From 3b2c56b55f72fe9635c170ef7214c5f3c452fa46 Mon Sep 17 00:00:00 2001
From: Eduardo Pacheco <eduardo.pach@hotmail.com>
Date: Tue, 25 Jun 2024 13:54:02 +0200
Subject: [PATCH 01/61] Starting to fix GroundingDinoLoss and
 GroundingDinoHungarianMatcher

---
 .../grounding_dino/modeling_grounding_dino.py | 71 ++++++++-----------
 1 file changed, 31 insertions(+), 40 deletions(-)

diff --git a/src/transformers/models/grounding_dino/modeling_grounding_dino.py b/src/transformers/models/grounding_dino/modeling_grounding_dino.py
index dcdccc50cc116d..c6266391aac09f 100644
--- a/src/transformers/models/grounding_dino/modeling_grounding_dino.py
+++ b/src/transformers/models/grounding_dino/modeling_grounding_dino.py
@@ -2616,46 +2616,6 @@ def sigmoid_focal_loss(inputs, targets, num_boxes, alpha: float = 0.25, gamma: f
     return loss.mean(1).sum() / num_boxes
 
 
-# Copied from transformers.models.detr.modeling_detr.NestedTensor
-class NestedTensor(object):
-    def __init__(self, tensors, mask: Optional[Tensor]):
-        self.tensors = tensors
-        self.mask = mask
-
-    def to(self, device):
-        cast_tensor = self.tensors.to(device)
-        mask = self.mask
-        if mask is not None:
-            cast_mask = mask.to(device)
-        else:
-            cast_mask = None
-        return NestedTensor(cast_tensor, cast_mask)
-
-    def decompose(self):
-        return self.tensors, self.mask
-
-    def __repr__(self):
-        return str(self.tensors)
-
-
-# Copied from transformers.models.detr.modeling_detr.nested_tensor_from_tensor_list
-def nested_tensor_from_tensor_list(tensor_list: List[Tensor]):
-    if tensor_list[0].ndim == 3:
-        max_size = _max_by_axis([list(img.shape) for img in tensor_list])
-        batch_shape = [len(tensor_list)] + max_size
-        batch_size, num_channels, height, width = batch_shape
-        dtype = tensor_list[0].dtype
-        device = tensor_list[0].device
-        tensor = torch.zeros(batch_shape, dtype=dtype, device=device)
-        mask = torch.ones((batch_size, height, width), dtype=torch.bool, device=device)
-        for img, pad_img, m in zip(tensor_list, tensor, mask):
-            pad_img[: img.shape[0], : img.shape[1], : img.shape[2]].copy_(img)
-            m[: img.shape[1], : img.shape[2]] = False
-    else:
-        raise ValueError("Only 3-dimensional tensors are supported")
-    return NestedTensor(tensor, mask)
-
-
 # Copied from transformers.models.deformable_detr.modeling_deformable_detr.DeformableDetrHungarianMatcher with DeformableDetr->GroundingDino
 class GroundingDinoHungarianMatcher(nn.Module):
     """
@@ -2851,6 +2811,34 @@ def _get_target_permutation_idx(self, indices):
         batch_idx = torch.cat([torch.full_like(target, i) for i, (_, target) in enumerate(indices)])
         target_idx = torch.cat([target for (_, target) in indices])
         return batch_idx, target_idx
+    
+    # Ignore copy
+    def _get_label_map(self, outputs):
+        """
+        Computes a mapping between the tokens associated with the prompt labels in the logit space with shape (batch_size, num_labels, hidden_size)
+        where `num_labels` is defined by the number of classes in the input prompt.
+
+        For instance if the prompt "fish. shark." we get input_ids = [  101,  3869,  1012, 11420,  1012,   102]
+        this function will then return a mapping for each of the prompt tokens (i.e. tokens associated with "fish" and "shark")
+        indicating their position in the logit space.
+
+        This is used in `loss_labels` and in the `GroundingDinoHungarianMatcher`.)
+        """
+        input_ids = outputs["input_ids"] # (batch_size, num_tokens)
+        # Add [PAD] token to the list of special tokens
+        delimiter_tokens = torch.tensor(SPECIAL_TOKENS + [0], device=input_ids.device)
+
+        # NOTE: Loop for now, but then trying to do in a bachtwise manner
+        # things to remember for batchwise later on:
+        # Easy to get the delimiter indices (only the valid ones i.e. diff between two consecutive delimiters is > 1)
+        # Have to update the class_labels in the targets with previous amount of labels as the number of labes in prompt might be different.
+        # Have to update the delimiter_indices with seq_len.
+        for ids in input_ids:
+            delimiter_token_mask = torch.isin(ids, delimiter_tokens)
+            delimiter_indices = delimiter_token_mask.nonzero()
+
+        # Placeholder for the label map
+        label_map = torch.zeros_like(...)
 
     def get_loss(self, loss, outputs, targets, indices, num_boxes):
         loss_map = {
@@ -2862,6 +2850,7 @@ def get_loss(self, loss, outputs, targets, indices, num_boxes):
             raise ValueError(f"Loss {loss} not supported")
         return loss_map[loss](outputs, targets, indices, num_boxes)
 
+    # Ignore copy
     def forward(self, outputs, targets):
         """
         This performs the loss computation.
@@ -2875,6 +2864,8 @@ def forward(self, outputs, targets):
         """
         outputs_without_aux = {k: v for k, v in outputs.items() if k != "auxiliary_outputs" and k != "enc_outputs"}
 
+        label_map = self._get_label_map(outputs)
+
         # Retrieve the matching between the outputs of the last layer and the targets
         indices = self.matcher(outputs_without_aux, targets)
 

From 3b328c01fe04d94687207196e3796b186376375d Mon Sep 17 00:00:00 2001
From: Eduardo Pacheco <eduardo.pach@hotmail.com>
Date: Sun, 30 Jun 2024 12:06:45 +0200
Subject: [PATCH 02/61] More updates

---
 .../grounding_dino/modeling_grounding_dino.py | 125 +++++++++++++-----
 1 file changed, 92 insertions(+), 33 deletions(-)

diff --git a/src/transformers/models/grounding_dino/modeling_grounding_dino.py b/src/transformers/models/grounding_dino/modeling_grounding_dino.py
index c6266391aac09f..173e6096c83462 100644
--- a/src/transformers/models/grounding_dino/modeling_grounding_dino.py
+++ b/src/transformers/models/grounding_dino/modeling_grounding_dino.py
@@ -2645,6 +2645,7 @@ def __init__(self, class_cost: float = 1, bbox_cost: float = 1, giou_cost: float
             raise ValueError("All costs of the Matcher can't be 0")
 
     @torch.no_grad()
+    # Ignore copy
     def forward(self, outputs, targets):
         """
         Args:
@@ -2652,6 +2653,7 @@ def forward(self, outputs, targets):
                 A dictionary that contains at least these entries:
                 * "logits": Tensor of dim [batch_size, num_queries, num_classes] with the classification logits
                 * "pred_boxes": Tensor of dim [batch_size, num_queries, 4] with the predicted box coordinates.
+                * "label_maps": Tuple of tensors of dim [num_classes, hidden_dim].
             targets (`List[dict]`):
                 A list of targets (len(targets) = batch_size), where each target is a dict containing:
                 * "class_labels": Tensor of dim [num_target_boxes] (where num_target_boxes is the number of
@@ -2668,11 +2670,16 @@ def forward(self, outputs, targets):
         batch_size, num_queries = outputs["logits"].shape[:2]
 
         # We flatten to compute the cost matrices in a batch
-        out_prob = outputs["logits"].flatten(0, 1).sigmoid()  # [batch_size * num_queries, num_classes]
+        out_prob = outputs["logits"].flatten(0, 1).sigmoid()  # [batch_size * num_queries, hidden_dim]
         out_bbox = outputs["pred_boxes"].flatten(0, 1)  # [batch_size * num_queries, 4]
+        label_maps = outputs["label_maps"]
+
+        # First take the label map for each class in each batch and then concatenate them
+        label_maps = torch.cat([label_map[v["class_labels"]] for v in targets for label_map in label_maps])
+        # Normalize label maps based on number of tokens per class
+        label_maps = label_maps / label_maps.sum(dim=-1, keepdim=True)
 
         # Also concat the target labels and boxes
-        target_ids = torch.cat([v["class_labels"] for v in targets])
         target_bbox = torch.cat([v["boxes"] for v in targets])
 
         # Compute the classification cost.
@@ -2680,7 +2687,8 @@ def forward(self, outputs, targets):
         gamma = 2.0
         neg_cost_class = (1 - alpha) * (out_prob**gamma) * (-(1 - out_prob + 1e-8).log())
         pos_cost_class = alpha * ((1 - out_prob) ** gamma) * (-(out_prob + 1e-8).log())
-        class_cost = pos_cost_class[:, target_ids] - neg_cost_class[:, target_ids]
+        # Compute the classification cost by taking pos and neg cost in the appropriate index
+        class_cost = (pos_cost_class - neg_cost_class) @ label_maps.t()
 
         # Compute the L1 cost between boxes
         bbox_cost = torch.cdist(out_bbox, target_bbox, p=1)
@@ -2722,7 +2730,7 @@ def __init__(self, matcher, num_classes, focal_alpha, losses):
         self.focal_alpha = focal_alpha
         self.losses = losses
 
-    # removed logging parameter, which was part of the original implementation
+    # Ignore copy
     def loss_labels(self, outputs, targets, indices, num_boxes):
         """
         Classification loss (Binary focal loss) targets dicts must contain the key "class_labels" containing a tensor
@@ -2730,29 +2738,53 @@ def loss_labels(self, outputs, targets, indices, num_boxes):
         """
         if "logits" not in outputs:
             raise KeyError("No logits were found in the outputs")
+        if "one_hot_labels" not in outputs:
+            raise KeyError("No one_hot_labels were found in the outputs")
+        if "text_mask" not in outputs:
+            raise KeyError("No text_mask were found in the outputs")
+
         source_logits = outputs["logits"]
+        # TODO maybe create one_hot and text_mask here (pass attention mask to outputs)
+        target_classes_onehot = outputs["one_hot"]
+        text_mask = outputs["text_mask"]
 
-        idx = self._get_source_permutation_idx(indices)
-        target_classes_o = torch.cat([t["class_labels"][J] for t, (_, J) in zip(targets, indices)])
-        target_classes = torch.full(
-            source_logits.shape[:2], self.num_classes, dtype=torch.int64, device=source_logits.device
-        )
-        target_classes[idx] = target_classes_o
+        ### New implementation
+        batch_size, num_queries, hidden_dim = source_logits.shape
 
-        target_classes_onehot = torch.zeros(
-            [source_logits.shape[0], source_logits.shape[1], source_logits.shape[2] + 1],
-            dtype=source_logits.dtype,
-            layout=source_logits.layout,
-            device=source_logits.device,
-        )
-        target_classes_onehot.scatter_(2, target_classes.unsqueeze(-1), 1)
+        if text_mask is not None:
+            text_mask = text_mask.repeat(1, num_queries)
+            text_mask = text_mask.view(batch_size, -1, hidden_dim)
+            source_logits = torch.masked_select(source_logits, text_mask)
+            target_classes_onehot = torch.masked_select(target_classes_onehot, text_mask)
+
+        target_classes_onehot = target_classes_onehot.float()
+        loss_ce = sigmoid_focal_loss(source_logits, target_classes_onehot, num_boxes, alpha=self.focal_alpha, gamma=2)
 
-        target_classes_onehot = target_classes_onehot[:, :, :-1]
-        loss_ce = (
-            sigmoid_focal_loss(source_logits, target_classes_onehot, num_boxes, alpha=self.focal_alpha, gamma=2)
-            * source_logits.shape[1]
-        )
         losses = {"loss_ce": loss_ce}
+        # return losses
+
+        ### Old implementation
+        # idx = self._get_source_permutation_idx(indices)
+        # target_classes_o = torch.cat([t["class_labels"][J] for t, (_, J) in zip(targets, indices)])
+        # target_classes = torch.full(
+        #     source_logits.shape[:2], self.num_classes, dtype=torch.int64, device=source_logits.device
+        # )
+        # target_classes[idx] = target_classes_o
+
+        # target_classes_onehot = torch.zeros(
+        #     [source_logits.shape[0], source_logits.shape[1], source_logits.shape[2] + 1],
+        #     dtype=source_logits.dtype,
+        #     layout=source_logits.layout,
+        #     device=source_logits.device,
+        # )
+        # target_classes_onehot.scatter_(2, target_classes.unsqueeze(-1), 1)
+
+        # target_classes_onehot = target_classes_onehot[:, :, :-1]
+        # loss_ce = (
+        #     sigmoid_focal_loss(source_logits, target_classes_onehot, num_boxes, alpha=self.focal_alpha, gamma=2)
+        #     * source_logits.shape[1]
+        # )
+        # losses = {"loss_ce": loss_ce}
 
         return losses
 
@@ -2773,7 +2805,7 @@ def loss_cardinality(self, outputs, targets, indices, num_boxes):
         losses = {"cardinality_error": card_err}
         return losses
 
-    # Copied from transformers.models.detr.modeling_detr.DetrLoss.loss_boxes
+    # Ignore copy
     def loss_boxes(self, outputs, targets, indices, num_boxes):
         """
         Compute the losses related to the bounding boxes, the L1 regression loss and the GIoU loss.
@@ -2796,6 +2828,12 @@ def loss_boxes(self, outputs, targets, indices, num_boxes):
             generalized_box_iou(center_to_corners_format(source_boxes), center_to_corners_format(target_boxes))
         )
         losses["loss_giou"] = loss_giou.sum() / num_boxes
+
+        # calculate the x,y and h,w loss
+        with torch.no_grad():
+            losses["loss_xy"] = loss_bbox[..., :2].sum() / num_boxes
+            losses["loss_hw"] = loss_bbox[..., 2:].sum() / num_boxes
+
         return losses
 
     # Copied from transformers.models.detr.modeling_detr.DetrLoss._get_source_permutation_idx
@@ -2811,9 +2849,9 @@ def _get_target_permutation_idx(self, indices):
         batch_idx = torch.cat([torch.full_like(target, i) for i, (_, target) in enumerate(indices)])
         target_idx = torch.cat([target for (_, target) in indices])
         return batch_idx, target_idx
-    
+
     # Ignore copy
-    def _get_label_map(self, outputs):
+    def _get_label_maps(self, outputs):
         """
         Computes a mapping between the tokens associated with the prompt labels in the logit space with shape (batch_size, num_labels, hidden_size)
         where `num_labels` is defined by the number of classes in the input prompt.
@@ -2824,7 +2862,8 @@ def _get_label_map(self, outputs):
 
         This is used in `loss_labels` and in the `GroundingDinoHungarianMatcher`.)
         """
-        input_ids = outputs["input_ids"] # (batch_size, num_tokens)
+        batch_size, num_boxes, hidden_size = outputs["logits"].shape
+        input_ids = outputs["input_ids"]  # (batch_size, num_tokens)
         # Add [PAD] token to the list of special tokens
         delimiter_tokens = torch.tensor(SPECIAL_TOKENS + [0], device=input_ids.device)
 
@@ -2833,12 +2872,22 @@ def _get_label_map(self, outputs):
         # Easy to get the delimiter indices (only the valid ones i.e. diff between two consecutive delimiters is > 1)
         # Have to update the class_labels in the targets with previous amount of labels as the number of labes in prompt might be different.
         # Have to update the delimiter_indices with seq_len.
-        for ids in input_ids:
-            delimiter_token_mask = torch.isin(ids, delimiter_tokens)
-            delimiter_indices = delimiter_token_mask.nonzero()
-
-        # Placeholder for the label map
-        label_map = torch.zeros_like(...)
+        delimiter_token_masks = torch.isin(input_ids, delimiter_tokens)
+        label_maps = ()
+        for delimiter_token_mask in delimiter_token_masks:
+            label_map_within_batch = []
+            delimiter_indices = torch.where(delimiter_token_mask)[0]
+            for i in range(len(delimiter_indices) - 1):
+                start = delimiter_indices[i]
+                end = delimiter_indices[i + 1]
+                if end - start > 1:
+                    label_map = torch.zeros(hidden_size, device=input_ids.device)
+                    label_map[start + 1 : end] = 1
+                    label_map_within_batch.append(label_map)
+
+            label_maps += (torch.stack(label_map_within_batch),)
+
+        return label_maps
 
     def get_loss(self, loss, outputs, targets, indices, num_boxes):
         loss_map = {
@@ -2864,11 +2913,21 @@ def forward(self, outputs, targets):
         """
         outputs_without_aux = {k: v for k, v in outputs.items() if k != "auxiliary_outputs" and k != "enc_outputs"}
 
-        label_map = self._get_label_map(outputs)
+        outputs_without_aux["label_maps"] = self._get_label_maps(outputs)
 
         # Retrieve the matching between the outputs of the last layer and the targets
         indices = self.matcher(outputs_without_aux, targets)
 
+        # Create one_hot based on the matching indices
+        one_hot = torch.zeros_like(
+            outputs["logits"], device=outputs["logits"].device, dtype=torch.long
+        )  # (batch_size, num_queries, hidden_dim)
+        class_labels = [target["class_labels"] for target in targets]
+        for i, (source, target) in enumerate(indices):
+            labels = class_labels[i][target]
+            one_hot[i, source] = outputs_without_aux["label_maps"][i][labels].to(torch.long)
+        outputs_without_aux["one_hot"] = one_hot
+
         # Compute the average number of target boxes accross all nodes, for normalization purposes
         num_boxes = sum(len(t["class_labels"]) for t in targets)
         num_boxes = torch.as_tensor([num_boxes], dtype=torch.float, device=next(iter(outputs.values())).device)

From 3b84fa742e329a79f5289b41df08b38e78e724f6 Mon Sep 17 00:00:00 2001
From: Eduardo Pacheco <eduardo.pach@hotmail.com>
Date: Wed, 3 Jul 2024 09:12:32 +0200
Subject: [PATCH 03/61] More updates

---
 .../grounding_dino/modeling_grounding_dino.py  |  2 ++
 .../test_modeling_grounding_dino.py            | 18 ++++++++++++++++++
 2 files changed, 20 insertions(+)

diff --git a/src/transformers/models/grounding_dino/modeling_grounding_dino.py b/src/transformers/models/grounding_dino/modeling_grounding_dino.py
index 173e6096c83462..dcd1308e16c6e2 100644
--- a/src/transformers/models/grounding_dino/modeling_grounding_dino.py
+++ b/src/transformers/models/grounding_dino/modeling_grounding_dino.py
@@ -3144,6 +3144,8 @@ def forward(
             outputs_loss = {}
             outputs_loss["logits"] = logits
             outputs_loss["pred_boxes"] = pred_boxes
+            outputs_loss["input_ids"] = input_ids
+            outputs_loss["attention_mask"] = attention_mask
             if self.config.auxiliary_loss:
                 auxiliary_outputs = self._set_aux_loss(outputs_class, outputs_coord)
                 outputs_loss["auxiliary_outputs"] = auxiliary_outputs
diff --git a/tests/models/grounding_dino/test_modeling_grounding_dino.py b/tests/models/grounding_dino/test_modeling_grounding_dino.py
index 1b4970785aa6c6..7d47010c8ea0db 100644
--- a/tests/models/grounding_dino/test_modeling_grounding_dino.py
+++ b/tests/models/grounding_dino/test_modeling_grounding_dino.py
@@ -616,6 +616,9 @@ def prepare_text():
     return text
 
 
+def prepare_for_loss(): ...
+
+
 @require_timm
 @require_vision
 @slow
@@ -741,3 +744,18 @@ def test_cross_attention_mask(self):
         self.assertTrue(torch.allclose(outputs1.logits, outputs_batched.logits[:1], atol=1e-3))
         # For some reason 12 elements are > 1e-3, but the rest are fine
         self.assertTrue(torch.allclose(outputs2.logits, outputs_batched.logits[1:], atol=1.8e-3))
+
+    def test_grounding_dino_loss(self):
+        model = GroundingDinoForObjectDetection.from_pretrained("IDEA-Research/grounding-dino-tiny").to(torch_device)
+
+        processor = self.default_processor
+        image, text, labels = prepare_for_loss()
+        encoding = processor(images=image, text=text, return_tensors="pt").to(torch_device)
+
+        with torch.no_grad():
+            outputs = model(labels=labels, **encoding)
+
+        # test loss
+        loss = outputs.loss
+        expected_loss = ...
+        self.assertEqual(loss.item(), expected_loss, msg="Loss is not matching expected value")

From ce59ba7b8d7e70995156d4a46566100133cc4a29 Mon Sep 17 00:00:00 2001
From: Eduardo Pacheco <eduardo.pach@hotmail.com>
Date: Sat, 13 Jul 2024 20:07:13 +0200
Subject: [PATCH 04/61] fixed: GroundingDinoLoss

---
 .../configuration_grounding_dino.py           |   8 +
 .../grounding_dino/modeling_grounding_dino.py | 340 +++++++++---------
 .../test_modeling_grounding_dino.py           |  76 +++-
 3 files changed, 243 insertions(+), 181 deletions(-)

diff --git a/src/transformers/models/grounding_dino/configuration_grounding_dino.py b/src/transformers/models/grounding_dino/configuration_grounding_dino.py
index 362e50a1c1cc68..8d7316718c91bd 100644
--- a/src/transformers/models/grounding_dino/configuration_grounding_dino.py
+++ b/src/transformers/models/grounding_dino/configuration_grounding_dino.py
@@ -97,10 +97,14 @@ class GroundingDinoConfig(PretrainedConfig):
             Relative weight of the L1 error of the bounding box coordinates in the Hungarian matching cost.
         giou_cost (`float`, *optional*, defaults to 2.0):
             Relative weight of the generalized IoU loss of the bounding box in the Hungarian matching cost.
+        class_loss_coefficient (`float`, *optional*, defaults to 2.0):
+            Relative weight of the cross-entropy loss in the object detection loss.
         bbox_loss_coefficient (`float`, *optional*, defaults to 5.0):
             Relative weight of the L1 bounding box loss in the object detection loss.
         giou_loss_coefficient (`float`, *optional*, defaults to 2.0):
             Relative weight of the generalized IoU loss in the object detection loss.
+        class_loss_reduction (`str`, *optional*, defaults to `"mean"`):
+            The reduction method for the classification loss. One of `"mean"` or `"sum"`.
         focal_alpha (`float`, *optional*, defaults to 0.25):
             Alpha parameter in the focal loss.
         disable_custom_kernels (`bool`, *optional*, defaults to `False`):
@@ -181,8 +185,10 @@ def __init__(
         class_cost=1.0,
         bbox_cost=5.0,
         giou_cost=2.0,
+        class_loss_coefficient=2.0,
         bbox_loss_coefficient=5.0,
         giou_loss_coefficient=2.0,
+        class_loss_reduction="mean",
         focal_alpha=0.25,
         disable_custom_kernels=False,
         # other parameters
@@ -255,8 +261,10 @@ def __init__(
         self.bbox_cost = bbox_cost
         self.giou_cost = giou_cost
         # Loss coefficients
+        self.class_loss_coefficient = class_loss_coefficient
         self.bbox_loss_coefficient = bbox_loss_coefficient
         self.giou_loss_coefficient = giou_loss_coefficient
+        self.class_loss_reduction = class_loss_reduction
         self.focal_alpha = focal_alpha
         self.disable_custom_kernels = disable_custom_kernels
         # Text backbone
diff --git a/src/transformers/models/grounding_dino/modeling_grounding_dino.py b/src/transformers/models/grounding_dino/modeling_grounding_dino.py
index dcd1308e16c6e2..466aaca16148a1 100644
--- a/src/transformers/models/grounding_dino/modeling_grounding_dino.py
+++ b/src/transformers/models/grounding_dino/modeling_grounding_dino.py
@@ -14,7 +14,6 @@
 # limitations under the License.
 """PyTorch Grounding DINO model."""
 
-import copy
 import math
 import os
 import warnings
@@ -259,12 +258,18 @@ class GroundingDinoModelOutput(ModelOutput):
             weighted average in the text-vision attention, vision-text attention, text-enhancer (self-attention) and
             multi-scale deformable attention heads. attention softmax, used to compute the weighted average in the
             bi-attention heads.
+        enc_topk_proposals (`torch.FloatTensor` of shape `(batch_size, num_queries)`, *optional*, returned when `config.two_stage=True`):
+            Top `config.num_queries` scoring bounding boxes indicies picked as region proposals in the first stage.
         enc_outputs_class (`torch.FloatTensor` of shape `(batch_size, sequence_length, config.num_labels)`, *optional*, returned when `config.two_stage=True`):
             Predicted bounding boxes scores where the top `config.num_queries` scoring bounding boxes are picked as
             region proposals in the first stage. Output of bounding box binary classification (i.e. foreground and
             background).
         enc_outputs_coord_logits (`torch.FloatTensor` of shape `(batch_size, sequence_length, 4)`, *optional*, returned when `config.two_stage=True`):
             Logits of predicted bounding boxes coordinates in the first stage.
+        encoder_logits (`torch.FloatTensor` of shape `(batch_size, sequence_length, config.num_labels)`, *optional*, returned when `config.two_stage=True`):
+            Logits of top `config.num_queries` scoring bounding boxes in the first stage.
+        encoder_pred_boxes (`torch.FloatTensor` of shape `(batch_size, sequence_length, 4)`, *optional*, returned when `config.two_stage=True`):
+            Coordinates of top `config.num_queries` scoring bounding boxes in the first stage.
     """
 
     last_hidden_state: torch.FloatTensor = None
@@ -278,8 +283,11 @@ class GroundingDinoModelOutput(ModelOutput):
     encoder_vision_hidden_states: Optional[Tuple[torch.FloatTensor]] = None
     encoder_text_hidden_states: Optional[Tuple[torch.FloatTensor]] = None
     encoder_attentions: Optional[Tuple[Tuple[torch.FloatTensor]]] = None
+    enc_topk_proposals: Optional[torch.FloatTensor] = None
     enc_outputs_class: Optional[torch.FloatTensor] = None
     enc_outputs_coord_logits: Optional[torch.FloatTensor] = None
+    encoder_logits: Optional[torch.FloatTensor] = None
+    encoder_pred_boxes: Optional[torch.FloatTensor] = None
 
 
 @dataclass
@@ -338,12 +346,18 @@ class GroundingDinoObjectDetectionOutput(ModelOutput):
             Stacked intermediate reference points (reference points of each layer of the decoder).
         init_reference_points (`torch.FloatTensor` of shape  `(batch_size, num_queries, 4)`):
             Initial reference points sent through the Transformer decoder.
+        enc_topk_proposals (`torch.FloatTensor` of shape `(batch_size, num_queries)`, *optional*, returned when `config.two_stage=True`):
+            Top `config.num_queries` scoring bounding boxes indicies picked as region proposals in the first stage.
         enc_outputs_class (`torch.FloatTensor` of shape `(batch_size, sequence_length, config.num_labels)`, *optional*, returned when `config.two_stage=True`):
             Predicted bounding boxes scores where the top `config.num_queries` scoring bounding boxes are picked as
             region proposals in the first stage. Output of bounding box binary classification (i.e. foreground and
             background).
         enc_outputs_coord_logits (`torch.FloatTensor` of shape `(batch_size, sequence_length, 4)`, *optional*, returned when `config.two_stage=True`):
             Logits of predicted bounding boxes coordinates in the first stage.
+        encoder_logits (`torch.FloatTensor` of shape `(batch_size, sequence_length, config.num_labels)`, *optional*, returned when `config.two_stage=True`):
+            Logits of top `config.num_queries` scoring bounding boxes in the first stage.
+        encoder_pred_boxes (`torch.FloatTensor` of shape `(batch_size, sequence_length, 4)`, *optional*, returned when `config.two_stage=True`):
+            Coordinates of top `config.num_queries` scoring bounding boxes in the first stage.
     """
 
     loss: Optional[torch.FloatTensor] = None
@@ -362,8 +376,11 @@ class GroundingDinoObjectDetectionOutput(ModelOutput):
     encoder_vision_hidden_states: Optional[Tuple[torch.FloatTensor]] = None
     encoder_text_hidden_states: Optional[Tuple[torch.FloatTensor]] = None
     encoder_attentions: Optional[Tuple[Tuple[torch.FloatTensor]]] = None
+    enc_topk_proposals: Optional[torch.FloatTensor] = None
     enc_outputs_class: Optional[torch.FloatTensor] = None
     enc_outputs_coord_logits: Optional[torch.FloatTensor] = None
+    encoder_logits: Optional[torch.FloatTensor] = None
+    encoder_pred_boxes: Optional[torch.FloatTensor] = None
 
 
 # Copied from transformers.models.detr.modeling_detr.DetrFrozenBatchNorm2d with Detr->GroundingDino
@@ -2384,8 +2401,11 @@ def forward(
             )
 
         # Fifth, prepare decoder inputs
+        topk_proposals = None
         enc_outputs_class = None
         enc_outputs_coord_logits = None
+        encoder_logits = None
+        encoder_pred_boxes = None
         if self.config.two_stage:
             object_query_embedding, output_proposals = self.generate_encoder_output_proposals(
                 encoder_outputs[0], ~mask_flatten, spatial_shapes
@@ -2418,6 +2438,10 @@ def forward(
                 target = torch.gather(
                     object_query_embedding, 1, topk_proposals.unsqueeze(-1).repeat(1, 1, self.d_model)
                 ).detach()
+
+            # Set intermediate topk proposals (coordsa nd class) for loss computation
+            encoder_pred_boxes = topk_coords_logits.sigmoid()
+            encoder_logits = self.encoder_output_class_embed(target, text_features, text_token_mask)
         else:
             target = query_embeds.unsqueeze(0).repeat(batch_size, 1, 1)
             reference_points = self.reference_points.weight.unsqueeze(0).repeat(batch_size, 1, 1).sigmoid()
@@ -2440,7 +2464,9 @@ def forward(
         )
 
         if not return_dict:
-            enc_outputs = tuple(value for value in [enc_outputs_class, enc_outputs_coord_logits] if value is not None)
+            enc_outputs = tuple(
+                value for value in [topk_proposals, enc_outputs_class, enc_outputs_coord_logits] if value is not None
+            )
             tuple_outputs = (
                 (decoder_outputs[0], init_reference_points) + decoder_outputs[1:] + encoder_outputs + enc_outputs
             )
@@ -2459,8 +2485,11 @@ def forward(
             encoder_vision_hidden_states=encoder_outputs.vision_hidden_states,
             encoder_text_hidden_states=encoder_outputs.text_hidden_states,
             encoder_attentions=encoder_outputs.attentions,
+            enc_topk_proposals=topk_proposals,
             enc_outputs_class=enc_outputs_class,
             enc_outputs_coord_logits=enc_outputs_coord_logits,
+            encoder_logits=encoder_logits,
+            encoder_pred_boxes=encoder_pred_boxes,
         )
 
 
@@ -2554,38 +2583,19 @@ def generalized_box_iou(boxes1, boxes2):
     return iou - (area - union) / area
 
 
-# Copied from transformers.models.detr.modeling_detr._max_by_axis
-def _max_by_axis(the_list):
-    # type: (List[List[int]]) -> List[int]
-    maxes = the_list[0]
-    for sublist in the_list[1:]:
-        for index, item in enumerate(sublist):
-            maxes[index] = max(maxes[index], item)
-    return maxes
-
-
-# Copied from transformers.models.detr.modeling_detr.dice_loss
-def dice_loss(inputs, targets, num_boxes):
-    """
-    Compute the DICE loss, similar to generalized IOU for masks
-
-    Args:
-        inputs: A float tensor of arbitrary shape.
-                The predictions for each example.
-        targets: A float tensor with the same shape as inputs. Stores the binary
-                 classification label for each element in inputs (0 for the negative class and 1 for the positive
-                 class).
-    """
-    inputs = inputs.sigmoid()
-    inputs = inputs.flatten(1)
-    numerator = 2 * (inputs * targets).sum(1)
-    denominator = inputs.sum(-1) + targets.sum(-1)
-    loss = 1 - (numerator + 1) / (denominator + 1)
-    return loss.sum() / num_boxes
-
-
-# Copied from transformers.models.detr.modeling_detr.sigmoid_focal_loss
-def sigmoid_focal_loss(inputs, targets, num_boxes, alpha: float = 0.25, gamma: float = 2):
+# Changed the way we reduce the loss since pass only valid logits to avoid inf/nan problems
+# also added num_queries and reduction args as the base implementation uses 'sum' reduction
+# but the sigmoid_focal_loss in the original implementation uses 'mean' reduction and the 'sum'
+# reduction leads to a loss_ce that is considerably higher than the other losses.
+def sigmoid_focal_loss(
+    inputs: torch.Tensor,
+    targets: torch.Tensor,
+    num_boxes: int,
+    num_queries: int,
+    reduction: str = "mean",
+    alpha: float = 0.25,
+    gamma: float = 2,
+):
     """
     Loss used in RetinaNet for dense detection: https://arxiv.org/abs/1708.02002.
 
@@ -2595,6 +2605,12 @@ def sigmoid_focal_loss(inputs, targets, num_boxes, alpha: float = 0.25, gamma: f
         targets (`torch.FloatTensor` with the same shape as `inputs`)
             A tensor storing the binary classification label for each element in the `inputs` (0 for the negative class
             and 1 for the positive class).
+        num_boxes (`int`):
+            The total number of boxes in the batch.
+        num_queries (`int`):
+            The number of query boxes per image.
+        reduction (`str`, *optional*, defaults to `'mean'`):
+            Specifies the redction to apply to the loss. Can be either `'mean'`, or `'sum'`.
         alpha (`float`, *optional*, defaults to `0.25`):
             Optional weighting factor in the range (0,1) to balance positive vs. negative examples.
         gamma (`int`, *optional*, defaults to `2`):
@@ -2613,7 +2629,9 @@ def sigmoid_focal_loss(inputs, targets, num_boxes, alpha: float = 0.25, gamma: f
         alpha_t = alpha * targets + (1 - alpha) * (1 - targets)
         loss = alpha_t * loss
 
-    return loss.mean(1).sum() / num_boxes
+    if reduction == "mean":
+        return loss.sum() / num_queries / num_boxes
+    return loss.sum() / num_boxes
 
 
 # Copied from transformers.models.deformable_detr.modeling_deformable_detr.DeformableDetrHungarianMatcher with DeformableDetr->GroundingDino
@@ -2675,7 +2693,7 @@ def forward(self, outputs, targets):
         label_maps = outputs["label_maps"]
 
         # First take the label map for each class in each batch and then concatenate them
-        label_maps = torch.cat([label_map[v["class_labels"]] for v in targets for label_map in label_maps])
+        label_maps = torch.cat([label_map[target["class_labels"]] for label_map, target in zip(label_maps, targets)])
         # Normalize label maps based on number of tokens per class
         label_maps = label_maps / label_maps.sum(dim=-1, keepdim=True)
 
@@ -2705,7 +2723,6 @@ def forward(self, outputs, targets):
         return [(torch.as_tensor(i, dtype=torch.int64), torch.as_tensor(j, dtype=torch.int64)) for i, j in indices]
 
 
-# Copied from transformers.models.deformable_detr.modeling_deformable_detr.DeformableDetrLoss with DeformableDetr->GroundingDino
 class GroundingDinoLoss(nn.Module):
     """
     This class computes the losses for `GroundingDinoForObjectDetection`. The process happens in two steps: 1) we
@@ -2715,22 +2732,38 @@ class GroundingDinoLoss(nn.Module):
     Args:
         matcher (`GroundingDinoHungarianMatcher`):
             Module able to compute a matching between targets and proposals.
-        num_classes (`int`):
-            Number of object categories, omitting the special no-object category.
         focal_alpha (`float`):
             Alpha parameter in focal loss.
         losses (`List[str]`):
             List of all the losses to be applied. See `get_loss` for a list of all available losses.
+        class_reduction (`str`):
+            Specifies the reduction to apply to the label loss. Can be either `'mean'` or `'sum'`
     """
 
-    def __init__(self, matcher, num_classes, focal_alpha, losses):
+    def __init__(self, matcher, focal_alpha, losses, class_reduction):
         super().__init__()
         self.matcher = matcher
-        self.num_classes = num_classes
         self.focal_alpha = focal_alpha
         self.losses = losses
+        self.class_reduction = class_reduction
 
-    # Ignore copy
+    def _get_target_classes_one_hot(self, outputs, targets, indices):
+        """
+        Create one_hot based on the matching indices
+        """
+        class_labels = [target["class_labels"] for target in targets]
+        logits = outputs["logits"]
+        label_maps = outputs["label_maps"]
+
+        target_classes_onehot = torch.zeros_like(logits, device=logits.device, dtype=torch.long)
+
+        for i, (source, target) in enumerate(indices):
+            labels = class_labels[i][target]
+            target_classes_onehot[i, source] = label_maps[i][labels].to(torch.long)
+
+        return target_classes_onehot
+
+    # Added new target_classes_onehot and step to get valid logits and new sigmoid_focal_loss signature
     def loss_labels(self, outputs, targets, indices, num_boxes):
         """
         Classification loss (Binary focal loss) targets dicts must contain the key "class_labels" containing a tensor
@@ -2738,74 +2771,35 @@ def loss_labels(self, outputs, targets, indices, num_boxes):
         """
         if "logits" not in outputs:
             raise KeyError("No logits were found in the outputs")
-        if "one_hot_labels" not in outputs:
-            raise KeyError("No one_hot_labels were found in the outputs")
         if "text_mask" not in outputs:
             raise KeyError("No text_mask were found in the outputs")
 
+        target_classes_onehot = self._get_target_classes_one_hot(outputs, targets, indices)
         source_logits = outputs["logits"]
-        # TODO maybe create one_hot and text_mask here (pass attention mask to outputs)
-        target_classes_onehot = outputs["one_hot"]
         text_mask = outputs["text_mask"]
 
-        ### New implementation
-        batch_size, num_queries, hidden_dim = source_logits.shape
+        # Select only valid logits
+        source_logits = torch.masked_select(source_logits, text_mask)
+        target_classes_onehot = torch.masked_select(target_classes_onehot, text_mask)
 
-        if text_mask is not None:
-            text_mask = text_mask.repeat(1, num_queries)
-            text_mask = text_mask.view(batch_size, -1, hidden_dim)
-            source_logits = torch.masked_select(source_logits, text_mask)
-            target_classes_onehot = torch.masked_select(target_classes_onehot, text_mask)
+        num_queries = source_logits.shape[0]
 
         target_classes_onehot = target_classes_onehot.float()
-        loss_ce = sigmoid_focal_loss(source_logits, target_classes_onehot, num_boxes, alpha=self.focal_alpha, gamma=2)
+        loss_ce = sigmoid_focal_loss(
+            inputs=source_logits,
+            targets=target_classes_onehot,
+            num_boxes=num_boxes,
+            num_queries=num_queries,
+            reduction=self.class_reduction,
+            alpha=self.focal_alpha,
+            gamma=2,
+        )
 
         losses = {"loss_ce": loss_ce}
-        # return losses
-
-        ### Old implementation
-        # idx = self._get_source_permutation_idx(indices)
-        # target_classes_o = torch.cat([t["class_labels"][J] for t, (_, J) in zip(targets, indices)])
-        # target_classes = torch.full(
-        #     source_logits.shape[:2], self.num_classes, dtype=torch.int64, device=source_logits.device
-        # )
-        # target_classes[idx] = target_classes_o
-
-        # target_classes_onehot = torch.zeros(
-        #     [source_logits.shape[0], source_logits.shape[1], source_logits.shape[2] + 1],
-        #     dtype=source_logits.dtype,
-        #     layout=source_logits.layout,
-        #     device=source_logits.device,
-        # )
-        # target_classes_onehot.scatter_(2, target_classes.unsqueeze(-1), 1)
-
-        # target_classes_onehot = target_classes_onehot[:, :, :-1]
-        # loss_ce = (
-        #     sigmoid_focal_loss(source_logits, target_classes_onehot, num_boxes, alpha=self.focal_alpha, gamma=2)
-        #     * source_logits.shape[1]
-        # )
-        # losses = {"loss_ce": loss_ce}
 
         return losses
 
-    @torch.no_grad()
-    # Copied from transformers.models.detr.modeling_detr.DetrLoss.loss_cardinality
-    def loss_cardinality(self, outputs, targets, indices, num_boxes):
-        """
-        Compute the cardinality error, i.e. the absolute error in the number of predicted non-empty boxes.
-
-        This is not really a loss, it is intended for logging purposes only. It doesn't propagate gradients.
-        """
-        logits = outputs["logits"]
-        device = logits.device
-        target_lengths = torch.as_tensor([len(v["class_labels"]) for v in targets], device=device)
-        # Count the number of predictions that are NOT "no-object" (which is the last class)
-        card_pred = (logits.argmax(-1) != logits.shape[-1] - 1).sum(1)
-        card_err = nn.functional.l1_loss(card_pred.float(), target_lengths.float())
-        losses = {"cardinality_error": card_err}
-        return losses
-
-    # Ignore copy
+    # Added loss_xy and loss_hw to calculate the x,y and h,w loss
     def loss_boxes(self, outputs, targets, indices, num_boxes):
         """
         Compute the losses related to the bounding boxes, the L1 regression loss and the GIoU loss.
@@ -2836,70 +2830,30 @@ def loss_boxes(self, outputs, targets, indices, num_boxes):
 
         return losses
 
-    # Copied from transformers.models.detr.modeling_detr.DetrLoss._get_source_permutation_idx
+    # Copied from transformers.models.deformable_detr.modeling_deformable_detr.DeformableDetrLoss._get_source_permutation_idx
     def _get_source_permutation_idx(self, indices):
         # permute predictions following indices
         batch_idx = torch.cat([torch.full_like(source, i) for i, (source, _) in enumerate(indices)])
         source_idx = torch.cat([source for (source, _) in indices])
         return batch_idx, source_idx
 
-    # Copied from transformers.models.detr.modeling_detr.DetrLoss._get_target_permutation_idx
+    # Copied from transformers.models.deformable_detr.modeling_deformable_detr.DeformableDetrLoss._get_target_permutation_idx
     def _get_target_permutation_idx(self, indices):
         # permute targets following indices
         batch_idx = torch.cat([torch.full_like(target, i) for i, (_, target) in enumerate(indices)])
         target_idx = torch.cat([target for (_, target) in indices])
         return batch_idx, target_idx
 
-    # Ignore copy
-    def _get_label_maps(self, outputs):
-        """
-        Computes a mapping between the tokens associated with the prompt labels in the logit space with shape (batch_size, num_labels, hidden_size)
-        where `num_labels` is defined by the number of classes in the input prompt.
-
-        For instance if the prompt "fish. shark." we get input_ids = [  101,  3869,  1012, 11420,  1012,   102]
-        this function will then return a mapping for each of the prompt tokens (i.e. tokens associated with "fish" and "shark")
-        indicating their position in the logit space.
-
-        This is used in `loss_labels` and in the `GroundingDinoHungarianMatcher`.)
-        """
-        batch_size, num_boxes, hidden_size = outputs["logits"].shape
-        input_ids = outputs["input_ids"]  # (batch_size, num_tokens)
-        # Add [PAD] token to the list of special tokens
-        delimiter_tokens = torch.tensor(SPECIAL_TOKENS + [0], device=input_ids.device)
-
-        # NOTE: Loop for now, but then trying to do in a bachtwise manner
-        # things to remember for batchwise later on:
-        # Easy to get the delimiter indices (only the valid ones i.e. diff between two consecutive delimiters is > 1)
-        # Have to update the class_labels in the targets with previous amount of labels as the number of labes in prompt might be different.
-        # Have to update the delimiter_indices with seq_len.
-        delimiter_token_masks = torch.isin(input_ids, delimiter_tokens)
-        label_maps = ()
-        for delimiter_token_mask in delimiter_token_masks:
-            label_map_within_batch = []
-            delimiter_indices = torch.where(delimiter_token_mask)[0]
-            for i in range(len(delimiter_indices) - 1):
-                start = delimiter_indices[i]
-                end = delimiter_indices[i + 1]
-                if end - start > 1:
-                    label_map = torch.zeros(hidden_size, device=input_ids.device)
-                    label_map[start + 1 : end] = 1
-                    label_map_within_batch.append(label_map)
-
-            label_maps += (torch.stack(label_map_within_batch),)
-
-        return label_maps
-
+    # Removed cardinality loss as it is not used.
     def get_loss(self, loss, outputs, targets, indices, num_boxes):
         loss_map = {
             "labels": self.loss_labels,
-            "cardinality": self.loss_cardinality,
             "boxes": self.loss_boxes,
         }
         if loss not in loss_map:
             raise ValueError(f"Loss {loss} not supported")
         return loss_map[loss](outputs, targets, indices, num_boxes)
 
-    # Ignore copy
     def forward(self, outputs, targets):
         """
         This performs the loss computation.
@@ -2913,21 +2867,9 @@ def forward(self, outputs, targets):
         """
         outputs_without_aux = {k: v for k, v in outputs.items() if k != "auxiliary_outputs" and k != "enc_outputs"}
 
-        outputs_without_aux["label_maps"] = self._get_label_maps(outputs)
-
         # Retrieve the matching between the outputs of the last layer and the targets
         indices = self.matcher(outputs_without_aux, targets)
 
-        # Create one_hot based on the matching indices
-        one_hot = torch.zeros_like(
-            outputs["logits"], device=outputs["logits"].device, dtype=torch.long
-        )  # (batch_size, num_queries, hidden_dim)
-        class_labels = [target["class_labels"] for target in targets]
-        for i, (source, target) in enumerate(indices):
-            labels = class_labels[i][target]
-            one_hot[i, source] = outputs_without_aux["label_maps"][i][labels].to(torch.long)
-        outputs_without_aux["one_hot"] = one_hot
-
         # Compute the average number of target boxes accross all nodes, for normalization purposes
         num_boxes = sum(len(t["class_labels"]) for t in targets)
         num_boxes = torch.as_tensor([num_boxes], dtype=torch.float, device=next(iter(outputs.values())).device)
@@ -2954,18 +2896,59 @@ def forward(self, outputs, targets):
 
         if "enc_outputs" in outputs:
             enc_outputs = outputs["enc_outputs"]
-            bin_targets = copy.deepcopy(targets)
-            for bt in bin_targets:
-                bt["class_labels"] = torch.zeros_like(bt["class_labels"])
-            indices = self.matcher(enc_outputs, bin_targets)
+            indices = self.matcher(enc_outputs, targets)
             for loss in self.losses:
-                l_dict = self.get_loss(loss, enc_outputs, bin_targets, indices, num_boxes)
+                l_dict = self.get_loss(loss, enc_outputs, targets, indices, num_boxes)
                 l_dict = {k + "_enc": v for k, v in l_dict.items()}
                 losses.update(l_dict)
 
         return losses
 
 
+def build_label_maps(logits, input_ids):
+    """
+    Computes a mapping between the tokens associated with the prompt labels in the logit space with shape (batch_size, num_labels, hidden_size)
+    where `num_labels` is defined by the number of classes in the input prompt.
+
+    For instance if the prompt "fish. shark." we get input_ids = [  101,  3869,  1012, 11420,  1012,   102]
+    this function will then return a mapping for each of the prompt tokens (i.e. tokens associated with "fish" and "shark")
+    indicating their position in the logit space.
+
+    This is used in `loss_labels` and in the `GroundingDinoHungarianMatcher`.)
+    """
+    hidden_size = logits.shape[-1]
+    # Add [PAD] token to the list of special tokens
+    delimiter_tokens = torch.tensor(SPECIAL_TOKENS + [0], device=input_ids.device)
+
+    delimiter_token_masks = torch.isin(input_ids, delimiter_tokens)
+    label_maps = ()
+    for delimiter_token_mask in delimiter_token_masks:
+        label_map_within_batch = []
+        delimiter_indices = torch.where(delimiter_token_mask)[0]
+        for i in range(len(delimiter_indices) - 1):
+            start = delimiter_indices[i]
+            end = delimiter_indices[i + 1]
+            if end - start > 1:
+                label_map = torch.zeros(hidden_size, device=input_ids.device)
+                label_map[start + 1 : end] = 1
+                label_map_within_batch.append(label_map)
+
+        label_maps += (torch.stack(label_map_within_batch),)
+
+    return label_maps
+
+
+def build_text_mask(logits, attention_mask):
+    """
+    Create text_mask based on the matching indices
+    """
+    seq_len = attention_mask.shape[1]
+    text_mask = torch.zeros_like(logits, device=logits.device, dtype=attention_mask.dtype)
+    text_mask[:, :, :seq_len] = attention_mask[:, None, :]
+
+    return text_mask.bool()
+
+
 @add_start_docstrings(
     """
     Grounding DINO Model (consisting of a backbone and encoder-decoder Transformer) with object detection heads on top,
@@ -3006,11 +2989,14 @@ def __init__(self, config: GroundingDinoConfig):
 
     # taken from https://github.com/facebookresearch/detr/blob/master/models/detr.py
     @torch.jit.unused
-    def _set_aux_loss(self, outputs_class, outputs_coord):
+    def _set_aux_loss(self, outputs_class, outputs_coord, label_maps, text_mask):
         # this is a workaround to make torchscript happy, as torchscript
         # doesn't support dictionary with non-homogeneous values, such
         # as a dict having both a Tensor and a list.
-        return [{"logits": a, "pred_boxes": b} for a, b in zip(outputs_class[:-1], outputs_coord[:-1])]
+        return [
+            {"logits": a, "pred_boxes": b, "label_maps": label_maps, "text_mask": text_mask}
+            for a, b in zip(outputs_class[:-1], outputs_coord[:-1])
+        ]
 
     @add_start_docstrings_to_model_forward(GROUNDING_DINO_INPUTS_DOCSTRING)
     @replace_return_docstrings(output_type=GroundingDinoObjectDetectionOutput, config_class=_CONFIG_FOR_DOC)
@@ -3132,36 +3118,53 @@ def forward(
                 class_cost=self.config.class_cost, bbox_cost=self.config.bbox_cost, giou_cost=self.config.giou_cost
             )
             # Second: create the criterion
-            losses = ["labels", "boxes", "cardinality"]
+            losses = ["labels", "boxes"]
             criterion = GroundingDinoLoss(
                 matcher=matcher,
-                num_classes=self.config.num_labels,
+                class_reduction=self.config.class_loss_reduction,
                 focal_alpha=self.config.focal_alpha,
                 losses=losses,
             )
             criterion.to(self.device)
+            label_maps = build_label_maps(logits, input_ids)
+            text_mask = build_text_mask(logits, attention_mask)
             # Third: compute the losses, based on outputs and labels
             outputs_loss = {}
             outputs_loss["logits"] = logits
             outputs_loss["pred_boxes"] = pred_boxes
-            outputs_loss["input_ids"] = input_ids
-            outputs_loss["attention_mask"] = attention_mask
+            outputs_loss["label_maps"] = label_maps
+            outputs_loss["text_mask"] = text_mask
+
             if self.config.auxiliary_loss:
-                auxiliary_outputs = self._set_aux_loss(outputs_class, outputs_coord)
+                auxiliary_outputs = self._set_aux_loss(outputs_class, outputs_coord, label_maps, text_mask)
                 outputs_loss["auxiliary_outputs"] = auxiliary_outputs
+
             if self.config.two_stage:
-                enc_outputs_coord = outputs[-1].sigmoid()
-                outputs_loss["enc_outputs"] = {"logits": outputs[-2], "pred_boxes": enc_outputs_coord}
+                outputs_loss["enc_outputs"] = {
+                    "logits": outputs[-2],
+                    "pred_boxes": outputs[-1],
+                    "label_maps": label_maps,
+                    "text_mask": text_mask,
+                }
 
             loss_dict = criterion(outputs_loss, labels)
             # Fourth: compute total loss, as a weighted sum of the various losses
-            weight_dict = {"loss_ce": 1, "loss_bbox": self.config.bbox_loss_coefficient}
-            weight_dict["loss_giou"] = self.config.giou_loss_coefficient
+            weight_dict = {
+                "loss_ce": self.config.class_loss_coefficient,
+                "loss_bbox": self.config.bbox_loss_coefficient,
+                "loss_giou": self.config.giou_loss_coefficient,
+            }
+
+            if self.config.two_stage:
+                enc_weight_dict = {k + "_enc": v for k, v in weight_dict.items()}
+                weight_dict.update(enc_weight_dict)
+
             if self.config.auxiliary_loss:
                 aux_weight_dict = {}
                 for i in range(self.config.decoder_layers - 1):
                     aux_weight_dict.update({k + f"_{i}": v for k, v in weight_dict.items()})
                 weight_dict.update(aux_weight_dict)
+
             loss = sum(loss_dict[k] * weight_dict[k] for k in loss_dict.keys() if k in weight_dict)
 
         if not return_dict:
@@ -3190,8 +3193,11 @@ def forward(
             intermediate_hidden_states=outputs.intermediate_hidden_states,
             intermediate_reference_points=outputs.intermediate_reference_points,
             init_reference_points=outputs.init_reference_points,
+            enc_topk_proposals=outputs.enc_topk_proposals,
             enc_outputs_class=outputs.enc_outputs_class,
             enc_outputs_coord_logits=outputs.enc_outputs_coord_logits,
+            encoder_logits=outputs.encoder_logits,
+            encoder_pred_boxes=outputs.encoder_pred_boxes,
         )
 
         return dict_outputs
diff --git a/tests/models/grounding_dino/test_modeling_grounding_dino.py b/tests/models/grounding_dino/test_modeling_grounding_dino.py
index 7d47010c8ea0db..db0285bf38e8bc 100644
--- a/tests/models/grounding_dino/test_modeling_grounding_dino.py
+++ b/tests/models/grounding_dino/test_modeling_grounding_dino.py
@@ -20,6 +20,8 @@
 import re
 import unittest
 
+from datasets import load_dataset
+
 from transformers import (
     GroundingDinoConfig,
     SwinConfig,
@@ -44,7 +46,7 @@
 if is_torch_available():
     import torch
 
-    from transformers import GroundingDinoForObjectDetection, GroundingDinoModel
+    from transformers import GroundingDinoConfig, GroundingDinoForObjectDetection, GroundingDinoModel
     from transformers.pytorch_utils import id_tensor_storage
 
 
@@ -616,9 +618,6 @@ def prepare_text():
     return text
 
 
-def prepare_for_loss(): ...
-
-
 @require_timm
 @require_vision
 @slow
@@ -746,16 +745,65 @@ def test_cross_attention_mask(self):
         self.assertTrue(torch.allclose(outputs2.logits, outputs_batched.logits[1:], atol=1.8e-3))
 
     def test_grounding_dino_loss(self):
-        model = GroundingDinoForObjectDetection.from_pretrained("IDEA-Research/grounding-dino-tiny").to(torch_device)
+        ds = load_dataset("EduardoPacheco/aquarium-sample", split="train")
+        image_processor = self.default_processor.image_processor
+        tokenizer = self.default_processor.tokenizer
+        id2label = {0: "fish", 1: "jellyfish", 2: "penguins", 3: "sharks", 4: "puffins", 5: "stingrays", 6: "starfish"}
+        prompt = ". ".join(id2label.values()) + "."
+
+        text_inputs = tokenizer([prompt, prompt], return_tensors="pt")
+        image_inputs = image_processor(images=ds["image"], annotations=ds["annotations"], return_tensors="pt")
+
+        # Passing class_reduction="sum" and auxiliary_loss=True to compare with the expected loss
+        model = GroundingDinoForObjectDetection.from_pretrained(
+            "IDEA-Research/grounding-dino-tiny", auxiliary_loss=True, class_loss_reduction="sum"
+        )
+        # Interested in the loss only
+        model.eval()
+        with torch.no_grad():
+            outputs = model(**text_inputs, **image_inputs)
+
+        expected_loss_dict = {
+            "loss_ce": torch.tensor(1.1151),
+            "loss_bbox": torch.tensor(0.2031),
+            "loss_giou": torch.tensor(0.5819),
+            "loss_xy": torch.tensor(0.0927),
+            "loss_hw": torch.tensor(0.1104),
+            "loss_ce_0": torch.tensor(1.1942),
+            "loss_bbox_0": torch.tensor(0.1978),
+            "loss_giou_0": torch.tensor(0.5524),
+            "loss_xy_0": torch.tensor(0.0887),
+            "loss_hw_0": torch.tensor(0.1091),
+            "loss_ce_1": torch.tensor(1.1623),
+            "loss_bbox_1": torch.tensor(0.1909),
+            "loss_giou_1": torch.tensor(0.5892),
+            "loss_xy_1": torch.tensor(0.0926),
+            "loss_hw_1": torch.tensor(0.0982),
+            "loss_ce_2": torch.tensor(1.1643),
+            "loss_bbox_2": torch.tensor(0.1891),
+            "loss_giou_2": torch.tensor(0.5626),
+            "loss_xy_2": torch.tensor(0.0896),
+            "loss_hw_2": torch.tensor(0.0996),
+            "loss_ce_3": torch.tensor(1.1945),
+            "loss_bbox_3": torch.tensor(0.1943),
+            "loss_giou_3": torch.tensor(0.5592),
+            "loss_xy_3": torch.tensor(0.0895),
+            "loss_hw_3": torch.tensor(0.1048),
+            "loss_ce_4": torch.tensor(1.0946),
+            "loss_bbox_4": torch.tensor(0.2037),
+            "loss_giou_4": torch.tensor(0.5813),
+            "loss_xy_4": torch.tensor(0.0918),
+            "loss_hw_4": torch.tensor(0.1119),
+            "loss_ce_enc": torch.tensor(16226.3145),
+            "loss_bbox_enc": torch.tensor(0.3063),
+            "loss_giou_enc": torch.tensor(0.7380),
+            "loss_xy_enc": torch.tensor(0.1324),
+            "loss_hw_enc": torch.tensor(0.1739),
+        }
 
-        processor = self.default_processor
-        image, text, labels = prepare_for_loss()
-        encoding = processor(images=image, text=text, return_tensors="pt").to(torch_device)
+        expected_loss = torch.tensor(32482.2344)
 
-        with torch.no_grad():
-            outputs = model(labels=labels, **encoding)
+        for key in expected_loss_dict:
+            self.assertTrue(torch.allclose(outputs.loss_dict[key], expected_loss_dict[key], atol=1e-3))
 
-        # test loss
-        loss = outputs.loss
-        expected_loss = ...
-        self.assertEqual(loss.item(), expected_loss, msg="Loss is not matching expected value")
+        self.assertTrue(torch.allclose(outputs.loss, expected_loss, atol=1e-3))

From 261305d41a58676c320d20b9133cf19c1413ea78 Mon Sep 17 00:00:00 2001
From: Eduardo Pacheco <eduardo.pach@hotmail.com>
Date: Sun, 14 Jul 2024 01:21:30 +0200
Subject: [PATCH 05/61] fixed: failing tests

---
 .../grounding_dino/modeling_grounding_dino.py | 10 +++-
 .../test_modeling_grounding_dino.py           | 46 +++++++++++++++++--
 2 files changed, 50 insertions(+), 6 deletions(-)

diff --git a/src/transformers/models/grounding_dino/modeling_grounding_dino.py b/src/transformers/models/grounding_dino/modeling_grounding_dino.py
index 466aaca16148a1..3f275a1715cdd7 100644
--- a/src/transformers/models/grounding_dino/modeling_grounding_dino.py
+++ b/src/transformers/models/grounding_dino/modeling_grounding_dino.py
@@ -2465,7 +2465,15 @@ def forward(
 
         if not return_dict:
             enc_outputs = tuple(
-                value for value in [topk_proposals, enc_outputs_class, enc_outputs_coord_logits] if value is not None
+                value
+                for value in [
+                    topk_proposals,
+                    enc_outputs_class,
+                    enc_outputs_coord_logits,
+                    encoder_logits,
+                    encoder_pred_boxes,
+                ]
+                if value is not None
             )
             tuple_outputs = (
                 (decoder_outputs[0], init_reference_points) + decoder_outputs[1:] + encoder_outputs + enc_outputs
diff --git a/tests/models/grounding_dino/test_modeling_grounding_dino.py b/tests/models/grounding_dino/test_modeling_grounding_dino.py
index db0285bf38e8bc..9a14c247f33751 100644
--- a/tests/models/grounding_dino/test_modeling_grounding_dino.py
+++ b/tests/models/grounding_dino/test_modeling_grounding_dino.py
@@ -39,7 +39,7 @@
 )
 
 from ...test_configuration_common import ConfigTester
-from ...test_modeling_common import ModelTesterMixin, _config_zero_init, floats_tensor, ids_tensor
+from ...test_modeling_common import ModelTesterMixin, _config_zero_init, floats_tensor
 from ...test_pipeline_mixin import PipelineTesterMixin
 
 
@@ -56,6 +56,41 @@
     from transformers import AutoProcessor
 
 
+def generate_fake_bounding_boxes(n_boxes):
+    """Generate bounding boxes in the format (cx, cy, w, h)"""
+    # Validate the input
+    if not isinstance(n_boxes, int):
+        raise ValueError("n_boxes must be an integer")
+    if n_boxes <= 0:
+        raise ValueError("n_boxes must be a positive integer")
+
+    # Generate random bounding boxes in the format (cx, cy, w, h)
+    bounding_boxes = torch.rand((n_boxes, 4))
+
+    for i in range(n_boxes):
+        cx, cy, w, h = bounding_boxes[i]
+
+        # Ensure width and height do not exceed bounds
+        if w > 1.0:
+            w = 1.0
+        if h > 1.0:
+            h = 1.0
+
+        # Ensure the bounding box stays within the normalized space
+        if cx - w / 2 < 0:
+            cx = w / 2
+        if cx + w / 2 > 1:
+            cx = 1 - w / 2
+        if cy - h / 2 < 0:
+            cy = h / 2
+        if cy + h / 2 > 1:
+            cy = 1 - h / 2
+
+        bounding_boxes[i] = torch.tensor([cx, cy, w, h])
+
+    return bounding_boxes
+
+
 class GroundingDinoModelTester:
     def __init__(
         self,
@@ -74,7 +109,7 @@ def __init__(
         num_channels=3,
         image_size=98,
         n_targets=8,
-        num_labels=3,
+        num_labels=2,
         num_feature_levels=4,
         encoder_n_points=2,
         decoder_n_points=6,
@@ -117,7 +152,8 @@ def prepare_config_and_inputs(self):
         pixel_values = floats_tensor([self.batch_size, self.num_channels, self.image_size, self.image_size])
         pixel_mask = torch.ones([self.batch_size, self.image_size, self.image_size], device=torch_device)
 
-        input_ids = ids_tensor([self.batch_size, self.max_text_len], self.num_labels)
+        input_ids = torch.tensor([101, 3869, 1012, 11420, 1012, 1012, 102])
+        input_ids = input_ids.unsqueeze(0).expand(self.batch_size, -1)
 
         labels = None
         if self.use_labels:
@@ -128,7 +164,7 @@ def prepare_config_and_inputs(self):
                 target["class_labels"] = torch.randint(
                     high=self.num_labels, size=(self.n_targets,), device=torch_device
                 )
-                target["boxes"] = torch.rand(self.n_targets, 4, device=torch_device)
+                target["boxes"] = generate_fake_bounding_boxes(self.n_targets).to(torch_device)
                 target["masks"] = torch.rand(self.n_targets, self.image_size, self.image_size, device=torch_device)
                 labels.append(target)
 
@@ -319,7 +355,7 @@ def test_attention_outputs(self):
             )
             out_len = len(outputs)
 
-            correct_outlen = 10
+            correct_outlen = 13
 
             # loss is at first position
             if "labels" in inputs_dict:

From 51c201ac45fa0ebd91791f558987f5d6bc4f0ca0 Mon Sep 17 00:00:00 2001
From: sangbumchoi <danielsejong55@gmail.com>
Date: Mon, 15 Jul 2024 07:19:58 +0000
Subject: [PATCH 06/61] fix typo

---
 docs/source/en/model_doc/grounding-dino.md | 6 +++---
 1 file changed, 3 insertions(+), 3 deletions(-)

diff --git a/docs/source/en/model_doc/grounding-dino.md b/docs/source/en/model_doc/grounding-dino.md
index d258f492abf8b5..609572a2393f01 100644
--- a/docs/source/en/model_doc/grounding-dino.md
+++ b/docs/source/en/model_doc/grounding-dino.md
@@ -45,19 +45,19 @@ import requests
 
 import torch
 from PIL import Image
-from transformers import AutoProcessor, AutoModelForZeroShotObjectDetection, 
+from transformers import AutoProcessor, AutoModelForZeroShotObjectDetection
 
 model_id = "IDEA-Research/grounding-dino-tiny"
 
 processor = AutoProcessor.from_pretrained(model_id)
-model = AutoModelForZeroShotObjectDetection.from_pretrained(model_id).to(device)
+model = AutoModelForZeroShotObjectDetection.from_pretrained(model_id)
 
 image_url = "http://images.cocodataset.org/val2017/000000039769.jpg"
 image = Image.open(requests.get(image_url, stream=True).raw)
 # Check for cats and remote controls
 text = "a cat. a remote control."
 
-inputs = processor(images=image, text=text, return_tensors="pt").to(device)
+inputs = processor(images=image, text=text, return_tensors="pt")
 with torch.no_grad():
     outputs = model(**inputs)
 

From 9dd38e20bcebfa3f6cdf812f79f4a50fe0104363 Mon Sep 17 00:00:00 2001
From: sangbumchoi <danielsejong55@gmail.com>
Date: Mon, 15 Jul 2024 07:20:07 +0000
Subject: [PATCH 07/61] uniform kwargs

---
 .../processing_grounding_dino.py              | 74 +++++++++++--------
 1 file changed, 44 insertions(+), 30 deletions(-)

diff --git a/src/transformers/models/grounding_dino/processing_grounding_dino.py b/src/transformers/models/grounding_dino/processing_grounding_dino.py
index 44b99811d931ce..982b82b37ffe5f 100644
--- a/src/transformers/models/grounding_dino/processing_grounding_dino.py
+++ b/src/transformers/models/grounding_dino/processing_grounding_dino.py
@@ -16,12 +16,20 @@
 Processor class for Grounding DINO.
 """
 
+import sys
 from typing import List, Optional, Tuple, Union
 
 from ...image_processing_utils import BatchFeature
 from ...image_transforms import center_to_corners_format
 from ...image_utils import ImageInput
-from ...processing_utils import ProcessorMixin
+from ...processing_utils import ProcessingKwargs, ProcessorMixin
+
+
+if sys.version_info >= (3, 11):
+    from typing import Unpack
+else:
+    from typing_extensions import Unpack
+
 from ...tokenization_utils_base import BatchEncoding, PaddingStrategy, PreTokenizedInput, TextInput, TruncationStrategy
 from ...utils import TensorType, is_torch_available
 
@@ -56,6 +64,26 @@ def get_phrases_from_posmap(posmaps, input_ids):
     return token_ids
 
 
+class GroundingDinoProcessorKwargs(ProcessingKwargs, total=False):
+    _defaults = {
+        "text_kwargs": {
+            "add_special_tokens": True,
+            "padding": False,
+            "truncation": None,
+            "max_length": None,
+            "stride": 0,
+            "pad_to_multiple_of": None,
+            "return_attention_mask": None,
+            "return_overflowing_tokens": False,
+            "return_special_tokens_mask": False,
+            "return_offsets_mapping": False,
+            "return_token_type_ids": True,
+            "return_length": False,
+            "verbose": True,
+        }
+    }
+
+
 class GroundingDinoProcessor(ProcessorMixin):
     r"""
     Constructs a Grounding DINO processor which wraps a Deformable DETR image processor and a BERT tokenizer into a
@@ -83,21 +111,8 @@ def __call__(
         self,
         images: ImageInput = None,
         text: Union[TextInput, PreTokenizedInput, List[TextInput], List[PreTokenizedInput]] = None,
-        add_special_tokens: bool = True,
-        padding: Union[bool, str, PaddingStrategy] = False,
-        truncation: Union[bool, str, TruncationStrategy] = None,
-        max_length: Optional[int] = None,
-        stride: int = 0,
-        pad_to_multiple_of: Optional[int] = None,
-        return_attention_mask: Optional[bool] = None,
-        return_overflowing_tokens: bool = False,
-        return_special_tokens_mask: bool = False,
-        return_offsets_mapping: bool = False,
-        return_token_type_ids: bool = True,
-        return_length: bool = False,
-        verbose: bool = True,
         return_tensors: Optional[Union[str, TensorType]] = None,
-        **kwargs,
+        **kwargs: Unpack[GroundingDinoProcessorKwargs],
     ) -> BatchEncoding:
         """
         This method uses [`GroundingDinoImageProcessor.__call__`] method to prepare image(s) for the model, and
@@ -108,30 +123,29 @@ def __call__(
         if images is None and text is None:
             raise ValueError("You have to specify either images or text.")
 
+        output_kwargs = self._merge_kwargs(
+            GroundingDinoProcessorKwargs,
+            tokenizer_init_kwargs=self.tokenizer.init_kwargs,
+            **kwargs,
+        )
+
+        # BC for explicit return_tensors
+        if "return_tensors" in output_kwargs["common_kwargs"]:
+            return_tensors = output_kwargs["common_kwargs"].pop("return_tensors", None)
+
         # Get only text
         if images is not None:
-            encoding_image_processor = self.image_processor(images, return_tensors=return_tensors)
+            encoding_image_processor = self.image_processor(
+                images, return_tensors=return_tensors, **output_kwargs["images_kwargs"]
+            )
         else:
             encoding_image_processor = BatchFeature()
 
         if text is not None:
             text_encoding = self.tokenizer(
                 text=text,
-                add_special_tokens=add_special_tokens,
-                padding=padding,
-                truncation=truncation,
-                max_length=max_length,
-                stride=stride,
-                pad_to_multiple_of=pad_to_multiple_of,
-                return_attention_mask=return_attention_mask,
-                return_overflowing_tokens=return_overflowing_tokens,
-                return_special_tokens_mask=return_special_tokens_mask,
-                return_offsets_mapping=return_offsets_mapping,
-                return_token_type_ids=return_token_type_ids,
-                return_length=return_length,
-                verbose=verbose,
                 return_tensors=return_tensors,
-                **kwargs,
+                **output_kwargs["text_kwargs"],
             )
         else:
             text_encoding = BatchEncoding()

From ebc386282dccc725a1a7be9ae1edab25eda1a977 Mon Sep 17 00:00:00 2001
From: sangbumchoi <danielsejong55@gmail.com>
Date: Mon, 15 Jul 2024 07:24:45 +0000
Subject: [PATCH 08/61] make style

---
 .../models/grounding_dino/processing_grounding_dino.py          | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/src/transformers/models/grounding_dino/processing_grounding_dino.py b/src/transformers/models/grounding_dino/processing_grounding_dino.py
index 982b82b37ffe5f..1020820efb8050 100644
--- a/src/transformers/models/grounding_dino/processing_grounding_dino.py
+++ b/src/transformers/models/grounding_dino/processing_grounding_dino.py
@@ -30,7 +30,7 @@
 else:
     from typing_extensions import Unpack
 
-from ...tokenization_utils_base import BatchEncoding, PaddingStrategy, PreTokenizedInput, TextInput, TruncationStrategy
+from ...tokenization_utils_base import BatchEncoding, PreTokenizedInput, TextInput
 from ...utils import TensorType, is_torch_available
 
 
From c6dc44528b39f8f58f072d2db912844ec501b9a3 Mon Sep 17 00:00:00 2001
From: sangbumchoi <danielsejong55@gmail.com>
Date: Mon, 15 Jul 2024 13:05:22 +0000
Subject: [PATCH 09/61] add comments

---
 .../processing_grounding_dino.py              | 17 +++++-----
 src/transformers/processing_utils.py          | 28 ++++++++++++++++
 .../test_processor_grounding_dino.py          | 33 ++++++++++++++++++-
 3 files changed, 69 insertions(+), 9 deletions(-)

diff --git a/src/transformers/models/grounding_dino/processing_grounding_dino.py b/src/transformers/models/grounding_dino/processing_grounding_dino.py
index 1020820efb8050..fe80864f8e0d09 100644
--- a/src/transformers/models/grounding_dino/processing_grounding_dino.py
+++ b/src/transformers/models/grounding_dino/processing_grounding_dino.py
@@ -17,7 +17,7 @@
 """
 
 import sys
-from typing import List, Optional, Tuple, Union
+from typing import List, Tuple, Union
 
 from ...image_processing_utils import BatchFeature
 from ...image_transforms import center_to_corners_format
@@ -69,18 +69,18 @@ class GroundingDinoProcessorKwargs(ProcessingKwargs, total=False):
         "text_kwargs": {
             "add_special_tokens": True,
             "padding": False,
-            "truncation": None,
-            "max_length": None,
             "stride": 0,
-            "pad_to_multiple_of": None,
-            "return_attention_mask": None,
             "return_overflowing_tokens": False,
             "return_special_tokens_mask": False,
             "return_offsets_mapping": False,
-            "return_token_type_ids": True,
+            "return_token_type_ids": False,
             "return_length": False,
             "verbose": True,
-        }
+        },
+        "images_kwargs": {
+            "do_convert_annotations": True,
+            "do_resize": True,
+        },
     }
 
 
@@ -111,7 +111,8 @@ def __call__(
         self,
         images: ImageInput = None,
         text: Union[TextInput, PreTokenizedInput, List[TextInput], List[PreTokenizedInput]] = None,
-        return_tensors: Optional[Union[str, TensorType]] = None,
+        audio=None,
+        videos=None,
         **kwargs: Unpack[GroundingDinoProcessorKwargs],
     ) -> BatchEncoding:
         """
diff --git a/src/transformers/processing_utils.py b/src/transformers/processing_utils.py
index 7062a7699a79f7..2e724eb2264a0a 100644
--- a/src/transformers/processing_utils.py
+++ b/src/transformers/processing_utils.py
@@ -20,6 +20,7 @@
 import inspect
 import json
 import os
+import pathlib
 import warnings
 from pathlib import Path
 from typing import Any, Dict, List, Optional, Tuple, TypedDict, Union
@@ -40,6 +41,7 @@
 )
 from .utils import (
     PROCESSOR_NAME,
+    ExplicitEnum,
     PushToHubMixin,
     TensorType,
     add_model_info_to_auto_map,
@@ -56,6 +58,14 @@
 
 logger = logging.get_logger(__name__)
 
+AnnotationType = Dict[str, Union[int, str, List[Dict]]]
+
+
+class AnnotationFormat(ExplicitEnum):
+    COCO_DETECTION = "coco_detection"
+    COCO_PANOPTIC = "coco_panoptic"
+
+
 # Dynamically import the Transformers module to grab the attribute classes of the processor form their names.
 transformers_module = direct_transformers_import(Path(__file__).parent)
 
@@ -128,6 +138,12 @@ class ImagesKwargs(TypedDict, total=False):
     class methods and docstrings.
 
     Attributes:
+        annotations (`AnnotationType` or `List[AnnotationType]`, *optional*):
+            List of annotations associated with the image or batch of images.
+        return_segmentation_masks (`bool`, *optional*):
+            Whether to return segmentation masks.
+        masks_path (`str` or `pathlib.Path`, *optional*):
+            Path to the directory containing the segmentation masks.
         do_resize (`bool`, *optional*):
             Whether to resize the image.
         size (`Dict[str, int]`, *optional*):
@@ -144,6 +160,8 @@ class methods and docstrings.
             Scale factor to use if rescaling the image.
         do_normalize (`bool`, *optional*):
             Whether to normalize the image.
+        do_convert_annotations (`bool`, *optional*):
+            Whether to convert the annotations to the format expected by the model.
         image_mean (`float` or `List[float]`, *optional*):
             Mean to use if normalizing the image.
         image_std (`float` or `List[float]`, *optional*):
@@ -152,12 +170,19 @@ class methods and docstrings.
             Whether to pad the image to the `(max_height, max_width)` of the images in the batch.
         do_center_crop (`bool`, *optional*):
             Whether to center crop the image.
+        format (`str` or `AnnotationFormat`, *optional*):
+            Format of the annotations.
         data_format (`ChannelDimension` or `str`, *optional*):
             The channel dimension format for the output image.
         input_data_format (`ChannelDimension` or `str`, *optional*):
             The channel dimension format for the input image.
+        pad_size (`Dict[str, int]`, *optional*):
+            The size `{"height": int, "width" int}` to pad the images to.
     """
 
+    annotations: Optional[Union[AnnotationType, List[AnnotationType]]]
+    return_segmentation_masks: Optional[bool]
+    masks_path: Optional[Union[str, pathlib.Path]]
     do_resize: Optional[bool]
     size: Optional[Dict[str, int]]
     size_divisor: Optional[int]
@@ -166,12 +191,15 @@ class methods and docstrings.
     do_rescale: Optional[bool]
     rescale_factor: Optional[float]
     do_normalize: Optional[bool]
+    do_convert_annotations: Optional[bool]
     image_mean: Optional[Union[float, List[float]]]
     image_std: Optional[Union[float, List[float]]]
     do_pad: Optional[bool]
     do_center_crop: Optional[bool]
+    format: Optional[Union[str, AnnotationFormat]]
     data_format: Optional[ChannelDimension]
     input_data_format: Optional[Union[str, ChannelDimension]]
+    pad_size: Optional[Dict[str, int]]
 
 
 class VideosKwargs(TypedDict, total=False):
diff --git a/tests/models/grounding_dino/test_processor_grounding_dino.py b/tests/models/grounding_dino/test_processor_grounding_dino.py
index a788d09ca7eed1..b7a259f0c31526 100644
--- a/tests/models/grounding_dino/test_processor_grounding_dino.py
+++ b/tests/models/grounding_dino/test_processor_grounding_dino.py
@@ -26,6 +26,8 @@
 from transformers.testing_utils import require_torch, require_vision
 from transformers.utils import IMAGE_PROCESSOR_NAME, is_torch_available, is_vision_available
 
+from ...test_processing_common import ProcessorTesterMixin
+
 
 if is_torch_available():
     import torch
@@ -40,7 +42,9 @@
 
 @require_torch
 @require_vision
-class GroundingDinoProcessorTest(unittest.TestCase):
+class GroundingDinoProcessorTest(ProcessorTesterMixin, unittest.TestCase):
+    processor_class = GroundingDinoProcessor
+
     def setUp(self):
         self.tmpdirname = tempfile.mkdtemp()
 
@@ -251,3 +255,30 @@ def test_model_input_names(self):
         inputs = processor(text=input_str, images=image_input)
 
         self.assertListEqual(list(inputs.keys()), processor.model_input_names)
+
+    @require_torch
+    @require_vision
+    def test_unstructured_kwargs_batched(self):
+        if "image_processor" not in self.processor_class.attributes:
+            self.skipTest(f"image_processor attribute not present in {self.processor_class}")
+        image_processor = self.get_component("image_processor")
+        tokenizer = self.get_component("tokenizer")
+        if not tokenizer.pad_token:
+            tokenizer.pad_token = "[TEST_PAD]"
+        processor = self.processor_class(tokenizer=tokenizer, image_processor=image_processor)
+        self.skip_processor_without_typed_kwargs(processor)
+
+        input_str = ["lower newer", "upper older longer string"]
+        image_input = self.prepare_image_inputs() * 2
+        inputs = processor(
+            text=input_str,
+            images=image_input,
+            return_tensors="pt",
+            crop_size={"height": 214, "width": 214},
+            size={"height": 214, "width": 214},
+            padding="longest",
+            max_length=76,
+        )
+        self.assertEqual(inputs["pixel_values"].shape[2], 214)
+
+        self.assertEqual(len(inputs["input_ids"][0]), 11)

From 16ddefd33d42be09015f6e06916d1f170ed050ac Mon Sep 17 00:00:00 2001
From: sangbumchoi <danielsejong55@gmail.com>
Date: Tue, 16 Jul 2024 01:00:01 +0000
Subject: [PATCH 10/61] remove return_tensors

---
 .../models/grounding_dino/processing_grounding_dino.py | 10 +++-------
 1 file changed, 3 insertions(+), 7 deletions(-)

diff --git a/src/transformers/models/grounding_dino/processing_grounding_dino.py b/src/transformers/models/grounding_dino/processing_grounding_dino.py
index fe80864f8e0d09..4aa081fd70f224 100644
--- a/src/transformers/models/grounding_dino/processing_grounding_dino.py
+++ b/src/transformers/models/grounding_dino/processing_grounding_dino.py
@@ -122,7 +122,7 @@ def __call__(
         Please refer to the docstring of the above two methods for more information.
         """
         if images is None and text is None:
-            raise ValueError("You have to specify either images or text.")
+            raise ValueError("You must specify either text or images.")
 
         output_kwargs = self._merge_kwargs(
             GroundingDinoProcessorKwargs,
@@ -130,14 +130,10 @@ def __call__(
             **kwargs,
         )
 
-        # BC for explicit return_tensors
-        if "return_tensors" in output_kwargs["common_kwargs"]:
-            return_tensors = output_kwargs["common_kwargs"].pop("return_tensors", None)
-
         # Get only text
         if images is not None:
             encoding_image_processor = self.image_processor(
-                images, return_tensors=return_tensors, **output_kwargs["images_kwargs"]
+                images, **output_kwargs["common_kwargs"], **output_kwargs["images_kwargs"]
             )
         else:
             encoding_image_processor = BatchFeature()
@@ -145,7 +141,7 @@ def __call__(
         if text is not None:
             text_encoding = self.tokenizer(
                 text=text,
-                return_tensors=return_tensors,
+                **output_kwargs["common_kwargs"],
                 **output_kwargs["text_kwargs"],
             )
         else:

From f9560659ae43f04db558ef0ae1f2854176c43b76 Mon Sep 17 00:00:00 2001
From: Eduardo Pacheco <69953243+EduardoPach@users.noreply.github.com>
Date: Fri, 19 Jul 2024 23:06:07 +0200
Subject: [PATCH 11/61] Update
 src/transformers/models/grounding_dino/modeling_grounding_dino.py

Co-authored-by: amyeroberts <22614925+amyeroberts@users.noreply.github.com>
---
 .../models/grounding_dino/modeling_grounding_dino.py            | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/src/transformers/models/grounding_dino/modeling_grounding_dino.py b/src/transformers/models/grounding_dino/modeling_grounding_dino.py
index 3f275a1715cdd7..b176f2a2e23f46 100644
--- a/src/transformers/models/grounding_dino/modeling_grounding_dino.py
+++ b/src/transformers/models/grounding_dino/modeling_grounding_dino.py
@@ -259,7 +259,7 @@ class GroundingDinoModelOutput(ModelOutput):
             multi-scale deformable attention heads. attention softmax, used to compute the weighted average in the
             bi-attention heads.
         enc_topk_proposals (`torch.FloatTensor` of shape `(batch_size, num_queries)`, *optional*, returned when `config.two_stage=True`):
-            Top `config.num_queries` scoring bounding boxes indicies picked as region proposals in the first stage.
+            Top `config.num_queries` scoring bounding boxes indices picked as region proposals in the first stage.
         enc_outputs_class (`torch.FloatTensor` of shape `(batch_size, sequence_length, config.num_labels)`, *optional*, returned when `config.two_stage=True`):
             Predicted bounding boxes scores where the top `config.num_queries` scoring bounding boxes are picked as
             region proposals in the first stage. Output of bounding box binary classification (i.e. foreground and

From d06206ea098cb00ba8290d3b830cd4b7b118a113 Mon Sep 17 00:00:00 2001
From: Eduardo Pacheco <69953243+EduardoPach@users.noreply.github.com>
Date: Fri, 19 Jul 2024 23:06:16 +0200
Subject: [PATCH 12/61] Update
 tests/models/grounding_dino/test_modeling_grounding_dino.py

Co-authored-by: amyeroberts <22614925+amyeroberts@users.noreply.github.com>
---
 tests/models/grounding_dino/test_modeling_grounding_dino.py | 6 ++----
 1 file changed, 2 insertions(+), 4 deletions(-)

diff --git a/tests/models/grounding_dino/test_modeling_grounding_dino.py b/tests/models/grounding_dino/test_modeling_grounding_dino.py
index 9a14c247f33751..160c817cdf34ec 100644
--- a/tests/models/grounding_dino/test_modeling_grounding_dino.py
+++ b/tests/models/grounding_dino/test_modeling_grounding_dino.py
@@ -71,10 +71,8 @@ def generate_fake_bounding_boxes(n_boxes):
         cx, cy, w, h = bounding_boxes[i]
 
         # Ensure width and height do not exceed bounds
-        if w > 1.0:
-            w = 1.0
-        if h > 1.0:
-            h = 1.0
+        w = min(w, 1.0)
+        h = min(h, 1.0)
 
         # Ensure the bounding box stays within the normalized space
         if cx - w / 2 < 0:

From 8f1ffc6a257a46402bc5ca5e99b18600be553ca7 Mon Sep 17 00:00:00 2001
From: Eduardo Pacheco <69953243+EduardoPach@users.noreply.github.com>
Date: Fri, 19 Jul 2024 23:06:52 +0200
Subject: [PATCH 13/61] Update
 src/transformers/models/grounding_dino/modeling_grounding_dino.py

Co-authored-by: amyeroberts <22614925+amyeroberts@users.noreply.github.com>
---
 .../models/grounding_dino/modeling_grounding_dino.py            | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/src/transformers/models/grounding_dino/modeling_grounding_dino.py b/src/transformers/models/grounding_dino/modeling_grounding_dino.py
index b176f2a2e23f46..9bfc91ac71d2d8 100644
--- a/src/transformers/models/grounding_dino/modeling_grounding_dino.py
+++ b/src/transformers/models/grounding_dino/modeling_grounding_dino.py
@@ -347,7 +347,7 @@ class GroundingDinoObjectDetectionOutput(ModelOutput):
         init_reference_points (`torch.FloatTensor` of shape  `(batch_size, num_queries, 4)`):
             Initial reference points sent through the Transformer decoder.
         enc_topk_proposals (`torch.FloatTensor` of shape `(batch_size, num_queries)`, *optional*, returned when `config.two_stage=True`):
-            Top `config.num_queries` scoring bounding boxes indicies picked as region proposals in the first stage.
+            Top `config.num_queries` scoring bounding boxes indices picked as region proposals in the first stage.
         enc_outputs_class (`torch.FloatTensor` of shape `(batch_size, sequence_length, config.num_labels)`, *optional*, returned when `config.two_stage=True`):
             Predicted bounding boxes scores where the top `config.num_queries` scoring bounding boxes are picked as
             region proposals in the first stage. Output of bounding box binary classification (i.e. foreground and

From 61e7658bcd4250da2e6da8db57961b52161946cd Mon Sep 17 00:00:00 2001
From: Eduardo Pacheco <69953243+EduardoPach@users.noreply.github.com>
Date: Fri, 19 Jul 2024 23:09:13 +0200
Subject: [PATCH 14/61] Update
 src/transformers/models/grounding_dino/modeling_grounding_dino.py

Co-authored-by: amyeroberts <22614925+amyeroberts@users.noreply.github.com>
---
 .../models/grounding_dino/modeling_grounding_dino.py          | 4 ++--
 1 file changed, 2 insertions(+), 2 deletions(-)

diff --git a/src/transformers/models/grounding_dino/modeling_grounding_dino.py b/src/transformers/models/grounding_dino/modeling_grounding_dino.py
index 9bfc91ac71d2d8..695add466b4e10 100644
--- a/src/transformers/models/grounding_dino/modeling_grounding_dino.py
+++ b/src/transformers/models/grounding_dino/modeling_grounding_dino.py
@@ -2619,9 +2619,9 @@ def sigmoid_focal_loss(
             The number of query boxes per image.
         reduction (`str`, *optional*, defaults to `'mean'`):
             Specifies the redction to apply to the loss. Can be either `'mean'`, or `'sum'`.
-        alpha (`float`, *optional*, defaults to `0.25`):
+        alpha (`float`, *optional*, defaults to 0.25):
             Optional weighting factor in the range (0,1) to balance positive vs. negative examples.
-        gamma (`int`, *optional*, defaults to `2`):
+        gamma (`int`, *optional*, defaults to 2):
             Exponent of the modulating factor (1 - p_t) to balance easy vs hard examples.
 
     Returns:

From 5d356fc8f81f651577d8b84ac4c5f028e571d481 Mon Sep 17 00:00:00 2001
From: Eduardo Pacheco <69953243+EduardoPach@users.noreply.github.com>
Date: Fri, 19 Jul 2024 23:10:14 +0200
Subject: [PATCH 15/61] Update
 src/transformers/models/grounding_dino/modeling_grounding_dino.py

Co-authored-by: amyeroberts <22614925+amyeroberts@users.noreply.github.com>
---
 .../models/grounding_dino/modeling_grounding_dino.py            | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/src/transformers/models/grounding_dino/modeling_grounding_dino.py b/src/transformers/models/grounding_dino/modeling_grounding_dino.py
index 695add466b4e10..62f561c686306b 100644
--- a/src/transformers/models/grounding_dino/modeling_grounding_dino.py
+++ b/src/transformers/models/grounding_dino/modeling_grounding_dino.py
@@ -2915,7 +2915,7 @@ def forward(self, outputs, targets):
 
 def build_label_maps(logits, input_ids):
     """
-    Computes a mapping between the tokens associated with the prompt labels in the logit space with shape (batch_size, num_labels, hidden_size)
+    Computes a mapping between the tokens associated with the prompt labels in the logit space with shape `(batch_size, num_labels, hidden_size)`
     where `num_labels` is defined by the number of classes in the input prompt.
 
     For instance if the prompt "fish. shark." we get input_ids = [  101,  3869,  1012, 11420,  1012,   102]

From 0860f3b3ee95b3d8c5aca487d330dfbf33057657 Mon Sep 17 00:00:00 2001
From: Eduardo Pacheco <69953243+EduardoPach@users.noreply.github.com>
Date: Fri, 19 Jul 2024 23:12:32 +0200
Subject: [PATCH 16/61] Update
 src/transformers/models/grounding_dino/modeling_grounding_dino.py

Co-authored-by: amyeroberts <22614925+amyeroberts@users.noreply.github.com>
---
 .../models/grounding_dino/modeling_grounding_dino.py            | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/src/transformers/models/grounding_dino/modeling_grounding_dino.py b/src/transformers/models/grounding_dino/modeling_grounding_dino.py
index 62f561c686306b..7e10d96c90e9c6 100644
--- a/src/transformers/models/grounding_dino/modeling_grounding_dino.py
+++ b/src/transformers/models/grounding_dino/modeling_grounding_dino.py
@@ -2918,7 +2918,7 @@ def build_label_maps(logits, input_ids):
     Computes a mapping between the tokens associated with the prompt labels in the logit space with shape `(batch_size, num_labels, hidden_size)`
     where `num_labels` is defined by the number of classes in the input prompt.
 
-    For instance if the prompt "fish. shark." we get input_ids = [  101,  3869,  1012, 11420,  1012,   102]
+    For instance, given the prompt "fish. shark." we get input_ids = [  101,  3869,  1012, 11420,  1012,   102].
     this function will then return a mapping for each of the prompt tokens (i.e. tokens associated with "fish" and "shark")
     indicating their position in the logit space.
 

From 07048ad44919498806749e33a2e0ccfa02cb296b Mon Sep 17 00:00:00 2001
From: Eduardo Pacheco <69953243+EduardoPach@users.noreply.github.com>
Date: Fri, 19 Jul 2024 23:12:58 +0200
Subject: [PATCH 17/61] Update
 src/transformers/models/grounding_dino/modeling_grounding_dino.py

Co-authored-by: amyeroberts <22614925+amyeroberts@users.noreply.github.com>
---
 .../models/grounding_dino/modeling_grounding_dino.py            | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/src/transformers/models/grounding_dino/modeling_grounding_dino.py b/src/transformers/models/grounding_dino/modeling_grounding_dino.py
index 7e10d96c90e9c6..d7d4589b2823c2 100644
--- a/src/transformers/models/grounding_dino/modeling_grounding_dino.py
+++ b/src/transformers/models/grounding_dino/modeling_grounding_dino.py
@@ -2919,7 +2919,7 @@ def build_label_maps(logits, input_ids):
     where `num_labels` is defined by the number of classes in the input prompt.
 
     For instance, given the prompt "fish. shark." we get input_ids = [  101,  3869,  1012, 11420,  1012,   102].
-    this function will then return a mapping for each of the prompt tokens (i.e. tokens associated with "fish" and "shark")
+    This function will return a mapping for each of the prompt tokens (i.e. tokens associated with "fish" and "shark")
     indicating their position in the logit space.
 
     This is used in `loss_labels` and in the `GroundingDinoHungarianMatcher`.)

From 1f81e13c1c66716d5b6c96cbb1638fa92049e281 Mon Sep 17 00:00:00 2001
From: Eduardo Pacheco <69953243+EduardoPach@users.noreply.github.com>
Date: Fri, 19 Jul 2024 23:13:31 +0200
Subject: [PATCH 18/61] Update
 src/transformers/models/grounding_dino/modeling_grounding_dino.py

Co-authored-by: amyeroberts <22614925+amyeroberts@users.noreply.github.com>
---
 .../models/grounding_dino/modeling_grounding_dino.py         | 5 ++++-
 1 file changed, 4 insertions(+), 1 deletion(-)

diff --git a/src/transformers/models/grounding_dino/modeling_grounding_dino.py b/src/transformers/models/grounding_dino/modeling_grounding_dino.py
index d7d4589b2823c2..74a1366d870e77 100644
--- a/src/transformers/models/grounding_dino/modeling_grounding_dino.py
+++ b/src/transformers/models/grounding_dino/modeling_grounding_dino.py
@@ -2639,7 +2639,10 @@ def sigmoid_focal_loss(
 
     if reduction == "mean":
         return loss.sum() / num_queries / num_boxes
-    return loss.sum() / num_boxes
+    elif reduction == "sum":
+        return loss.sum() / num_boxes
+    else:
+        raise ValueError(f"{reduction=} is not a valid reduction method")
 
 
 # Copied from transformers.models.deformable_detr.modeling_deformable_detr.DeformableDetrHungarianMatcher with DeformableDetr->GroundingDino

From a20eea8f893a9e43d063f7c3265c0aa3bb725a14 Mon Sep 17 00:00:00 2001
From: Eduardo Pacheco <69953243+EduardoPach@users.noreply.github.com>
Date: Fri, 19 Jul 2024 23:13:39 +0200
Subject: [PATCH 19/61] Update
 src/transformers/models/grounding_dino/modeling_grounding_dino.py

Co-authored-by: amyeroberts <22614925+amyeroberts@users.noreply.github.com>
---
 .../models/grounding_dino/modeling_grounding_dino.py            | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/src/transformers/models/grounding_dino/modeling_grounding_dino.py b/src/transformers/models/grounding_dino/modeling_grounding_dino.py
index 74a1366d870e77..48ae7bee089f3d 100644
--- a/src/transformers/models/grounding_dino/modeling_grounding_dino.py
+++ b/src/transformers/models/grounding_dino/modeling_grounding_dino.py
@@ -2439,7 +2439,7 @@ def forward(
                     object_query_embedding, 1, topk_proposals.unsqueeze(-1).repeat(1, 1, self.d_model)
                 ).detach()
 
-            # Set intermediate topk proposals (coordsa nd class) for loss computation
+            # Set intermediate topk proposals (coords and class) for loss computation
             encoder_pred_boxes = topk_coords_logits.sigmoid()
             encoder_logits = self.encoder_output_class_embed(target, text_features, text_token_mask)
         else:

From cbe6ea8722c1906230a17ea83f698fb56fc4aa32 Mon Sep 17 00:00:00 2001
From: Eduardo Pacheco <69953243+EduardoPach@users.noreply.github.com>
Date: Fri, 19 Jul 2024 23:14:14 +0200
Subject: [PATCH 20/61] Update
 src/transformers/models/grounding_dino/modeling_grounding_dino.py

Co-authored-by: amyeroberts <22614925+amyeroberts@users.noreply.github.com>
---
 .../models/grounding_dino/modeling_grounding_dino.py             | 1 -
 1 file changed, 1 deletion(-)

diff --git a/src/transformers/models/grounding_dino/modeling_grounding_dino.py b/src/transformers/models/grounding_dino/modeling_grounding_dino.py
index 48ae7bee089f3d..e4fab1e84da847 100644
--- a/src/transformers/models/grounding_dino/modeling_grounding_dino.py
+++ b/src/transformers/models/grounding_dino/modeling_grounding_dino.py
@@ -2925,7 +2925,6 @@ def build_label_maps(logits, input_ids):
     This function will return a mapping for each of the prompt tokens (i.e. tokens associated with "fish" and "shark")
     indicating their position in the logit space.
 
-    This is used in `loss_labels` and in the `GroundingDinoHungarianMatcher`.)
     """
     hidden_size = logits.shape[-1]
     # Add [PAD] token to the list of special tokens

From 1f9a0eeb22e07fd6f0a8a904146e7186e4938c95 Mon Sep 17 00:00:00 2001
From: sangbumchoi <danielsejong55@gmail.com>
Date: Tue, 23 Jul 2024 01:39:42 +0000
Subject: [PATCH 21/61] remove common_kwargs from processor since it propagates

---
 .../models/grounding_dino/processing_grounding_dino.py         | 3 +--
 1 file changed, 1 insertion(+), 2 deletions(-)

diff --git a/src/transformers/models/grounding_dino/processing_grounding_dino.py b/src/transformers/models/grounding_dino/processing_grounding_dino.py
index 4aa081fd70f224..c005f2d031087c 100644
--- a/src/transformers/models/grounding_dino/processing_grounding_dino.py
+++ b/src/transformers/models/grounding_dino/processing_grounding_dino.py
@@ -133,7 +133,7 @@ def __call__(
         # Get only text
         if images is not None:
             encoding_image_processor = self.image_processor(
-                images, **output_kwargs["common_kwargs"], **output_kwargs["images_kwargs"]
+                images, **output_kwargs["images_kwargs"]
             )
         else:
             encoding_image_processor = BatchFeature()
@@ -141,7 +141,6 @@ def __call__(
         if text is not None:
             text_encoding = self.tokenizer(
                 text=text,
-                **output_kwargs["common_kwargs"],
                 **output_kwargs["text_kwargs"],
             )
         else:

From 0696dcf60720e8aa9d94c519c3f3da14d5820bb2 Mon Sep 17 00:00:00 2001
From: sangbumchoi <danielsejong55@gmail.com>
Date: Tue, 23 Jul 2024 01:51:09 +0000
Subject: [PATCH 22/61] make style

---
 .../models/grounding_dino/processing_grounding_dino.py        | 4 +---
 1 file changed, 1 insertion(+), 3 deletions(-)

diff --git a/src/transformers/models/grounding_dino/processing_grounding_dino.py b/src/transformers/models/grounding_dino/processing_grounding_dino.py
index c005f2d031087c..9eccc7320ccc6b 100644
--- a/src/transformers/models/grounding_dino/processing_grounding_dino.py
+++ b/src/transformers/models/grounding_dino/processing_grounding_dino.py
@@ -132,9 +132,7 @@ def __call__(
 
         # Get only text
         if images is not None:
-            encoding_image_processor = self.image_processor(
-                images, **output_kwargs["images_kwargs"]
-            )
+            encoding_image_processor = self.image_processor(images, **output_kwargs["images_kwargs"])
         else:
             encoding_image_processor = BatchFeature()
 

From 850b9d5f6ea7224d6afd17311a20880d86f144d9 Mon Sep 17 00:00:00 2001
From: sangbumchoi <danielsejong55@gmail.com>
Date: Tue, 23 Jul 2024 02:06:37 +0000
Subject: [PATCH 23/61] return_token_type_ids to True

---
 .../models/grounding_dino/processing_grounding_dino.py          | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/src/transformers/models/grounding_dino/processing_grounding_dino.py b/src/transformers/models/grounding_dino/processing_grounding_dino.py
index 9eccc7320ccc6b..74cda0c5953915 100644
--- a/src/transformers/models/grounding_dino/processing_grounding_dino.py
+++ b/src/transformers/models/grounding_dino/processing_grounding_dino.py
@@ -73,7 +73,7 @@ class GroundingDinoProcessorKwargs(ProcessingKwargs, total=False):
             "return_overflowing_tokens": False,
             "return_special_tokens_mask": False,
             "return_offsets_mapping": False,
-            "return_token_type_ids": False,
+            "return_token_type_ids": True,
             "return_length": False,
             "verbose": True,
         },

From 71089006f33f7288b9725755478d5b537f45452e Mon Sep 17 00:00:00 2001
From: Eduardo Pacheco <eduardo.pach@hotmail.com>
Date: Tue, 23 Jul 2024 12:40:43 +0200
Subject: [PATCH 24/61] Addressed comments

---
 .../grounding_dino/modeling_grounding_dino.py | 28 ++++++++++++++-----
 1 file changed, 21 insertions(+), 7 deletions(-)

diff --git a/src/transformers/models/grounding_dino/modeling_grounding_dino.py b/src/transformers/models/grounding_dino/modeling_grounding_dino.py
index e4fab1e84da847..5528b85042a521 100644
--- a/src/transformers/models/grounding_dino/modeling_grounding_dino.py
+++ b/src/transformers/models/grounding_dino/modeling_grounding_dino.py
@@ -2591,10 +2591,8 @@ def generalized_box_iou(boxes1, boxes2):
     return iou - (area - union) / area
 
 
-# Changed the way we reduce the loss since pass only valid logits to avoid inf/nan problems
-# also added num_queries and reduction args as the base implementation uses 'sum' reduction
-# but the sigmoid_focal_loss in the original implementation uses 'mean' reduction and the 'sum'
-# reduction leads to a loss_ce that is considerably higher than the other losses.
+# Similar to `DeformableDetr` but we pass `num_queries` (because inputs are only the valid logits)
+# and we also pass `reduction` for testing purposes.
 def sigmoid_focal_loss(
     inputs: torch.Tensor,
     targets: torch.Tensor,
@@ -2774,7 +2772,6 @@ def _get_target_classes_one_hot(self, outputs, targets, indices):
 
         return target_classes_onehot
 
-    # Added new target_classes_onehot and step to get valid logits and new sigmoid_focal_loss signature
     def loss_labels(self, outputs, targets, indices, num_boxes):
         """
         Classification loss (Binary focal loss) targets dicts must contain the key "class_labels" containing a tensor
@@ -2810,7 +2807,7 @@ def loss_labels(self, outputs, targets, indices, num_boxes):
 
         return losses
 
-    # Added loss_xy and loss_hw to calculate the x,y and h,w loss
+    # Same as in `DeformableDetrLoss` with the addition of loss_xy and loss_hw to calculate the x,y and h,w loss
     def loss_boxes(self, outputs, targets, indices, num_boxes):
         """
         Compute the losses related to the bounding boxes, the L1 regression loss and the GIoU loss.
@@ -2841,6 +2838,23 @@ def loss_boxes(self, outputs, targets, indices, num_boxes):
 
         return losses
 
+    @torch.no_grad()
+    # Copied from transformers.models.deformable_detr.modeling_deformable_detr.DeformableDetrLoss.loss_cardinality
+    def loss_cardinality(self, outputs, targets, indices, num_boxes):
+        """
+        Compute the cardinality error, i.e. the absolute error in the number of predicted non-empty boxes.
+
+        This is not really a loss, it is intended for logging purposes only. It doesn't propagate gradients.
+        """
+        logits = outputs["logits"]
+        device = logits.device
+        target_lengths = torch.as_tensor([len(v["class_labels"]) for v in targets], device=device)
+        # Count the number of predictions that are NOT "no-object" (which is the last class)
+        card_pred = (logits.argmax(-1) != logits.shape[-1] - 1).sum(1)
+        card_err = nn.functional.l1_loss(card_pred.float(), target_lengths.float())
+        losses = {"cardinality_error": card_err}
+        return losses
+
     # Copied from transformers.models.deformable_detr.modeling_deformable_detr.DeformableDetrLoss._get_source_permutation_idx
     def _get_source_permutation_idx(self, indices):
         # permute predictions following indices
@@ -2855,11 +2869,11 @@ def _get_target_permutation_idx(self, indices):
         target_idx = torch.cat([target for (_, target) in indices])
         return batch_idx, target_idx
 
-    # Removed cardinality loss as it is not used.
     def get_loss(self, loss, outputs, targets, indices, num_boxes):
         loss_map = {
             "labels": self.loss_labels,
             "boxes": self.loss_boxes,
+            "cardinality": self.loss_cardinality,
         }
         if loss not in loss_map:
             raise ValueError(f"Loss {loss} not supported")

From c96c02b6ecdb0047c1ec19772244042ea3086831 Mon Sep 17 00:00:00 2001
From: sangbumchoi <danielsejong55@gmail.com>
Date: Wed, 24 Jul 2024 00:46:39 +0000
Subject: [PATCH 25/61] revert the default imagekwargs since does not accept
 any value in the image processro

---
 .../models/grounding_dino/processing_grounding_dino.py          | 2 --
 1 file changed, 2 deletions(-)

diff --git a/src/transformers/models/grounding_dino/processing_grounding_dino.py b/src/transformers/models/grounding_dino/processing_grounding_dino.py
index 74cda0c5953915..9928eacbeb5ce0 100644
--- a/src/transformers/models/grounding_dino/processing_grounding_dino.py
+++ b/src/transformers/models/grounding_dino/processing_grounding_dino.py
@@ -78,8 +78,6 @@ class GroundingDinoProcessorKwargs(ProcessingKwargs, total=False):
             "verbose": True,
         },
         "images_kwargs": {
-            "do_convert_annotations": True,
-            "do_resize": True,
         },
     }
 

From 8cff6b609438691cb4cbc1195bd6f058d8794e4b Mon Sep 17 00:00:00 2001
From: sangbumchoi <danielsejong55@gmail.com>
Date: Wed, 24 Jul 2024 01:08:01 +0000
Subject: [PATCH 26/61] revert processing_utils.py

---
 src/transformers/processing_utils.py | 28 ----------------------------
 1 file changed, 28 deletions(-)

diff --git a/src/transformers/processing_utils.py b/src/transformers/processing_utils.py
index 2e724eb2264a0a..7062a7699a79f7 100644
--- a/src/transformers/processing_utils.py
+++ b/src/transformers/processing_utils.py
@@ -20,7 +20,6 @@
 import inspect
 import json
 import os
-import pathlib
 import warnings
 from pathlib import Path
 from typing import Any, Dict, List, Optional, Tuple, TypedDict, Union
@@ -41,7 +40,6 @@
 )
 from .utils import (
     PROCESSOR_NAME,
-    ExplicitEnum,
     PushToHubMixin,
     TensorType,
     add_model_info_to_auto_map,
@@ -58,14 +56,6 @@
 
 logger = logging.get_logger(__name__)
 
-AnnotationType = Dict[str, Union[int, str, List[Dict]]]
-
-
-class AnnotationFormat(ExplicitEnum):
-    COCO_DETECTION = "coco_detection"
-    COCO_PANOPTIC = "coco_panoptic"
-
-
 # Dynamically import the Transformers module to grab the attribute classes of the processor form their names.
 transformers_module = direct_transformers_import(Path(__file__).parent)
 
@@ -138,12 +128,6 @@ class ImagesKwargs(TypedDict, total=False):
     class methods and docstrings.
 
     Attributes:
-        annotations (`AnnotationType` or `List[AnnotationType]`, *optional*):
-            List of annotations associated with the image or batch of images.
-        return_segmentation_masks (`bool`, *optional*):
-            Whether to return segmentation masks.
-        masks_path (`str` or `pathlib.Path`, *optional*):
-            Path to the directory containing the segmentation masks.
         do_resize (`bool`, *optional*):
             Whether to resize the image.
         size (`Dict[str, int]`, *optional*):
@@ -160,8 +144,6 @@ class methods and docstrings.
             Scale factor to use if rescaling the image.
         do_normalize (`bool`, *optional*):
             Whether to normalize the image.
-        do_convert_annotations (`bool`, *optional*):
-            Whether to convert the annotations to the format expected by the model.
         image_mean (`float` or `List[float]`, *optional*):
             Mean to use if normalizing the image.
         image_std (`float` or `List[float]`, *optional*):
@@ -170,19 +152,12 @@ class methods and docstrings.
             Whether to pad the image to the `(max_height, max_width)` of the images in the batch.
         do_center_crop (`bool`, *optional*):
             Whether to center crop the image.
-        format (`str` or `AnnotationFormat`, *optional*):
-            Format of the annotations.
         data_format (`ChannelDimension` or `str`, *optional*):
             The channel dimension format for the output image.
         input_data_format (`ChannelDimension` or `str`, *optional*):
             The channel dimension format for the input image.
-        pad_size (`Dict[str, int]`, *optional*):
-            The size `{"height": int, "width" int}` to pad the images to.
     """
 
-    annotations: Optional[Union[AnnotationType, List[AnnotationType]]]
-    return_segmentation_masks: Optional[bool]
-    masks_path: Optional[Union[str, pathlib.Path]]
     do_resize: Optional[bool]
     size: Optional[Dict[str, int]]
     size_divisor: Optional[int]
@@ -191,15 +166,12 @@ class methods and docstrings.
     do_rescale: Optional[bool]
     rescale_factor: Optional[float]
     do_normalize: Optional[bool]
-    do_convert_annotations: Optional[bool]
     image_mean: Optional[Union[float, List[float]]]
     image_std: Optional[Union[float, List[float]]]
     do_pad: Optional[bool]
     do_center_crop: Optional[bool]
-    format: Optional[Union[str, AnnotationFormat]]
     data_format: Optional[ChannelDimension]
     input_data_format: Optional[Union[str, ChannelDimension]]
-    pad_size: Optional[Dict[str, int]]
 
 
 class VideosKwargs(TypedDict, total=False):

From bb1f18bb3bb9230dd3acfa5349328daf749c5fbe Mon Sep 17 00:00:00 2001
From: sangbumchoi <danielsejong55@gmail.com>
Date: Wed, 24 Jul 2024 01:25:14 +0000
Subject: [PATCH 27/61] make style

---
 .../models/grounding_dino/processing_grounding_dino.py         | 3 +--
 1 file changed, 1 insertion(+), 2 deletions(-)

diff --git a/src/transformers/models/grounding_dino/processing_grounding_dino.py b/src/transformers/models/grounding_dino/processing_grounding_dino.py
index 9928eacbeb5ce0..a09d21502b3c07 100644
--- a/src/transformers/models/grounding_dino/processing_grounding_dino.py
+++ b/src/transformers/models/grounding_dino/processing_grounding_dino.py
@@ -77,8 +77,7 @@ class GroundingDinoProcessorKwargs(ProcessingKwargs, total=False):
             "return_length": False,
             "verbose": True,
         },
-        "images_kwargs": {
-        },
+        "images_kwargs": {},
     }
 
 
From a476c6ee88318ce40d73ea31e2dc2d4faa8ae410 Mon Sep 17 00:00:00 2001
From: sangbumchoi <danielsejong55@gmail.com>
Date: Wed, 24 Jul 2024 01:47:47 +0000
Subject: [PATCH 28/61] add molbap's commit

---
 .../processing_grounding_dino.py              |  23 +-
 src/transformers/processing_utils.py          |  71 +++----
 tests/test_processing_common.py               | 200 ++++++++++++++++--
 3 files changed, 225 insertions(+), 69 deletions(-)

diff --git a/src/transformers/models/grounding_dino/processing_grounding_dino.py b/src/transformers/models/grounding_dino/processing_grounding_dino.py
index a09d21502b3c07..167b5598bfa7b9 100644
--- a/src/transformers/models/grounding_dino/processing_grounding_dino.py
+++ b/src/transformers/models/grounding_dino/processing_grounding_dino.py
@@ -16,13 +16,14 @@
 Processor class for Grounding DINO.
 """
 
+import pathlib
 import sys
-from typing import List, Tuple, Union
+from typing import Dict, List, Optional, Tuple, Union
 
 from ...image_processing_utils import BatchFeature
 from ...image_transforms import center_to_corners_format
 from ...image_utils import ImageInput
-from ...processing_utils import ProcessingKwargs, ProcessorMixin
+from ...processing_utils import ImagesKwargs, ProcessingKwargs, ProcessorMixin
 
 
 if sys.version_info >= (3, 11):
@@ -31,12 +32,19 @@
     from typing_extensions import Unpack
 
 from ...tokenization_utils_base import BatchEncoding, PreTokenizedInput, TextInput
-from ...utils import TensorType, is_torch_available
+from ...utils import ExplicitEnum, TensorType, is_torch_available
 
 
 if is_torch_available():
     import torch
 
+AnnotationType = Dict[str, Union[int, str, List[Dict]]]
+
+
+class AnnotationFormat(ExplicitEnum):
+    COCO_DETECTION = "coco_detection"
+    COCO_PANOPTIC = "coco_panoptic"
+
 
 def get_phrases_from_posmap(posmaps, input_ids):
     """Get token ids of phrases from posmaps and input_ids.
@@ -64,7 +72,16 @@ def get_phrases_from_posmap(posmaps, input_ids):
     return token_ids
 
 
+class GroundingDinoImagesKwargs(ImagesKwargs, total=False):
+    annotations: Optional[Union[AnnotationType, List[AnnotationType]]]
+    return_segmentation_masks: Optional[bool]
+    masks_path: Optional[Union[str, pathlib.Path]]
+    do_convert_annotations: Optional[bool]
+    format: Optional[Union[str, AnnotationFormat]]
+
+
 class GroundingDinoProcessorKwargs(ProcessingKwargs, total=False):
+    images_kwargs: GroundingDinoImagesKwargs
     _defaults = {
         "text_kwargs": {
             "add_special_tokens": True,
diff --git a/src/transformers/processing_utils.py b/src/transformers/processing_utils.py
index 7062a7699a79f7..d9f1e6f5efabde 100644
--- a/src/transformers/processing_utils.py
+++ b/src/transformers/processing_utils.py
@@ -150,6 +150,8 @@ class methods and docstrings.
             Standard deviation to use if normalizing the image.
         do_pad (`bool`, *optional*):
             Whether to pad the image to the `(max_height, max_width)` of the images in the batch.
+        pad_size (`Dict[str, int]`, *optional*):
+            The size `{"height": int, "width" int}` to pad the images to.
         do_center_crop (`bool`, *optional*):
             Whether to center crop the image.
         data_format (`ChannelDimension` or `str`, *optional*):
@@ -169,6 +171,7 @@ class methods and docstrings.
     image_mean: Optional[Union[float, List[float]]]
     image_std: Optional[Union[float, List[float]]]
     do_pad: Optional[bool]
+    pad_size: Optional[Dict[str, int]]
     do_center_crop: Optional[bool]
     data_format: Optional[ChannelDimension]
     input_data_format: Optional[Union[str, ChannelDimension]]
@@ -320,7 +323,6 @@ class ProcessorMixin(PushToHubMixin):
     feature_extractor_class = None
     tokenizer_class = None
     _auto_class = None
-    valid_kwargs: List[str] = []
 
     # args have to match the attributes class attribute
     def __init__(self, *args, **kwargs):
@@ -649,15 +651,14 @@ def from_args_and_dict(cls, args, processor_dict: Dict[str, Any], **kwargs):
         processor_dict = processor_dict.copy()
         return_unused_kwargs = kwargs.pop("return_unused_kwargs", False)
 
-        # We have to pop up some unused (but specific) kwargs and then validate that it doesn't contain unused kwargs
-        # If we don't pop, some specific kwargs will raise a warning
+        # Unlike image processors or feature extractors whose `__init__` accept `kwargs`, processor don't have `kwargs`.
+        # We have to pop up some unused (but specific) arguments to make it work.
         if "processor_class" in processor_dict:
             del processor_dict["processor_class"]
 
         if "auto_map" in processor_dict:
             del processor_dict["auto_map"]
 
-        unused_kwargs = cls.validate_init_kwargs(processor_config=processor_dict, valid_kwargs=cls.valid_kwargs)
         processor = cls(*args, **processor_dict)
 
         # Update processor with kwargs if needed
@@ -665,7 +666,6 @@ def from_args_and_dict(cls, args, processor_dict: Dict[str, Any], **kwargs):
             if hasattr(processor, key):
                 setattr(processor, key, kwargs.pop(key))
 
-        kwargs.update(unused_kwargs)
         logger.info(f"Processor {processor}")
         if return_unused_kwargs:
             return processor, kwargs
@@ -743,38 +743,34 @@ class MyProcessingKwargs(ProcessingKwargs, CommonKwargs, TextKwargs, ImagesKwarg
                 if modality_key in tokenizer_init_kwargs:
                     default_kwargs[modality][modality_key] = tokenizer_init_kwargs[modality_key]
         # now defaults kwargs are updated with the tokenizers defaults.
-        # pass defaults to output dictionary
         output_kwargs.update(default_kwargs)
 
+        # gather common kwargs and remove them from individual kwargs if present
+        common_kwargs = {
+            key: value
+            for key, value in kwargs.items()
+            if key not in ModelProcessorKwargs.__annotations__["text_kwargs"].__annotations__
+            and key not in ModelProcessorKwargs.__annotations__["images_kwargs"].__annotations__
+            and key not in ModelProcessorKwargs.__annotations__["audio_kwargs"].__annotations__
+            and key not in ModelProcessorKwargs.__annotations__["videos_kwargs"].__annotations__
+        }
+
+        # ensure common kwargs are propagated to all relevant modalities
+        for key, value in common_kwargs.items():
+            for modality in output_kwargs:
+                if modality != "common_kwargs":
+                    output_kwargs[modality][key] = value
+
+        # remove common kwargs from the kwargs to process the rest
+        kwargs = {k: v for k, v in kwargs.items() if k not in common_kwargs}
+
         # update modality kwargs with passed kwargs
-        non_modality_kwargs = set(kwargs) - set(output_kwargs)
         for modality in output_kwargs:
             for modality_key in ModelProcessorKwargs.__annotations__[modality].__annotations__.keys():
-                # check if we received a structured kwarg dict or not to handle it correctly
-                if modality in kwargs:
-                    kwarg_value = kwargs[modality].pop(modality_key, "__empty__")
-                    # check if this key was passed as a flat kwarg.
-                    if kwarg_value != "__empty__" and modality_key in non_modality_kwargs:
-                        raise ValueError(
-                            f"Keyword argument {modality_key} was passed two times: in a dictionary for {modality} and as a **kwarg."
-                        )
+                if modality in kwargs and modality_key in kwargs[modality]:
+                    output_kwargs[modality][modality_key] = kwargs[modality][modality_key]
                 elif modality_key in kwargs:
-                    kwarg_value = kwargs.pop(modality_key, "__empty__")
-                else:
-                    kwarg_value = "__empty__"
-                if kwarg_value != "__empty__":
-                    output_kwargs[modality][modality_key] = kwarg_value
-        # if something remains in kwargs, it belongs to common after flattening
-        if set(kwargs) & set(default_kwargs):
-            # here kwargs is dictionary-based since it shares keys with default set
-            [output_kwargs["common_kwargs"].update(subdict) for _, subdict in kwargs.items()]
-        else:
-            # here it's a flat dict
-            output_kwargs["common_kwargs"].update(kwargs)
-
-        # all modality-specific kwargs are updated with common kwargs
-        for modality in output_kwargs:
-            output_kwargs[modality].update(output_kwargs["common_kwargs"])
+                    output_kwargs[modality][modality_key] = kwargs[modality_key]
         return output_kwargs
 
     @classmethod
@@ -890,19 +886,6 @@ def model_input_names(self):
         first_attribute = getattr(self, self.attributes[0])
         return getattr(first_attribute, "model_input_names", None)
 
-    @staticmethod
-    def validate_init_kwargs(processor_config, valid_kwargs):
-        kwargs_from_config = processor_config.keys()
-        unused_kwargs = {}
-        unused_keys = set(kwargs_from_config) - set(valid_kwargs)
-        if unused_keys:
-            unused_key_str = ", ".join(unused_keys)
-            logger.warning(
-                f"Some kwargs in processor config are unused and will not have any effect: {unused_key_str}. "
-            )
-            unused_kwargs = {k: processor_config[k] for k in unused_keys}
-        return unused_kwargs
-
     def apply_chat_template(
         self,
         conversation: Union[List[Dict[str, str]]],
diff --git a/tests/test_processing_common.py b/tests/test_processing_common.py
index 074aa2f1d62545..e6128cde9bb503 100644
--- a/tests/test_processing_common.py
+++ b/tests/test_processing_common.py
@@ -16,6 +16,7 @@
 
 import inspect
 import json
+import random
 import tempfile
 
 
@@ -38,15 +39,31 @@
 from transformers.utils import is_vision_available
 
 
+global_rng = random.Random()
+
 if is_vision_available():
     from PIL import Image
 
     from transformers import CLIPImageProcessor
 
 
+# Copied from tests.models.whisper.test_feature_extraction_whisper.floats_list
+def floats_list(shape, scale=1.0, rng=None, name=None):
+    """Creates a random float32 tensor"""
+    if rng is None:
+        rng = global_rng
+
+    values = []
+    for batch_idx in range(shape[0]):
+        values.append([])
+        for _ in range(shape[1]):
+            values[-1].append(rng.random() * scale)
+
+    return values
+
+
 @require_torch
 @require_vision
-@require_torch
 class ProcessorTesterMixin:
     processor_class = None
 
@@ -60,7 +77,10 @@ def get_component(self, attribute, **kwargs):
             component_class_name = component_class_name[0]
 
         component_class = processor_class_from_name(component_class_name)
-        component = component_class.from_pretrained(self.tmpdirname, **kwargs)  # noqa
+        if hasattr(self, "tmpdirname"):
+            component = component_class.from_pretrained(self.tmpdirname, **kwargs)  # noqa
+        elif hasattr(self, "model_id"):
+            component = component_class.from_pretrained(self.model_id, **kwargs)  # noqa
 
         return component
 
@@ -126,13 +146,13 @@ def test_tokenizer_defaults_preserved_by_kwargs(self):
         if "image_processor" not in self.processor_class.attributes:
             self.skipTest(f"image_processor attribute not present in {self.processor_class}")
         image_processor = self.get_component("image_processor")
-        tokenizer = self.get_component("tokenizer", max_length=117)
-
+        tokenizer = self.get_component("tokenizer", max_length=117, padding="max_length")
+        if not tokenizer.pad_token:
+            tokenizer.pad_token = "[TEST_PAD]"
         processor = self.processor_class(tokenizer=tokenizer, image_processor=image_processor)
         self.skip_processor_without_typed_kwargs(processor)
         input_str = "lower newer"
         image_input = self.prepare_image_inputs()
-
         inputs = processor(text=input_str, images=image_input, return_tensors="pt")
         self.assertEqual(len(inputs["input_ids"][0]), 117)
 
@@ -141,15 +161,15 @@ def test_tokenizer_defaults_preserved_by_kwargs(self):
     def test_image_processor_defaults_preserved_by_image_kwargs(self):
         if "image_processor" not in self.processor_class.attributes:
             self.skipTest(f"image_processor attribute not present in {self.processor_class}")
-        image_processor = self.get_component("image_processor", crop_size=(234, 234))
+        image_processor = self.get_component("image_processor", crop_size=(234, 234), size=(234, 234))
         tokenizer = self.get_component("tokenizer", max_length=117)
-
+        if not tokenizer.pad_token:
+            tokenizer.pad_token = "[TEST_PAD]"
         processor = self.processor_class(tokenizer=tokenizer, image_processor=image_processor)
         self.skip_processor_without_typed_kwargs(processor)
 
         input_str = "lower newer"
         image_input = self.prepare_image_inputs()
-
         inputs = processor(text=input_str, images=image_input)
         self.assertEqual(len(inputs["pixel_values"][0][0]), 234)
 
@@ -160,13 +180,15 @@ def test_kwargs_overrides_default_tokenizer_kwargs(self):
             self.skipTest(f"image_processor attribute not present in {self.processor_class}")
         image_processor = self.get_component("image_processor")
         tokenizer = self.get_component("tokenizer", max_length=117)
-
+        if not tokenizer.pad_token:
+            tokenizer.pad_token = "[TEST_PAD]"
         processor = self.processor_class(tokenizer=tokenizer, image_processor=image_processor)
         self.skip_processor_without_typed_kwargs(processor)
         input_str = "lower newer"
         image_input = self.prepare_image_inputs()
-
-        inputs = processor(text=input_str, images=image_input, return_tensors="pt", max_length=112)
+        inputs = processor(
+            text=input_str, images=image_input, return_tensors="pt", max_length=112, padding="max_length"
+        )
         self.assertEqual(len(inputs["input_ids"][0]), 112)
 
     @require_torch
@@ -174,16 +196,17 @@ def test_kwargs_overrides_default_tokenizer_kwargs(self):
     def test_kwargs_overrides_default_image_processor_kwargs(self):
         if "image_processor" not in self.processor_class.attributes:
             self.skipTest(f"image_processor attribute not present in {self.processor_class}")
-        image_processor = self.get_component("image_processor", crop_size=(234, 234))
+        image_processor = self.get_component("image_processor", crop_size=(234, 234), size=(234, 234))
         tokenizer = self.get_component("tokenizer", max_length=117)
-
+        if not tokenizer.pad_token:
+            tokenizer.pad_token = "[TEST_PAD]"
         processor = self.processor_class(tokenizer=tokenizer, image_processor=image_processor)
         self.skip_processor_without_typed_kwargs(processor)
 
         input_str = "lower newer"
         image_input = self.prepare_image_inputs()
 
-        inputs = processor(text=input_str, images=image_input, crop_size=[224, 224])
+        inputs = processor(text=input_str, images=image_input, crop_size=[224, 224], size=[224, 224])
         self.assertEqual(len(inputs["pixel_values"][0][0]), 224)
 
     @require_torch
@@ -193,7 +216,8 @@ def test_unstructured_kwargs(self):
             self.skipTest(f"image_processor attribute not present in {self.processor_class}")
         image_processor = self.get_component("image_processor")
         tokenizer = self.get_component("tokenizer")
-
+        if not tokenizer.pad_token:
+            tokenizer.pad_token = "[TEST_PAD]"
         processor = self.processor_class(tokenizer=tokenizer, image_processor=image_processor)
         self.skip_processor_without_typed_kwargs(processor)
 
@@ -204,6 +228,7 @@ def test_unstructured_kwargs(self):
             images=image_input,
             return_tensors="pt",
             crop_size={"height": 214, "width": 214},
+            size={"height": 214, "width": 214},
             padding="max_length",
             max_length=76,
         )
@@ -218,7 +243,8 @@ def test_unstructured_kwargs_batched(self):
             self.skipTest(f"image_processor attribute not present in {self.processor_class}")
         image_processor = self.get_component("image_processor")
         tokenizer = self.get_component("tokenizer")
-
+        if not tokenizer.pad_token:
+            tokenizer.pad_token = "[TEST_PAD]"
         processor = self.processor_class(tokenizer=tokenizer, image_processor=image_processor)
         self.skip_processor_without_typed_kwargs(processor)
 
@@ -229,10 +255,10 @@ def test_unstructured_kwargs_batched(self):
             images=image_input,
             return_tensors="pt",
             crop_size={"height": 214, "width": 214},
+            size={"height": 214, "width": 214},
             padding="longest",
             max_length=76,
         )
-
         self.assertEqual(inputs["pixel_values"].shape[2], 214)
 
         self.assertEqual(len(inputs["input_ids"][0]), 6)
@@ -244,7 +270,8 @@ def test_doubly_passed_kwargs(self):
             self.skipTest(f"image_processor attribute not present in {self.processor_class}")
         image_processor = self.get_component("image_processor")
         tokenizer = self.get_component("tokenizer")
-
+        if not tokenizer.pad_token:
+            tokenizer.pad_token = "[TEST_PAD]"
         processor = self.processor_class(tokenizer=tokenizer, image_processor=image_processor)
         self.skip_processor_without_typed_kwargs(processor)
 
@@ -265,7 +292,8 @@ def test_structured_kwargs_nested(self):
             self.skipTest(f"image_processor attribute not present in {self.processor_class}")
         image_processor = self.get_component("image_processor")
         tokenizer = self.get_component("tokenizer")
-
+        if not tokenizer.pad_token:
+            tokenizer.pad_token = "[TEST_PAD]"
         processor = self.processor_class(tokenizer=tokenizer, image_processor=image_processor)
         self.skip_processor_without_typed_kwargs(processor)
 
@@ -275,7 +303,7 @@ def test_structured_kwargs_nested(self):
         # Define the kwargs for each modality
         all_kwargs = {
             "common_kwargs": {"return_tensors": "pt"},
-            "images_kwargs": {"crop_size": {"height": 214, "width": 214}},
+            "images_kwargs": {"crop_size": {"height": 214, "width": 214}, "size": {"height": 214, "width": 214}},
             "text_kwargs": {"padding": "max_length", "max_length": 76},
         }
 
@@ -294,7 +322,8 @@ def test_structured_kwargs_nested_from_dict(self):
 
         image_processor = self.get_component("image_processor")
         tokenizer = self.get_component("tokenizer")
-
+        if not tokenizer.pad_token:
+            tokenizer.pad_token = "[TEST_PAD]"
         processor = self.processor_class(tokenizer=tokenizer, image_processor=image_processor)
         self.skip_processor_without_typed_kwargs(processor)
         input_str = "lower newer"
@@ -303,7 +332,7 @@ def test_structured_kwargs_nested_from_dict(self):
         # Define the kwargs for each modality
         all_kwargs = {
             "common_kwargs": {"return_tensors": "pt"},
-            "images_kwargs": {"crop_size": {"height": 214, "width": 214}},
+            "images_kwargs": {"crop_size": {"height": 214, "width": 214}, "size": {"height": 214, "width": 214}},
             "text_kwargs": {"padding": "max_length", "max_length": 76},
         }
 
@@ -312,6 +341,133 @@ def test_structured_kwargs_nested_from_dict(self):
 
         self.assertEqual(len(inputs["input_ids"][0]), 76)
 
+    #  text + audio kwargs testing
+    @require_torch
+    def test_tokenizer_defaults_preserved_by_kwargs_audio(self):
+        if "feature_extractor" not in self.processor_class.attributes:
+            self.skipTest(f"feature_extractor attribute not present in {self.processor_class}")
+        feature_extractor = self.get_component("feature_extractor")
+        if hasattr(self, "get_tokenizer"):
+            tokenizer = self.get_tokenizer(max_length=117, padding="max_length")
+        elif hasattr(self, "get_component"):
+            tokenizer = self.get_component("tokenizer", max_length=117, padding="max_length")
+        if not tokenizer.pad_token:
+            tokenizer.pad_token = "[TEST_PAD]"
+        processor = self.processor_class(tokenizer=tokenizer, feature_extractor=feature_extractor)
+        self.skip_processor_without_typed_kwargs(processor)
+        input_str = "lower newer"
+        raw_speech = floats_list((3, 1000))
+        inputs = processor(text=input_str, audio=raw_speech, return_tensors="pt")
+        if "input_ids" in inputs:
+            self.assertEqual(len(inputs["input_ids"][0]), 117)
+        elif "labels" in inputs:
+            self.assertEqual(len(inputs["labels"][0]), 117)
+
+    @require_torch
+    def test_kwargs_overrides_default_tokenizer_kwargs_audio(self):
+        if "feature_extractor" not in self.processor_class.attributes:
+            self.skipTest(f"feature_extractor attribute not present in {self.processor_class}")
+        feature_extractor = self.get_component("feature_extractor")
+        if hasattr(self, "get_tokenizer"):
+            tokenizer = self.get_tokenizer(max_length=117)
+        elif hasattr(self, "get_component"):
+            tokenizer = self.get_component("tokenizer", max_length=117)
+        if not tokenizer.pad_token:
+            tokenizer.pad_token = "[TEST_PAD]"
+        processor = self.processor_class(tokenizer=tokenizer, feature_extractor=feature_extractor)
+        self.skip_processor_without_typed_kwargs(processor)
+        input_str = "lower newer"
+        raw_speech = floats_list((3, 1000))
+        inputs = processor(text=input_str, audio=raw_speech, return_tensors="pt", max_length=112, padding="max_length")
+        if "input_ids" in inputs:
+            self.assertEqual(len(inputs["input_ids"][0]), 112)
+        elif "labels" in inputs:
+            self.assertEqual(len(inputs["labels"][0]), 112)
+
+    @require_torch
+    def test_unstructured_kwargs_audio(self):
+        if "feature_extractor" not in self.processor_class.attributes:
+            self.skipTest(f"feature_extractor attribute not present in {self.processor_class}")
+        feature_extractor = self.get_component("feature_extractor")
+        if hasattr(self, "get_tokenizer"):
+            tokenizer = self.get_tokenizer(max_length=117)
+        elif hasattr(self, "get_component"):
+            tokenizer = self.get_component("tokenizer", max_length=117)
+        if not tokenizer.pad_token:
+            tokenizer.pad_token = "[TEST_PAD]"
+        processor = self.processor_class(tokenizer=tokenizer, feature_extractor=feature_extractor)
+        self.skip_processor_without_typed_kwargs(processor)
+
+        input_str = "lower newer"
+        raw_speech = floats_list((3, 1000))
+        inputs = processor(
+            text=input_str,
+            audio=raw_speech,
+            return_tensors="pt",
+            padding="max_length",
+            max_length=76,
+        )
+
+        if "input_ids" in inputs:
+            self.assertEqual(len(inputs["input_ids"][0]), 76)
+        elif "labels" in inputs:
+            self.assertEqual(len(inputs["labels"][0]), 76)
+
+    @require_torch
+    def test_doubly_passed_kwargs_audio(self):
+        if "feature_extractor" not in self.processor_class.attributes:
+            self.skipTest(f"feature_extractor attribute not present in {self.processor_class}")
+        feature_extractor = self.get_component("feature_extractor")
+        if hasattr(self, "get_tokenizer"):
+            tokenizer = self.get_tokenizer()
+        elif hasattr(self, "get_component"):
+            tokenizer = self.get_component("tokenizer")
+        if not tokenizer.pad_token:
+            tokenizer.pad_token = "[TEST_PAD]"
+        processor = self.processor_class(tokenizer=tokenizer, feature_extractor=feature_extractor)
+        self.skip_processor_without_typed_kwargs(processor)
+
+        input_str = ["lower newer"]
+        raw_speech = floats_list((3, 1000))
+        with self.assertRaises(ValueError):
+            _ = processor(
+                text=input_str,
+                audio=raw_speech,
+                audio_kwargs={"padding": "max_length"},
+                padding="max_length",
+            )
+
+    @require_torch
+    @require_vision
+    def test_structured_kwargs_audio_nested(self):
+        if "feature_extractor" not in self.processor_class.attributes:
+            self.skipTest(f"feature_extractor attribute not present in {self.processor_class}")
+        feature_extractor = self.get_component("feature_extractor")
+        if hasattr(self, "get_tokenizer"):
+            tokenizer = self.get_tokenizer()
+        elif hasattr(self, "get_component"):
+            tokenizer = self.get_component("tokenizer")
+        if not tokenizer.pad_token:
+            tokenizer.pad_token = "[TEST_PAD]"
+        processor = self.processor_class(tokenizer=tokenizer, feature_extractor=feature_extractor)
+        self.skip_processor_without_typed_kwargs(processor)
+
+        input_str = ["lower newer"]
+        raw_speech = floats_list((3, 1000))
+
+        # Define the kwargs for each modality
+        all_kwargs = {
+            "common_kwargs": {"return_tensors": "pt"},
+            "audio_kwargs": {"padding": "max_length", "max_length": 66},
+            "text_kwargs": {"padding": "max_length", "max_length": 76},
+        }
+
+        inputs = processor(text=input_str, audio=raw_speech, **all_kwargs)
+        if "input_ids" in inputs:
+            self.assertEqual(len(inputs["input_ids"][0]), 76)
+        elif "labels" in inputs:
+            self.assertEqual(len(inputs["labels"][0]), 76)
+
 
 class MyProcessor(ProcessorMixin):
     attributes = ["image_processor", "tokenizer"]

From 81045219a22ed34a73cac3fe924908e57a9ebe53 Mon Sep 17 00:00:00 2001
From: sangbumchoi <danielsejong55@gmail.com>
Date: Wed, 24 Jul 2024 02:40:01 +0000
Subject: [PATCH 29/61] fix typo

---
 .../models/grounding_dino/modeling_grounding_dino.py            | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/src/transformers/models/grounding_dino/modeling_grounding_dino.py b/src/transformers/models/grounding_dino/modeling_grounding_dino.py
index dcdccc50cc116d..c33718bde54410 100644
--- a/src/transformers/models/grounding_dino/modeling_grounding_dino.py
+++ b/src/transformers/models/grounding_dino/modeling_grounding_dino.py
@@ -1580,7 +1580,7 @@ def _set_gradient_checkpointing(self, module, value=False):
             Indices of input sequence tokens in the vocabulary. Padding will be ignored by default should you provide
             it.
 
-            Indices can be obtained using [`AutoTokenizer`]. See [`GroundingDinoTokenizer.__call__`] for details.
+            Indices can be obtained using [`AutoTokenizer`]. See [`BertTokenizer.__call__`] for details.
 
         token_type_ids (`torch.LongTensor` of shape `(batch_size, text_sequence_length)`, *optional*):
             Segment token indices to indicate first and second portions of the inputs. Indices are selected in `[0,

From 5d6a088566b743d64defbd6bc55cdf00cbbe985e Mon Sep 17 00:00:00 2001
From: sangbumchoi <danielsejong55@gmail.com>
Date: Wed, 24 Jul 2024 12:28:55 +0000
Subject: [PATCH 30/61] fix common processor

---
 src/transformers/processing_utils.py          | 35 +++++++++++--------
 .../test_processor_grounding_dino.py          | 10 +++++-
 tests/test_processing_common.py               | 14 ++++----
 3 files changed, 36 insertions(+), 23 deletions(-)

diff --git a/src/transformers/processing_utils.py b/src/transformers/processing_utils.py
index 8e0ab968d8589a..83ad01714dbceb 100644
--- a/src/transformers/processing_utils.py
+++ b/src/transformers/processing_utils.py
@@ -736,12 +736,12 @@ def _merge_kwargs(
         The order of operations is as follows:
             1) kwargs passed as before have highest priority to preserve BC.
                 ```python
-                high_priority_kwargs = {"crop_size" = (224, 224), "padding" = "max_length"}
+                high_priority_kwargs = {"crop_size" = {"height": 222, "width": 222}, "padding" = "max_length"}
                 processor(..., **high_priority_kwargs)
                 ```
             2) kwargs passed as modality-specific kwargs have second priority. This is the recommended API.
                 ```python
-                processor(..., text_kwargs={"padding": "max_length"}, images_kwargs={"crop_size": (224, 224)}})
+                processor(..., text_kwargs={"padding": "max_length"}, images_kwargs={"crop_size": {"height": 222, "width": 222}}})
                 ```
             3) kwargs passed during instantiation of a modality processor have fourth priority.
                 ```python
@@ -799,14 +799,20 @@ class MyProcessingKwargs(ProcessingKwargs, CommonKwargs, TextKwargs, ImagesKwarg
         output_kwargs.update(default_kwargs)
 
         # gather common kwargs and remove them from individual kwargs if present
-        common_kwargs = {
-            key: value
-            for key, value in kwargs.items()
-            if key not in ModelProcessorKwargs.__annotations__["text_kwargs"].__annotations__
-            and key not in ModelProcessorKwargs.__annotations__["images_kwargs"].__annotations__
-            and key not in ModelProcessorKwargs.__annotations__["audio_kwargs"].__annotations__
-            and key not in ModelProcessorKwargs.__annotations__["videos_kwargs"].__annotations__
-        }
+        common_kwargs = {}
+        for key, value in kwargs.items():
+            if key == "common_kwargs":
+                for common_key, common_value in value.items():
+                    common_kwargs[common_key] = common_value
+            elif key in ["text_kwargs", "images_kwargs", "audio_kwargs", "videos_kwargs"]:
+                pass
+            elif (
+                key not in ModelProcessorKwargs.__annotations__["text_kwargs"].__annotations__
+                and key not in ModelProcessorKwargs.__annotations__["images_kwargs"].__annotations__
+                and key not in ModelProcessorKwargs.__annotations__["audio_kwargs"].__annotations__
+                and key not in ModelProcessorKwargs.__annotations__["videos_kwargs"].__annotations__
+            ):
+                common_kwargs[key] = value
 
         # ensure common kwargs are propagated to all relevant modalities
         for key, value in common_kwargs.items():
@@ -820,10 +826,10 @@ class MyProcessingKwargs(ProcessingKwargs, CommonKwargs, TextKwargs, ImagesKwarg
         # update modality kwargs with passed kwargs
         for modality in output_kwargs:
             for modality_key in ModelProcessorKwargs.__annotations__[modality].__annotations__.keys():
-                if modality in kwargs and modality_key in kwargs[modality]:
-                    output_kwargs[modality][modality_key] = kwargs[modality][modality_key]
-                elif modality_key in kwargs:
+                if modality_key in kwargs:
                     output_kwargs[modality][modality_key] = kwargs[modality_key]
+                elif modality in kwargs and modality_key in kwargs[modality]:
+                    output_kwargs[modality][modality_key] = kwargs[modality][modality_key]
         return output_kwargs
 
     @classmethod
@@ -988,5 +994,4 @@ def apply_chat_template(
 ProcessorMixin.push_to_hub = copy_func(ProcessorMixin.push_to_hub)
 if ProcessorMixin.push_to_hub.__doc__ is not None:
     ProcessorMixin.push_to_hub.__doc__ = ProcessorMixin.push_to_hub.__doc__.format(
-        object="processor", object_class="AutoProcessor", object_files="processor files"
-    )
+        object="processor", object_class="AutoProcessor", object_
\ No newline at end of file
diff --git a/tests/models/grounding_dino/test_processor_grounding_dino.py b/tests/models/grounding_dino/test_processor_grounding_dino.py
index b7a259f0c31526..448aa8f7fb6433 100644
--- a/tests/models/grounding_dino/test_processor_grounding_dino.py
+++ b/tests/models/grounding_dino/test_processor_grounding_dino.py
@@ -43,6 +43,7 @@
 @require_torch
 @require_vision
 class GroundingDinoProcessorTest(ProcessorTesterMixin, unittest.TestCase):
+    from_pretrained_id = "IDEA-Research/grounding-dino-base"
     processor_class = GroundingDinoProcessor
 
     def setUp(self):
@@ -67,6 +68,13 @@ def setUp(self):
         with open(self.image_processor_file, "w", encoding="utf-8") as fp:
             json.dump(image_processor_map, fp)
 
+        image_processor = GroundingDinoImageProcessor()
+        tokenizer = BertTokenizer.from_pretrained(self.from_pretrained_id)
+
+        processor = GroundingDinoProcessor(image_processor, tokenizer)
+
+        processor.save_pretrained(self.tmpdirname)
+
         self.batch_size = 7
         self.num_queries = 5
         self.embed_dim = 5
@@ -281,4 +289,4 @@ def test_unstructured_kwargs_batched(self):
         )
         self.assertEqual(inputs["pixel_values"].shape[2], 214)
 
-        self.assertEqual(len(inputs["input_ids"][0]), 11)
+        self.assertEqual(len(inputs["input_ids"][0]), 6)
diff --git a/tests/test_processing_common.py b/tests/test_processing_common.py
index e6128cde9bb503..b43d48e530b8ce 100644
--- a/tests/test_processing_common.py
+++ b/tests/test_processing_common.py
@@ -277,13 +277,13 @@ def test_doubly_passed_kwargs(self):
 
         input_str = ["lower newer"]
         image_input = self.prepare_image_inputs()
-        with self.assertRaises(ValueError):
-            _ = processor(
-                text=input_str,
-                images=image_input,
-                images_kwargs={"crop_size": {"height": 222, "width": 222}},
-                crop_size={"height": 214, "width": 214},
-            )
+        inputs = processor(
+            text=input_str,
+            images=image_input,
+            images_kwargs={"size": {"height": 222, "width": 222}},
+            size={"height": 35, "width": 35},
+        )
+        self.assertEqual(inputs["pixel_values"][0].shape[2], 35)
 
     @require_torch
     @require_vision

From d5b13d2beb4cec5f617bdf6accf40241983c43cb Mon Sep 17 00:00:00 2001
From: sangbumchoi <danielsejong55@gmail.com>
Date: Wed, 24 Jul 2024 12:29:47 +0000
Subject: [PATCH 31/61] remain

---
 src/transformers/processing_utils.py | 3 ++-
 1 file changed, 2 insertions(+), 1 deletion(-)

diff --git a/src/transformers/processing_utils.py b/src/transformers/processing_utils.py
index 83ad01714dbceb..372e54e9e2776a 100644
--- a/src/transformers/processing_utils.py
+++ b/src/transformers/processing_utils.py
@@ -994,4 +994,5 @@ def apply_chat_template(
 ProcessorMixin.push_to_hub = copy_func(ProcessorMixin.push_to_hub)
 if ProcessorMixin.push_to_hub.__doc__ is not None:
     ProcessorMixin.push_to_hub.__doc__ = ProcessorMixin.push_to_hub.__doc__.format(
-        object="processor", object_class="AutoProcessor", object_
\ No newline at end of file
+        object="processor", object_class="AutoProcessor", object_files="processor files"
+    )

From 889c4ed0987e6a5f16d7b56867afa168a67e68f9 Mon Sep 17 00:00:00 2001
From: Eduardo Pacheco <69953243+EduardoPach@users.noreply.github.com>
Date: Mon, 29 Jul 2024 13:00:20 +0200
Subject: [PATCH 32/61] Update
 src/transformers/models/grounding_dino/modeling_grounding_dino.py

Co-authored-by: Sangbum Daniel Choi <34004152+SangbumChoi@users.noreply.github.com>
---
 .../models/grounding_dino/modeling_grounding_dino.py            | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/src/transformers/models/grounding_dino/modeling_grounding_dino.py b/src/transformers/models/grounding_dino/modeling_grounding_dino.py
index 5528b85042a521..2885231cf08c9e 100644
--- a/src/transformers/models/grounding_dino/modeling_grounding_dino.py
+++ b/src/transformers/models/grounding_dino/modeling_grounding_dino.py
@@ -2440,7 +2440,7 @@ def forward(
                 ).detach()
 
             # Set intermediate topk proposals (coords and class) for loss computation
-            encoder_pred_boxes = topk_coords_logits.sigmoid()
+            encoder_pred_boxes = reference_points
             encoder_logits = self.encoder_output_class_embed(target, text_features, text_token_mask)
         else:
             target = query_embeds.unsqueeze(0).repeat(batch_size, 1, 1)

From 73942bc8886f5450886d293f3ec28a558dcc98a9 Mon Sep 17 00:00:00 2001
From: Eduardo Pacheco <eduardo.pach@hotmail.com>
Date: Mon, 29 Jul 2024 13:00:13 +0200
Subject: [PATCH 33/61] add: cardinality loss and make box loss as copy from

---
 .../grounding_dino/modeling_grounding_dino.py      |  4 ++--
 .../grounding_dino/test_modeling_grounding_dino.py | 14 --------------
 2 files changed, 2 insertions(+), 16 deletions(-)

diff --git a/src/transformers/models/grounding_dino/modeling_grounding_dino.py b/src/transformers/models/grounding_dino/modeling_grounding_dino.py
index 2885231cf08c9e..27f0b81a5dfb6a 100644
--- a/src/transformers/models/grounding_dino/modeling_grounding_dino.py
+++ b/src/transformers/models/grounding_dino/modeling_grounding_dino.py
@@ -2807,7 +2807,7 @@ def loss_labels(self, outputs, targets, indices, num_boxes):
 
         return losses
 
-    # Same as in `DeformableDetrLoss` with the addition of loss_xy and loss_hw to calculate the x,y and h,w loss
+    # Copied from transformers.models.deformable_detr.modeling_deformable_detr.DeformableDetrLoss.loss_boxes
     def loss_boxes(self, outputs, targets, indices, num_boxes):
         """
         Compute the losses related to the bounding boxes, the L1 regression loss and the GIoU loss.
@@ -3142,7 +3142,7 @@ def forward(
                 class_cost=self.config.class_cost, bbox_cost=self.config.bbox_cost, giou_cost=self.config.giou_cost
             )
             # Second: create the criterion
-            losses = ["labels", "boxes"]
+            losses = ["labels", "boxes", "cardinality"]
             criterion = GroundingDinoLoss(
                 matcher=matcher,
                 class_reduction=self.config.class_loss_reduction,
diff --git a/tests/models/grounding_dino/test_modeling_grounding_dino.py b/tests/models/grounding_dino/test_modeling_grounding_dino.py
index 160c817cdf34ec..f864e098b458d2 100644
--- a/tests/models/grounding_dino/test_modeling_grounding_dino.py
+++ b/tests/models/grounding_dino/test_modeling_grounding_dino.py
@@ -801,38 +801,24 @@ def test_grounding_dino_loss(self):
             "loss_ce": torch.tensor(1.1151),
             "loss_bbox": torch.tensor(0.2031),
             "loss_giou": torch.tensor(0.5819),
-            "loss_xy": torch.tensor(0.0927),
-            "loss_hw": torch.tensor(0.1104),
             "loss_ce_0": torch.tensor(1.1942),
             "loss_bbox_0": torch.tensor(0.1978),
             "loss_giou_0": torch.tensor(0.5524),
-            "loss_xy_0": torch.tensor(0.0887),
-            "loss_hw_0": torch.tensor(0.1091),
             "loss_ce_1": torch.tensor(1.1623),
             "loss_bbox_1": torch.tensor(0.1909),
             "loss_giou_1": torch.tensor(0.5892),
-            "loss_xy_1": torch.tensor(0.0926),
-            "loss_hw_1": torch.tensor(0.0982),
             "loss_ce_2": torch.tensor(1.1643),
             "loss_bbox_2": torch.tensor(0.1891),
             "loss_giou_2": torch.tensor(0.5626),
-            "loss_xy_2": torch.tensor(0.0896),
-            "loss_hw_2": torch.tensor(0.0996),
             "loss_ce_3": torch.tensor(1.1945),
             "loss_bbox_3": torch.tensor(0.1943),
             "loss_giou_3": torch.tensor(0.5592),
-            "loss_xy_3": torch.tensor(0.0895),
-            "loss_hw_3": torch.tensor(0.1048),
             "loss_ce_4": torch.tensor(1.0946),
             "loss_bbox_4": torch.tensor(0.2037),
             "loss_giou_4": torch.tensor(0.5813),
-            "loss_xy_4": torch.tensor(0.0918),
-            "loss_hw_4": torch.tensor(0.1119),
             "loss_ce_enc": torch.tensor(16226.3145),
             "loss_bbox_enc": torch.tensor(0.3063),
             "loss_giou_enc": torch.tensor(0.7380),
-            "loss_xy_enc": torch.tensor(0.1324),
-            "loss_hw_enc": torch.tensor(0.1739),
         }
 
         expected_loss = torch.tensor(32482.2344)

From 7932111e10550486892e30f16288818cd73db100 Mon Sep 17 00:00:00 2001
From: Eduardo Pacheco <eduardo.pach@hotmail.com>
Date: Mon, 29 Jul 2024 13:03:19 +0200
Subject: [PATCH 34/61] change: default for reduction loss is sum

---
 .../models/grounding_dino/configuration_grounding_dino.py       | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/src/transformers/models/grounding_dino/configuration_grounding_dino.py b/src/transformers/models/grounding_dino/configuration_grounding_dino.py
index 8d7316718c91bd..4afd7717d82985 100644
--- a/src/transformers/models/grounding_dino/configuration_grounding_dino.py
+++ b/src/transformers/models/grounding_dino/configuration_grounding_dino.py
@@ -188,7 +188,7 @@ def __init__(
         class_loss_coefficient=2.0,
         bbox_loss_coefficient=5.0,
         giou_loss_coefficient=2.0,
-        class_loss_reduction="mean",
+        class_loss_reduction="sum",
         focal_alpha=0.25,
         disable_custom_kernels=False,
         # other parameters

From 277b3564a50e61aa63ed72156073834c8ca10078 Mon Sep 17 00:00:00 2001
From: Eduardo Pacheco <eduardo.pach@hotmail.com>
Date: Mon, 29 Jul 2024 13:07:20 +0200
Subject: [PATCH 35/61] fix: vectorized generate fake box

---
 .../test_modeling_grounding_dino.py           | 36 +++++++++----------
 1 file changed, 18 insertions(+), 18 deletions(-)

diff --git a/tests/models/grounding_dino/test_modeling_grounding_dino.py b/tests/models/grounding_dino/test_modeling_grounding_dino.py
index f864e098b458d2..84fc842acb2780 100644
--- a/tests/models/grounding_dino/test_modeling_grounding_dino.py
+++ b/tests/models/grounding_dino/test_modeling_grounding_dino.py
@@ -67,24 +67,24 @@ def generate_fake_bounding_boxes(n_boxes):
     # Generate random bounding boxes in the format (cx, cy, w, h)
     bounding_boxes = torch.rand((n_boxes, 4))
 
-    for i in range(n_boxes):
-        cx, cy, w, h = bounding_boxes[i]
-
-        # Ensure width and height do not exceed bounds
-        w = min(w, 1.0)
-        h = min(h, 1.0)
-
-        # Ensure the bounding box stays within the normalized space
-        if cx - w / 2 < 0:
-            cx = w / 2
-        if cx + w / 2 > 1:
-            cx = 1 - w / 2
-        if cy - h / 2 < 0:
-            cy = h / 2
-        if cy + h / 2 > 1:
-            cy = 1 - h / 2
-
-        bounding_boxes[i] = torch.tensor([cx, cy, w, h])
+    # Extract the components
+    cx = bounding_boxes[:, 0]
+    cy = bounding_boxes[:, 1]
+    w = bounding_boxes[:, 2]
+    h = bounding_boxes[:, 3]
+
+    # Ensure width and height do not exceed bounds
+    w = torch.min(w, torch.tensor(1.0))
+    h = torch.min(h, torch.tensor(1.0))
+
+    # Ensure the bounding box stays within the normalized space
+    cx = torch.where(cx - w / 2 < 0, w / 2, cx)
+    cx = torch.where(cx + w / 2 > 1, 1 - w / 2, cx)
+    cy = torch.where(cy - h / 2 < 0, h / 2, cy)
+    cy = torch.where(cy + h / 2 > 1, 1 - h / 2, cy)
+
+    # Combine back into bounding boxes
+    bounding_boxes = torch.stack([cx, cy, w, h], dim=1)
 
     return bounding_boxes
 

From 7f9df2991c0eacac0e66fb3cf169f896312aec25 Mon Sep 17 00:00:00 2001
From: Eduardo Pacheco <eduardo.pach@hotmail.com>
Date: Mon, 29 Jul 2024 13:10:51 +0200
Subject: [PATCH 36/61] fix copies

---
 .../models/grounding_dino/configuration_grounding_dino.py   | 2 +-
 .../models/grounding_dino/modeling_grounding_dino.py        | 6 ------
 2 files changed, 1 insertion(+), 7 deletions(-)

diff --git a/src/transformers/models/grounding_dino/configuration_grounding_dino.py b/src/transformers/models/grounding_dino/configuration_grounding_dino.py
index 4afd7717d82985..684f02d6457f70 100644
--- a/src/transformers/models/grounding_dino/configuration_grounding_dino.py
+++ b/src/transformers/models/grounding_dino/configuration_grounding_dino.py
@@ -103,7 +103,7 @@ class GroundingDinoConfig(PretrainedConfig):
             Relative weight of the L1 bounding box loss in the object detection loss.
         giou_loss_coefficient (`float`, *optional*, defaults to 2.0):
             Relative weight of the generalized IoU loss in the object detection loss.
-        class_loss_reduction (`str`, *optional*, defaults to `"mean"`):
+        class_loss_reduction (`str`, *optional*, defaults to `"sum"`):
             The reduction method for the classification loss. One of `"mean"` or `"sum"`.
         focal_alpha (`float`, *optional*, defaults to 0.25):
             Alpha parameter in the focal loss.
diff --git a/src/transformers/models/grounding_dino/modeling_grounding_dino.py b/src/transformers/models/grounding_dino/modeling_grounding_dino.py
index 27f0b81a5dfb6a..c3e889fef6ec75 100644
--- a/src/transformers/models/grounding_dino/modeling_grounding_dino.py
+++ b/src/transformers/models/grounding_dino/modeling_grounding_dino.py
@@ -2830,12 +2830,6 @@ def loss_boxes(self, outputs, targets, indices, num_boxes):
             generalized_box_iou(center_to_corners_format(source_boxes), center_to_corners_format(target_boxes))
         )
         losses["loss_giou"] = loss_giou.sum() / num_boxes
-
-        # calculate the x,y and h,w loss
-        with torch.no_grad():
-            losses["loss_xy"] = loss_bbox[..., :2].sum() / num_boxes
-            losses["loss_hw"] = loss_bbox[..., 2:].sum() / num_boxes
-
         return losses
 
     @torch.no_grad()

From 1cf9139ef8dad3c48db6f47f6f5c8ed6e351d79d Mon Sep 17 00:00:00 2001
From: sangbumchoi <danielsejong55@gmail.com>
Date: Mon, 29 Jul 2024 12:27:59 +0000
Subject: [PATCH 37/61] Revert "add molbap's commit"

This reverts commit a476c6ee88318ce40d73ea31e2dc2d4faa8ae410.
---
 .../processing_grounding_dino.py              |  23 +-
 src/transformers/processing_utils.py          |  79 ++++---
 tests/test_processing_common.py               | 200 ++----------------
 3 files changed, 70 insertions(+), 232 deletions(-)

diff --git a/src/transformers/models/grounding_dino/processing_grounding_dino.py b/src/transformers/models/grounding_dino/processing_grounding_dino.py
index 167b5598bfa7b9..a09d21502b3c07 100644
--- a/src/transformers/models/grounding_dino/processing_grounding_dino.py
+++ b/src/transformers/models/grounding_dino/processing_grounding_dino.py
@@ -16,14 +16,13 @@
 Processor class for Grounding DINO.
 """
 
-import pathlib
 import sys
-from typing import Dict, List, Optional, Tuple, Union
+from typing import List, Tuple, Union
 
 from ...image_processing_utils import BatchFeature
 from ...image_transforms import center_to_corners_format
 from ...image_utils import ImageInput
-from ...processing_utils import ImagesKwargs, ProcessingKwargs, ProcessorMixin
+from ...processing_utils import ProcessingKwargs, ProcessorMixin
 
 
 if sys.version_info >= (3, 11):
@@ -32,19 +31,12 @@
     from typing_extensions import Unpack
 
 from ...tokenization_utils_base import BatchEncoding, PreTokenizedInput, TextInput
-from ...utils import ExplicitEnum, TensorType, is_torch_available
+from ...utils import TensorType, is_torch_available
 
 
 if is_torch_available():
     import torch
 
-AnnotationType = Dict[str, Union[int, str, List[Dict]]]
-
-
-class AnnotationFormat(ExplicitEnum):
-    COCO_DETECTION = "coco_detection"
-    COCO_PANOPTIC = "coco_panoptic"
-
 
 def get_phrases_from_posmap(posmaps, input_ids):
     """Get token ids of phrases from posmaps and input_ids.
@@ -72,16 +64,7 @@ def get_phrases_from_posmap(posmaps, input_ids):
     return token_ids
 
 
-class GroundingDinoImagesKwargs(ImagesKwargs, total=False):
-    annotations: Optional[Union[AnnotationType, List[AnnotationType]]]
-    return_segmentation_masks: Optional[bool]
-    masks_path: Optional[Union[str, pathlib.Path]]
-    do_convert_annotations: Optional[bool]
-    format: Optional[Union[str, AnnotationFormat]]
-
-
 class GroundingDinoProcessorKwargs(ProcessingKwargs, total=False):
-    images_kwargs: GroundingDinoImagesKwargs
     _defaults = {
         "text_kwargs": {
             "add_special_tokens": True,
diff --git a/src/transformers/processing_utils.py b/src/transformers/processing_utils.py
index 372e54e9e2776a..9abb4b29fcd7ff 100644
--- a/src/transformers/processing_utils.py
+++ b/src/transformers/processing_utils.py
@@ -151,8 +151,6 @@ class methods and docstrings.
             Standard deviation to use if normalizing the image.
         do_pad (`bool`, *optional*):
             Whether to pad the image to the `(max_height, max_width)` of the images in the batch.
-        pad_size (`Dict[str, int]`, *optional*):
-            The size `{"height": int, "width" int}` to pad the images to.
         do_center_crop (`bool`, *optional*):
             Whether to center crop the image.
         data_format (`ChannelDimension` or `str`, *optional*):
@@ -172,7 +170,6 @@ class methods and docstrings.
     image_mean: Optional[Union[float, List[float]]]
     image_std: Optional[Union[float, List[float]]]
     do_pad: Optional[bool]
-    pad_size: Optional[Dict[str, int]]
     do_center_crop: Optional[bool]
     data_format: Optional[ChannelDimension]
     input_data_format: Optional[Union[str, ChannelDimension]]
@@ -324,6 +321,7 @@ class ProcessorMixin(PushToHubMixin):
     feature_extractor_class = None
     tokenizer_class = None
     _auto_class = None
+    valid_kwargs: List[str] = []
 
     # args have to match the attributes class attribute
     def __init__(self, *args, **kwargs):
@@ -702,14 +700,15 @@ def from_args_and_dict(cls, args, processor_dict: Dict[str, Any], **kwargs):
         return_unused_kwargs = kwargs.pop("return_unused_kwargs", False)
         chat_template = kwargs.pop("chat_template", None)
 
-        # Unlike image processors or feature extractors whose `__init__` accept `kwargs`, processor don't have `kwargs`.
-        # We have to pop up some unused (but specific) arguments to make it work.
+        # We have to pop up some unused (but specific) kwargs and then validate that it doesn't contain unused kwargs
+        # If we don't pop, some specific kwargs will raise a warning
         if "processor_class" in processor_dict:
             del processor_dict["processor_class"]
 
         if "auto_map" in processor_dict:
             del processor_dict["auto_map"]
 
+        unused_kwargs = cls.validate_init_kwargs(processor_config=processor_dict, valid_kwargs=cls.valid_kwargs)
         processor = cls(*args, **processor_dict)
         if chat_template is not None:
             setattr(processor, "chat_template", chat_template)
@@ -719,6 +718,7 @@ def from_args_and_dict(cls, args, processor_dict: Dict[str, Any], **kwargs):
             if hasattr(processor, key):
                 setattr(processor, key, kwargs.pop(key))
 
+        kwargs.update(unused_kwargs)
         logger.info(f"Processor {processor}")
         if return_unused_kwargs:
             return processor, kwargs
@@ -796,40 +796,38 @@ class MyProcessingKwargs(ProcessingKwargs, CommonKwargs, TextKwargs, ImagesKwarg
                 if modality_key in tokenizer_init_kwargs:
                     default_kwargs[modality][modality_key] = tokenizer_init_kwargs[modality_key]
         # now defaults kwargs are updated with the tokenizers defaults.
+        # pass defaults to output dictionary
         output_kwargs.update(default_kwargs)
 
-        # gather common kwargs and remove them from individual kwargs if present
-        common_kwargs = {}
-        for key, value in kwargs.items():
-            if key == "common_kwargs":
-                for common_key, common_value in value.items():
-                    common_kwargs[common_key] = common_value
-            elif key in ["text_kwargs", "images_kwargs", "audio_kwargs", "videos_kwargs"]:
-                pass
-            elif (
-                key not in ModelProcessorKwargs.__annotations__["text_kwargs"].__annotations__
-                and key not in ModelProcessorKwargs.__annotations__["images_kwargs"].__annotations__
-                and key not in ModelProcessorKwargs.__annotations__["audio_kwargs"].__annotations__
-                and key not in ModelProcessorKwargs.__annotations__["videos_kwargs"].__annotations__
-            ):
-                common_kwargs[key] = value
-
-        # ensure common kwargs are propagated to all relevant modalities
-        for key, value in common_kwargs.items():
-            for modality in output_kwargs:
-                if modality != "common_kwargs":
-                    output_kwargs[modality][key] = value
-
-        # remove common kwargs from the kwargs to process the rest
-        kwargs = {k: v for k, v in kwargs.items() if k not in common_kwargs}
-
         # update modality kwargs with passed kwargs
+        non_modality_kwargs = set(kwargs) - set(output_kwargs)
         for modality in output_kwargs:
             for modality_key in ModelProcessorKwargs.__annotations__[modality].__annotations__.keys():
-                if modality_key in kwargs:
-                    output_kwargs[modality][modality_key] = kwargs[modality_key]
-                elif modality in kwargs and modality_key in kwargs[modality]:
-                    output_kwargs[modality][modality_key] = kwargs[modality][modality_key]
+                # check if we received a structured kwarg dict or not to handle it correctly
+                if modality in kwargs:
+                    kwarg_value = kwargs[modality].pop(modality_key, "__empty__")
+                    # check if this key was passed as a flat kwarg.
+                    if kwarg_value != "__empty__" and modality_key in non_modality_kwargs:
+                        raise ValueError(
+                            f"Keyword argument {modality_key} was passed two times: in a dictionary for {modality} and as a **kwarg."
+                        )
+                elif modality_key in kwargs:
+                    kwarg_value = kwargs.pop(modality_key, "__empty__")
+                else:
+                    kwarg_value = "__empty__"
+                if kwarg_value != "__empty__":
+                    output_kwargs[modality][modality_key] = kwarg_value
+        # if something remains in kwargs, it belongs to common after flattening
+        if set(kwargs) & set(default_kwargs):
+            # here kwargs is dictionary-based since it shares keys with default set
+            [output_kwargs["common_kwargs"].update(subdict) for _, subdict in kwargs.items()]
+        else:
+            # here it's a flat dict
+            output_kwargs["common_kwargs"].update(kwargs)
+
+        # all modality-specific kwargs are updated with common kwargs
+        for modality in output_kwargs:
+            output_kwargs[modality].update(output_kwargs["common_kwargs"])
         return output_kwargs
 
     @classmethod
@@ -945,6 +943,19 @@ def model_input_names(self):
         first_attribute = getattr(self, self.attributes[0])
         return getattr(first_attribute, "model_input_names", None)
 
+    @staticmethod
+    def validate_init_kwargs(processor_config, valid_kwargs):
+        kwargs_from_config = processor_config.keys()
+        unused_kwargs = {}
+        unused_keys = set(kwargs_from_config) - set(valid_kwargs)
+        if unused_keys:
+            unused_key_str = ", ".join(unused_keys)
+            logger.warning(
+                f"Some kwargs in processor config are unused and will not have any effect: {unused_key_str}. "
+            )
+            unused_kwargs = {k: processor_config[k] for k in unused_keys}
+        return unused_kwargs
+
     def apply_chat_template(
         self,
         conversation: Union[List[Dict[str, str]]],
diff --git a/tests/test_processing_common.py b/tests/test_processing_common.py
index b43d48e530b8ce..bb4d86d3f5a500 100644
--- a/tests/test_processing_common.py
+++ b/tests/test_processing_common.py
@@ -16,7 +16,6 @@
 
 import inspect
 import json
-import random
 import tempfile
 
 
@@ -39,31 +38,15 @@
 from transformers.utils import is_vision_available
 
 
-global_rng = random.Random()
-
 if is_vision_available():
     from PIL import Image
 
     from transformers import CLIPImageProcessor
 
 
-# Copied from tests.models.whisper.test_feature_extraction_whisper.floats_list
-def floats_list(shape, scale=1.0, rng=None, name=None):
-    """Creates a random float32 tensor"""
-    if rng is None:
-        rng = global_rng
-
-    values = []
-    for batch_idx in range(shape[0]):
-        values.append([])
-        for _ in range(shape[1]):
-            values[-1].append(rng.random() * scale)
-
-    return values
-
-
 @require_torch
 @require_vision
+@require_torch
 class ProcessorTesterMixin:
     processor_class = None
 
@@ -77,10 +60,7 @@ def get_component(self, attribute, **kwargs):
             component_class_name = component_class_name[0]
 
         component_class = processor_class_from_name(component_class_name)
-        if hasattr(self, "tmpdirname"):
-            component = component_class.from_pretrained(self.tmpdirname, **kwargs)  # noqa
-        elif hasattr(self, "model_id"):
-            component = component_class.from_pretrained(self.model_id, **kwargs)  # noqa
+        component = component_class.from_pretrained(self.tmpdirname, **kwargs)  # noqa
 
         return component
 
@@ -146,13 +126,13 @@ def test_tokenizer_defaults_preserved_by_kwargs(self):
         if "image_processor" not in self.processor_class.attributes:
             self.skipTest(f"image_processor attribute not present in {self.processor_class}")
         image_processor = self.get_component("image_processor")
-        tokenizer = self.get_component("tokenizer", max_length=117, padding="max_length")
-        if not tokenizer.pad_token:
-            tokenizer.pad_token = "[TEST_PAD]"
+        tokenizer = self.get_component("tokenizer", max_length=117)
+
         processor = self.processor_class(tokenizer=tokenizer, image_processor=image_processor)
         self.skip_processor_without_typed_kwargs(processor)
         input_str = "lower newer"
         image_input = self.prepare_image_inputs()
+
         inputs = processor(text=input_str, images=image_input, return_tensors="pt")
         self.assertEqual(len(inputs["input_ids"][0]), 117)
 
@@ -161,15 +141,15 @@ def test_tokenizer_defaults_preserved_by_kwargs(self):
     def test_image_processor_defaults_preserved_by_image_kwargs(self):
         if "image_processor" not in self.processor_class.attributes:
             self.skipTest(f"image_processor attribute not present in {self.processor_class}")
-        image_processor = self.get_component("image_processor", crop_size=(234, 234), size=(234, 234))
+        image_processor = self.get_component("image_processor", crop_size=(234, 234))
         tokenizer = self.get_component("tokenizer", max_length=117)
-        if not tokenizer.pad_token:
-            tokenizer.pad_token = "[TEST_PAD]"
+
         processor = self.processor_class(tokenizer=tokenizer, image_processor=image_processor)
         self.skip_processor_without_typed_kwargs(processor)
 
         input_str = "lower newer"
         image_input = self.prepare_image_inputs()
+
         inputs = processor(text=input_str, images=image_input)
         self.assertEqual(len(inputs["pixel_values"][0][0]), 234)
 
@@ -180,15 +160,13 @@ def test_kwargs_overrides_default_tokenizer_kwargs(self):
             self.skipTest(f"image_processor attribute not present in {self.processor_class}")
         image_processor = self.get_component("image_processor")
         tokenizer = self.get_component("tokenizer", max_length=117)
-        if not tokenizer.pad_token:
-            tokenizer.pad_token = "[TEST_PAD]"
+
         processor = self.processor_class(tokenizer=tokenizer, image_processor=image_processor)
         self.skip_processor_without_typed_kwargs(processor)
         input_str = "lower newer"
         image_input = self.prepare_image_inputs()
-        inputs = processor(
-            text=input_str, images=image_input, return_tensors="pt", max_length=112, padding="max_length"
-        )
+
+        inputs = processor(text=input_str, images=image_input, return_tensors="pt", max_length=112)
         self.assertEqual(len(inputs["input_ids"][0]), 112)
 
     @require_torch
@@ -196,17 +174,16 @@ def test_kwargs_overrides_default_tokenizer_kwargs(self):
     def test_kwargs_overrides_default_image_processor_kwargs(self):
         if "image_processor" not in self.processor_class.attributes:
             self.skipTest(f"image_processor attribute not present in {self.processor_class}")
-        image_processor = self.get_component("image_processor", crop_size=(234, 234), size=(234, 234))
+        image_processor = self.get_component("image_processor", crop_size=(234, 234))
         tokenizer = self.get_component("tokenizer", max_length=117)
-        if not tokenizer.pad_token:
-            tokenizer.pad_token = "[TEST_PAD]"
+
         processor = self.processor_class(tokenizer=tokenizer, image_processor=image_processor)
         self.skip_processor_without_typed_kwargs(processor)
 
         input_str = "lower newer"
         image_input = self.prepare_image_inputs()
 
-        inputs = processor(text=input_str, images=image_input, crop_size=[224, 224], size=[224, 224])
+        inputs = processor(text=input_str, images=image_input, crop_size=[224, 224])
         self.assertEqual(len(inputs["pixel_values"][0][0]), 224)
 
     @require_torch
@@ -216,8 +193,7 @@ def test_unstructured_kwargs(self):
             self.skipTest(f"image_processor attribute not present in {self.processor_class}")
         image_processor = self.get_component("image_processor")
         tokenizer = self.get_component("tokenizer")
-        if not tokenizer.pad_token:
-            tokenizer.pad_token = "[TEST_PAD]"
+
         processor = self.processor_class(tokenizer=tokenizer, image_processor=image_processor)
         self.skip_processor_without_typed_kwargs(processor)
 
@@ -228,7 +204,6 @@ def test_unstructured_kwargs(self):
             images=image_input,
             return_tensors="pt",
             crop_size={"height": 214, "width": 214},
-            size={"height": 214, "width": 214},
             padding="max_length",
             max_length=76,
         )
@@ -243,8 +218,7 @@ def test_unstructured_kwargs_batched(self):
             self.skipTest(f"image_processor attribute not present in {self.processor_class}")
         image_processor = self.get_component("image_processor")
         tokenizer = self.get_component("tokenizer")
-        if not tokenizer.pad_token:
-            tokenizer.pad_token = "[TEST_PAD]"
+
         processor = self.processor_class(tokenizer=tokenizer, image_processor=image_processor)
         self.skip_processor_without_typed_kwargs(processor)
 
@@ -255,10 +229,10 @@ def test_unstructured_kwargs_batched(self):
             images=image_input,
             return_tensors="pt",
             crop_size={"height": 214, "width": 214},
-            size={"height": 214, "width": 214},
             padding="longest",
             max_length=76,
         )
+
         self.assertEqual(inputs["pixel_values"].shape[2], 214)
 
         self.assertEqual(len(inputs["input_ids"][0]), 6)
@@ -270,8 +244,7 @@ def test_doubly_passed_kwargs(self):
             self.skipTest(f"image_processor attribute not present in {self.processor_class}")
         image_processor = self.get_component("image_processor")
         tokenizer = self.get_component("tokenizer")
-        if not tokenizer.pad_token:
-            tokenizer.pad_token = "[TEST_PAD]"
+
         processor = self.processor_class(tokenizer=tokenizer, image_processor=image_processor)
         self.skip_processor_without_typed_kwargs(processor)
 
@@ -292,8 +265,7 @@ def test_structured_kwargs_nested(self):
             self.skipTest(f"image_processor attribute not present in {self.processor_class}")
         image_processor = self.get_component("image_processor")
         tokenizer = self.get_component("tokenizer")
-        if not tokenizer.pad_token:
-            tokenizer.pad_token = "[TEST_PAD]"
+
         processor = self.processor_class(tokenizer=tokenizer, image_processor=image_processor)
         self.skip_processor_without_typed_kwargs(processor)
 
@@ -303,7 +275,7 @@ def test_structured_kwargs_nested(self):
         # Define the kwargs for each modality
         all_kwargs = {
             "common_kwargs": {"return_tensors": "pt"},
-            "images_kwargs": {"crop_size": {"height": 214, "width": 214}, "size": {"height": 214, "width": 214}},
+            "images_kwargs": {"crop_size": {"height": 214, "width": 214}},
             "text_kwargs": {"padding": "max_length", "max_length": 76},
         }
 
@@ -322,8 +294,7 @@ def test_structured_kwargs_nested_from_dict(self):
 
         image_processor = self.get_component("image_processor")
         tokenizer = self.get_component("tokenizer")
-        if not tokenizer.pad_token:
-            tokenizer.pad_token = "[TEST_PAD]"
+
         processor = self.processor_class(tokenizer=tokenizer, image_processor=image_processor)
         self.skip_processor_without_typed_kwargs(processor)
         input_str = "lower newer"
@@ -332,7 +303,7 @@ def test_structured_kwargs_nested_from_dict(self):
         # Define the kwargs for each modality
         all_kwargs = {
             "common_kwargs": {"return_tensors": "pt"},
-            "images_kwargs": {"crop_size": {"height": 214, "width": 214}, "size": {"height": 214, "width": 214}},
+            "images_kwargs": {"crop_size": {"height": 214, "width": 214}},
             "text_kwargs": {"padding": "max_length", "max_length": 76},
         }
 
@@ -341,133 +312,6 @@ def test_structured_kwargs_nested_from_dict(self):
 
         self.assertEqual(len(inputs["input_ids"][0]), 76)
 
-    #  text + audio kwargs testing
-    @require_torch
-    def test_tokenizer_defaults_preserved_by_kwargs_audio(self):
-        if "feature_extractor" not in self.processor_class.attributes:
-            self.skipTest(f"feature_extractor attribute not present in {self.processor_class}")
-        feature_extractor = self.get_component("feature_extractor")
-        if hasattr(self, "get_tokenizer"):
-            tokenizer = self.get_tokenizer(max_length=117, padding="max_length")
-        elif hasattr(self, "get_component"):
-            tokenizer = self.get_component("tokenizer", max_length=117, padding="max_length")
-        if not tokenizer.pad_token:
-            tokenizer.pad_token = "[TEST_PAD]"
-        processor = self.processor_class(tokenizer=tokenizer, feature_extractor=feature_extractor)
-        self.skip_processor_without_typed_kwargs(processor)
-        input_str = "lower newer"
-        raw_speech = floats_list((3, 1000))
-        inputs = processor(text=input_str, audio=raw_speech, return_tensors="pt")
-        if "input_ids" in inputs:
-            self.assertEqual(len(inputs["input_ids"][0]), 117)
-        elif "labels" in inputs:
-            self.assertEqual(len(inputs["labels"][0]), 117)
-
-    @require_torch
-    def test_kwargs_overrides_default_tokenizer_kwargs_audio(self):
-        if "feature_extractor" not in self.processor_class.attributes:
-            self.skipTest(f"feature_extractor attribute not present in {self.processor_class}")
-        feature_extractor = self.get_component("feature_extractor")
-        if hasattr(self, "get_tokenizer"):
-            tokenizer = self.get_tokenizer(max_length=117)
-        elif hasattr(self, "get_component"):
-            tokenizer = self.get_component("tokenizer", max_length=117)
-        if not tokenizer.pad_token:
-            tokenizer.pad_token = "[TEST_PAD]"
-        processor = self.processor_class(tokenizer=tokenizer, feature_extractor=feature_extractor)
-        self.skip_processor_without_typed_kwargs(processor)
-        input_str = "lower newer"
-        raw_speech = floats_list((3, 1000))
-        inputs = processor(text=input_str, audio=raw_speech, return_tensors="pt", max_length=112, padding="max_length")
-        if "input_ids" in inputs:
-            self.assertEqual(len(inputs["input_ids"][0]), 112)
-        elif "labels" in inputs:
-            self.assertEqual(len(inputs["labels"][0]), 112)
-
-    @require_torch
-    def test_unstructured_kwargs_audio(self):
-        if "feature_extractor" not in self.processor_class.attributes:
-            self.skipTest(f"feature_extractor attribute not present in {self.processor_class}")
-        feature_extractor = self.get_component("feature_extractor")
-        if hasattr(self, "get_tokenizer"):
-            tokenizer = self.get_tokenizer(max_length=117)
-        elif hasattr(self, "get_component"):
-            tokenizer = self.get_component("tokenizer", max_length=117)
-        if not tokenizer.pad_token:
-            tokenizer.pad_token = "[TEST_PAD]"
-        processor = self.processor_class(tokenizer=tokenizer, feature_extractor=feature_extractor)
-        self.skip_processor_without_typed_kwargs(processor)
-
-        input_str = "lower newer"
-        raw_speech = floats_list((3, 1000))
-        inputs = processor(
-            text=input_str,
-            audio=raw_speech,
-            return_tensors="pt",
-            padding="max_length",
-            max_length=76,
-        )
-
-        if "input_ids" in inputs:
-            self.assertEqual(len(inputs["input_ids"][0]), 76)
-        elif "labels" in inputs:
-            self.assertEqual(len(inputs["labels"][0]), 76)
-
-    @require_torch
-    def test_doubly_passed_kwargs_audio(self):
-        if "feature_extractor" not in self.processor_class.attributes:
-            self.skipTest(f"feature_extractor attribute not present in {self.processor_class}")
-        feature_extractor = self.get_component("feature_extractor")
-        if hasattr(self, "get_tokenizer"):
-            tokenizer = self.get_tokenizer()
-        elif hasattr(self, "get_component"):
-            tokenizer = self.get_component("tokenizer")
-        if not tokenizer.pad_token:
-            tokenizer.pad_token = "[TEST_PAD]"
-        processor = self.processor_class(tokenizer=tokenizer, feature_extractor=feature_extractor)
-        self.skip_processor_without_typed_kwargs(processor)
-
-        input_str = ["lower newer"]
-        raw_speech = floats_list((3, 1000))
-        with self.assertRaises(ValueError):
-            _ = processor(
-                text=input_str,
-                audio=raw_speech,
-                audio_kwargs={"padding": "max_length"},
-                padding="max_length",
-            )
-
-    @require_torch
-    @require_vision
-    def test_structured_kwargs_audio_nested(self):
-        if "feature_extractor" not in self.processor_class.attributes:
-            self.skipTest(f"feature_extractor attribute not present in {self.processor_class}")
-        feature_extractor = self.get_component("feature_extractor")
-        if hasattr(self, "get_tokenizer"):
-            tokenizer = self.get_tokenizer()
-        elif hasattr(self, "get_component"):
-            tokenizer = self.get_component("tokenizer")
-        if not tokenizer.pad_token:
-            tokenizer.pad_token = "[TEST_PAD]"
-        processor = self.processor_class(tokenizer=tokenizer, feature_extractor=feature_extractor)
-        self.skip_processor_without_typed_kwargs(processor)
-
-        input_str = ["lower newer"]
-        raw_speech = floats_list((3, 1000))
-
-        # Define the kwargs for each modality
-        all_kwargs = {
-            "common_kwargs": {"return_tensors": "pt"},
-            "audio_kwargs": {"padding": "max_length", "max_length": 66},
-            "text_kwargs": {"padding": "max_length", "max_length": 76},
-        }
-
-        inputs = processor(text=input_str, audio=raw_speech, **all_kwargs)
-        if "input_ids" in inputs:
-            self.assertEqual(len(inputs["input_ids"][0]), 76)
-        elif "labels" in inputs:
-            self.assertEqual(len(inputs["labels"][0]), 76)
-
 
 class MyProcessor(ProcessorMixin):
     attributes = ["image_processor", "tokenizer"]

From 86722b43d1577e9e9f697e6186083851aa9001d6 Mon Sep 17 00:00:00 2001
From: sangbumchoi <danielsejong55@gmail.com>
Date: Mon, 29 Jul 2024 13:09:27 +0000
Subject: [PATCH 38/61] add unsync PR

---
 .../processing_grounding_dino.py              | 27 +++++++++++++++----
 1 file changed, 22 insertions(+), 5 deletions(-)

diff --git a/src/transformers/models/grounding_dino/processing_grounding_dino.py b/src/transformers/models/grounding_dino/processing_grounding_dino.py
index a09d21502b3c07..444b29085b0142 100644
--- a/src/transformers/models/grounding_dino/processing_grounding_dino.py
+++ b/src/transformers/models/grounding_dino/processing_grounding_dino.py
@@ -16,13 +16,14 @@
 Processor class for Grounding DINO.
 """
 
+import pathlib
 import sys
-from typing import List, Tuple, Union
+from typing import Dict, List, Optional, Tuple, Union
 
 from ...image_processing_utils import BatchFeature
 from ...image_transforms import center_to_corners_format
 from ...image_utils import ImageInput
-from ...processing_utils import ProcessingKwargs, ProcessorMixin
+from ...processing_utils import ImagesKwargs, ProcessingKwargs, ProcessorMixin
 
 
 if sys.version_info >= (3, 11):
@@ -31,13 +32,21 @@
     from typing_extensions import Unpack
 
 from ...tokenization_utils_base import BatchEncoding, PreTokenizedInput, TextInput
-from ...utils import TensorType, is_torch_available
+from ...utils import ExplicitEnum, TensorType, is_torch_available
 
 
 if is_torch_available():
     import torch
 
 
+AnnotationType = Dict[str, Union[int, str, List[Dict]]]
+
+
+class AnnotationFormat(ExplicitEnum):
+    COCO_DETECTION = "coco_detection"
+    COCO_PANOPTIC = "coco_panoptic"
+
+
 def get_phrases_from_posmap(posmaps, input_ids):
     """Get token ids of phrases from posmaps and input_ids.
 
@@ -64,7 +73,16 @@ def get_phrases_from_posmap(posmaps, input_ids):
     return token_ids
 
 
+class GroundingDinoImagesKwargs(ImagesKwargs, total=False):
+    annotations: Optional[Union[AnnotationType, List[AnnotationType]]]
+    return_segmentation_masks: Optional[bool]
+    masks_path: Optional[Union[str, pathlib.Path]]
+    do_convert_annotations: Optional[bool]
+    format: Optional[Union[str, AnnotationFormat]]
+
+
 class GroundingDinoProcessorKwargs(ProcessingKwargs, total=False):
+    images_kwargs: GroundingDinoImagesKwargs
     _defaults = {
         "text_kwargs": {
             "add_special_tokens": True,
@@ -76,8 +94,7 @@ class GroundingDinoProcessorKwargs(ProcessingKwargs, total=False):
             "return_token_type_ids": True,
             "return_length": False,
             "verbose": True,
-        },
-        "images_kwargs": {},
+        }
     }
 
 
From 8baa8e080eb0724d831341f22d25fd5607ff4b76 Mon Sep 17 00:00:00 2001
From: sangbumchoi <danielsejong55@gmail.com>
Date: Mon, 29 Jul 2024 13:25:34 +0000
Subject: [PATCH 39/61] revert

---
 tests/test_processing_common.py | 14 +++++++-------
 1 file changed, 7 insertions(+), 7 deletions(-)

diff --git a/tests/test_processing_common.py b/tests/test_processing_common.py
index bb4d86d3f5a500..074aa2f1d62545 100644
--- a/tests/test_processing_common.py
+++ b/tests/test_processing_common.py
@@ -250,13 +250,13 @@ def test_doubly_passed_kwargs(self):
 
         input_str = ["lower newer"]
         image_input = self.prepare_image_inputs()
-        inputs = processor(
-            text=input_str,
-            images=image_input,
-            images_kwargs={"size": {"height": 222, "width": 222}},
-            size={"height": 35, "width": 35},
-        )
-        self.assertEqual(inputs["pixel_values"][0].shape[2], 35)
+        with self.assertRaises(ValueError):
+            _ = processor(
+                text=input_str,
+                images=image_input,
+                images_kwargs={"crop_size": {"height": 222, "width": 222}},
+                crop_size={"height": 214, "width": 214},
+            )
 
     @require_torch
     @require_vision

From 39f28afc08d3d4a371112c830acdd5a73d71e87b Mon Sep 17 00:00:00 2001
From: sangbumchoi <danielsejong55@gmail.com>
Date: Mon, 5 Aug 2024 06:54:07 +0000
Subject: [PATCH 40/61] make CI happy

---
 .../test_processor_grounding_dino.py          | 157 ++++++++++++++++++
 1 file changed, 157 insertions(+)

diff --git a/tests/models/grounding_dino/test_processor_grounding_dino.py b/tests/models/grounding_dino/test_processor_grounding_dino.py
index 448aa8f7fb6433..32c61e407df027 100644
--- a/tests/models/grounding_dino/test_processor_grounding_dino.py
+++ b/tests/models/grounding_dino/test_processor_grounding_dino.py
@@ -22,6 +22,7 @@
 import pytest
 
 from transformers import BertTokenizer, BertTokenizerFast, GroundingDinoProcessor
+from transformers.models.auto.processing_auto import processor_class_from_name
 from transformers.models.bert.tokenization_bert import VOCAB_FILES_NAMES
 from transformers.testing_utils import require_torch, require_vision
 from transformers.utils import IMAGE_PROCESSOR_NAME, is_torch_available, is_vision_available
@@ -80,6 +81,17 @@ def setUp(self):
         self.embed_dim = 5
         self.seq_length = 5
 
+    def get_component(self, attribute, **kwargs):
+        assert attribute in self.processor_class.attributes
+        component_class_name = getattr(self.processor_class, f"{attribute}_class")
+        if isinstance(component_class_name, tuple):
+            component_class_name = component_class_name[0]
+
+        component_class = processor_class_from_name(component_class_name)
+        component = component_class.from_pretrained(self.tmpdirname, **kwargs)  # noqa
+
+        return component
+
     # Copied from tests.models.clip.test_processor_clip.CLIPProcessorTest.get_tokenizer with CLIP->Bert
     def get_tokenizer(self, **kwargs):
         return BertTokenizer.from_pretrained(self.tmpdirname, **kwargs)
@@ -264,6 +276,151 @@ def test_model_input_names(self):
 
         self.assertListEqual(list(inputs.keys()), processor.model_input_names)
 
+    @require_torch
+    @require_vision
+    def test_image_processor_defaults_preserved_by_image_kwargs(self):
+        if "image_processor" not in self.processor_class.attributes:
+            self.skipTest(f"image_processor attribute not present in {self.processor_class}")
+        image_processor = self.get_component("image_processor", size={"height": 234, "width": 234})
+        tokenizer = self.get_component("tokenizer", max_length=117)
+
+        processor = self.processor_class(tokenizer=tokenizer, image_processor=image_processor)
+        self.skip_processor_without_typed_kwargs(processor)
+
+        input_str = "lower newer"
+        image_input = self.prepare_image_inputs()
+
+        inputs = processor(text=input_str, images=image_input)
+        self.assertEqual(len(inputs["pixel_values"][0][0]), 234)
+
+    @require_vision
+    @require_torch
+    def test_kwargs_overrides_default_tokenizer_kwargs(self):
+        if "image_processor" not in self.processor_class.attributes:
+            self.skipTest(f"image_processor attribute not present in {self.processor_class}")
+        image_processor = self.get_component("image_processor")
+        tokenizer = self.get_component("tokenizer", max_length=117)
+
+        processor = self.processor_class(tokenizer=tokenizer, image_processor=image_processor)
+        self.skip_processor_without_typed_kwargs(processor)
+        input_str = "lower newer"
+        image_input = self.prepare_image_inputs()
+
+        inputs = processor(text=input_str, images=image_input, return_tensors="pt", max_length=112)
+        self.assertEqual(len(inputs["input_ids"][0]), 4)
+
+    @require_vision
+    @require_torch
+    def test_tokenizer_defaults_preserved_by_kwargs(self):
+        if "image_processor" not in self.processor_class.attributes:
+            self.skipTest(f"image_processor attribute not present in {self.processor_class}")
+        image_processor = self.get_component("image_processor")
+        tokenizer = self.get_component("tokenizer", max_length=117)
+
+        processor = self.processor_class(tokenizer=tokenizer, image_processor=image_processor)
+        self.skip_processor_without_typed_kwargs(processor)
+        input_str = "lower newer"
+        image_input = self.prepare_image_inputs()
+
+        inputs = processor(text=input_str, images=image_input, return_tensors="pt")
+        self.assertEqual(len(inputs["input_ids"][0]), 4)
+
+    @require_torch
+    @require_vision
+    def test_kwargs_overrides_default_image_processor_kwargs(self):
+        if "image_processor" not in self.processor_class.attributes:
+            self.skipTest(f"image_processor attribute not present in {self.processor_class}")
+        image_processor = self.get_component("image_processor", size=(234, 234))
+        tokenizer = self.get_component("tokenizer", max_length=117)
+
+        processor = self.processor_class(tokenizer=tokenizer, image_processor=image_processor)
+        self.skip_processor_without_typed_kwargs(processor)
+
+        input_str = "lower newer"
+        image_input = self.prepare_image_inputs()
+
+        inputs = processor(text=input_str, images=image_input, size=[224, 224])
+        self.assertEqual(len(inputs["pixel_values"][0][0]), 224)
+
+    @require_torch
+    @require_vision
+    def test_structured_kwargs_nested(self):
+        if "image_processor" not in self.processor_class.attributes:
+            self.skipTest(f"image_processor attribute not present in {self.processor_class}")
+        image_processor = self.get_component("image_processor")
+        tokenizer = self.get_component("tokenizer")
+
+        processor = self.processor_class(tokenizer=tokenizer, image_processor=image_processor)
+        self.skip_processor_without_typed_kwargs(processor)
+
+        input_str = "lower newer"
+        image_input = self.prepare_image_inputs()
+
+        # Define the kwargs for each modality
+        all_kwargs = {
+            "common_kwargs": {"return_tensors": "pt"},
+            "images_kwargs": {"size": {"height": 214, "width": 214}},
+            "text_kwargs": {"padding": "max_length", "max_length": 76},
+        }
+
+        inputs = processor(text=input_str, images=image_input, **all_kwargs)
+        self.skip_processor_without_typed_kwargs(processor)
+
+        self.assertEqual(inputs["pixel_values"].shape[2], 214)
+
+        self.assertEqual(len(inputs["input_ids"][0]), 76)
+
+    @require_torch
+    @require_vision
+    def test_structured_kwargs_nested_from_dict(self):
+        if "image_processor" not in self.processor_class.attributes:
+            self.skipTest(f"image_processor attribute not present in {self.processor_class}")
+
+        image_processor = self.get_component("image_processor")
+        tokenizer = self.get_component("tokenizer")
+
+        processor = self.processor_class(tokenizer=tokenizer, image_processor=image_processor)
+        self.skip_processor_without_typed_kwargs(processor)
+        input_str = "lower newer"
+        image_input = self.prepare_image_inputs()
+
+        # Define the kwargs for each modality
+        all_kwargs = {
+            "common_kwargs": {"return_tensors": "pt"},
+            "images_kwargs": {"size": {"height": 214, "width": 214}},
+            "text_kwargs": {"padding": "max_length", "max_length": 76},
+        }
+
+        inputs = processor(text=input_str, images=image_input, **all_kwargs)
+        self.assertEqual(inputs["pixel_values"].shape[2], 214)
+
+        self.assertEqual(len(inputs["input_ids"][0]), 76)
+
+    @require_torch
+    @require_vision
+    def test_unstructured_kwargs(self):
+        if "image_processor" not in self.processor_class.attributes:
+            self.skipTest(f"image_processor attribute not present in {self.processor_class}")
+        image_processor = self.get_component("image_processor")
+        tokenizer = self.get_component("tokenizer")
+
+        processor = self.processor_class(tokenizer=tokenizer, image_processor=image_processor)
+        self.skip_processor_without_typed_kwargs(processor)
+
+        input_str = "lower newer"
+        image_input = self.prepare_image_inputs()
+        inputs = processor(
+            text=input_str,
+            images=image_input,
+            return_tensors="pt",
+            size={"height": 214, "width": 214},
+            padding="max_length",
+            max_length=76,
+        )
+
+        self.assertEqual(inputs["pixel_values"].shape[2], 214)
+        self.assertEqual(len(inputs["input_ids"][0]), 76)
+
     @require_torch
     @require_vision
     def test_unstructured_kwargs_batched(self):

From 59982bc5c9f04d4450cc3e48be1eb1918a89330f Mon Sep 17 00:00:00 2001
From: Eduardo Pacheco <eduardo.pach@hotmail.com>
Date: Mon, 5 Aug 2024 14:52:57 +0200
Subject: [PATCH 41/61] Addressed comments

---
 .../models/grounding_dino/configuration_grounding_dino.py    | 5 +++++
 tests/models/grounding_dino/test_modeling_grounding_dino.py  | 1 +
 2 files changed, 6 insertions(+)

diff --git a/src/transformers/models/grounding_dino/configuration_grounding_dino.py b/src/transformers/models/grounding_dino/configuration_grounding_dino.py
index 684f02d6457f70..6d9cf0156c4bcc 100644
--- a/src/transformers/models/grounding_dino/configuration_grounding_dino.py
+++ b/src/transformers/models/grounding_dino/configuration_grounding_dino.py
@@ -205,6 +205,11 @@ def __init__(
         layer_norm_eps=1e-5,
         **kwargs,
     ):
+        if class_loss_reduction not in ["sum", "mean"]:
+            raise ValueError(
+                f"Invalid class_loss_reduction: {class_loss_reduction}. It must be either 'sum' or 'mean'."
+            )
+
         if backbone_config is None and backbone is None:
             logger.info("`backbone_config` is `None`. Initializing the config with the default `Swin` backbone.")
             backbone_config = CONFIG_MAPPING["swin"](
diff --git a/tests/models/grounding_dino/test_modeling_grounding_dino.py b/tests/models/grounding_dino/test_modeling_grounding_dino.py
index 84fc842acb2780..fde031bd5c4c13 100644
--- a/tests/models/grounding_dino/test_modeling_grounding_dino.py
+++ b/tests/models/grounding_dino/test_modeling_grounding_dino.py
@@ -150,6 +150,7 @@ def prepare_config_and_inputs(self):
         pixel_values = floats_tensor([self.batch_size, self.num_channels, self.image_size, self.image_size])
         pixel_mask = torch.ones([self.batch_size, self.image_size, self.image_size], device=torch_device)
 
+        # To avoid erros when running tests with `labels` `input_ids` have to follow this structure
         input_ids = torch.tensor([101, 3869, 1012, 11420, 1012, 1012, 102])
         input_ids = input_ids.unsqueeze(0).expand(self.batch_size, -1)
 

From 7366aab07281f4792a5d85626efcf65b27b62a78 Mon Sep 17 00:00:00 2001
From: sangbumchoi <danielsejong55@gmail.com>
Date: Mon, 5 Aug 2024 14:12:19 +0000
Subject: [PATCH 42/61] nit

---
 .../test_processor_grounding_dino.py          | 22 +++++--------------
 1 file changed, 6 insertions(+), 16 deletions(-)

diff --git a/tests/models/grounding_dino/test_processor_grounding_dino.py b/tests/models/grounding_dino/test_processor_grounding_dino.py
index 32c61e407df027..c0bb186b392eb0 100644
--- a/tests/models/grounding_dino/test_processor_grounding_dino.py
+++ b/tests/models/grounding_dino/test_processor_grounding_dino.py
@@ -22,7 +22,6 @@
 import pytest
 
 from transformers import BertTokenizer, BertTokenizerFast, GroundingDinoProcessor
-from transformers.models.auto.processing_auto import processor_class_from_name
 from transformers.models.bert.tokenization_bert import VOCAB_FILES_NAMES
 from transformers.testing_utils import require_torch, require_vision
 from transformers.utils import IMAGE_PROCESSOR_NAME, is_torch_available, is_vision_available
@@ -81,17 +80,6 @@ def setUp(self):
         self.embed_dim = 5
         self.seq_length = 5
 
-    def get_component(self, attribute, **kwargs):
-        assert attribute in self.processor_class.attributes
-        component_class_name = getattr(self.processor_class, f"{attribute}_class")
-        if isinstance(component_class_name, tuple):
-            component_class_name = component_class_name[0]
-
-        component_class = processor_class_from_name(component_class_name)
-        component = component_class.from_pretrained(self.tmpdirname, **kwargs)  # noqa
-
-        return component
-
     # Copied from tests.models.clip.test_processor_clip.CLIPProcessorTest.get_tokenizer with CLIP->Bert
     def get_tokenizer(self, **kwargs):
         return BertTokenizer.from_pretrained(self.tmpdirname, **kwargs)
@@ -306,8 +294,10 @@ def test_kwargs_overrides_default_tokenizer_kwargs(self):
         input_str = "lower newer"
         image_input = self.prepare_image_inputs()
 
-        inputs = processor(text=input_str, images=image_input, return_tensors="pt", max_length=112)
-        self.assertEqual(len(inputs["input_ids"][0]), 4)
+        inputs = processor(
+            text=input_str, images=image_input, return_tensors="pt", padding="max_length", max_length=112
+        )
+        self.assertEqual(len(inputs["input_ids"][0]), 112)
 
     @require_vision
     @require_torch
@@ -322,8 +312,8 @@ def test_tokenizer_defaults_preserved_by_kwargs(self):
         input_str = "lower newer"
         image_input = self.prepare_image_inputs()
 
-        inputs = processor(text=input_str, images=image_input, return_tensors="pt")
-        self.assertEqual(len(inputs["input_ids"][0]), 4)
+        inputs = processor(text=input_str, images=image_input, return_tensors="pt", padding="max_length")
+        self.assertEqual(len(inputs["input_ids"][0]), 117)
 
     @require_torch
     @require_vision

From 92e2f8473f0839df5be1256c578aaaf387829c52 Mon Sep 17 00:00:00 2001
From: sangbumchoi <danielsejong55@gmail.com>
Date: Tue, 6 Aug 2024 06:33:56 +0000
Subject: [PATCH 43/61] tmp

---
 examples/pytorch/zero-shot/README.md          | 233 ++++++
 ...t.tfevents.1722922514.a9776447cf29.50015.1 | Bin 0 -> 1642 bytes
 .../1722922514.4384637/hparams.yml            |  29 +
 ...t.tfevents.1722922680.a9776447cf29.52465.1 | Bin 0 -> 1642 bytes
 .../1722922680.2782347/hparams.yml            |  29 +
 ...t.tfevents.1722924013.a9776447cf29.54915.1 | Bin 0 -> 1642 bytes
 .../1722924013.7994215/hparams.yml            |  29 +
 ...t.tfevents.1722924136.a9776447cf29.57364.1 | Bin 0 -> 1642 bytes
 .../1722924136.0336704/hparams.yml            |  29 +
 ...t.tfevents.1722924201.a9776447cf29.59813.1 | Bin 0 -> 1642 bytes
 .../1722924201.254446/hparams.yml             |  29 +
 ...t.tfevents.1722924545.a9776447cf29.62264.1 | Bin 0 -> 1642 bytes
 .../1722924545.4942276/hparams.yml            |  29 +
 ...t.tfevents.1722924574.a9776447cf29.63604.1 | Bin 0 -> 1642 bytes
 .../1722924574.1846154/hparams.yml            |  29 +
 ...t.tfevents.1722925039.a9776447cf29.67335.1 | Bin 0 -> 1642 bytes
 .../1722925039.8834014/hparams.yml            |  29 +
 ...t.tfevents.1722925229.a9776447cf29.71066.1 | Bin 0 -> 1642 bytes
 .../1722925229.4961867/hparams.yml            |  29 +
 ...t.tfevents.1722922514.a9776447cf29.50015.0 | Bin 0 -> 88 bytes
 ...t.tfevents.1722922680.a9776447cf29.52465.0 | Bin 0 -> 88 bytes
 ...t.tfevents.1722924013.a9776447cf29.54915.0 | Bin 0 -> 88 bytes
 ...t.tfevents.1722924136.a9776447cf29.57364.0 | Bin 0 -> 88 bytes
 ...t.tfevents.1722924201.a9776447cf29.59813.0 | Bin 0 -> 88 bytes
 ...t.tfevents.1722924545.a9776447cf29.62264.0 | Bin 0 -> 88 bytes
 ...t.tfevents.1722924574.a9776447cf29.63604.0 | Bin 0 -> 88 bytes
 ...t.tfevents.1722925039.a9776447cf29.67335.0 | Bin 0 -> 88 bytes
 ...t.tfevents.1722925229.a9776447cf29.71066.0 | Bin 0 -> 88 bytes
 examples/pytorch/zero-shot/requirements.txt   |   5 +
 .../run_zero_shot_object_detection.py         | 518 ++++++++++++
 ...n_zero_shot_object_detection_no_trainer.py | 785 ++++++++++++++++++
 31 files changed, 1802 insertions(+)
 create mode 100644 examples/pytorch/zero-shot/README.md
 create mode 100644 examples/pytorch/zero-shot/grounding-dino-tiny-finetuned/object_detection_no_trainer/1722922514.433052/events.out.tfevents.1722922514.a9776447cf29.50015.1
 create mode 100644 examples/pytorch/zero-shot/grounding-dino-tiny-finetuned/object_detection_no_trainer/1722922514.4384637/hparams.yml
 create mode 100644 examples/pytorch/zero-shot/grounding-dino-tiny-finetuned/object_detection_no_trainer/1722922680.2719772/events.out.tfevents.1722922680.a9776447cf29.52465.1
 create mode 100644 examples/pytorch/zero-shot/grounding-dino-tiny-finetuned/object_detection_no_trainer/1722922680.2782347/hparams.yml
 create mode 100644 examples/pytorch/zero-shot/grounding-dino-tiny-finetuned/object_detection_no_trainer/1722924013.7937937/events.out.tfevents.1722924013.a9776447cf29.54915.1
 create mode 100644 examples/pytorch/zero-shot/grounding-dino-tiny-finetuned/object_detection_no_trainer/1722924013.7994215/hparams.yml
 create mode 100644 examples/pytorch/zero-shot/grounding-dino-tiny-finetuned/object_detection_no_trainer/1722924136.027696/events.out.tfevents.1722924136.a9776447cf29.57364.1
 create mode 100644 examples/pytorch/zero-shot/grounding-dino-tiny-finetuned/object_detection_no_trainer/1722924136.0336704/hparams.yml
 create mode 100644 examples/pytorch/zero-shot/grounding-dino-tiny-finetuned/object_detection_no_trainer/1722924201.2479532/events.out.tfevents.1722924201.a9776447cf29.59813.1
 create mode 100644 examples/pytorch/zero-shot/grounding-dino-tiny-finetuned/object_detection_no_trainer/1722924201.254446/hparams.yml
 create mode 100644 examples/pytorch/zero-shot/grounding-dino-tiny-finetuned/object_detection_no_trainer/1722924545.4884186/events.out.tfevents.1722924545.a9776447cf29.62264.1
 create mode 100644 examples/pytorch/zero-shot/grounding-dino-tiny-finetuned/object_detection_no_trainer/1722924545.4942276/hparams.yml
 create mode 100644 examples/pytorch/zero-shot/grounding-dino-tiny-finetuned/object_detection_no_trainer/1722924574.178983/events.out.tfevents.1722924574.a9776447cf29.63604.1
 create mode 100644 examples/pytorch/zero-shot/grounding-dino-tiny-finetuned/object_detection_no_trainer/1722924574.1846154/hparams.yml
 create mode 100644 examples/pytorch/zero-shot/grounding-dino-tiny-finetuned/object_detection_no_trainer/1722925039.8776126/events.out.tfevents.1722925039.a9776447cf29.67335.1
 create mode 100644 examples/pytorch/zero-shot/grounding-dino-tiny-finetuned/object_detection_no_trainer/1722925039.8834014/hparams.yml
 create mode 100644 examples/pytorch/zero-shot/grounding-dino-tiny-finetuned/object_detection_no_trainer/1722925229.4844875/events.out.tfevents.1722925229.a9776447cf29.71066.1
 create mode 100644 examples/pytorch/zero-shot/grounding-dino-tiny-finetuned/object_detection_no_trainer/1722925229.4961867/hparams.yml
 create mode 100644 examples/pytorch/zero-shot/grounding-dino-tiny-finetuned/object_detection_no_trainer/events.out.tfevents.1722922514.a9776447cf29.50015.0
 create mode 100644 examples/pytorch/zero-shot/grounding-dino-tiny-finetuned/object_detection_no_trainer/events.out.tfevents.1722922680.a9776447cf29.52465.0
 create mode 100644 examples/pytorch/zero-shot/grounding-dino-tiny-finetuned/object_detection_no_trainer/events.out.tfevents.1722924013.a9776447cf29.54915.0
 create mode 100644 examples/pytorch/zero-shot/grounding-dino-tiny-finetuned/object_detection_no_trainer/events.out.tfevents.1722924136.a9776447cf29.57364.0
 create mode 100644 examples/pytorch/zero-shot/grounding-dino-tiny-finetuned/object_detection_no_trainer/events.out.tfevents.1722924201.a9776447cf29.59813.0
 create mode 100644 examples/pytorch/zero-shot/grounding-dino-tiny-finetuned/object_detection_no_trainer/events.out.tfevents.1722924545.a9776447cf29.62264.0
 create mode 100644 examples/pytorch/zero-shot/grounding-dino-tiny-finetuned/object_detection_no_trainer/events.out.tfevents.1722924574.a9776447cf29.63604.0
 create mode 100644 examples/pytorch/zero-shot/grounding-dino-tiny-finetuned/object_detection_no_trainer/events.out.tfevents.1722925039.a9776447cf29.67335.0
 create mode 100644 examples/pytorch/zero-shot/grounding-dino-tiny-finetuned/object_detection_no_trainer/events.out.tfevents.1722925229.a9776447cf29.71066.0
 create mode 100644 examples/pytorch/zero-shot/requirements.txt
 create mode 100644 examples/pytorch/zero-shot/run_zero_shot_object_detection.py
 create mode 100644 examples/pytorch/zero-shot/run_zero_shot_object_detection_no_trainer.py

diff --git a/examples/pytorch/zero-shot/README.md b/examples/pytorch/zero-shot/README.md
new file mode 100644
index 00000000000000..34d18393c7c4d2
--- /dev/null
+++ b/examples/pytorch/zero-shot/README.md
@@ -0,0 +1,233 @@
+<!---
+Copyright 2024 The HuggingFace Team. All rights reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License.
+-->
+
+# Object detection examples
+
+This directory contains 2 scripts that showcase how to fine-tune any model supported by the [`GroundingDinoForObjectDetection` API](https://huggingface.co/docs/transformers/main/en/model_doc/grounding-dino#transformers.GroundingDinoForObjectDetection) using PyTorch.
+
+Content:
+* [PyTorch version, Trainer](#pytorch-version-trainer)
+* [PyTorch version, no Trainer](#pytorch-version-no-trainer)
+* [Reload and perform inference](#reload-and-perform-inference)
+* [Note on custom data](#note-on-custom-data)
+
+
+## PyTorch version, Trainer
+
+Based on the script [`run_object_detection.py`](https://github.com/huggingface/transformers/blob/main/examples/pytorch/object-detection/run_object_detection.py).
+
+The script leverages the [🤗 Trainer API](https://huggingface.co/docs/transformers/main_classes/trainer) to automatically take care of the training for you, running on distributed environments right away.
+
+Here we show how to fine-tune a [DETR](https://huggingface.co/IDEA-Research/grounding-dino-tiny) model on the [CPPE-5](https://huggingface.co/datasets/cppe-5) dataset:
+
+```bash
+python run_zero_shot_object_detection.py \
+    --model_name_or_path IDEA-Research/grounding-dino-tiny \
+    --dataset_name cppe-5 \
+    --do_train true \
+    --do_eval true \
+    --output_dir grounding-dino-tiny-finetuned-cppe-5-10k-steps \
+    --num_train_epochs 100 \
+    --image_square_size 600 \
+    --fp16 true \
+    --learning_rate 5e-5 \
+    --weight_decay 1e-4 \
+    --dataloader_num_workers 4 \
+    --dataloader_prefetch_factor 2 \
+    --per_device_train_batch_size 8 \
+    --gradient_accumulation_steps 1 \
+    --remove_unused_columns false \
+    --eval_do_concat_batches false \
+    --ignore_mismatched_sizes true \
+    --metric_for_best_model eval_map \
+    --greater_is_better true \
+    --load_best_model_at_end true \
+    --logging_strategy epoch \
+    --evaluation_strategy epoch \
+    --save_strategy epoch \
+    --save_total_limit 2 \
+    --push_to_hub true \
+    --push_to_hub_model_id grounding-dino-tiny-finetuned-cppe-5-10k-steps \
+    --hub_strategy end \
+    --seed 1337
+```
+
+> Note:  
+`--eval_do_concat_batches false` is required for correct evaluation of detection models;  
+`--ignore_mismatched_sizes true` is required to load detection model for finetuning with different number of classes.
+
+The resulting model can be seen here: https://huggingface.co/qubvel-hf/qubvel-hf/detr-resnet-50-finetuned-10k-cppe5. The corresponding Weights and Biases report [here](https://api.wandb.ai/links/qubvel-hf-co/bnm0r5ex). Note that it's always advised to check the original paper to know the details regarding training hyperparameters. Hyperparameters for current example were not tuned. To improve model quality you could try:
+ - changing image size parameters (`--shortest_edge`/`--longest_edge`)
+ - changing training parameters, such as learning rate, batch size, warmup, optimizer and many more (see [TrainingArguments](https://huggingface.co/docs/transformers/main_classes/trainer#transformers.TrainingArguments))
+ - adding more image augmentations (we created a helpful [HF Space](https://huggingface.co/spaces/qubvel-hf/albumentations-demo) to choose some)
+
+Note that you can replace the model and dataset by simply setting the `model_name_or_path` and `dataset_name` arguments respectively, with model or dataset from the [hub](https://huggingface.co/). 
+For dataset, make sure it provides labels in the same format as [CPPE-5](https://huggingface.co/datasets/cppe-5) dataset and boxes are provided in [COCO format](https://albumentations.ai/docs/getting_started/bounding_boxes_augmentation/#coco).
+
+![W&B report](https://i.imgur.com/ASNjamQ.png)
+
+
+## PyTorch version, no Trainer
+
+Based on the script [`run_object_detection_no_trainer.py`](https://github.com/huggingface/transformers/blob/main/examples/pytorch/object-detection/run_object_detection.py).
+
+The script leverages [🤗 `Accelerate`](https://github.com/huggingface/accelerate), which allows to write your own training loop in PyTorch, but have it run instantly on any (distributed) environment, including CPU, multi-CPU, GPU, multi-GPU and TPU. It also supports mixed precision.
+
+First, run:
+
+```bash
+accelerate config
+```
+
+and reply to the questions asked regarding the environment on which you'd like to train. Then
+
+```bash
+accelerate test
+```
+
+that will check everything is ready for training. Finally, you can launch training with
+
+```bash
+accelerate launch run_zero_shot_object_detection_no_trainer.py \
+    --model_name_or_path "IDEA-Research/grounding-dino-tiny" \
+    --dataset_name cppe-5 \
+    --output_dir "grounding-dino-tiny-finetuned" \
+    --num_train_epochs 100 \
+    --image_square_size 600 \
+    --per_device_train_batch_size 1 \
+    --per_device_eval_batch_size 1 \
+    --checkpointing_steps epoch \
+    --learning_rate 5e-5 \
+    --ignore_mismatched_sizes \
+    --with_tracking \
+    --push_to_hub
+```
+
+and boom, you're training, possibly on multiple GPUs, logging everything to all trackers found in your environment (like Weights and Biases, Tensorboard) and regularly pushing your model to the hub (with the repo name being equal to `args.output_dir` at your HF username) 🤗
+
+With the default settings, the script fine-tunes a [DETR](https://huggingface.co/facebook/detr-resnet-50) model on the [CPPE-5](https://huggingface.co/datasets/cppe-5) dataset. The resulting model can be seen here: https://huggingface.co/qubvel-hf/detr-resnet-50-finetuned-10k-cppe5-no-trainer. 
+
+
+## Reload and perform inference
+
+This means that after training, you can easily load your trained model and perform inference as follows::
+
+```python
+import requests
+import torch
+
+from PIL import Image
+from transformers import AutoImageProcessor, AutoModelForObjectDetection
+
+# Name of repo on the hub or path to a local folder
+model_name = "qubvel-hf/detr-resnet-50-finetuned-10k-cppe5"
+
+image_processor = AutoImageProcessor.from_pretrained(model_name)
+model = AutoModelForObjectDetection.from_pretrained(model_name)
+
+# Load image for inference
+url = "https://images.pexels.com/photos/8413299/pexels-photo-8413299.jpeg?auto=compress&cs=tinysrgb&w=630&h=375&dpr=2"
+image = Image.open(requests.get(url, stream=True).raw)
+
+# Prepare image for the model
+inputs = image_processor(images=image, return_tensors="pt")
+
+with torch.no_grad():
+    outputs = model(**inputs)
+
+# Post process model predictions 
+# this include conversion to Pascal VOC format and filtering non confident boxes
+width, height = image.size
+target_sizes = torch.tensor([height, width]).unsqueeze(0)  # add batch dim
+results = image_processor.post_process_object_detection(outputs, threshold=0.5, target_sizes=target_sizes)[0]
+
+for score, label, box in zip(results["scores"], results["labels"], results["boxes"]):
+    box = [round(i, 2) for i in box.tolist()]
+    print(
+        f"Detected {model.config.id2label[label.item()]} with confidence "
+        f"{round(score.item(), 3)} at location {box}"
+    )
+```
+
+And visualize with the following code:
+```python
+from PIL import ImageDraw
+draw = ImageDraw.Draw(image)
+
+for score, label, box in zip(results["scores"], results["labels"], results["boxes"]):
+    box = [round(i, 2) for i in box.tolist()]
+    x, y, x2, y2 = tuple(box)
+    draw.rectangle((x, y, x2, y2), outline="red", width=1)
+    draw.text((x, y), model.config.id2label[label.item()], fill="white")
+
+image
+```
+
+
+## Note on custom data
+
+In case you'd like to use the script with custom data, you could prepare your data with the following way:
+
+```bash
+custom_dataset/
+└── train
+    ├── 0001.jpg
+    ├── 0002.jpg
+    ├── ...
+    └── metadata.jsonl
+└── validation
+    └── ...
+└── test
+    └── ...
+```
+
+Where `metadata.jsonl` is a file with the following structure:
+```json
+{"file_name": "0001.jpg", "objects": {"bbox": [[302.0, 109.0, 73.0, 52.0]], "categories": [0], "id": [1], "area": [50.0]}}
+{"file_name": "0002.jpg", "objects": {"bbox": [[810.0, 100.0, 57.0, 28.0]], "categories": [1], "id": [2], "area": [40.0]}}
+...
+```
+Trining script support bounding boxes in COCO format (x_min, y_min, width, height).
+
+Then, you cat load the dataset with just a few lines of code:
+
+```python
+from datasets import load_dataset
+
+# Load dataset
+dataset = load_dataset("imagefolder", data_dir="custom_dataset/")
+
+# >>> DatasetDict({
+# ...     train: Dataset({
+# ...         features: ['image', 'objects'],
+# ...         num_rows: 2
+# ...     })
+# ... })
+
+# Push to hub (assumes you have ran the huggingface-cli login command in a terminal/notebook)
+dataset.push_to_hub("name of repo on the hub")
+
+# optionally, you can push to a private repo on the hub
+# dataset.push_to_hub("name of repo on the hub", private=True)
+```
+
+And the final step, for training you should provide id2label mapping in the following way:
+```python
+id2label = {0: "Car", 1: "Bird", ...}
+```
+Just find it in code and replace for simplicity, or save `json` locally and with the dataset on the hub!
+
+See also: [Dataset Creation Guide](https://huggingface.co/docs/datasets/image_dataset#create-an-image-dataset)
diff --git a/examples/pytorch/zero-shot/grounding-dino-tiny-finetuned/object_detection_no_trainer/1722922514.433052/events.out.tfevents.1722922514.a9776447cf29.50015.1 b/examples/pytorch/zero-shot/grounding-dino-tiny-finetuned/object_detection_no_trainer/1722922514.433052/events.out.tfevents.1722922514.a9776447cf29.50015.1
new file mode 100644
index 0000000000000000000000000000000000000000..daff7299ba7963ee65ca7f33bb8d530ff0e69c01
GIT binary patch
literal 1642
zcmaJ>J!lj`6uy}A%sq`sR!!n2YD@%0y^Dz`DB@kvfIo$hMy-aq-O1gM-JNx2HZc|!
zA_yWDq9BNsij9?xh>eg&@TWFlWub+FcA}v8=JxK~yToj<u<v{G=6mmb-=0Xt&!1Z-
z?q|;}+`H;OS)8ytOOo;+KRL?+eRzB-4|_EWl#nGsrJGlg?^8LSpOaiOnP(=8OmmNs
zICA^x<>zNs#dX8a;iv4=_i^3b3u$<nhK-~WQb~P9M%l%XN$z8TQ;*Z|Fb!*Jkaah0
z-ANDkg5?XBc_g4dBSMmp>dH{ta5t=TsiunQDuR&>(4`G&GD|(8LXYdA_Td5O<K;j|
zMtrV(s+|gRiQ<=-LU0rGaG#bLQ5PbLDJG6#3)V-95gO?V(PD-L7~2K|$dkBimOG5Z
z_moVz`k}30ClZ+9(1hF2YxsJCx>zTOd@?8G43maO4|IBr%76!DBB?eU)&o$N`lQ4(
z9c%d>$8T)EinCC0PZ)+9q1#lAsj*Nv6;m3A;?s+@sM45@L2RZhsmsmQsN+O_<WbE<
zK<WuN7*7&qh81~8SI>tGCtSN?j!HiY8_7sjL^_N#ak<2-E?65z%8ZUwq7q(*)wGP1
zCX)F=GveSrG290m@B(Kh6g<$T_4xKSGmbfKW-qBTg8=4sLRMl=Qe)Mjwrg+7zt6VE
z^7?Rn<U#gVU0$y<@G1j?|Kz2ZQgLS#5A$GJoO+yrhna-TGWMMeY+0sbtHaKCJiyJs
z)@8X`YwXq4N#pVA@RO6VvOP6trC@N0G!fljeHIEZ(5koO(f($g8%|CbvfKgWv{b$I
zlP|9nVaI=^U(ZT?FF;>HXyXX%xz@Y*?#H(R>{?MI$r9LIi^l3`|Dz)8T*(;q#MDp`
zM*ioJMpAdtuu9mh)rXBtEZ5&8!v>N{ft3)Gwc2i{o>yOB(mjnM=ZkP?xxbJ#VsAfr
zd~zZ;#T1Ifsf?DThyoWSo5PpLp{nO$bF)*i>fsyvKHi+4JPLM;?-9!#8~pO|AQTMN
zWP%18+!=0uiJ=-eHkjP$7y@tKnCEK&a!IRN+w3jN5YA1b<aHD<_w?w`i6b=)zk2py
kDm%DX`-_jkDA-F)#sW9CSPyjlGgx|fRY%*4Z<kAd0akqjkN^Mx

literal 0
HcmV?d00001

diff --git a/examples/pytorch/zero-shot/grounding-dino-tiny-finetuned/object_detection_no_trainer/1722922514.4384637/hparams.yml b/examples/pytorch/zero-shot/grounding-dino-tiny-finetuned/object_detection_no_trainer/1722922514.4384637/hparams.yml
new file mode 100644
index 00000000000000..fd4e54228d4860
--- /dev/null
+++ b/examples/pytorch/zero-shot/grounding-dino-tiny-finetuned/object_detection_no_trainer/1722922514.4384637/hparams.yml
@@ -0,0 +1,29 @@
+adam_beta1: 0.9
+adam_beta2: 0.999
+adam_epsilon: 1.0e-08
+cache_dir: null
+checkpointing_steps: epoch
+dataloader_num_workers: 4
+dataset_name: cppe-5
+gradient_accumulation_steps: 1
+hub_model_id: null
+hub_token: null
+ignore_mismatched_sizes: true
+image_square_size: 600
+learning_rate: 5.0e-05
+lr_scheduler_type: linear
+max_train_steps: 2700
+model_name_or_path: IDEA-Research/grounding-dino-tiny
+num_train_epochs: 100
+num_warmup_steps: 0
+output_dir: grounding-dino-tiny-finetuned
+per_device_eval_batch_size: 8
+per_device_train_batch_size: 8
+push_to_hub: false
+report_to: all
+resume_from_checkpoint: null
+seed: null
+train_val_split: 0.15
+trust_remote_code: false
+use_auth_token: false
+with_tracking: true
diff --git a/examples/pytorch/zero-shot/grounding-dino-tiny-finetuned/object_detection_no_trainer/1722922680.2719772/events.out.tfevents.1722922680.a9776447cf29.52465.1 b/examples/pytorch/zero-shot/grounding-dino-tiny-finetuned/object_detection_no_trainer/1722922680.2719772/events.out.tfevents.1722922680.a9776447cf29.52465.1
new file mode 100644
index 0000000000000000000000000000000000000000..6ca8eaa4f08ce2813401b549c22047efd05356a4
GIT binary patch
literal 1642
zcmaJ>O=uKJ6rLpGn0XqLX*G$RXksELx-;2C6clkrH28B7auGo&YPu^k#dLSIRn_^~
zqZdKMgCcnGWW<Yj)T`h{{JSfjJgBfPySKfGucoKRo=LP01O2{NufF%*_szLXe7@g4
zcRzn|rSQam{Qi_ZP?eMirTHZm=;Lz>CHO<LKnYnDRJtV<`977arDe%AlO<-d$RhU`
zi6j54efWB@H?A9g2|wo>cjCGSUu5BV7Iu(&NG0_ZnPDp-libGw7anC{Eeo4lkaaI>
z-OWx7gXIgCc_g4dBSMmp>iT5Q)DUcSsiunQCW4XeFsKb_vP3<iLXYdozNs-7;k7_W
zMtrV(s+~G>iQ<1Th2TyoaG%x~QCB01DJG6#3${gy5gO?_(W1cuj7`88@+2->;tnJ6
zJ5^IIxwHrDR01;`I&d3?4PQ@C7wZI(PnLyjFll%cV8CNk20W+{Nwwjy4S>4TCsn5D
zY}fZ3eq;MhoP~;e!Z6$cL#Ap>jfKLgo6<NGpRTl`N@F?(v6-5rE;n1FjuZKjM>Q7#
zNfK}{o+Qc)EAo)8UJV&exb?&`m3|bqlaZ>3bQo#ka*0`kuqBL?86BxdReTPsX&EU^
zB=d!4#KC=Hcm%fN15P6pJkX}~`1Kw$j%99UFR9Z&0CNW*FR>@7vFc>c71i5!sV|n-
z#~afR@~;wkz0SeQ9E|^zmtsoAol(5ZgGF)SQ4ZE}DVgQ$yE)jsLC03Vo%48rn}IzW
za<w+wo0^lx>&@Y(Cu8LcEoWt5e4R8AJ={Dm$}rZgxAfJ~PMsT0P8hP{5#+R9y~)kz
z*DA2@m(owNGXIrfBqg*t1P)yve*f<Of6H*-w<1ZG!0x&<mY|~#E3p4}#z+z~lNFf$
zl|$M|Llwg+WwUM{wllHZXon2jNG1bTN=()!yN`NavcR-^+E*@D;Mhk0Le{js_xzdp
zQ^f_QP$W)$rY1!cxG32o{)i%~dKGqcIu)xPzxn5<TdVUY!S3=sV#U+re}6g(WrKB?
zpv?w%hMQk!s5Xubrgu7vz}q+Gd@DmSZB=W+-n{|g;v!03M*(x^?6b2cS{i=&?7>3*
gTVk_Wm;rme$yngV7AwHuPlIKrdi#5x9_%0f0Ul2MtpET3

literal 0
HcmV?d00001

diff --git a/examples/pytorch/zero-shot/grounding-dino-tiny-finetuned/object_detection_no_trainer/1722922680.2782347/hparams.yml b/examples/pytorch/zero-shot/grounding-dino-tiny-finetuned/object_detection_no_trainer/1722922680.2782347/hparams.yml
new file mode 100644
index 00000000000000..fd4e54228d4860
--- /dev/null
+++ b/examples/pytorch/zero-shot/grounding-dino-tiny-finetuned/object_detection_no_trainer/1722922680.2782347/hparams.yml
@@ -0,0 +1,29 @@
+adam_beta1: 0.9
+adam_beta2: 0.999
+adam_epsilon: 1.0e-08
+cache_dir: null
+checkpointing_steps: epoch
+dataloader_num_workers: 4
+dataset_name: cppe-5
+gradient_accumulation_steps: 1
+hub_model_id: null
+hub_token: null
+ignore_mismatched_sizes: true
+image_square_size: 600
+learning_rate: 5.0e-05
+lr_scheduler_type: linear
+max_train_steps: 2700
+model_name_or_path: IDEA-Research/grounding-dino-tiny
+num_train_epochs: 100
+num_warmup_steps: 0
+output_dir: grounding-dino-tiny-finetuned
+per_device_eval_batch_size: 8
+per_device_train_batch_size: 8
+push_to_hub: false
+report_to: all
+resume_from_checkpoint: null
+seed: null
+train_val_split: 0.15
+trust_remote_code: false
+use_auth_token: false
+with_tracking: true
diff --git a/examples/pytorch/zero-shot/grounding-dino-tiny-finetuned/object_detection_no_trainer/1722924013.7937937/events.out.tfevents.1722924013.a9776447cf29.54915.1 b/examples/pytorch/zero-shot/grounding-dino-tiny-finetuned/object_detection_no_trainer/1722924013.7937937/events.out.tfevents.1722924013.a9776447cf29.54915.1
new file mode 100644
index 0000000000000000000000000000000000000000..d43e8d3424664a82772fa99a31205a750918b108
GIT binary patch
literal 1642
zcmaJ>O=uKJ6rPxIblz@E(rS|JM2(4{=+4AMTv(Bvku~6YSqz8>LQ&IQnJLm;)mv4a
z9}gZD7DdE`1;L9Kb@8(Iy$T`->K=Cq2n!zcpy1ENgCM?|o*sK9(K-zD`(C~J-h1CS
zCvwU8@W+W8#s5~-d-2!mF}JU#C>Q0~MJCLVnYl82Ynaed)g)DZS;t{W)k=9ual=%Z
z*(@^814fd_%?oF|pEo3Rqvz4HB6*wC-G46+ck{4~G$N{KsL2#tj+o*h7C3n)54ZEM
zsRcRL^Uk&W#1J^4^jSaz4H=P&MAS6KdnN|pYo8jbnQ0;z*$Vx}k|v8ZAUX=T8SkAK
zg<)P7QZW*89a7^pm`^l6&oqMDVUUNk&WQdcrkG;m7<OPwtQn!PX%Hh9n84UB7)74M
zXN%lpBzdQ1%cYlgf}2TTmO}^b!jR<~Na|x95r<?+ss*Mjk3r}Q7*&FcI#JYE4qE`I
zPeW2;hE8{V&)_$<-^5v@c_1yrZ7^V~Ce&CYy@oA~Lka0}E2<5qV-TCEE9!H*HR^eB
z7zfmFDM*@tg9#MTc35$Mbj?b{aKf!8mZ%EjsGW>dWo)9@5T7f|>W8nQSliK&MqI<^
zu$ob^Hbk*d8b&<aCx(Y%D?Z>YM3M_*TTfo^vEx|ccJ_*T3kYCt9~2e#qzqOa@40m7
zxBF*$6L~$kJbA15D3#a40z4?d*e7{urghR8#mijG%aeBsaJ!I^S;4(lfF0{}>}+-m
z0T;L#*tsrOXOp|JIcdD!9Da5(PI0j1tQ?H3ktU*tn&(~xM!WTvy*kpVbIZw#B33$#
zoYty0y?OW7D(v~J^wX@|>k15Kgf@e~fr~?{|Gaxufqh>TNwx%b*QK!(9r?2gd%t9i
zG%+_`g~`u3q@6TSwX8BW>-J$g6U&Ws$gqv%a^Pgd<ZN_%X%M6f%(|z2<!lwcU+-VY
znRIvmbbR($X^v?WiPxB_D;WzPC0oKDSwdB>z&D*v#i~cH9DI6pW%dVfyL^vW>Bq4b
zPrrkT#X3yTW`jG!&95<38^;E-JDo=0@gsX)R-lx%s<X@8u@2$VJWAfgg89q;p8Nl3
oOT!QTzBN~TliF+=rodfmGA8`QVuR5C$zb`34V!yT{oXkJ5pm@Nl>h($

literal 0
HcmV?d00001

diff --git a/examples/pytorch/zero-shot/grounding-dino-tiny-finetuned/object_detection_no_trainer/1722924013.7994215/hparams.yml b/examples/pytorch/zero-shot/grounding-dino-tiny-finetuned/object_detection_no_trainer/1722924013.7994215/hparams.yml
new file mode 100644
index 00000000000000..fd4e54228d4860
--- /dev/null
+++ b/examples/pytorch/zero-shot/grounding-dino-tiny-finetuned/object_detection_no_trainer/1722924013.7994215/hparams.yml
@@ -0,0 +1,29 @@
+adam_beta1: 0.9
+adam_beta2: 0.999
+adam_epsilon: 1.0e-08
+cache_dir: null
+checkpointing_steps: epoch
+dataloader_num_workers: 4
+dataset_name: cppe-5
+gradient_accumulation_steps: 1
+hub_model_id: null
+hub_token: null
+ignore_mismatched_sizes: true
+image_square_size: 600
+learning_rate: 5.0e-05
+lr_scheduler_type: linear
+max_train_steps: 2700
+model_name_or_path: IDEA-Research/grounding-dino-tiny
+num_train_epochs: 100
+num_warmup_steps: 0
+output_dir: grounding-dino-tiny-finetuned
+per_device_eval_batch_size: 8
+per_device_train_batch_size: 8
+push_to_hub: false
+report_to: all
+resume_from_checkpoint: null
+seed: null
+train_val_split: 0.15
+trust_remote_code: false
+use_auth_token: false
+with_tracking: true
diff --git a/examples/pytorch/zero-shot/grounding-dino-tiny-finetuned/object_detection_no_trainer/1722924136.027696/events.out.tfevents.1722924136.a9776447cf29.57364.1 b/examples/pytorch/zero-shot/grounding-dino-tiny-finetuned/object_detection_no_trainer/1722924136.027696/events.out.tfevents.1722924136.a9776447cf29.57364.1
new file mode 100644
index 0000000000000000000000000000000000000000..8f94705d1712263e2bc04c5360d983a71c87d403
GIT binary patch
literal 1642
zcmaJ>O=uKJ6rM4Y(Rpr6(rS|JM2(Y$U3Q&`i6|)IjJSd5MaV@x)z)-ZW{PxowN=&m
z@fI(Nh=L&N#gi_Ac;C~4;6-pZr@aWCa?pc{9s=IPSJTsD&m>xhfqvhsSKoW@`(`#1
zpU-z@?-#EBpS8u4m#6K%nxs4^&n&V)pO~I2!y(NAC1g!d>6TUG`&6!!mn7FrmYK;S
z^W0-3j{I4Cym5Y8TsM3ZzApUnKCb)gvm89l!5-2GsieLllWaL;lKWWT+@l;k%)#~+
zWZlbIcXP!duzcY%j|9|bL`V`+-5Adn2jB;nYO0uSA{f~X{o0Twi_{}3^tc}HDUQN0
zuLnXh;&bIw?KGH66u-h0f_q_*`?Su8x*SnVF>wrAuq#rG&`39k77HxE*Z~+tp2TH~
z++iesr)J6}m-d65N??XV2X4cV;p++NVx1uJ$&!!@Od1}8(C0BK10K|gq}p)U1wdWu
zlN!@>s_T0izp?!$&O*gKVHoa#0aG=m#zNsVOlcg7PnTO!r7<0Y*i2ngmz%9o$BF#N
zqne9=BndbePZDK@6?sTkuY?RI+<Ib(N<RwQ$w*a1I*c@Nxx}n~*cnF3jE*#-8a{{B
zw2YJ{lKDb2;^00pJPf<>0cRl;JkX}~`1PzA#}YTQm(*E60CW4GAh9Q@vFdpCA8~v2
zLQgEOS2rde6xI@Xy~xA!JdAynmtsoAol(5ZgL!f8Q63)VQ!>ljck{4si;k@wcHZLw
zZU**m$<^9!_ckYu*PFvnPsS<?ww#rLu}#uM^honOtH5Zt-qKe`I(2S1Ibq03N0HNJ
z^(Hr;Uai7U-;{ommHAkK;gry(5%}%e(95+?A1d(6cSVvef!%d!EI~*9s>09TGe(k_
z8Lz^`w;a+=8mJmpDVufsu$_tJMml8JMlu<&Qev`t?H=lR$pX{vX<xZih2vZO3t1EP
z!Lz^5oG#5Vg(7hplXWSgz(vWH@JEzT)hqC0r&F=&iJOPlZ>`Ln0=vuih?UNay<I;B
z6@zt{pv?w%hMV7Hs5Xubrgu7pz<>Xm^R5D=v{kJG_P#9$m*!FOItrM(adCC@WJ|-(
k|9&u6*hp+P1(RTJHW>@t*kXgw|J7i*;<g>xo!3O~7ipgUVE_OC

literal 0
HcmV?d00001

diff --git a/examples/pytorch/zero-shot/grounding-dino-tiny-finetuned/object_detection_no_trainer/1722924136.0336704/hparams.yml b/examples/pytorch/zero-shot/grounding-dino-tiny-finetuned/object_detection_no_trainer/1722924136.0336704/hparams.yml
new file mode 100644
index 00000000000000..fd4e54228d4860
--- /dev/null
+++ b/examples/pytorch/zero-shot/grounding-dino-tiny-finetuned/object_detection_no_trainer/1722924136.0336704/hparams.yml
@@ -0,0 +1,29 @@
+adam_beta1: 0.9
+adam_beta2: 0.999
+adam_epsilon: 1.0e-08
+cache_dir: null
+checkpointing_steps: epoch
+dataloader_num_workers: 4
+dataset_name: cppe-5
+gradient_accumulation_steps: 1
+hub_model_id: null
+hub_token: null
+ignore_mismatched_sizes: true
+image_square_size: 600
+learning_rate: 5.0e-05
+lr_scheduler_type: linear
+max_train_steps: 2700
+model_name_or_path: IDEA-Research/grounding-dino-tiny
+num_train_epochs: 100
+num_warmup_steps: 0
+output_dir: grounding-dino-tiny-finetuned
+per_device_eval_batch_size: 8
+per_device_train_batch_size: 8
+push_to_hub: false
+report_to: all
+resume_from_checkpoint: null
+seed: null
+train_val_split: 0.15
+trust_remote_code: false
+use_auth_token: false
+with_tracking: true
diff --git a/examples/pytorch/zero-shot/grounding-dino-tiny-finetuned/object_detection_no_trainer/1722924201.2479532/events.out.tfevents.1722924201.a9776447cf29.59813.1 b/examples/pytorch/zero-shot/grounding-dino-tiny-finetuned/object_detection_no_trainer/1722924201.2479532/events.out.tfevents.1722924201.a9776447cf29.59813.1
new file mode 100644
index 0000000000000000000000000000000000000000..f8ed5c44e8ce97dd02100896d4f080b776acd55e
GIT binary patch
literal 1642
zcmaJ>O=uHA6i#i7&C}X;)wXV{)>cr|q_(1<h>2)Hi=g%*o`%WpG#R?Pv(C)6)}t3e
zMMMz1cn}p4yn9g)ya@gr1;JAfdQjAxH}Oq28#ifn4uO5&n>XKk@B4Bx6QAF=Pu|O)
z{(fe+c>3{}-CdEC2gRv*7U;v1Gey{|S)hch2rAv8ihQ5S#o~hGn#m$FS!9-bjKq--
z@MQ4Js<>|WCH$Osy3W???tYbpms!|IY9W=>S7e-B3Yp|S7MOmVg@;*K(}1kIS?f-A
zv>z;AxXdE~^%)V8gjCl?I!1e8ol7-UOxF>NY=9nZNRxT$5fyq|k93X>!2quYLNelW
z<x}m{m`fDD%oKu~ppW~s%80rcQA{y$3|p{1QjE|@*N7H#EWp?{7($-JW%Jx&Bz~u2
z$|aY!f}KiWhC>T(L%-qc3F=~<Ao9tAkaJ8L9(~a5F)9NdREeb8a99sOUFwqx({!Tk
zdlJ8~{W{J<#XVsdZiHS_HKxWw;nYlN9EwjbHKIymItH<ss-!MATceH>`H@F87Xe8U
za4?=E$_y*=kgi?~8BVzI!~&Im6gHEQs)%$LY2tE;Sv{~ejFcH2sYMlh4y$PyDNQ8v
zg=WOTePVb3HsAx!Tqt;;P3!UN9cCO0+{|85XAS|(?S{O>o}|XABOSMnY<_;eGnUt<
zYhw@cFA{mZ$-(O!4F8pvVoJrGQM}B9Suy=M2M=>8ndR&|IoPs7$5xk}^LT)pfvqcY
zwbt0H>yyUo_2H){W99oA&dR{>GHD{ZzkXhoV5nVh>8pdSIyao0Fl2=T$Z5HHlbbKE
zlwrp|rJrPFewJV$CA3Kd_FV1%`2N?A672e~NYW*+yEcs_=-{I=?EIfGlElnN8OHwQ
zkY-YE*|194tlfvrOe{CpBEu$<$$*s-leOCJq@I^7Fzue^l?!D!w9>zjHD+%=bz<sR
zVTLIbiBlV|N)ZJvO16MMqJXMigw3r^#j1yI>|46IICT{4Hs2#wI6nM!=^&I0)?$Jt
z8{8Rgewm?~I5wEx=>!7r-kS5R1ckI!t!?&}6$lq*QSv$pn7jSl-U~+>8h-uk!AyQB
fvDpNSgT35jEO29s^+C^HgJnlob#+|7e<l0_taSUq

literal 0
HcmV?d00001

diff --git a/examples/pytorch/zero-shot/grounding-dino-tiny-finetuned/object_detection_no_trainer/1722924201.254446/hparams.yml b/examples/pytorch/zero-shot/grounding-dino-tiny-finetuned/object_detection_no_trainer/1722924201.254446/hparams.yml
new file mode 100644
index 00000000000000..fd4e54228d4860
--- /dev/null
+++ b/examples/pytorch/zero-shot/grounding-dino-tiny-finetuned/object_detection_no_trainer/1722924201.254446/hparams.yml
@@ -0,0 +1,29 @@
+adam_beta1: 0.9
+adam_beta2: 0.999
+adam_epsilon: 1.0e-08
+cache_dir: null
+checkpointing_steps: epoch
+dataloader_num_workers: 4
+dataset_name: cppe-5
+gradient_accumulation_steps: 1
+hub_model_id: null
+hub_token: null
+ignore_mismatched_sizes: true
+image_square_size: 600
+learning_rate: 5.0e-05
+lr_scheduler_type: linear
+max_train_steps: 2700
+model_name_or_path: IDEA-Research/grounding-dino-tiny
+num_train_epochs: 100
+num_warmup_steps: 0
+output_dir: grounding-dino-tiny-finetuned
+per_device_eval_batch_size: 8
+per_device_train_batch_size: 8
+push_to_hub: false
+report_to: all
+resume_from_checkpoint: null
+seed: null
+train_val_split: 0.15
+trust_remote_code: false
+use_auth_token: false
+with_tracking: true
diff --git a/examples/pytorch/zero-shot/grounding-dino-tiny-finetuned/object_detection_no_trainer/1722924545.4884186/events.out.tfevents.1722924545.a9776447cf29.62264.1 b/examples/pytorch/zero-shot/grounding-dino-tiny-finetuned/object_detection_no_trainer/1722924545.4884186/events.out.tfevents.1722924545.a9776447cf29.62264.1
new file mode 100644
index 0000000000000000000000000000000000000000..82c509d7c9233640d1ab67843e889463a6e9c1c7
GIT binary patch
literal 1642
zcmaJ>J!lj`6uy}A^q$5ft0r+1H70_h-o-=|jpChX@TU>eh}AH+JGmROyR**BCdNv%
z5JfBm!9pt)8#@bI!A9_>4_KwpLP0wVL40$2d)d20r#RU6y?OJ!_rCAWW#aSa!MP{-
z4VTCb@pAE$-Bpp42gRv57U;=|=^`A^EKovL1eI=4MZQnvLUCSl&18|8EHcAAM&ih;
zqjzpyTocy~zlLA(J$_tw-`gy_$-)*=3#p{OB4g}Y$Rzi%z=aoCc$S5A4aj<wwH{_i
z`oQvq%RCZLpAjKRNOf(vW26T*x>QrebREISX6V+2G?}9wQK856aOcP%^z&*UBqKgo
zKGjZ*xkT~nOd+@xdbv-ljHs&-#S|0AumzhU#R!dbjc75;0*vj1LF7qXHpd-C;&&>h
zTykj#*r^0&IJDq4^clXMpf1)4BA?6)Im@Ks(F<K3qcY$@l}M@$hfM&~r9P=JO~>25
zC-58Fuj4FK+!Kc37U(fmV`?lEPR*3Yq4@M#BdRo}V-TCEO6qd6HR?E#A9+-B5s)MS
z2jfYi%&;O4>FR}$;e;Dc%v0${VKW)2ib#i%CN7tl)eRfMNSV=*T2#U3u$q>U(nK;}
zXhs~|Cx-iBGd|$VhJpv$v>w0SVa74f&Fm$0W)Z;LF33ylNouS*+;R5!@2{6TV|jhP
zJNh(VOXT%F2k&w)v@9>hl!`l}c$o(?;=+p@Jj<nIma`w`VEZZ^TkGwd#{=99>{ykn
zwa#8!pEO>t4?jH_E8p92RtAPvNE6XR_4Bp_gY9}tUma-Gx#8r5AuAk4PAk=$+<bGh
z47>j+{Uj^%vjqJqp-mvL|90Qvr(Zuxu=l?rNteLx+BBA+1JBE_=YPgX5;MbP82y(+
znn^un!zyL7b{{q~vD`q544X(M16E2*)>^xhdS0@?w0oLYu9V@}YX3sksJ-j_nW<BS
zX{JylPHn6zMHILw*#iEE0;+lewzWDHt4`iKxO9JE>Ll20zDKNZdg$BI5hxj~#RN?@
zxHH`R3PUw<Y%smkaRfeoFz0&-3TdlaJMHbO5H8H1<aHD<cmLq$dnXzie)sz6bpCc?
evvC*$d!@-(;Kml~h3;j8Wk=Sm@0fgidGRk%HvU`y

literal 0
HcmV?d00001

diff --git a/examples/pytorch/zero-shot/grounding-dino-tiny-finetuned/object_detection_no_trainer/1722924545.4942276/hparams.yml b/examples/pytorch/zero-shot/grounding-dino-tiny-finetuned/object_detection_no_trainer/1722924545.4942276/hparams.yml
new file mode 100644
index 00000000000000..fd4e54228d4860
--- /dev/null
+++ b/examples/pytorch/zero-shot/grounding-dino-tiny-finetuned/object_detection_no_trainer/1722924545.4942276/hparams.yml
@@ -0,0 +1,29 @@
+adam_beta1: 0.9
+adam_beta2: 0.999
+adam_epsilon: 1.0e-08
+cache_dir: null
+checkpointing_steps: epoch
+dataloader_num_workers: 4
+dataset_name: cppe-5
+gradient_accumulation_steps: 1
+hub_model_id: null
+hub_token: null
+ignore_mismatched_sizes: true
+image_square_size: 600
+learning_rate: 5.0e-05
+lr_scheduler_type: linear
+max_train_steps: 2700
+model_name_or_path: IDEA-Research/grounding-dino-tiny
+num_train_epochs: 100
+num_warmup_steps: 0
+output_dir: grounding-dino-tiny-finetuned
+per_device_eval_batch_size: 8
+per_device_train_batch_size: 8
+push_to_hub: false
+report_to: all
+resume_from_checkpoint: null
+seed: null
+train_val_split: 0.15
+trust_remote_code: false
+use_auth_token: false
+with_tracking: true
diff --git a/examples/pytorch/zero-shot/grounding-dino-tiny-finetuned/object_detection_no_trainer/1722924574.178983/events.out.tfevents.1722924574.a9776447cf29.63604.1 b/examples/pytorch/zero-shot/grounding-dino-tiny-finetuned/object_detection_no_trainer/1722924574.178983/events.out.tfevents.1722924574.a9776447cf29.63604.1
new file mode 100644
index 0000000000000000000000000000000000000000..a5e6b6f321fdbf2488ded69ee04c04f642c12d39
GIT binary patch
literal 1642
zcmaJ>O>7fK6n2PX;@5^G3r)f%fJu-*P&-YkB7rLGP$E$-ND)%uG#c+t?2&hOw)2ya
z9=Ra3h)Re^m0C_ol}qnEl8OtWKPN6oJ#l~o2zu)c@y2VLwG)_wrG4L<H{W~j`+PBz
zoHuta-pl`S<A<B#w<inkP*qYMl@?c6q|eSTmEcp&A|+&1Q0bSH2}3H^N~@A<CQHm_
zk!2n*l0;17^^ceLBz5EG@w5D+pOd;LA7o)W3kOI&rjmw=%(Ck-lRU%%m$tI7nT5SA
z$oVPjtY@dkzzK!V0us@X5g|!Tb-mCxJpv#5R8z%t6T!%S7}l0FS)l<@almz<e|i$e
zc`Xu>k&vs9YOl_GqWE`AA@~W5@{raTQQsPhDK?H_2lg4o2sOG+v}mvhV@F^Tc@m$k
zaF3DXovJO@zH}JeR06Xcx^Nf9EZ;y-AL~RWB&$L;n6x}bVJKi!Mm(w!Nwwv$4}khK
zBvq#AT+jDBeq;MhoW+U<!ZJJnBerTnjm5&N+tN6ckX~;^mBw@oVly>KeQvi#J<o(D
zpqh(_v=eYJfh5Wf%LGVQuf+@}+<Ib_%Fx7}WTYyLj*TWhmzXsSAH_!5(UH2T;&WI{
zOQSTAEEJj%5BG`TaoCR!c#T-_NZZzv*Zb@^R=J(Mq+SC7%pHQf#Ga(as)fFzk6#Ub
z)1S!e-|uHO@~3}k$?MM?Jj}t=J9#OlRMHv6%RE{Zm$q`SnM=tm=dS1A&@LT2gKjS1
z5pD(!@5<HL>kc$0jn|vQPfy0lkG7nZfvFwRMD+9Kc~FMQUcIHSPIT+sa`NJs6~91E
zJJs91xqYJo$KEUbc2?%UGK{B$Hjlumn`2M@`TunpPJB=#=@QsokH*^Q#IF@N{vl(u
z6ElSh%)HMbourY9WtFm7uMaz!SZ<<Ah8-l60VgFUXTa^JLC{`c+C7~s*D7#kw|^mL
z#y$G=g~f&95>qG=uRdFo!bCnwwunEXh^k(LgWXQWs%LL~_Vo7J;yG}8e2-Z1tEm@H
zPea*aT_)(T!JXmecNnUJV}t3P&LQykU-rB#Losbt=ZJe~7sAD5l)N?(^Y6dheDP&V
n!w-MoSjt~*+iVVI!QE*x7Ws+AMq&7!!LrkP2K$!op1<-I;e7$~

literal 0
HcmV?d00001

diff --git a/examples/pytorch/zero-shot/grounding-dino-tiny-finetuned/object_detection_no_trainer/1722924574.1846154/hparams.yml b/examples/pytorch/zero-shot/grounding-dino-tiny-finetuned/object_detection_no_trainer/1722924574.1846154/hparams.yml
new file mode 100644
index 00000000000000..fd4e54228d4860
--- /dev/null
+++ b/examples/pytorch/zero-shot/grounding-dino-tiny-finetuned/object_detection_no_trainer/1722924574.1846154/hparams.yml
@@ -0,0 +1,29 @@
+adam_beta1: 0.9
+adam_beta2: 0.999
+adam_epsilon: 1.0e-08
+cache_dir: null
+checkpointing_steps: epoch
+dataloader_num_workers: 4
+dataset_name: cppe-5
+gradient_accumulation_steps: 1
+hub_model_id: null
+hub_token: null
+ignore_mismatched_sizes: true
+image_square_size: 600
+learning_rate: 5.0e-05
+lr_scheduler_type: linear
+max_train_steps: 2700
+model_name_or_path: IDEA-Research/grounding-dino-tiny
+num_train_epochs: 100
+num_warmup_steps: 0
+output_dir: grounding-dino-tiny-finetuned
+per_device_eval_batch_size: 8
+per_device_train_batch_size: 8
+push_to_hub: false
+report_to: all
+resume_from_checkpoint: null
+seed: null
+train_val_split: 0.15
+trust_remote_code: false
+use_auth_token: false
+with_tracking: true
diff --git a/examples/pytorch/zero-shot/grounding-dino-tiny-finetuned/object_detection_no_trainer/1722925039.8776126/events.out.tfevents.1722925039.a9776447cf29.67335.1 b/examples/pytorch/zero-shot/grounding-dino-tiny-finetuned/object_detection_no_trainer/1722925039.8776126/events.out.tfevents.1722925039.a9776447cf29.67335.1
new file mode 100644
index 0000000000000000000000000000000000000000..1c19610eebecebd3ae59e6edae42cb2947e93f7f
GIT binary patch
literal 1642
zcmaJ>O=uHA6i#i7=4oxaYFoF})>Kf`q_(1H5fjmZdJzjEo`%WpG#R?Pv(C)+$CG#w
zL_7$B7f&jB@UABjiXixN7d$EGK}By0g7_wz#7$bALtx+c=FRus`@Wn^#pn0UlXtSY
z$)CT(qqoQGuCk;&C``?<Ku=Cg7htbuffBMTsB{Y|@_i~73-gj|CJW4Dks0nW5=R~m
zzxr@yRa`gx6n@NpJsa2Ey_AM$Y1l-nA(hluWQ;9@OmZI!oPLmodudoxhpgLa>sC6~
z2bM2f=8=H<j0j0Ws;eU%xgJ>WQcV@pH3TCYp<5f$WR7}7g&x-<ow-5i=aoQ6MtrV(
zs+}rxiQ<=-LU1$ma-UWhQ5PeMDJG6#3pPZG5gO?#(PEYb7#oH`<VjpM#~nuEcgm(*
z<I*;;6A8?4Xu)mhGkiTkU91yCKA9JCmPx~-7rHz~Wx#_9kyINF8vv+FeNtwcj<<bJ
z;5W8k!&#`fCk(?)&||8`)L1B-sws^_@##W6sx+o!5Syt;>T<I+>Nt@fc~o-|kVXOy
z#*;*uVMQL&)r%p+3D=*Pr_ztYW-?M0kq#qGTrM%I8`g!9GNU8asEp5HH7z5hiDbUe
zj5xSY4EMuEe88Cv1rM}oJ$}8zjANdg*-ProB7nJFkd@ez)L3<-<8toF^Yfjtyxv_K
zy_-EZU6<F(47|v|&|i5erc~S+#mhXH5vLzy;9e#nvy6Q!16x<<*jj66JRaa?VB3mZ
ztu^-Q+NAM%ZTQK_SlQmXvr;g$Oqz)9ubrhL47Tend3B&w=Z2FLhAe*oIW1Rj<L0v~
zCD`##={K@c-;2<n5ZVL+d#?7qee>g65qAAoB*_xkU7N-l=)nCF?EIfG8i}cq5{&-K
zA<d+ol3|swS-TILnOJV1MTSizl>#dvCTq3bNj<Nzz@&ScS1y#`@Jjze)~LPx)QPEM
z`Dvz5Bu;g#B1IIqDA_#zh&-x#5w^5C6{}8O-}nB;;?z;F+kB5${`k=6_XnY9uoe?E
z+2GD_^UDm?#IeETPR9{={mPs#MaU<uY7N_4S0J39LCNbVVD6=*cZZJDHT>f7-RbN?
f!)D_!2KI83vA~Tj)(hQ#4VKQWTH7)9w2=P;<01n?

literal 0
HcmV?d00001

diff --git a/examples/pytorch/zero-shot/grounding-dino-tiny-finetuned/object_detection_no_trainer/1722925039.8834014/hparams.yml b/examples/pytorch/zero-shot/grounding-dino-tiny-finetuned/object_detection_no_trainer/1722925039.8834014/hparams.yml
new file mode 100644
index 00000000000000..fd4e54228d4860
--- /dev/null
+++ b/examples/pytorch/zero-shot/grounding-dino-tiny-finetuned/object_detection_no_trainer/1722925039.8834014/hparams.yml
@@ -0,0 +1,29 @@
+adam_beta1: 0.9
+adam_beta2: 0.999
+adam_epsilon: 1.0e-08
+cache_dir: null
+checkpointing_steps: epoch
+dataloader_num_workers: 4
+dataset_name: cppe-5
+gradient_accumulation_steps: 1
+hub_model_id: null
+hub_token: null
+ignore_mismatched_sizes: true
+image_square_size: 600
+learning_rate: 5.0e-05
+lr_scheduler_type: linear
+max_train_steps: 2700
+model_name_or_path: IDEA-Research/grounding-dino-tiny
+num_train_epochs: 100
+num_warmup_steps: 0
+output_dir: grounding-dino-tiny-finetuned
+per_device_eval_batch_size: 8
+per_device_train_batch_size: 8
+push_to_hub: false
+report_to: all
+resume_from_checkpoint: null
+seed: null
+train_val_split: 0.15
+trust_remote_code: false
+use_auth_token: false
+with_tracking: true
diff --git a/examples/pytorch/zero-shot/grounding-dino-tiny-finetuned/object_detection_no_trainer/1722925229.4844875/events.out.tfevents.1722925229.a9776447cf29.71066.1 b/examples/pytorch/zero-shot/grounding-dino-tiny-finetuned/object_detection_no_trainer/1722925229.4844875/events.out.tfevents.1722925229.a9776447cf29.71066.1
new file mode 100644
index 0000000000000000000000000000000000000000..c62c0ae3dbbe340d44904ab45939f17e6185ef86
GIT binary patch
literal 1642
zcmaJ>J!lj`6uy}A^q$5ft0ubn^&%+hT}(tl5ib%0nnq0{R>R!x<ZjIF&N?%j7)u*L
z#KIs5YNe3I-Zp3x&{)|BRw=Yl&`vAy&F$T}cZp7Mu<v{G=6mmb-(5__=l8>l&$3G=
z$kFiS$MbeaNm3pZrsi3o&y3F$;FxBC60#(ybPFo-eJU3V3zBOl3(RDZS?)0sM~03q
z-MG9Zt{Z*{KWATGjq4tHlZKTv>>-tqO6n^z%5H^Bavuv!FQ?%}8n)FS>v7t8l+N{l
z<qMa2B%nScLXwc`%5Yn*3wF9xQ^j-@!N_jt)P^*fryfzE$MtY~t`B;7IS`T&pDUkg
zr@~yK_-&>T+zZ{@r)5Ue&4^-(iDTG;U6EphM!G_@m}3FPhM*655|_<$hmrW5k||fe
zGzfMgff)`>xD7pquP3OBb%Mwz3qsB@X?S!)hsUT4cu*#iYQtd{0ClNPN=(zSmhW-=
z#`dc?3l;Z-VYmmnOx2hg3x!iLrEw@ey;X}Ujp-P~X3CPf+-!|HPUJ@()m#Lmo`8e#
zBvEEqk%x5kV#sj9wI>#+^rNtmj8sLW!$=dCOU&wo9bu%*=tw0h;d59`%SdS=nJ+XW
z4(=1fy|5b}aOOh618rK5UvD$xSm0*%k~(t;U~UIwCH5pWRvm6DOg_$CYmepi>E6im
zY+<G*ueTXk%|QQOc`2q;+!@8oJeU>J%Ncl)Nysc?Kgz(sCLLSb?Tp6*+zbqE%GKIt
zZ>>%muUChkoQ#$2t~o0O{Trl-=!xoiQ-r=&y(O>iYu35p<b)y1pF~a@)my*0a%Tb#
z{!{w(tkjPp^d^Khj=<5oJs;oy{9c5^{}oBH1a{Y=u{yf;VFC{Q&lvSY!(`-N4rwHH
zO&C@Qo3;9|k%{H@HOa7nq*7od#AI!?+o|W(7npQU<I433IKA1wkTqf-xO8Fae13*0
z6p2$AElUvvE=o3!KO&E+UWEP4PQ|Kc?jK)&usC%N>=xf6mY?kZx_%0Z25T}wgAKNZ
zo8Mrl296CTcRGf^+B<W;6(OIrsx@Q}Y(h9ci;~w-z}#O?#gnr&4X?g>K9jvtx7iqs
bg1ymXEO29sbwlT0gQas@wzoZ)UfTNy8xj6#

literal 0
HcmV?d00001

diff --git a/examples/pytorch/zero-shot/grounding-dino-tiny-finetuned/object_detection_no_trainer/1722925229.4961867/hparams.yml b/examples/pytorch/zero-shot/grounding-dino-tiny-finetuned/object_detection_no_trainer/1722925229.4961867/hparams.yml
new file mode 100644
index 00000000000000..a6ee74feb4d073
--- /dev/null
+++ b/examples/pytorch/zero-shot/grounding-dino-tiny-finetuned/object_detection_no_trainer/1722925229.4961867/hparams.yml
@@ -0,0 +1,29 @@
+adam_beta1: 0.9
+adam_beta2: 0.999
+adam_epsilon: 1.0e-08
+cache_dir: null
+checkpointing_steps: epoch
+dataloader_num_workers: 4
+dataset_name: cppe-5
+gradient_accumulation_steps: 1
+hub_model_id: null
+hub_token: null
+ignore_mismatched_sizes: true
+image_square_size: 600
+learning_rate: 5.0e-05
+lr_scheduler_type: linear
+max_train_steps: 21300
+model_name_or_path: IDEA-Research/grounding-dino-tiny
+num_train_epochs: 100
+num_warmup_steps: 0
+output_dir: grounding-dino-tiny-finetuned
+per_device_eval_batch_size: 1
+per_device_train_batch_size: 1
+push_to_hub: false
+report_to: all
+resume_from_checkpoint: null
+seed: null
+train_val_split: 0.15
+trust_remote_code: false
+use_auth_token: false
+with_tracking: true
diff --git a/examples/pytorch/zero-shot/grounding-dino-tiny-finetuned/object_detection_no_trainer/events.out.tfevents.1722922514.a9776447cf29.50015.0 b/examples/pytorch/zero-shot/grounding-dino-tiny-finetuned/object_detection_no_trainer/events.out.tfevents.1722922514.a9776447cf29.50015.0
new file mode 100644
index 0000000000000000000000000000000000000000..c5085c613a378cc7f9d3e0f0ba38503b08a6a725
GIT binary patch
literal 88
zcmeZZfPjCKJmzx77tU_UU31e>iZ`h!F*8rkwJbHS#L6g0k4vW{HLp0oC@DX&C`GTh
hG&eV~s8X-ID6=HBNG}znDn2bUCp8`-!q@*~B>*P5AdLV3

literal 0
HcmV?d00001

diff --git a/examples/pytorch/zero-shot/grounding-dino-tiny-finetuned/object_detection_no_trainer/events.out.tfevents.1722922680.a9776447cf29.52465.0 b/examples/pytorch/zero-shot/grounding-dino-tiny-finetuned/object_detection_no_trainer/events.out.tfevents.1722922680.a9776447cf29.52465.0
new file mode 100644
index 0000000000000000000000000000000000000000..c31669781c6f94717a33037726b6bdadc9730ce2
GIT binary patch
literal 88
zcmeZZfPjCKJmzv%Xb7&$U31e>iZ`h!F*8rkwJbHS#L6g0k4vW{HLp0oC@DX&C`GTh
hG&eV~s8X-ID6=HBNG}znDn2bUCp8`-5)=FOH~{`DAV2^B

literal 0
HcmV?d00001

diff --git a/examples/pytorch/zero-shot/grounding-dino-tiny-finetuned/object_detection_no_trainer/events.out.tfevents.1722924013.a9776447cf29.54915.0 b/examples/pytorch/zero-shot/grounding-dino-tiny-finetuned/object_detection_no_trainer/events.out.tfevents.1722924013.a9776447cf29.54915.0
new file mode 100644
index 0000000000000000000000000000000000000000..b6eac80b292be0b92e14640b3714859cfa694890
GIT binary patch
literal 88
zcmeZZfPjCKJmzvT&Mf+!x8|m!6mL>dVrHJ6YguYuiIq{19+yr@YF=@EQBr<lQHox1
hX>M*}QKepaQD#YMkzOiDReV}zPHH?vBu^$!7yv7BABO+{

literal 0
HcmV?d00001

diff --git a/examples/pytorch/zero-shot/grounding-dino-tiny-finetuned/object_detection_no_trainer/events.out.tfevents.1722924136.a9776447cf29.57364.0 b/examples/pytorch/zero-shot/grounding-dino-tiny-finetuned/object_detection_no_trainer/events.out.tfevents.1722924136.a9776447cf29.57364.0
new file mode 100644
index 0000000000000000000000000000000000000000..0d5c402e493c5c38a9b591b3c319e28dfac8f3c1
GIT binary patch
literal 88
zcmeZZfPjCKJmzv9=wXz~Uvtw@iZ`h!F*8rkwJbHS#L6g0k4vW{HLp0oC@DX&C`GTh
hG&eV~s8X-ID6=HBNG}znDn2bUCp8`-a{P(I0{{e=AgBNU

literal 0
HcmV?d00001

diff --git a/examples/pytorch/zero-shot/grounding-dino-tiny-finetuned/object_detection_no_trainer/events.out.tfevents.1722924201.a9776447cf29.59813.0 b/examples/pytorch/zero-shot/grounding-dino-tiny-finetuned/object_detection_no_trainer/events.out.tfevents.1722924201.a9776447cf29.59813.0
new file mode 100644
index 0000000000000000000000000000000000000000..4dc60ced50f349842b4bd01c2e150ac73eed7246
GIT binary patch
literal 88
zcmeZZfPjCKJmzxFUhc1zzviZ+6mL>dVrHJ6YguYuiIq{19+yr@YF=@EQBr<lQHox1
hX>M*}QKepaQD#YMkzOiDReV}zPHH?vWJ5;M4FDRsAcFt^

literal 0
HcmV?d00001

diff --git a/examples/pytorch/zero-shot/grounding-dino-tiny-finetuned/object_detection_no_trainer/events.out.tfevents.1722924545.a9776447cf29.62264.0 b/examples/pytorch/zero-shot/grounding-dino-tiny-finetuned/object_detection_no_trainer/events.out.tfevents.1722924545.a9776447cf29.62264.0
new file mode 100644
index 0000000000000000000000000000000000000000..9dd492b6fc3317075357f86e0170590407807fd8
GIT binary patch
literal 88
zcmeZZfPjCKJmzw`Fvd6Jues?c#hX-=n3<>NT9%quVr3Mh$E8z}npd1(l$4)Xl%iK$
hnwy(gRH;{9lv$Emq?Za(6`z)wlNt{Z={)ax5diK<AP4{e

literal 0
HcmV?d00001

diff --git a/examples/pytorch/zero-shot/grounding-dino-tiny-finetuned/object_detection_no_trainer/events.out.tfevents.1722924574.a9776447cf29.63604.0 b/examples/pytorch/zero-shot/grounding-dino-tiny-finetuned/object_detection_no_trainer/events.out.tfevents.1722924574.a9776447cf29.63604.0
new file mode 100644
index 0000000000000000000000000000000000000000..952cdf66a6833a5b7bf2ea1ca228383f7253d338
GIT binary patch
literal 88
zcmeZZfPjCKJmzvX8FaVjues?c#hX-=n3<>NT9%quVr3Mh$E8z}npd1(l$4)Xl%iK$
hnwy(gRH;{9lv$Emq?Za(6`z)wlNt{Zc`5Tv0{|SnAYA|e

literal 0
HcmV?d00001

diff --git a/examples/pytorch/zero-shot/grounding-dino-tiny-finetuned/object_detection_no_trainer/events.out.tfevents.1722925039.a9776447cf29.67335.0 b/examples/pytorch/zero-shot/grounding-dino-tiny-finetuned/object_detection_no_trainer/events.out.tfevents.1722925039.a9776447cf29.67335.0
new file mode 100644
index 0000000000000000000000000000000000000000..a0236d45bb8b6a72ef6baddcb446988cb9cbe98b
GIT binary patch
literal 88
zcmeZZfPjCKJmzv%KK=eXf6YxtDc+=_#LPTB*Rs^S5-X!1JuaP+)V$*SqNM!9q7=R2
h(%js{qDsB;qRf)iBE3|Qs`#|boYZ)Th|ssX{{W2eB1-@O

literal 0
HcmV?d00001

diff --git a/examples/pytorch/zero-shot/grounding-dino-tiny-finetuned/object_detection_no_trainer/events.out.tfevents.1722925229.a9776447cf29.71066.0 b/examples/pytorch/zero-shot/grounding-dino-tiny-finetuned/object_detection_no_trainer/events.out.tfevents.1722925229.a9776447cf29.71066.0
new file mode 100644
index 0000000000000000000000000000000000000000..a8375abb5c27a05c218bcdd2ccb418a9192ad99b
GIT binary patch
literal 88
zcmeZZfPjCKJmzwWE{M}ESaZ`+iZ`h!F*8rkwJbHS#L6g0k4vW{HLp0oC@DX&C`GTh
hG&eV~s8X-ID6=HBNG}znDn2bUCp8`-^5coN1pxQHAM^kK

literal 0
HcmV?d00001

diff --git a/examples/pytorch/zero-shot/requirements.txt b/examples/pytorch/zero-shot/requirements.txt
new file mode 100644
index 00000000000000..2aa0d9bcf01672
--- /dev/null
+++ b/examples/pytorch/zero-shot/requirements.txt
@@ -0,0 +1,5 @@
+albumentations >= 1.4.5
+timm
+datasets
+torchmetrics
+pycocotools
diff --git a/examples/pytorch/zero-shot/run_zero_shot_object_detection.py b/examples/pytorch/zero-shot/run_zero_shot_object_detection.py
new file mode 100644
index 00000000000000..5d98267625e5b1
--- /dev/null
+++ b/examples/pytorch/zero-shot/run_zero_shot_object_detection.py
@@ -0,0 +1,518 @@
+#!/usr/bin/env python
+# coding=utf-8
+# Copyright 2024 The HuggingFace Inc. team. All rights reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+
+"""Finetuning any 🤗 Transformers model supported by AutoModelForZeroShotObjectDetection for object detection leveraging the Trainer API."""
+
+import logging
+import os
+import sys
+from dataclasses import dataclass, field
+from functools import partial
+from typing import Any, List, Mapping, Optional, Tuple, Union
+
+import albumentations as A
+import numpy as np
+import torch
+from datasets import load_dataset
+from torchmetrics.detection.mean_ap import MeanAveragePrecision
+
+import transformers
+from transformers import (
+    AutoConfig,
+    AutoProcessor,
+    AutoModelForZeroShotObjectDetection,
+    HfArgumentParser,
+    Trainer,
+    TrainingArguments,
+)
+from transformers.image_processing_utils import BatchFeature
+from transformers.image_transforms import center_to_corners_format
+from transformers.trainer import EvalPrediction
+from transformers.trainer_utils import get_last_checkpoint
+from transformers.utils import check_min_version, send_example_telemetry
+from transformers.utils.versions import require_version
+
+
+logger = logging.getLogger(__name__)
+
+# Will error if the minimal version of Transformers is not installed. Remove at your own risks.
+check_min_version("4.44.0.dev0")
+
+require_version("datasets>=2.0.0", "To fix: pip install -r examples/pytorch/zero_shot/requirements.txt")
+
+
+@dataclass
+class ModelOutput:
+    logits: torch.Tensor
+    pred_boxes: torch.Tensor
+
+
+def format_image_annotations_as_coco(
+    image_id: str, categories: List[int], areas: List[float], bboxes: List[Tuple[float]]
+) -> dict:
+    """Format one set of image annotations to the COCO format
+
+    Args:
+        image_id (str): image id. e.g. "0001"
+        categories (List[int]): list of categories/class labels corresponding to provided bounding boxes
+        areas (List[float]): list of corresponding areas to provided bounding boxes
+        bboxes (List[Tuple[float]]): list of bounding boxes provided in COCO format
+            ([center_x, center_y, width, height] in absolute coordinates)
+
+    Returns:
+        dict: {
+            "image_id": image id,
+            "annotations": list of formatted annotations
+        }
+    """
+    annotations = []
+    for category, area, bbox in zip(categories, areas, bboxes):
+        formatted_annotation = {
+            "image_id": image_id,
+            "category_id": category,
+            "iscrowd": 0,
+            "area": area,
+            "bbox": list(bbox),
+        }
+        annotations.append(formatted_annotation)
+
+    return {
+        "image_id": image_id,
+        "annotations": annotations,
+    }
+
+
+def convert_bbox_yolo_to_pascal(boxes: torch.Tensor, image_size: Tuple[int, int]) -> torch.Tensor:
+    """
+    Convert bounding boxes from YOLO format (x_center, y_center, width, height) in range [0, 1]
+    to Pascal VOC format (x_min, y_min, x_max, y_max) in absolute coordinates.
+
+    Args:
+        boxes (torch.Tensor): Bounding boxes in YOLO format
+        image_size (Tuple[int, int]): Image size in format (height, width)
+
+    Returns:
+        torch.Tensor: Bounding boxes in Pascal VOC format (x_min, y_min, x_max, y_max)
+    """
+    # convert center to corners format
+    boxes = center_to_corners_format(boxes)
+
+    # convert to absolute coordinates
+    height, width = image_size
+    boxes = boxes * torch.tensor([[width, height, width, height]])
+
+    return boxes
+
+
+def augment_and_transform_batch(
+    examples: Mapping[str, Any],
+    transform: A.Compose,
+    image_processor: AutoImageProcessor,
+    return_pixel_mask: bool = False,
+) -> BatchFeature:
+    """Apply augmentations and format annotations in COCO format for object detection task"""
+
+    images = []
+    annotations = []
+    for image_id, image, objects in zip(examples["image_id"], examples["image"], examples["objects"]):
+        image = np.array(image.convert("RGB"))
+
+        # apply augmentations
+        output = transform(image=image, bboxes=objects["bbox"], category=objects["category"])
+        images.append(output["image"])
+
+        # format annotations in COCO format
+        formatted_annotations = format_image_annotations_as_coco(
+            image_id, output["category"], objects["area"], output["bboxes"]
+        )
+        annotations.append(formatted_annotations)
+
+    # Apply the image processor transformations: resizing, rescaling, normalization
+    result = image_processor(images=images, annotations=annotations, return_tensors="pt")
+
+    if not return_pixel_mask:
+        result.pop("pixel_mask", None)
+
+    return result
+
+
+def collate_fn(batch: List[BatchFeature]) -> Mapping[str, Union[torch.Tensor, List[Any]]]:
+    data = {}
+    data["pixel_values"] = torch.stack([x["pixel_values"] for x in batch])
+    data["labels"] = [x["labels"] for x in batch]
+    if "pixel_mask" in batch[0]:
+        data["pixel_mask"] = torch.stack([x["pixel_mask"] for x in batch])
+    return data
+
+
+@torch.no_grad()
+def compute_metrics(
+    evaluation_results: EvalPrediction,
+    image_processor: AutoImageProcessor,
+    threshold: float = 0.0,
+    id2label: Optional[Mapping[int, str]] = None,
+) -> Mapping[str, float]:
+    """
+    Compute mean average mAP, mAR and their variants for the object detection task.
+
+    Args:
+        evaluation_results (EvalPrediction): Predictions and targets from evaluation.
+        threshold (float, optional): Threshold to filter predicted boxes by confidence. Defaults to 0.0.
+        id2label (Optional[dict], optional): Mapping from class id to class name. Defaults to None.
+
+    Returns:
+        Mapping[str, float]: Metrics in a form of dictionary {<metric_name>: <metric_value>}
+    """
+
+    predictions, targets = evaluation_results.predictions, evaluation_results.label_ids
+
+    # For metric computation we need to provide:
+    #  - targets in a form of list of dictionaries with keys "boxes", "labels"
+    #  - predictions in a form of list of dictionaries with keys "boxes", "scores", "labels"
+
+    image_sizes = []
+    post_processed_targets = []
+    post_processed_predictions = []
+
+    # Collect targets in the required format for metric computation
+    for batch in targets:
+        # collect image sizes, we will need them for predictions post processing
+        batch_image_sizes = torch.tensor([x["orig_size"] for x in batch])
+        image_sizes.append(batch_image_sizes)
+        # collect targets in the required format for metric computation
+        # boxes were converted to YOLO format needed for model training
+        # here we will convert them to Pascal VOC format (x_min, y_min, x_max, y_max)
+        for image_target in batch:
+            boxes = torch.tensor(image_target["boxes"])
+            boxes = convert_bbox_yolo_to_pascal(boxes, image_target["orig_size"])
+            labels = torch.tensor(image_target["class_labels"])
+            post_processed_targets.append({"boxes": boxes, "labels": labels})
+
+    # Collect predictions in the required format for metric computation,
+    # model produce boxes in YOLO format, then image_processor convert them to Pascal VOC format
+    for batch, target_sizes in zip(predictions, image_sizes):
+        batch_logits, batch_boxes = batch[1], batch[2]
+        output = ModelOutput(logits=torch.tensor(batch_logits), pred_boxes=torch.tensor(batch_boxes))
+        post_processed_output = image_processor.post_process_object_detection(
+            output, threshold=threshold, target_sizes=target_sizes
+        )
+        post_processed_predictions.extend(post_processed_output)
+
+    # Compute metrics
+    metric = MeanAveragePrecision(box_format="xyxy", class_metrics=True)
+    metric.update(post_processed_predictions, post_processed_targets)
+    metrics = metric.compute()
+
+    # Replace list of per class metrics with separate metric for each class
+    classes = metrics.pop("classes")
+    map_per_class = metrics.pop("map_per_class")
+    mar_100_per_class = metrics.pop("mar_100_per_class")
+    for class_id, class_map, class_mar in zip(classes, map_per_class, mar_100_per_class):
+        class_name = id2label[class_id.item()] if id2label is not None else class_id.item()
+        metrics[f"map_{class_name}"] = class_map
+        metrics[f"mar_100_{class_name}"] = class_mar
+
+    metrics = {k: round(v.item(), 4) for k, v in metrics.items()}
+
+    return metrics
+
+
+@dataclass
+class DataTrainingArguments:
+    """
+    Arguments pertaining to what data we are going to input our model for training and eval.
+    Using `HfArgumentParser` we can turn this class into argparse arguments to be able to specify
+    them on the command line.
+    """
+
+    dataset_name: str = field(
+        default="cppe-5",
+        metadata={
+            "help": "Name of a dataset from the hub (could be your own, possibly private dataset hosted on the hub)."
+        },
+    )
+    dataset_config_name: Optional[str] = field(
+        default=None, metadata={"help": "The configuration name of the dataset to use (via the datasets library)."}
+    )
+    train_val_split: Optional[float] = field(
+        default=0.15, metadata={"help": "Percent to split off of train for validation."}
+    )
+    image_square_size: Optional[int] = field(
+        default=600,
+        metadata={"help": "Image longest size will be resized to this value, then image will be padded to square."},
+    )
+    max_train_samples: Optional[int] = field(
+        default=None,
+        metadata={
+            "help": (
+                "For debugging purposes or quicker training, truncate the number of training examples to this "
+                "value if set."
+            )
+        },
+    )
+    max_eval_samples: Optional[int] = field(
+        default=None,
+        metadata={
+            "help": (
+                "For debugging purposes or quicker training, truncate the number of evaluation examples to this "
+                "value if set."
+            )
+        },
+    )
+
+
+@dataclass
+class ModelArguments:
+    """
+    Arguments pertaining to which model/config/tokenizer we are going to fine-tune from.
+    """
+
+    model_name_or_path: str = field(
+        default="IDEA-Research/grounding-dino-tiny",
+        metadata={"help": "Path to pretrained model or model identifier from huggingface.co/models"},
+    )
+    config_name: Optional[str] = field(
+        default=None, metadata={"help": "Pretrained config name or path if not the same as model_name"}
+    )
+    cache_dir: Optional[str] = field(
+        default=None, metadata={"help": "Where do you want to store the pretrained models downloaded from s3"}
+    )
+    model_revision: str = field(
+        default="main",
+        metadata={"help": "The specific model version to use (can be a branch name, tag name or commit id)."},
+    )
+    image_processor_name: str = field(default=None, metadata={"help": "Name or path of preprocessor config."})
+    ignore_mismatched_sizes: bool = field(
+        default=False,
+        metadata={
+            "help": "Whether or not to raise an error if some of the weights from the checkpoint do not have the same size as the weights of the model (if for instance, you are instantiating a model with 10 labels from a checkpoint with 3 labels)."
+        },
+    )
+    token: str = field(
+        default=None,
+        metadata={
+            "help": (
+                "The token to use as HTTP bearer authorization for remote files. If not specified, will use the token "
+                "generated when running `huggingface-cli login` (stored in `~/.huggingface`)."
+            )
+        },
+    )
+    trust_remote_code: bool = field(
+        default=False,
+        metadata={
+            "help": (
+                "Whether to trust the execution of code from datasets/models defined on the Hub."
+                " This option should only be set to `True` for repositories you trust and in which you have read the"
+                " code, as it will execute code present on the Hub on your local machine."
+            )
+        },
+    )
+
+
+def main():
+    # See all possible arguments in src/transformers/training_args.py
+    # or by passing the --help flag to this script.
+    # We now keep distinct sets of args, for a cleaner separation of concerns.
+
+    parser = HfArgumentParser((ModelArguments, DataTrainingArguments, TrainingArguments))
+    if len(sys.argv) == 2 and sys.argv[1].endswith(".json"):
+        # If we pass only one argument to the script and it's the path to a json file,
+        # let's parse it to get our arguments.
+        model_args, data_args, training_args = parser.parse_json_file(json_file=os.path.abspath(sys.argv[1]))
+    else:
+        model_args, data_args, training_args = parser.parse_args_into_dataclasses()
+
+    # # Sending telemetry. Tracking the example usage helps us better allocate resources to maintain them. The
+    # # information sent is the one passed as arguments along with your Python/PyTorch versions.
+    send_example_telemetry("run_object_detection", model_args, data_args)
+
+    # Setup logging
+    logging.basicConfig(
+        format="%(asctime)s - %(levelname)s - %(name)s - %(message)s",
+        datefmt="%m/%d/%Y %H:%M:%S",
+        handlers=[logging.StreamHandler(sys.stdout)],
+    )
+
+    if training_args.should_log:
+        # The default of training_args.log_level is passive, so we set log level at info here to have that default.
+        transformers.utils.logging.set_verbosity_info()
+
+    log_level = training_args.get_process_log_level()
+    logger.setLevel(log_level)
+    transformers.utils.logging.set_verbosity(log_level)
+    transformers.utils.logging.enable_default_handler()
+    transformers.utils.logging.enable_explicit_format()
+
+    # Log on each process the small summary:
+    logger.warning(
+        f"Process rank: {training_args.local_rank}, device: {training_args.device}, n_gpu: {training_args.n_gpu}, "
+        + f"distributed training: {training_args.parallel_mode.value == 'distributed'}, 16-bits training: {training_args.fp16}"
+    )
+    logger.info(f"Training/evaluation parameters {training_args}")
+
+    # Detecting last checkpoint.
+    checkpoint = None
+    if training_args.resume_from_checkpoint is not None:
+        checkpoint = training_args.resume_from_checkpoint
+    elif os.path.isdir(training_args.output_dir) and not training_args.overwrite_output_dir:
+        checkpoint = get_last_checkpoint(training_args.output_dir)
+        if checkpoint is None and len(os.listdir(training_args.output_dir)) > 0:
+            raise ValueError(
+                f"Output directory ({training_args.output_dir}) already exists and is not empty. "
+                "Use --overwrite_output_dir to overcome."
+            )
+        elif checkpoint is not None and training_args.resume_from_checkpoint is None:
+            logger.info(
+                f"Checkpoint detected, resuming training at {checkpoint}. To avoid this behavior, change "
+                "the `--output_dir` or add `--overwrite_output_dir` to train from scratch."
+            )
+
+    # ------------------------------------------------------------------------------------------------
+    # Load dataset, prepare splits
+    # ------------------------------------------------------------------------------------------------
+
+    dataset = load_dataset(
+        data_args.dataset_name, cache_dir=model_args.cache_dir, trust_remote_code=model_args.trust_remote_code
+    )
+
+    # If we don't have a validation split, split off a percentage of train as validation
+    data_args.train_val_split = None if "validation" in dataset.keys() else data_args.train_val_split
+    if isinstance(data_args.train_val_split, float) and data_args.train_val_split > 0.0:
+        split = dataset["train"].train_test_split(data_args.train_val_split, seed=training_args.seed)
+        dataset["train"] = split["train"]
+        dataset["validation"] = split["test"]
+
+    # Get dataset categories and prepare mappings for label_name <-> label_id
+    categories = dataset["train"].features["objects"].feature["category"].names
+    id2label = dict(enumerate(categories))
+    label2id = {v: k for k, v in id2label.items()}
+
+    # ------------------------------------------------------------------------------------------------
+    # Load pretrained config, model and image processor
+    # ------------------------------------------------------------------------------------------------
+
+    common_pretrained_args = {
+        "cache_dir": model_args.cache_dir,
+        "revision": model_args.model_revision,
+        "token": model_args.token,
+        "trust_remote_code": model_args.trust_remote_code,
+    }
+    config = AutoConfig.from_pretrained(
+        model_args.config_name or model_args.model_name_or_path,
+        label2id=label2id,
+        id2label=id2label,
+        **common_pretrained_args,
+    )
+    model = AutoModelForZeroShotObjectDetection.from_pretrained(
+        model_args.model_name_or_path,
+        config=config,
+        ignore_mismatched_sizes=model_args.ignore_mismatched_sizes,
+        **common_pretrained_args,
+    )
+    processor = AutoProcessor.from_pretrained(
+        model_args.image_processor_name or model_args.model_name_or_path,
+    )
+
+    # ------------------------------------------------------------------------------------------------
+    # Define image augmentations and dataset transforms
+    # ------------------------------------------------------------------------------------------------
+    max_size = data_args.image_square_size
+    train_augment_and_transform = A.Compose(
+        [
+            A.Compose(
+                [
+                    A.SmallestMaxSize(max_size=max_size, p=1.0),
+                    A.RandomSizedBBoxSafeCrop(height=max_size, width=max_size, p=1.0),
+                ],
+                p=0.2,
+            ),
+            A.OneOf(
+                [
+                    A.Blur(blur_limit=7, p=0.5),
+                    A.MotionBlur(blur_limit=7, p=0.5),
+                    A.Defocus(radius=(1, 5), alias_blur=(0.1, 0.25), p=0.1),
+                ],
+                p=0.1,
+            ),
+            A.Perspective(p=0.1),
+            A.HorizontalFlip(p=0.5),
+            A.RandomBrightnessContrast(p=0.5),
+            A.HueSaturationValue(p=0.1),
+        ],
+        bbox_params=A.BboxParams(format="coco", label_fields=["category"], clip=True, min_area=25),
+    )
+    validation_transform = A.Compose(
+        [A.NoOp()],
+        bbox_params=A.BboxParams(format="coco", label_fields=["category"], clip=True),
+    )
+
+    # Make transform functions for batch and apply for dataset splits
+    train_transform_batch = partial(
+        augment_and_transform_batch, transform=train_augment_and_transform, image_processor=processor
+    )
+    validation_transform_batch = partial(
+        augment_and_transform_batch, transform=validation_transform, image_processor=processor
+    )
+
+    dataset["train"] = dataset["train"].with_transform(train_transform_batch)
+    dataset["validation"] = dataset["validation"].with_transform(validation_transform_batch)
+    dataset["test"] = dataset["test"].with_transform(validation_transform_batch)
+
+    # ------------------------------------------------------------------------------------------------
+    # Model training and evaluation with Trainer API
+    # ------------------------------------------------------------------------------------------------
+
+    eval_compute_metrics_fn = partial(
+        compute_metrics, image_processor=processor, id2label=id2label, threshold=0.0
+    )
+
+    trainer = Trainer(
+        model=model,
+        args=training_args,
+        train_dataset=dataset["train"] if training_args.do_train else None,
+        eval_dataset=dataset["validation"] if training_args.do_eval else None,
+        tokenizer=processor,
+        data_collator=collate_fn,
+        compute_metrics=eval_compute_metrics_fn,
+    )
+
+    # Training
+    if training_args.do_train:
+        train_result = trainer.train(resume_from_checkpoint=checkpoint)
+        trainer.save_model()
+        trainer.log_metrics("train", train_result.metrics)
+        trainer.save_metrics("train", train_result.metrics)
+        trainer.save_state()
+
+    # Final evaluation
+    if training_args.do_eval:
+        metrics = trainer.evaluate(eval_dataset=dataset["test"], metric_key_prefix="test")
+        trainer.log_metrics("test", metrics)
+        trainer.save_metrics("test", metrics)
+
+    # Write model card and (optionally) push to hub
+    kwargs = {
+        "finetuned_from": model_args.model_name_or_path,
+        "dataset": data_args.dataset_name,
+        "tags": ["object-detection", "vision"],
+    }
+    if training_args.push_to_hub:
+        trainer.push_to_hub(**kwargs)
+    else:
+        trainer.create_model_card(**kwargs)
+
+
+if __name__ == "__main__":
+    main()
diff --git a/examples/pytorch/zero-shot/run_zero_shot_object_detection_no_trainer.py b/examples/pytorch/zero-shot/run_zero_shot_object_detection_no_trainer.py
new file mode 100644
index 00000000000000..d497204e7564b7
--- /dev/null
+++ b/examples/pytorch/zero-shot/run_zero_shot_object_detection_no_trainer.py
@@ -0,0 +1,785 @@
+# coding=utf-8
+# Copyright 2024 The HuggingFace Inc. team. All rights reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+"""Finetuning 🤗 Transformers model for object detection with Accelerate."""
+
+import argparse
+import json
+import logging
+import math
+import os
+from functools import partial
+from pathlib import Path
+from typing import Any, List, Mapping, Tuple, Union
+
+import albumentations as A
+import datasets
+import numpy as np
+import torch
+from accelerate import Accelerator
+from accelerate.logging import get_logger
+from accelerate.utils import set_seed
+from datasets import load_dataset
+from huggingface_hub import HfApi
+from torch.utils.data import DataLoader
+from torchmetrics.detection.mean_ap import MeanAveragePrecision
+from tqdm.auto import tqdm
+
+import transformers
+from transformers import (
+    AutoConfig,
+    AutoProcessor,
+    AutoModelForZeroShotObjectDetection,
+    SchedulerType,
+    get_scheduler,
+)
+from transformers.image_processing_utils import BatchFeature
+from transformers.image_transforms import center_to_corners_format
+from transformers.utils import check_min_version, send_example_telemetry
+from transformers.utils.versions import require_version
+
+
+# Will error if the minimal version of Transformers is not installed. Remove at your own risks.
+check_min_version("4.44.0.dev0")
+
+logging.basicConfig(level=logging.INFO)
+logger = get_logger(__name__)
+
+require_version("datasets>=2.0.0", "To fix: pip install -r examples/pytorch/semantic-segmentation/requirements.txt")
+
+
+# Copied from examples/pytorch/object-detection/run_object_detection.format_image_annotations_as_coco
+def format_image_annotations_as_coco(
+    image_id: str, categories: List[int], areas: List[float], bboxes: List[Tuple[float]]
+) -> dict:
+    """Format one set of image annotations to the COCO format
+
+    Args:
+        image_id (str): image id. e.g. "0001"
+        categories (List[int]): list of categories/class labels corresponding to provided bounding boxes
+        areas (List[float]): list of corresponding areas to provided bounding boxes
+        bboxes (List[Tuple[float]]): list of bounding boxes provided in COCO format
+            ([center_x, center_y, width, height] in absolute coordinates)
+
+    Returns:
+        dict: {
+            "image_id": image id,
+            "annotations": list of formatted annotations
+        }
+    """
+    annotations = []
+    for category, area, bbox in zip(categories, areas, bboxes):
+        formatted_annotation = {
+            "image_id": image_id,
+            "category_id": category,
+            "iscrowd": 0,
+            "area": area,
+            "bbox": list(bbox),
+        }
+        annotations.append(formatted_annotation)
+
+    return {
+        "image_id": image_id,
+        "annotations": annotations,
+    }
+
+
+# Copied from examples/pytorch/object-detection/run_object_detection.convert_bbox_yolo_to_pascal
+def convert_bbox_yolo_to_pascal(boxes: torch.Tensor, image_size: Tuple[int, int]) -> torch.Tensor:
+    """
+    Convert bounding boxes from YOLO format (x_center, y_center, width, height) in range [0, 1]
+    to Pascal VOC format (x_min, y_min, x_max, y_max) in absolute coordinates.
+
+    Args:
+        boxes (torch.Tensor): Bounding boxes in YOLO format
+        image_size (Tuple[int, int]): Image size in format (height, width)
+
+    Returns:
+        torch.Tensor: Bounding boxes in Pascal VOC format (x_min, y_min, x_max, y_max)
+    """
+    # convert center to corners format
+    boxes = center_to_corners_format(boxes)
+
+    # convert to absolute coordinates
+    height, width = image_size
+    boxes = boxes * torch.tensor([[width, height, width, height]])
+
+    return boxes
+
+
+# Copied from examples/pytorch/object-detection/run_object_detection.augment_and_transform_batch
+def augment_and_transform_batch(
+    examples: Mapping[str, Any],
+    transform: A.Compose,
+    processor: AutoProcessor,
+    prompt: str,
+    return_pixel_mask: bool = False,
+) -> BatchFeature:
+    """Apply augmentations and format annotations in COCO format for object detection task"""
+
+    images = []
+    annotations = []
+    text = []
+    for image_id, image, objects in zip(examples["image_id"], examples["image"], examples["objects"]):
+        image = np.array(image.convert("RGB"))
+
+        # apply augmentations
+        output = transform(image=image, bboxes=objects["bbox"], category=objects["category"])
+        images.append(output["image"])
+
+        # format annotations in COCO format
+        formatted_annotations = format_image_annotations_as_coco(
+            image_id, output["category"], objects["area"], output["bboxes"]
+        )
+        annotations.append(formatted_annotations)
+        text.append(prompt)
+
+    # Apply the image processor transformations: resizing, rescaling, normalization
+    result = processor(images=images, text=text, annotations=annotations, return_tensors="pt")
+
+    if not return_pixel_mask:
+        result.pop("pixel_mask", None)
+
+    return result
+
+
+# Copied from examples/pytorch/object-detection/run_object_detection.collate_fn
+def collate_fn(batch: List[BatchFeature]) -> Mapping[str, Union[torch.Tensor, List[Any]]]:
+    data = {}
+    data["pixel_values"] = torch.stack([x["pixel_values"] for x in batch])
+    data["input_ids"] = torch.stack([x["input_ids"] for x in batch])
+    data["token_type_ids"] = torch.stack([x["token_type_ids"] for x in batch])
+    data["labels"] = [x["labels"] for x in batch]
+    if "pixel_mask" in batch[0]:
+        data["pixel_mask"] = torch.stack([x["pixel_mask"] for x in batch])
+    if "attention_mask" in batch[0]:
+        data["attention_mask"] = torch.stack([x["attention_mask"] for x in batch])
+    return data
+
+
+def nested_to_cpu(objects):
+    """Move nested tesnors in objects to CPU if they are on GPU"""
+    if isinstance(objects, torch.Tensor):
+        return objects.cpu()
+    elif isinstance(objects, Mapping):
+        return type(objects)({k: nested_to_cpu(v) for k, v in objects.items()})
+    elif isinstance(objects, (list, tuple)):
+        return type(objects)([nested_to_cpu(v) for v in objects])
+    elif isinstance(objects, (np.ndarray, str, int, float, bool)):
+        return objects
+    raise ValueError(f"Unsupported type {type(objects)}")
+
+
+def evaluation_loop(
+    model: torch.nn.Module,
+    processor: AutoProcessor,
+    accelerator: Accelerator,
+    dataloader: DataLoader,
+    id2label: Mapping[int, str],
+) -> dict:
+    model.eval()
+    metric = MeanAveragePrecision(box_format="xyxy", class_metrics=True)
+
+    for step, batch in enumerate(tqdm(dataloader, disable=not accelerator.is_local_main_process)):
+        with torch.no_grad():
+            outputs = model(**batch)
+
+        # For metric computation we need to collect ground truth and predicted boxes in the same format
+
+        # 1. Collect predicted boxes, classes, scores
+        # processor convert boxes from YOLO format to Pascal VOC format
+        # ([x_min, y_min, x_max, y_max] in absolute coordinates)
+        image_size = torch.stack([example["orig_size"] for example in batch["labels"]], dim=0)
+        predictions = processor.post_process_grounded_object_detection(outputs, threshold=0.0, target_sizes=image_size)
+        predictions = nested_to_cpu(predictions)
+
+        # 2. Collect ground truth boxes in the same format for metric computation
+        # Do the same, convert YOLO boxes to Pascal VOC format
+        target = []
+        for label in batch["labels"]:
+            label = nested_to_cpu(label)
+            boxes = convert_bbox_yolo_to_pascal(label["boxes"], label["orig_size"])
+            labels = label["class_labels"]
+            target.append({"boxes": boxes, "labels": labels})
+
+        metric.update(predictions, target)
+
+    metrics = metric.compute()
+
+    # Replace list of per class metrics with separate metric for each class
+    classes = metrics.pop("classes")
+    map_per_class = metrics.pop("map_per_class")
+    mar_100_per_class = metrics.pop("mar_100_per_class")
+    for class_id, class_map, class_mar in zip(classes, map_per_class, mar_100_per_class):
+        class_name = id2label[class_id.item()]
+        metrics[f"map_{class_name}"] = class_map
+        metrics[f"mar_100_{class_name}"] = class_mar
+
+    # Convert metrics to float
+    metrics = {k: round(v.item(), 4) for k, v in metrics.items()}
+
+    return metrics
+
+
+def parse_args():
+    parser = argparse.ArgumentParser(description="Finetune a transformers model for object detection task")
+    parser.add_argument(
+        "--model_name_or_path",
+        type=str,
+        help="Path to a pretrained model or model identifier from huggingface.co/models.",
+        default="IDEA-Research/grounding-dino-tiny",
+    )
+    parser.add_argument(
+        "--dataset_name",
+        type=str,
+        help="Name of the dataset on the hub.",
+        default="cppe-5",
+    )
+    parser.add_argument(
+        "--train_val_split",
+        type=float,
+        default=0.15,
+        help="Fraction of the dataset to be used for validation.",
+    )
+    parser.add_argument(
+        "--ignore_mismatched_sizes",
+        action="store_true",
+        help="Ignore mismatched sizes between the model and the dataset.",
+    )
+    parser.add_argument(
+        "--image_square_size",
+        type=int,
+        default=1333,
+        help="Image longest size will be resized to this value, then image will be padded to square.",
+    )
+    parser.add_argument(
+        "--cache_dir",
+        type=str,
+        help="Path to a folder in which the model and dataset will be cached.",
+    )
+    parser.add_argument(
+        "--use_auth_token",
+        action="store_true",
+        help="Whether to use an authentication token to access the model repository.",
+    )
+    parser.add_argument(
+        "--per_device_train_batch_size",
+        type=int,
+        default=8,
+        help="Batch size (per device) for the training dataloader.",
+    )
+    parser.add_argument(
+        "--per_device_eval_batch_size",
+        type=int,
+        default=8,
+        help="Batch size (per device) for the evaluation dataloader.",
+    )
+    parser.add_argument(
+        "--dataloader_num_workers",
+        type=int,
+        default=4,
+        help="Number of workers to use for the dataloaders.",
+    )
+    parser.add_argument(
+        "--learning_rate",
+        type=float,
+        default=5e-5,
+        help="Initial learning rate (after the potential warmup period) to use.",
+    )
+    parser.add_argument(
+        "--adam_beta1",
+        type=float,
+        default=0.9,
+        help="Beta1 for AdamW optimizer",
+    )
+    parser.add_argument(
+        "--adam_beta2",
+        type=float,
+        default=0.999,
+        help="Beta2 for AdamW optimizer",
+    )
+    parser.add_argument(
+        "--adam_epsilon",
+        type=float,
+        default=1e-8,
+        help="Epsilon for AdamW optimizer",
+    )
+    parser.add_argument("--num_train_epochs", type=int, default=3, help="Total number of training epochs to perform.")
+    parser.add_argument(
+        "--max_train_steps",
+        type=int,
+        default=None,
+        help="Total number of training steps to perform. If provided, overrides num_train_epochs.",
+    )
+    parser.add_argument(
+        "--gradient_accumulation_steps",
+        type=int,
+        default=1,
+        help="Number of updates steps to accumulate before performing a backward/update pass.",
+    )
+    parser.add_argument(
+        "--lr_scheduler_type",
+        type=SchedulerType,
+        default="linear",
+        help="The scheduler type to use.",
+        choices=["linear", "cosine", "cosine_with_restarts", "polynomial", "constant", "constant_with_warmup"],
+    )
+    parser.add_argument(
+        "--num_warmup_steps", type=int, default=0, help="Number of steps for the warmup in the lr scheduler."
+    )
+    parser.add_argument("--output_dir", type=str, default=None, help="Where to store the final model.")
+    parser.add_argument("--seed", type=int, default=None, help="A seed for reproducible training.")
+    parser.add_argument("--push_to_hub", action="store_true", help="Whether or not to push the model to the Hub.")
+    parser.add_argument(
+        "--hub_model_id", type=str, help="The name of the repository to keep in sync with the local `output_dir`."
+    )
+    parser.add_argument("--hub_token", type=str, help="The token to use to push to the Model Hub.")
+    parser.add_argument(
+        "--trust_remote_code",
+        action="store_true",
+        help=(
+            "Whether to trust the execution of code from datasets/models defined on the Hub."
+            " This option should only be set to `True` for repositories you trust and in which you have read the"
+            " code, as it will execute code present on the Hub on your local machine."
+        ),
+    )
+    parser.add_argument(
+        "--checkpointing_steps",
+        type=str,
+        default=None,
+        help="Whether the various states should be saved at the end of every n steps, or 'epoch' for each epoch.",
+    )
+    parser.add_argument(
+        "--resume_from_checkpoint",
+        type=str,
+        default=None,
+        help="If the training should continue from a checkpoint folder.",
+    )
+    parser.add_argument(
+        "--with_tracking",
+        required=False,
+        action="store_true",
+        help="Whether to enable experiment trackers for logging.",
+    )
+    parser.add_argument(
+        "--report_to",
+        type=str,
+        default="all",
+        help=(
+            'The integration to report the results and logs to. Supported platforms are `"tensorboard"`,'
+            ' `"wandb"`, `"comet_ml"` and `"clearml"`. Use `"all"` (default) to report to all integrations. '
+            "Only applicable when `--with_tracking` is passed."
+        ),
+    )
+    args = parser.parse_args()
+
+    # Sanity checks
+    if args.push_to_hub or args.with_tracking:
+        if args.output_dir is None:
+            raise ValueError(
+                "Need an `output_dir` to create a repo when `--push_to_hub` or `with_tracking` is specified."
+            )
+
+    if args.output_dir is not None:
+        os.makedirs(args.output_dir, exist_ok=True)
+
+    return args
+
+
+def main():
+    args = parse_args()
+
+    # Sending telemetry. Tracking the example usage helps us better allocate resources to maintain them. The
+    # information sent is the one passed as arguments along with your Python/PyTorch versions.
+    send_example_telemetry("run_object_detection_no_trainer", args)
+
+    # Initialize the accelerator. We will let the accelerator handle device placement for us in this example.
+    # If we're using tracking, we also need to initialize it here and it will by default pick up all supported trackers
+    # in the environment
+    accelerator_log_kwargs = {}
+
+    if args.with_tracking:
+        accelerator_log_kwargs["log_with"] = args.report_to
+        accelerator_log_kwargs["project_dir"] = args.output_dir
+
+    accelerator = Accelerator(gradient_accumulation_steps=args.gradient_accumulation_steps, **accelerator_log_kwargs)
+
+    logger.info(accelerator.state, main_process_only=False)
+    if accelerator.is_local_main_process:
+        datasets.utils.logging.set_verbosity_warning()
+        transformers.utils.logging.set_verbosity_info()
+    else:
+        datasets.utils.logging.set_verbosity_error()
+        transformers.utils.logging.set_verbosity_error()
+
+    # If passed along, set the training seed now.
+    # We set device_specific to True as we want different data augmentation per device.
+    if args.seed is not None:
+        set_seed(args.seed, device_specific=True)
+
+    # Handle the repository creation
+    if accelerator.is_main_process:
+        if args.push_to_hub:
+            # Retrieve of infer repo_name
+            repo_name = args.hub_model_id
+            if repo_name is None:
+                repo_name = Path(args.output_dir).absolute().name
+            # Create repo and retrieve repo_id
+            api = HfApi()
+            repo_id = api.create_repo(repo_name, exist_ok=True, token=args.hub_token).repo_id
+
+            with open(os.path.join(args.output_dir, ".gitignore"), "w+") as gitignore:
+                if "step_*" not in gitignore:
+                    gitignore.write("step_*\n")
+                if "epoch_*" not in gitignore:
+                    gitignore.write("epoch_*\n")
+        elif args.output_dir is not None:
+            os.makedirs(args.output_dir, exist_ok=True)
+    accelerator.wait_for_everyone()
+
+    # Load dataset
+    # In distributed training, the load_dataset function guarantees that only one local process can concurrently
+    # download the dataset.
+    dataset = load_dataset(args.dataset_name, cache_dir=args.cache_dir, trust_remote_code=args.trust_remote_code)
+
+    # If we don't have a validation split, split off a percentage of train as validation.
+    args.train_val_split = None if "validation" in dataset.keys() else args.train_val_split
+    if isinstance(args.train_val_split, float) and args.train_val_split > 0.0:
+        split = dataset["train"].train_test_split(args.train_val_split, seed=args.seed)
+        dataset["train"] = split["train"]
+        dataset["validation"] = split["test"]
+
+    # Get dataset categories and prepare mappings for label_name <-> label_id
+    categories = dataset["train"].features["objects"].feature["category"].names
+    id2label = dict(enumerate(categories))
+    prompt = ". ".join(id2label.values()) + "."
+    label2id = {v: k for k, v in id2label.items()}
+
+    # ------------------------------------------------------------------------------------------------
+    # Load pretrained config, model and image processor
+    # ------------------------------------------------------------------------------------------------
+
+    common_pretrained_args = {
+        "cache_dir": args.cache_dir,
+        "token": args.hub_token,
+        "trust_remote_code": args.trust_remote_code,
+    }
+    config = AutoConfig.from_pretrained(
+        args.model_name_or_path, label2id=label2id, id2label=id2label, **common_pretrained_args
+    )
+    model = AutoModelForZeroShotObjectDetection.from_pretrained(
+        args.model_name_or_path,
+        config=config,
+        ignore_mismatched_sizes=args.ignore_mismatched_sizes,
+        **common_pretrained_args,
+    )
+    processor = AutoProcessor.from_pretrained(
+        args.model_name_or_path,
+    )
+
+    # ------------------------------------------------------------------------------------------------
+    # Define image augmentations and dataset transforms
+    # ------------------------------------------------------------------------------------------------
+    max_size = args.image_square_size
+    train_augment_and_transform = A.Compose(
+        [
+            A.Compose(
+                [
+                    A.SmallestMaxSize(max_size=max_size, p=1.0),
+                    A.RandomSizedBBoxSafeCrop(height=max_size, width=max_size, p=1.0),
+                ],
+                p=0.2,
+            ),
+            A.OneOf(
+                [
+                    A.Blur(blur_limit=7, p=0.5),
+                    A.MotionBlur(blur_limit=7, p=0.5),
+                    A.Defocus(radius=(1, 5), alias_blur=(0.1, 0.25), p=0.1),
+                ],
+                p=0.1,
+            ),
+            A.Perspective(p=0.1),
+            A.HorizontalFlip(p=0.5),
+            A.RandomBrightnessContrast(p=0.5),
+            A.HueSaturationValue(p=0.1),
+        ],
+        bbox_params=A.BboxParams(format="coco", label_fields=["category"], clip=True, min_area=25),
+    )
+    validation_transform = A.Compose(
+        [A.NoOp()],
+        bbox_params=A.BboxParams(format="coco", label_fields=["category"], clip=True),
+    )
+
+    # Make transform functions for batch and apply for dataset splits
+    train_transform_batch = partial(
+        augment_and_transform_batch, transform=train_augment_and_transform, processor=processor, prompt=prompt
+    )
+    validation_transform_batch = partial(
+        augment_and_transform_batch, transform=validation_transform, processor=processor, prompt=prompt
+    )
+
+    with accelerator.main_process_first():
+        train_dataset = dataset["train"].with_transform(train_transform_batch)
+        valid_dataset = dataset["validation"].with_transform(validation_transform_batch)
+        test_dataset = dataset["test"].with_transform(validation_transform_batch)
+
+    dataloader_common_args = {
+        "num_workers": args.dataloader_num_workers,
+        "collate_fn": collate_fn,
+    }
+    train_dataloader = DataLoader(
+        train_dataset, shuffle=True, batch_size=args.per_device_train_batch_size, **dataloader_common_args
+    )
+    valid_dataloader = DataLoader(
+        valid_dataset, shuffle=False, batch_size=args.per_device_eval_batch_size, **dataloader_common_args
+    )
+    test_dataloader = DataLoader(
+        test_dataset, shuffle=False, batch_size=args.per_device_eval_batch_size, **dataloader_common_args
+    )
+
+    # ------------------------------------------------------------------------------------------------
+    # Define optimizer, scheduler and prepare everything with the accelerator
+    # ------------------------------------------------------------------------------------------------
+
+    # Optimizer
+    optimizer = torch.optim.AdamW(
+        list(model.parameters()),
+        lr=args.learning_rate,
+        betas=[args.adam_beta1, args.adam_beta2],
+        eps=args.adam_epsilon,
+    )
+
+    # Figure out how many steps we should save the Accelerator states
+    checkpointing_steps = args.checkpointing_steps
+    if checkpointing_steps is not None and checkpointing_steps.isdigit():
+        checkpointing_steps = int(checkpointing_steps)
+
+    # Scheduler and math around the number of training steps.
+    overrode_max_train_steps = False
+    num_update_steps_per_epoch = math.ceil(len(train_dataloader) / args.gradient_accumulation_steps)
+    if args.max_train_steps is None:
+        args.max_train_steps = args.num_train_epochs * num_update_steps_per_epoch
+        overrode_max_train_steps = True
+
+    lr_scheduler = get_scheduler(
+        name=args.lr_scheduler_type,
+        optimizer=optimizer,
+        num_warmup_steps=args.num_warmup_steps * accelerator.num_processes,
+        num_training_steps=args.max_train_steps
+        if overrode_max_train_steps
+        else args.max_train_steps * accelerator.num_processes,
+    )
+
+    # Prepare everything with our `accelerator`.
+    model, optimizer, train_dataloader, valid_dataloader, test_dataloader, lr_scheduler = accelerator.prepare(
+        model, optimizer, train_dataloader, valid_dataloader, test_dataloader, lr_scheduler
+    )
+
+    # We need to recalculate our total training steps as the size of the training dataloader may have changed.
+    num_update_steps_per_epoch = math.ceil(len(train_dataloader) / args.gradient_accumulation_steps)
+    if overrode_max_train_steps:
+        args.max_train_steps = args.num_train_epochs * num_update_steps_per_epoch
+    # Afterwards we recalculate our number of training epochs
+    args.num_train_epochs = math.ceil(args.max_train_steps / num_update_steps_per_epoch)
+
+    # We need to initialize the trackers we use, and also store our configuration.
+    # The trackers initializes automatically on the main process.
+    if args.with_tracking:
+        experiment_config = vars(args)
+        # TensorBoard cannot log Enums, need the raw value
+        experiment_config["lr_scheduler_type"] = experiment_config["lr_scheduler_type"].value
+        accelerator.init_trackers("object_detection_no_trainer", experiment_config)
+
+    # ------------------------------------------------------------------------------------------------
+    # Run training with evaluation on each epoch
+    # ------------------------------------------------------------------------------------------------
+
+    total_batch_size = args.per_device_train_batch_size * accelerator.num_processes * args.gradient_accumulation_steps
+
+    logger.info("***** Running training *****")
+    logger.info(f"  Num examples = {len(train_dataset)}")
+    logger.info(f"  Num Epochs = {args.num_train_epochs}")
+    logger.info(f"  Instantaneous batch size per device = {args.per_device_train_batch_size}")
+    logger.info(f"  Total train batch size (w. parallel, distributed & accumulation) = {total_batch_size}")
+    logger.info(f"  Gradient Accumulation steps = {args.gradient_accumulation_steps}")
+    logger.info(f"  Total optimization steps = {args.max_train_steps}")
+
+    # Only show the progress bar once on each machine.
+    progress_bar = tqdm(range(args.max_train_steps), disable=not accelerator.is_local_main_process)
+    completed_steps = 0
+    starting_epoch = 0
+
+    # Potentially load in the weights and states from a previous save
+    if args.resume_from_checkpoint:
+        if args.resume_from_checkpoint is not None or args.resume_from_checkpoint != "":
+            checkpoint_path = args.resume_from_checkpoint
+            path = os.path.basename(args.resume_from_checkpoint)
+        else:
+            # Get the most recent checkpoint
+            dirs = [f.name for f in os.scandir(os.getcwd()) if f.is_dir()]
+            dirs.sort(key=os.path.getctime)
+            path = dirs[-1]  # Sorts folders by date modified, most recent checkpoint is the last
+            checkpoint_path = path
+            path = os.path.basename(checkpoint_path)
+
+        accelerator.print(f"Resumed from checkpoint: {checkpoint_path}")
+        accelerator.load_state(checkpoint_path)
+        # Extract `epoch_{i}` or `step_{i}`
+        training_difference = os.path.splitext(path)[0]
+
+        if "epoch" in training_difference:
+            starting_epoch = int(training_difference.replace("epoch_", "")) + 1
+            resume_step = None
+            completed_steps = starting_epoch * num_update_steps_per_epoch
+        else:
+            # need to multiply `gradient_accumulation_steps` to reflect real steps
+            resume_step = int(training_difference.replace("step_", "")) * args.gradient_accumulation_steps
+            starting_epoch = resume_step // len(train_dataloader)
+            completed_steps = resume_step // args.gradient_accumulation_steps
+            resume_step -= starting_epoch * len(train_dataloader)
+
+    # update the progress_bar if load from checkpoint
+    progress_bar.update(completed_steps)
+
+    for epoch in range(starting_epoch, args.num_train_epochs):
+        model.train()
+        if args.with_tracking:
+            total_loss = 0
+        if args.resume_from_checkpoint and epoch == starting_epoch and resume_step is not None:
+            # We skip the first `n` batches in the dataloader when resuming from a checkpoint
+            active_dataloader = accelerator.skip_first_batches(train_dataloader, resume_step)
+        else:
+            active_dataloader = train_dataloader
+
+        for step, batch in enumerate(active_dataloader):
+            with accelerator.accumulate(model):
+                outputs = model(**batch)
+                loss = outputs.loss
+                # We keep track of the loss at each epoch
+                if args.with_tracking:
+                    total_loss += loss.detach().float()
+                accelerator.backward(loss)
+                optimizer.step()
+                lr_scheduler.step()
+                optimizer.zero_grad()
+
+            # Checks if the accelerator has performed an optimization step behind the scenes
+            if accelerator.sync_gradients:
+                progress_bar.update(1)
+                completed_steps += 1
+
+            if isinstance(checkpointing_steps, int):
+                if completed_steps % checkpointing_steps == 0:
+                    output_dir = f"step_{completed_steps}"
+                    if args.output_dir is not None:
+                        output_dir = os.path.join(args.output_dir, output_dir)
+                    accelerator.save_state(output_dir)
+
+                    if args.push_to_hub and epoch < args.num_train_epochs - 1:
+                        accelerator.wait_for_everyone()
+                        unwrapped_model = accelerator.unwrap_model(model)
+                        unwrapped_model.save_pretrained(
+                            args.output_dir,
+                            is_main_process=accelerator.is_main_process,
+                            save_function=accelerator.save,
+                        )
+                        if accelerator.is_main_process:
+                            processor.save_pretrained(args.output_dir)
+                            api.upload_folder(
+                                commit_message=f"Training in progress epoch {epoch}",
+                                folder_path=args.output_dir,
+                                repo_id=repo_id,
+                                repo_type="model",
+                                token=args.hub_token,
+                            )
+
+            if completed_steps >= args.max_train_steps:
+                break
+
+        logger.info("***** Running evaluation *****")
+        metrics = evaluation_loop(model, processor, accelerator, valid_dataloader, id2label)
+
+        logger.info(f"epoch {epoch}: {metrics}")
+
+        if args.with_tracking:
+            accelerator.log(
+                {
+                    "train_loss": total_loss.item() / len(train_dataloader),
+                    **metrics,
+                    "epoch": epoch,
+                    "step": completed_steps,
+                },
+                step=completed_steps,
+            )
+
+        if args.push_to_hub and epoch < args.num_train_epochs - 1:
+            accelerator.wait_for_everyone()
+            unwrapped_model = accelerator.unwrap_model(model)
+            unwrapped_model.save_pretrained(
+                args.output_dir, is_main_process=accelerator.is_main_process, save_function=accelerator.save
+            )
+            if accelerator.is_main_process:
+                processor.save_pretrained(args.output_dir)
+                api.upload_folder(
+                    commit_message=f"Training in progress epoch {epoch}",
+                    folder_path=args.output_dir,
+                    repo_id=repo_id,
+                    repo_type="model",
+                    token=args.hub_token,
+                )
+
+        if args.checkpointing_steps == "epoch":
+            output_dir = f"epoch_{epoch}"
+            if args.output_dir is not None:
+                output_dir = os.path.join(args.output_dir, output_dir)
+            accelerator.save_state(output_dir)
+
+    # ------------------------------------------------------------------------------------------------
+    # Run evaluation on test dataset and save the model
+    # ------------------------------------------------------------------------------------------------
+
+    logger.info("***** Running evaluation on test dataset *****")
+    metrics = evaluation_loop(model, processor, accelerator, test_dataloader, id2label)
+    metrics = {f"test_{k}": v for k, v in metrics.items()}
+
+    logger.info(f"Test metrics: {metrics}")
+
+    if args.with_tracking:
+        accelerator.end_training()
+
+    if args.output_dir is not None:
+        accelerator.wait_for_everyone()
+        unwrapped_model = accelerator.unwrap_model(model)
+        unwrapped_model.save_pretrained(
+            args.output_dir, is_main_process=accelerator.is_main_process, save_function=accelerator.save
+        )
+        if accelerator.is_main_process:
+            with open(os.path.join(args.output_dir, "all_results.json"), "w") as f:
+                json.dump(metrics, f, indent=2)
+
+            processor.save_pretrained(args.output_dir)
+
+            if args.push_to_hub:
+                api.upload_folder(
+                    commit_message="End of training",
+                    folder_path=args.output_dir,
+                    repo_id=repo_id,
+                    repo_type="model",
+                    token=args.hub_token,
+                    ignore_patterns=["epoch_*"],
+                )
+
+
+if __name__ == "__main__":
+    main()

From 9ec8dd9ac94f54111a65195be8178d19a118a125 Mon Sep 17 00:00:00 2001
From: sangbumchoi <danielsejong55@gmail.com>
Date: Tue, 6 Aug 2024 13:46:52 +0000
Subject: [PATCH 44/61] erase

---
 ...t.tfevents.1722922514.a9776447cf29.50015.1 | Bin 1642 -> 0 bytes
 .../1722922514.4384637/hparams.yml            |  29 ------------------
 ...t.tfevents.1722922680.a9776447cf29.52465.1 | Bin 1642 -> 0 bytes
 .../1722922680.2782347/hparams.yml            |  29 ------------------
 ...t.tfevents.1722924013.a9776447cf29.54915.1 | Bin 1642 -> 0 bytes
 .../1722924013.7994215/hparams.yml            |  29 ------------------
 ...t.tfevents.1722924136.a9776447cf29.57364.1 | Bin 1642 -> 0 bytes
 .../1722924136.0336704/hparams.yml            |  29 ------------------
 ...t.tfevents.1722924201.a9776447cf29.59813.1 | Bin 1642 -> 0 bytes
 .../1722924201.254446/hparams.yml             |  29 ------------------
 ...t.tfevents.1722924545.a9776447cf29.62264.1 | Bin 1642 -> 0 bytes
 .../1722924545.4942276/hparams.yml            |  29 ------------------
 ...t.tfevents.1722924574.a9776447cf29.63604.1 | Bin 1642 -> 0 bytes
 .../1722924574.1846154/hparams.yml            |  29 ------------------
 ...t.tfevents.1722925039.a9776447cf29.67335.1 | Bin 1642 -> 0 bytes
 .../1722925039.8834014/hparams.yml            |  29 ------------------
 ...t.tfevents.1722925229.a9776447cf29.71066.1 | Bin 1642 -> 0 bytes
 .../1722925229.4961867/hparams.yml            |  29 ------------------
 ...t.tfevents.1722922514.a9776447cf29.50015.0 | Bin 88 -> 0 bytes
 ...t.tfevents.1722922680.a9776447cf29.52465.0 | Bin 88 -> 0 bytes
 ...t.tfevents.1722924013.a9776447cf29.54915.0 | Bin 88 -> 0 bytes
 ...t.tfevents.1722924136.a9776447cf29.57364.0 | Bin 88 -> 0 bytes
 ...t.tfevents.1722924201.a9776447cf29.59813.0 | Bin 88 -> 0 bytes
 ...t.tfevents.1722924545.a9776447cf29.62264.0 | Bin 88 -> 0 bytes
 ...t.tfevents.1722924574.a9776447cf29.63604.0 | Bin 88 -> 0 bytes
 ...t.tfevents.1722925039.a9776447cf29.67335.0 | Bin 88 -> 0 bytes
 ...t.tfevents.1722925229.a9776447cf29.71066.0 | Bin 88 -> 0 bytes
 27 files changed, 261 deletions(-)
 delete mode 100644 examples/pytorch/zero-shot/grounding-dino-tiny-finetuned/object_detection_no_trainer/1722922514.433052/events.out.tfevents.1722922514.a9776447cf29.50015.1
 delete mode 100644 examples/pytorch/zero-shot/grounding-dino-tiny-finetuned/object_detection_no_trainer/1722922514.4384637/hparams.yml
 delete mode 100644 examples/pytorch/zero-shot/grounding-dino-tiny-finetuned/object_detection_no_trainer/1722922680.2719772/events.out.tfevents.1722922680.a9776447cf29.52465.1
 delete mode 100644 examples/pytorch/zero-shot/grounding-dino-tiny-finetuned/object_detection_no_trainer/1722922680.2782347/hparams.yml
 delete mode 100644 examples/pytorch/zero-shot/grounding-dino-tiny-finetuned/object_detection_no_trainer/1722924013.7937937/events.out.tfevents.1722924013.a9776447cf29.54915.1
 delete mode 100644 examples/pytorch/zero-shot/grounding-dino-tiny-finetuned/object_detection_no_trainer/1722924013.7994215/hparams.yml
 delete mode 100644 examples/pytorch/zero-shot/grounding-dino-tiny-finetuned/object_detection_no_trainer/1722924136.027696/events.out.tfevents.1722924136.a9776447cf29.57364.1
 delete mode 100644 examples/pytorch/zero-shot/grounding-dino-tiny-finetuned/object_detection_no_trainer/1722924136.0336704/hparams.yml
 delete mode 100644 examples/pytorch/zero-shot/grounding-dino-tiny-finetuned/object_detection_no_trainer/1722924201.2479532/events.out.tfevents.1722924201.a9776447cf29.59813.1
 delete mode 100644 examples/pytorch/zero-shot/grounding-dino-tiny-finetuned/object_detection_no_trainer/1722924201.254446/hparams.yml
 delete mode 100644 examples/pytorch/zero-shot/grounding-dino-tiny-finetuned/object_detection_no_trainer/1722924545.4884186/events.out.tfevents.1722924545.a9776447cf29.62264.1
 delete mode 100644 examples/pytorch/zero-shot/grounding-dino-tiny-finetuned/object_detection_no_trainer/1722924545.4942276/hparams.yml
 delete mode 100644 examples/pytorch/zero-shot/grounding-dino-tiny-finetuned/object_detection_no_trainer/1722924574.178983/events.out.tfevents.1722924574.a9776447cf29.63604.1
 delete mode 100644 examples/pytorch/zero-shot/grounding-dino-tiny-finetuned/object_detection_no_trainer/1722924574.1846154/hparams.yml
 delete mode 100644 examples/pytorch/zero-shot/grounding-dino-tiny-finetuned/object_detection_no_trainer/1722925039.8776126/events.out.tfevents.1722925039.a9776447cf29.67335.1
 delete mode 100644 examples/pytorch/zero-shot/grounding-dino-tiny-finetuned/object_detection_no_trainer/1722925039.8834014/hparams.yml
 delete mode 100644 examples/pytorch/zero-shot/grounding-dino-tiny-finetuned/object_detection_no_trainer/1722925229.4844875/events.out.tfevents.1722925229.a9776447cf29.71066.1
 delete mode 100644 examples/pytorch/zero-shot/grounding-dino-tiny-finetuned/object_detection_no_trainer/1722925229.4961867/hparams.yml
 delete mode 100644 examples/pytorch/zero-shot/grounding-dino-tiny-finetuned/object_detection_no_trainer/events.out.tfevents.1722922514.a9776447cf29.50015.0
 delete mode 100644 examples/pytorch/zero-shot/grounding-dino-tiny-finetuned/object_detection_no_trainer/events.out.tfevents.1722922680.a9776447cf29.52465.0
 delete mode 100644 examples/pytorch/zero-shot/grounding-dino-tiny-finetuned/object_detection_no_trainer/events.out.tfevents.1722924013.a9776447cf29.54915.0
 delete mode 100644 examples/pytorch/zero-shot/grounding-dino-tiny-finetuned/object_detection_no_trainer/events.out.tfevents.1722924136.a9776447cf29.57364.0
 delete mode 100644 examples/pytorch/zero-shot/grounding-dino-tiny-finetuned/object_detection_no_trainer/events.out.tfevents.1722924201.a9776447cf29.59813.0
 delete mode 100644 examples/pytorch/zero-shot/grounding-dino-tiny-finetuned/object_detection_no_trainer/events.out.tfevents.1722924545.a9776447cf29.62264.0
 delete mode 100644 examples/pytorch/zero-shot/grounding-dino-tiny-finetuned/object_detection_no_trainer/events.out.tfevents.1722924574.a9776447cf29.63604.0
 delete mode 100644 examples/pytorch/zero-shot/grounding-dino-tiny-finetuned/object_detection_no_trainer/events.out.tfevents.1722925039.a9776447cf29.67335.0
 delete mode 100644 examples/pytorch/zero-shot/grounding-dino-tiny-finetuned/object_detection_no_trainer/events.out.tfevents.1722925229.a9776447cf29.71066.0

diff --git a/examples/pytorch/zero-shot/grounding-dino-tiny-finetuned/object_detection_no_trainer/1722922514.433052/events.out.tfevents.1722922514.a9776447cf29.50015.1 b/examples/pytorch/zero-shot/grounding-dino-tiny-finetuned/object_detection_no_trainer/1722922514.433052/events.out.tfevents.1722922514.a9776447cf29.50015.1
deleted file mode 100644
index daff7299ba7963ee65ca7f33bb8d530ff0e69c01..0000000000000000000000000000000000000000
GIT binary patch
literal 0
HcmV?d00001

literal 1642
zcmaJ>J!lj`6uy}A%sq`sR!!n2YD@%0y^Dz`DB@kvfIo$hMy-aq-O1gM-JNx2HZc|!
zA_yWDq9BNsij9?xh>eg&@TWFlWub+FcA}v8=JxK~yToj<u<v{G=6mmb-=0Xt&!1Z-
z?q|;}+`H;OS)8ytOOo;+KRL?+eRzB-4|_EWl#nGsrJGlg?^8LSpOaiOnP(=8OmmNs
zICA^x<>zNs#dX8a;iv4=_i^3b3u$<nhK-~WQb~P9M%l%XN$z8TQ;*Z|Fb!*Jkaah0
z-ANDkg5?XBc_g4dBSMmp>dH{ta5t=TsiunQDuR&>(4`G&GD|(8LXYdA_Td5O<K;j|
zMtrV(s+|gRiQ<=-LU0rGaG#bLQ5PbLDJG6#3)V-95gO?V(PD-L7~2K|$dkBimOG5Z
z_moVz`k}30ClZ+9(1hF2YxsJCx>zTOd@?8G43maO4|IBr%76!DBB?eU)&o$N`lQ4(
z9c%d>$8T)EinCC0PZ)+9q1#lAsj*Nv6;m3A;?s+@sM45@L2RZhsmsmQsN+O_<WbE<
zK<WuN7*7&qh81~8SI>tGCtSN?j!HiY8_7sjL^_N#ak<2-E?65z%8ZUwq7q(*)wGP1
zCX)F=GveSrG290m@B(Kh6g<$T_4xKSGmbfKW-qBTg8=4sLRMl=Qe)Mjwrg+7zt6VE
z^7?Rn<U#gVU0$y<@G1j?|Kz2ZQgLS#5A$GJoO+yrhna-TGWMMeY+0sbtHaKCJiyJs
z)@8X`YwXq4N#pVA@RO6VvOP6trC@N0G!fljeHIEZ(5koO(f($g8%|CbvfKgWv{b$I
zlP|9nVaI=^U(ZT?FF;>HXyXX%xz@Y*?#H(R>{?MI$r9LIi^l3`|Dz)8T*(;q#MDp`
zM*ioJMpAdtuu9mh)rXBtEZ5&8!v>N{ft3)Gwc2i{o>yOB(mjnM=ZkP?xxbJ#VsAfr
zd~zZ;#T1Ifsf?DThyoWSo5PpLp{nO$bF)*i>fsyvKHi+4JPLM;?-9!#8~pO|AQTMN
zWP%18+!=0uiJ=-eHkjP$7y@tKnCEK&a!IRN+w3jN5YA1b<aHD<_w?w`i6b=)zk2py
kDm%DX`-_jkDA-F)#sW9CSPyjlGgx|fRY%*4Z<kAd0akqjkN^Mx

diff --git a/examples/pytorch/zero-shot/grounding-dino-tiny-finetuned/object_detection_no_trainer/1722922514.4384637/hparams.yml b/examples/pytorch/zero-shot/grounding-dino-tiny-finetuned/object_detection_no_trainer/1722922514.4384637/hparams.yml
deleted file mode 100644
index fd4e54228d4860..00000000000000
--- a/examples/pytorch/zero-shot/grounding-dino-tiny-finetuned/object_detection_no_trainer/1722922514.4384637/hparams.yml
+++ /dev/null
@@ -1,29 +0,0 @@
-adam_beta1: 0.9
-adam_beta2: 0.999
-adam_epsilon: 1.0e-08
-cache_dir: null
-checkpointing_steps: epoch
-dataloader_num_workers: 4
-dataset_name: cppe-5
-gradient_accumulation_steps: 1
-hub_model_id: null
-hub_token: null
-ignore_mismatched_sizes: true
-image_square_size: 600
-learning_rate: 5.0e-05
-lr_scheduler_type: linear
-max_train_steps: 2700
-model_name_or_path: IDEA-Research/grounding-dino-tiny
-num_train_epochs: 100
-num_warmup_steps: 0
-output_dir: grounding-dino-tiny-finetuned
-per_device_eval_batch_size: 8
-per_device_train_batch_size: 8
-push_to_hub: false
-report_to: all
-resume_from_checkpoint: null
-seed: null
-train_val_split: 0.15
-trust_remote_code: false
-use_auth_token: false
-with_tracking: true
diff --git a/examples/pytorch/zero-shot/grounding-dino-tiny-finetuned/object_detection_no_trainer/1722922680.2719772/events.out.tfevents.1722922680.a9776447cf29.52465.1 b/examples/pytorch/zero-shot/grounding-dino-tiny-finetuned/object_detection_no_trainer/1722922680.2719772/events.out.tfevents.1722922680.a9776447cf29.52465.1
deleted file mode 100644
index 6ca8eaa4f08ce2813401b549c22047efd05356a4..0000000000000000000000000000000000000000
GIT binary patch
literal 0
HcmV?d00001

literal 1642
zcmaJ>O=uKJ6rLpGn0XqLX*G$RXksELx-;2C6clkrH28B7auGo&YPu^k#dLSIRn_^~
zqZdKMgCcnGWW<Yj)T`h{{JSfjJgBfPySKfGucoKRo=LP01O2{NufF%*_szLXe7@g4
zcRzn|rSQam{Qi_ZP?eMirTHZm=;Lz>CHO<LKnYnDRJtV<`977arDe%AlO<-d$RhU`
zi6j54efWB@H?A9g2|wo>cjCGSUu5BV7Iu(&NG0_ZnPDp-libGw7anC{Eeo4lkaaI>
z-OWx7gXIgCc_g4dBSMmp>iT5Q)DUcSsiunQCW4XeFsKb_vP3<iLXYdozNs-7;k7_W
zMtrV(s+~G>iQ<1Th2TyoaG%x~QCB01DJG6#3${gy5gO?_(W1cuj7`88@+2->;tnJ6
zJ5^IIxwHrDR01;`I&d3?4PQ@C7wZI(PnLyjFll%cV8CNk20W+{Nwwjy4S>4TCsn5D
zY}fZ3eq;MhoP~;e!Z6$cL#Ap>jfKLgo6<NGpRTl`N@F?(v6-5rE;n1FjuZKjM>Q7#
zNfK}{o+Qc)EAo)8UJV&exb?&`m3|bqlaZ>3bQo#ka*0`kuqBL?86BxdReTPsX&EU^
zB=d!4#KC=Hcm%fN15P6pJkX}~`1Kw$j%99UFR9Z&0CNW*FR>@7vFc>c71i5!sV|n-
z#~afR@~;wkz0SeQ9E|^zmtsoAol(5ZgGF)SQ4ZE}DVgQ$yE)jsLC03Vo%48rn}IzW
za<w+wo0^lx>&@Y(Cu8LcEoWt5e4R8AJ={Dm$}rZgxAfJ~PMsT0P8hP{5#+R9y~)kz
z*DA2@m(owNGXIrfBqg*t1P)yve*f<Of6H*-w<1ZG!0x&<mY|~#E3p4}#z+z~lNFf$
zl|$M|Llwg+WwUM{wllHZXon2jNG1bTN=()!yN`NavcR-^+E*@D;Mhk0Le{js_xzdp
zQ^f_QP$W)$rY1!cxG32o{)i%~dKGqcIu)xPzxn5<TdVUY!S3=sV#U+re}6g(WrKB?
zpv?w%hMQk!s5Xubrgu7vz}q+Gd@DmSZB=W+-n{|g;v!03M*(x^?6b2cS{i=&?7>3*
gTVk_Wm;rme$yngV7AwHuPlIKrdi#5x9_%0f0Ul2MtpET3

diff --git a/examples/pytorch/zero-shot/grounding-dino-tiny-finetuned/object_detection_no_trainer/1722922680.2782347/hparams.yml b/examples/pytorch/zero-shot/grounding-dino-tiny-finetuned/object_detection_no_trainer/1722922680.2782347/hparams.yml
deleted file mode 100644
index fd4e54228d4860..00000000000000
--- a/examples/pytorch/zero-shot/grounding-dino-tiny-finetuned/object_detection_no_trainer/1722922680.2782347/hparams.yml
+++ /dev/null
@@ -1,29 +0,0 @@
-adam_beta1: 0.9
-adam_beta2: 0.999
-adam_epsilon: 1.0e-08
-cache_dir: null
-checkpointing_steps: epoch
-dataloader_num_workers: 4
-dataset_name: cppe-5
-gradient_accumulation_steps: 1
-hub_model_id: null
-hub_token: null
-ignore_mismatched_sizes: true
-image_square_size: 600
-learning_rate: 5.0e-05
-lr_scheduler_type: linear
-max_train_steps: 2700
-model_name_or_path: IDEA-Research/grounding-dino-tiny
-num_train_epochs: 100
-num_warmup_steps: 0
-output_dir: grounding-dino-tiny-finetuned
-per_device_eval_batch_size: 8
-per_device_train_batch_size: 8
-push_to_hub: false
-report_to: all
-resume_from_checkpoint: null
-seed: null
-train_val_split: 0.15
-trust_remote_code: false
-use_auth_token: false
-with_tracking: true
diff --git a/examples/pytorch/zero-shot/grounding-dino-tiny-finetuned/object_detection_no_trainer/1722924013.7937937/events.out.tfevents.1722924013.a9776447cf29.54915.1 b/examples/pytorch/zero-shot/grounding-dino-tiny-finetuned/object_detection_no_trainer/1722924013.7937937/events.out.tfevents.1722924013.a9776447cf29.54915.1
deleted file mode 100644
index d43e8d3424664a82772fa99a31205a750918b108..0000000000000000000000000000000000000000
GIT binary patch
literal 0
HcmV?d00001

literal 1642
zcmaJ>O=uKJ6rPxIblz@E(rS|JM2(4{=+4AMTv(Bvku~6YSqz8>LQ&IQnJLm;)mv4a
z9}gZD7DdE`1;L9Kb@8(Iy$T`->K=Cq2n!zcpy1ENgCM?|o*sK9(K-zD`(C~J-h1CS
zCvwU8@W+W8#s5~-d-2!mF}JU#C>Q0~MJCLVnYl82Ynaed)g)DZS;t{W)k=9ual=%Z
z*(@^814fd_%?oF|pEo3Rqvz4HB6*wC-G46+ck{4~G$N{KsL2#tj+o*h7C3n)54ZEM
zsRcRL^Uk&W#1J^4^jSaz4H=P&MAS6KdnN|pYo8jbnQ0;z*$Vx}k|v8ZAUX=T8SkAK
zg<)P7QZW*89a7^pm`^l6&oqMDVUUNk&WQdcrkG;m7<OPwtQn!PX%Hh9n84UB7)74M
zXN%lpBzdQ1%cYlgf}2TTmO}^b!jR<~Na|x95r<?+ss*Mjk3r}Q7*&FcI#JYE4qE`I
zPeW2;hE8{V&)_$<-^5v@c_1yrZ7^V~Ce&CYy@oA~Lka0}E2<5qV-TCEE9!H*HR^eB
z7zfmFDM*@tg9#MTc35$Mbj?b{aKf!8mZ%EjsGW>dWo)9@5T7f|>W8nQSliK&MqI<^
zu$ob^Hbk*d8b&<aCx(Y%D?Z>YM3M_*TTfo^vEx|ccJ_*T3kYCt9~2e#qzqOa@40m7
zxBF*$6L~$kJbA15D3#a40z4?d*e7{urghR8#mijG%aeBsaJ!I^S;4(lfF0{}>}+-m
z0T;L#*tsrOXOp|JIcdD!9Da5(PI0j1tQ?H3ktU*tn&(~xM!WTvy*kpVbIZw#B33$#
zoYty0y?OW7D(v~J^wX@|>k15Kgf@e~fr~?{|Gaxufqh>TNwx%b*QK!(9r?2gd%t9i
zG%+_`g~`u3q@6TSwX8BW>-J$g6U&Ws$gqv%a^Pgd<ZN_%X%M6f%(|z2<!lwcU+-VY
znRIvmbbR($X^v?WiPxB_D;WzPC0oKDSwdB>z&D*v#i~cH9DI6pW%dVfyL^vW>Bq4b
zPrrkT#X3yTW`jG!&95<38^;E-JDo=0@gsX)R-lx%s<X@8u@2$VJWAfgg89q;p8Nl3
oOT!QTzBN~TliF+=rodfmGA8`QVuR5C$zb`34V!yT{oXkJ5pm@Nl>h($

diff --git a/examples/pytorch/zero-shot/grounding-dino-tiny-finetuned/object_detection_no_trainer/1722924013.7994215/hparams.yml b/examples/pytorch/zero-shot/grounding-dino-tiny-finetuned/object_detection_no_trainer/1722924013.7994215/hparams.yml
deleted file mode 100644
index fd4e54228d4860..00000000000000
--- a/examples/pytorch/zero-shot/grounding-dino-tiny-finetuned/object_detection_no_trainer/1722924013.7994215/hparams.yml
+++ /dev/null
@@ -1,29 +0,0 @@
-adam_beta1: 0.9
-adam_beta2: 0.999
-adam_epsilon: 1.0e-08
-cache_dir: null
-checkpointing_steps: epoch
-dataloader_num_workers: 4
-dataset_name: cppe-5
-gradient_accumulation_steps: 1
-hub_model_id: null
-hub_token: null
-ignore_mismatched_sizes: true
-image_square_size: 600
-learning_rate: 5.0e-05
-lr_scheduler_type: linear
-max_train_steps: 2700
-model_name_or_path: IDEA-Research/grounding-dino-tiny
-num_train_epochs: 100
-num_warmup_steps: 0
-output_dir: grounding-dino-tiny-finetuned
-per_device_eval_batch_size: 8
-per_device_train_batch_size: 8
-push_to_hub: false
-report_to: all
-resume_from_checkpoint: null
-seed: null
-train_val_split: 0.15
-trust_remote_code: false
-use_auth_token: false
-with_tracking: true
diff --git a/examples/pytorch/zero-shot/grounding-dino-tiny-finetuned/object_detection_no_trainer/1722924136.027696/events.out.tfevents.1722924136.a9776447cf29.57364.1 b/examples/pytorch/zero-shot/grounding-dino-tiny-finetuned/object_detection_no_trainer/1722924136.027696/events.out.tfevents.1722924136.a9776447cf29.57364.1
deleted file mode 100644
index 8f94705d1712263e2bc04c5360d983a71c87d403..0000000000000000000000000000000000000000
GIT binary patch
literal 0
HcmV?d00001

literal 1642
zcmaJ>O=uKJ6rM4Y(Rpr6(rS|JM2(Y$U3Q&`i6|)IjJSd5MaV@x)z)-ZW{PxowN=&m
z@fI(Nh=L&N#gi_Ac;C~4;6-pZr@aWCa?pc{9s=IPSJTsD&m>xhfqvhsSKoW@`(`#1
zpU-z@?-#EBpS8u4m#6K%nxs4^&n&V)pO~I2!y(NAC1g!d>6TUG`&6!!mn7FrmYK;S
z^W0-3j{I4Cym5Y8TsM3ZzApUnKCb)gvm89l!5-2GsieLllWaL;lKWWT+@l;k%)#~+
zWZlbIcXP!duzcY%j|9|bL`V`+-5Adn2jB;nYO0uSA{f~X{o0Twi_{}3^tc}HDUQN0
zuLnXh;&bIw?KGH66u-h0f_q_*`?Su8x*SnVF>wrAuq#rG&`39k77HxE*Z~+tp2TH~
z++iesr)J6}m-d65N??XV2X4cV;p++NVx1uJ$&!!@Od1}8(C0BK10K|gq}p)U1wdWu
zlN!@>s_T0izp?!$&O*gKVHoa#0aG=m#zNsVOlcg7PnTO!r7<0Y*i2ngmz%9o$BF#N
zqne9=BndbePZDK@6?sTkuY?RI+<Ib(N<RwQ$w*a1I*c@Nxx}n~*cnF3jE*#-8a{{B
zw2YJ{lKDb2;^00pJPf<>0cRl;JkX}~`1PzA#}YTQm(*E60CW4GAh9Q@vFdpCA8~v2
zLQgEOS2rde6xI@Xy~xA!JdAynmtsoAol(5ZgL!f8Q63)VQ!>ljck{4si;k@wcHZLw
zZU**m$<^9!_ckYu*PFvnPsS<?ww#rLu}#uM^honOtH5Zt-qKe`I(2S1Ibq03N0HNJ
z^(Hr;Uai7U-;{ommHAkK;gry(5%}%e(95+?A1d(6cSVvef!%d!EI~*9s>09TGe(k_
z8Lz^`w;a+=8mJmpDVufsu$_tJMml8JMlu<&Qev`t?H=lR$pX{vX<xZih2vZO3t1EP
z!Lz^5oG#5Vg(7hplXWSgz(vWH@JEzT)hqC0r&F=&iJOPlZ>`Ln0=vuih?UNay<I;B
z6@zt{pv?w%hMV7Hs5Xubrgu7pz<>Xm^R5D=v{kJG_P#9$m*!FOItrM(adCC@WJ|-(
k|9&u6*hp+P1(RTJHW>@t*kXgw|J7i*;<g>xo!3O~7ipgUVE_OC

diff --git a/examples/pytorch/zero-shot/grounding-dino-tiny-finetuned/object_detection_no_trainer/1722924136.0336704/hparams.yml b/examples/pytorch/zero-shot/grounding-dino-tiny-finetuned/object_detection_no_trainer/1722924136.0336704/hparams.yml
deleted file mode 100644
index fd4e54228d4860..00000000000000
--- a/examples/pytorch/zero-shot/grounding-dino-tiny-finetuned/object_detection_no_trainer/1722924136.0336704/hparams.yml
+++ /dev/null
@@ -1,29 +0,0 @@
-adam_beta1: 0.9
-adam_beta2: 0.999
-adam_epsilon: 1.0e-08
-cache_dir: null
-checkpointing_steps: epoch
-dataloader_num_workers: 4
-dataset_name: cppe-5
-gradient_accumulation_steps: 1
-hub_model_id: null
-hub_token: null
-ignore_mismatched_sizes: true
-image_square_size: 600
-learning_rate: 5.0e-05
-lr_scheduler_type: linear
-max_train_steps: 2700
-model_name_or_path: IDEA-Research/grounding-dino-tiny
-num_train_epochs: 100
-num_warmup_steps: 0
-output_dir: grounding-dino-tiny-finetuned
-per_device_eval_batch_size: 8
-per_device_train_batch_size: 8
-push_to_hub: false
-report_to: all
-resume_from_checkpoint: null
-seed: null
-train_val_split: 0.15
-trust_remote_code: false
-use_auth_token: false
-with_tracking: true
diff --git a/examples/pytorch/zero-shot/grounding-dino-tiny-finetuned/object_detection_no_trainer/1722924201.2479532/events.out.tfevents.1722924201.a9776447cf29.59813.1 b/examples/pytorch/zero-shot/grounding-dino-tiny-finetuned/object_detection_no_trainer/1722924201.2479532/events.out.tfevents.1722924201.a9776447cf29.59813.1
deleted file mode 100644
index f8ed5c44e8ce97dd02100896d4f080b776acd55e..0000000000000000000000000000000000000000
GIT binary patch
literal 0
HcmV?d00001

literal 1642
zcmaJ>O=uHA6i#i7&C}X;)wXV{)>cr|q_(1<h>2)Hi=g%*o`%WpG#R?Pv(C)6)}t3e
zMMMz1cn}p4yn9g)ya@gr1;JAfdQjAxH}Oq28#ifn4uO5&n>XKk@B4Bx6QAF=Pu|O)
z{(fe+c>3{}-CdEC2gRv*7U;v1Gey{|S)hch2rAv8ihQ5S#o~hGn#m$FS!9-bjKq--
z@MQ4Js<>|WCH$Osy3W???tYbpms!|IY9W=>S7e-B3Yp|S7MOmVg@;*K(}1kIS?f-A
zv>z;AxXdE~^%)V8gjCl?I!1e8ol7-UOxF>NY=9nZNRxT$5fyq|k93X>!2quYLNelW
z<x}m{m`fDD%oKu~ppW~s%80rcQA{y$3|p{1QjE|@*N7H#EWp?{7($-JW%Jx&Bz~u2
z$|aY!f}KiWhC>T(L%-qc3F=~<Ao9tAkaJ8L9(~a5F)9NdREeb8a99sOUFwqx({!Tk
zdlJ8~{W{J<#XVsdZiHS_HKxWw;nYlN9EwjbHKIymItH<ss-!MATceH>`H@F87Xe8U
za4?=E$_y*=kgi?~8BVzI!~&Im6gHEQs)%$LY2tE;Sv{~ejFcH2sYMlh4y$PyDNQ8v
zg=WOTePVb3HsAx!Tqt;;P3!UN9cCO0+{|85XAS|(?S{O>o}|XABOSMnY<_;eGnUt<
zYhw@cFA{mZ$-(O!4F8pvVoJrGQM}B9Suy=M2M=>8ndR&|IoPs7$5xk}^LT)pfvqcY
zwbt0H>yyUo_2H){W99oA&dR{>GHD{ZzkXhoV5nVh>8pdSIyao0Fl2=T$Z5HHlbbKE
zlwrp|rJrPFewJV$CA3Kd_FV1%`2N?A672e~NYW*+yEcs_=-{I=?EIfGlElnN8OHwQ
zkY-YE*|194tlfvrOe{CpBEu$<$$*s-leOCJq@I^7Fzue^l?!D!w9>zjHD+%=bz<sR
zVTLIbiBlV|N)ZJvO16MMqJXMigw3r^#j1yI>|46IICT{4Hs2#wI6nM!=^&I0)?$Jt
z8{8Rgewm?~I5wEx=>!7r-kS5R1ckI!t!?&}6$lq*QSv$pn7jSl-U~+>8h-uk!AyQB
fvDpNSgT35jEO29s^+C^HgJnlob#+|7e<l0_taSUq

diff --git a/examples/pytorch/zero-shot/grounding-dino-tiny-finetuned/object_detection_no_trainer/1722924201.254446/hparams.yml b/examples/pytorch/zero-shot/grounding-dino-tiny-finetuned/object_detection_no_trainer/1722924201.254446/hparams.yml
deleted file mode 100644
index fd4e54228d4860..00000000000000
--- a/examples/pytorch/zero-shot/grounding-dino-tiny-finetuned/object_detection_no_trainer/1722924201.254446/hparams.yml
+++ /dev/null
@@ -1,29 +0,0 @@
-adam_beta1: 0.9
-adam_beta2: 0.999
-adam_epsilon: 1.0e-08
-cache_dir: null
-checkpointing_steps: epoch
-dataloader_num_workers: 4
-dataset_name: cppe-5
-gradient_accumulation_steps: 1
-hub_model_id: null
-hub_token: null
-ignore_mismatched_sizes: true
-image_square_size: 600
-learning_rate: 5.0e-05
-lr_scheduler_type: linear
-max_train_steps: 2700
-model_name_or_path: IDEA-Research/grounding-dino-tiny
-num_train_epochs: 100
-num_warmup_steps: 0
-output_dir: grounding-dino-tiny-finetuned
-per_device_eval_batch_size: 8
-per_device_train_batch_size: 8
-push_to_hub: false
-report_to: all
-resume_from_checkpoint: null
-seed: null
-train_val_split: 0.15
-trust_remote_code: false
-use_auth_token: false
-with_tracking: true
diff --git a/examples/pytorch/zero-shot/grounding-dino-tiny-finetuned/object_detection_no_trainer/1722924545.4884186/events.out.tfevents.1722924545.a9776447cf29.62264.1 b/examples/pytorch/zero-shot/grounding-dino-tiny-finetuned/object_detection_no_trainer/1722924545.4884186/events.out.tfevents.1722924545.a9776447cf29.62264.1
deleted file mode 100644
index 82c509d7c9233640d1ab67843e889463a6e9c1c7..0000000000000000000000000000000000000000
GIT binary patch
literal 0
HcmV?d00001

literal 1642
zcmaJ>J!lj`6uy}A^q$5ft0r+1H70_h-o-=|jpChX@TU>eh}AH+JGmROyR**BCdNv%
z5JfBm!9pt)8#@bI!A9_>4_KwpLP0wVL40$2d)d20r#RU6y?OJ!_rCAWW#aSa!MP{-
z4VTCb@pAE$-Bpp42gRv57U;=|=^`A^EKovL1eI=4MZQnvLUCSl&18|8EHcAAM&ih;
zqjzpyTocy~zlLA(J$_tw-`gy_$-)*=3#p{OB4g}Y$Rzi%z=aoCc$S5A4aj<wwH{_i
z`oQvq%RCZLpAjKRNOf(vW26T*x>QrebREISX6V+2G?}9wQK856aOcP%^z&*UBqKgo
zKGjZ*xkT~nOd+@xdbv-ljHs&-#S|0AumzhU#R!dbjc75;0*vj1LF7qXHpd-C;&&>h
zTykj#*r^0&IJDq4^clXMpf1)4BA?6)Im@Ks(F<K3qcY$@l}M@$hfM&~r9P=JO~>25
zC-58Fuj4FK+!Kc37U(fmV`?lEPR*3Yq4@M#BdRo}V-TCEO6qd6HR?E#A9+-B5s)MS
z2jfYi%&;O4>FR}$;e;Dc%v0${VKW)2ib#i%CN7tl)eRfMNSV=*T2#U3u$q>U(nK;}
zXhs~|Cx-iBGd|$VhJpv$v>w0SVa74f&Fm$0W)Z;LF33ylNouS*+;R5!@2{6TV|jhP
zJNh(VOXT%F2k&w)v@9>hl!`l}c$o(?;=+p@Jj<nIma`w`VEZZ^TkGwd#{=99>{ykn
zwa#8!pEO>t4?jH_E8p92RtAPvNE6XR_4Bp_gY9}tUma-Gx#8r5AuAk4PAk=$+<bGh
z47>j+{Uj^%vjqJqp-mvL|90Qvr(Zuxu=l?rNteLx+BBA+1JBE_=YPgX5;MbP82y(+
znn^un!zyL7b{{q~vD`q544X(M16E2*)>^xhdS0@?w0oLYu9V@}YX3sksJ-j_nW<BS
zX{JylPHn6zMHILw*#iEE0;+lewzWDHt4`iKxO9JE>Ll20zDKNZdg$BI5hxj~#RN?@
zxHH`R3PUw<Y%smkaRfeoFz0&-3TdlaJMHbO5H8H1<aHD<cmLq$dnXzie)sz6bpCc?
evvC*$d!@-(;Kml~h3;j8Wk=Sm@0fgidGRk%HvU`y

diff --git a/examples/pytorch/zero-shot/grounding-dino-tiny-finetuned/object_detection_no_trainer/1722924545.4942276/hparams.yml b/examples/pytorch/zero-shot/grounding-dino-tiny-finetuned/object_detection_no_trainer/1722924545.4942276/hparams.yml
deleted file mode 100644
index fd4e54228d4860..00000000000000
--- a/examples/pytorch/zero-shot/grounding-dino-tiny-finetuned/object_detection_no_trainer/1722924545.4942276/hparams.yml
+++ /dev/null
@@ -1,29 +0,0 @@
-adam_beta1: 0.9
-adam_beta2: 0.999
-adam_epsilon: 1.0e-08
-cache_dir: null
-checkpointing_steps: epoch
-dataloader_num_workers: 4
-dataset_name: cppe-5
-gradient_accumulation_steps: 1
-hub_model_id: null
-hub_token: null
-ignore_mismatched_sizes: true
-image_square_size: 600
-learning_rate: 5.0e-05
-lr_scheduler_type: linear
-max_train_steps: 2700
-model_name_or_path: IDEA-Research/grounding-dino-tiny
-num_train_epochs: 100
-num_warmup_steps: 0
-output_dir: grounding-dino-tiny-finetuned
-per_device_eval_batch_size: 8
-per_device_train_batch_size: 8
-push_to_hub: false
-report_to: all
-resume_from_checkpoint: null
-seed: null
-train_val_split: 0.15
-trust_remote_code: false
-use_auth_token: false
-with_tracking: true
diff --git a/examples/pytorch/zero-shot/grounding-dino-tiny-finetuned/object_detection_no_trainer/1722924574.178983/events.out.tfevents.1722924574.a9776447cf29.63604.1 b/examples/pytorch/zero-shot/grounding-dino-tiny-finetuned/object_detection_no_trainer/1722924574.178983/events.out.tfevents.1722924574.a9776447cf29.63604.1
deleted file mode 100644
index a5e6b6f321fdbf2488ded69ee04c04f642c12d39..0000000000000000000000000000000000000000
GIT binary patch
literal 0
HcmV?d00001

literal 1642
zcmaJ>O>7fK6n2PX;@5^G3r)f%fJu-*P&-YkB7rLGP$E$-ND)%uG#c+t?2&hOw)2ya
z9=Ra3h)Re^m0C_ol}qnEl8OtWKPN6oJ#l~o2zu)c@y2VLwG)_wrG4L<H{W~j`+PBz
zoHuta-pl`S<A<B#w<inkP*qYMl@?c6q|eSTmEcp&A|+&1Q0bSH2}3H^N~@A<CQHm_
zk!2n*l0;17^^ceLBz5EG@w5D+pOd;LA7o)W3kOI&rjmw=%(Ck-lRU%%m$tI7nT5SA
z$oVPjtY@dkzzK!V0us@X5g|!Tb-mCxJpv#5R8z%t6T!%S7}l0FS)l<@almz<e|i$e
zc`Xu>k&vs9YOl_GqWE`AA@~W5@{raTQQsPhDK?H_2lg4o2sOG+v}mvhV@F^Tc@m$k
zaF3DXovJO@zH}JeR06Xcx^Nf9EZ;y-AL~RWB&$L;n6x}bVJKi!Mm(w!Nwwv$4}khK
zBvq#AT+jDBeq;MhoW+U<!ZJJnBerTnjm5&N+tN6ckX~;^mBw@oVly>KeQvi#J<o(D
zpqh(_v=eYJfh5Wf%LGVQuf+@}+<Ib_%Fx7}WTYyLj*TWhmzXsSAH_!5(UH2T;&WI{
zOQSTAEEJj%5BG`TaoCR!c#T-_NZZzv*Zb@^R=J(Mq+SC7%pHQf#Ga(as)fFzk6#Ub
z)1S!e-|uHO@~3}k$?MM?Jj}t=J9#OlRMHv6%RE{Zm$q`SnM=tm=dS1A&@LT2gKjS1
z5pD(!@5<HL>kc$0jn|vQPfy0lkG7nZfvFwRMD+9Kc~FMQUcIHSPIT+sa`NJs6~91E
zJJs91xqYJo$KEUbc2?%UGK{B$Hjlumn`2M@`TunpPJB=#=@QsokH*^Q#IF@N{vl(u
z6ElSh%)HMbourY9WtFm7uMaz!SZ<<Ah8-l60VgFUXTa^JLC{`c+C7~s*D7#kw|^mL
z#y$G=g~f&95>qG=uRdFo!bCnwwunEXh^k(LgWXQWs%LL~_Vo7J;yG}8e2-Z1tEm@H
zPea*aT_)(T!JXmecNnUJV}t3P&LQykU-rB#Losbt=ZJe~7sAD5l)N?(^Y6dheDP&V
n!w-MoSjt~*+iVVI!QE*x7Ws+AMq&7!!LrkP2K$!op1<-I;e7$~

diff --git a/examples/pytorch/zero-shot/grounding-dino-tiny-finetuned/object_detection_no_trainer/1722924574.1846154/hparams.yml b/examples/pytorch/zero-shot/grounding-dino-tiny-finetuned/object_detection_no_trainer/1722924574.1846154/hparams.yml
deleted file mode 100644
index fd4e54228d4860..00000000000000
--- a/examples/pytorch/zero-shot/grounding-dino-tiny-finetuned/object_detection_no_trainer/1722924574.1846154/hparams.yml
+++ /dev/null
@@ -1,29 +0,0 @@
-adam_beta1: 0.9
-adam_beta2: 0.999
-adam_epsilon: 1.0e-08
-cache_dir: null
-checkpointing_steps: epoch
-dataloader_num_workers: 4
-dataset_name: cppe-5
-gradient_accumulation_steps: 1
-hub_model_id: null
-hub_token: null
-ignore_mismatched_sizes: true
-image_square_size: 600
-learning_rate: 5.0e-05
-lr_scheduler_type: linear
-max_train_steps: 2700
-model_name_or_path: IDEA-Research/grounding-dino-tiny
-num_train_epochs: 100
-num_warmup_steps: 0
-output_dir: grounding-dino-tiny-finetuned
-per_device_eval_batch_size: 8
-per_device_train_batch_size: 8
-push_to_hub: false
-report_to: all
-resume_from_checkpoint: null
-seed: null
-train_val_split: 0.15
-trust_remote_code: false
-use_auth_token: false
-with_tracking: true
diff --git a/examples/pytorch/zero-shot/grounding-dino-tiny-finetuned/object_detection_no_trainer/1722925039.8776126/events.out.tfevents.1722925039.a9776447cf29.67335.1 b/examples/pytorch/zero-shot/grounding-dino-tiny-finetuned/object_detection_no_trainer/1722925039.8776126/events.out.tfevents.1722925039.a9776447cf29.67335.1
deleted file mode 100644
index 1c19610eebecebd3ae59e6edae42cb2947e93f7f..0000000000000000000000000000000000000000
GIT binary patch
literal 0
HcmV?d00001

literal 1642
zcmaJ>O=uHA6i#i7=4oxaYFoF})>Kf`q_(1H5fjmZdJzjEo`%WpG#R?Pv(C)+$CG#w
zL_7$B7f&jB@UABjiXixN7d$EGK}By0g7_wz#7$bALtx+c=FRus`@Wn^#pn0UlXtSY
z$)CT(qqoQGuCk;&C``?<Ku=Cg7htbuffBMTsB{Y|@_i~73-gj|CJW4Dks0nW5=R~m
zzxr@yRa`gx6n@NpJsa2Ey_AM$Y1l-nA(hluWQ;9@OmZI!oPLmodudoxhpgLa>sC6~
z2bM2f=8=H<j0j0Ws;eU%xgJ>WQcV@pH3TCYp<5f$WR7}7g&x-<ow-5i=aoQ6MtrV(
zs+}rxiQ<=-LU1$ma-UWhQ5PeMDJG6#3pPZG5gO?#(PEYb7#oH`<VjpM#~nuEcgm(*
z<I*;;6A8?4Xu)mhGkiTkU91yCKA9JCmPx~-7rHz~Wx#_9kyINF8vv+FeNtwcj<<bJ
z;5W8k!&#`fCk(?)&||8`)L1B-sws^_@##W6sx+o!5Syt;>T<I+>Nt@fc~o-|kVXOy
z#*;*uVMQL&)r%p+3D=*Pr_ztYW-?M0kq#qGTrM%I8`g!9GNU8asEp5HH7z5hiDbUe
zj5xSY4EMuEe88Cv1rM}oJ$}8zjANdg*-ProB7nJFkd@ez)L3<-<8toF^Yfjtyxv_K
zy_-EZU6<F(47|v|&|i5erc~S+#mhXH5vLzy;9e#nvy6Q!16x<<*jj66JRaa?VB3mZ
ztu^-Q+NAM%ZTQK_SlQmXvr;g$Oqz)9ubrhL47Tend3B&w=Z2FLhAe*oIW1Rj<L0v~
zCD`##={K@c-;2<n5ZVL+d#?7qee>g65qAAoB*_xkU7N-l=)nCF?EIfG8i}cq5{&-K
zA<d+ol3|swS-TILnOJV1MTSizl>#dvCTq3bNj<Nzz@&ScS1y#`@Jjze)~LPx)QPEM
z`Dvz5Bu;g#B1IIqDA_#zh&-x#5w^5C6{}8O-}nB;;?z;F+kB5${`k=6_XnY9uoe?E
z+2GD_^UDm?#IeETPR9{={mPs#MaU<uY7N_4S0J39LCNbVVD6=*cZZJDHT>f7-RbN?
f!)D_!2KI83vA~Tj)(hQ#4VKQWTH7)9w2=P;<01n?

diff --git a/examples/pytorch/zero-shot/grounding-dino-tiny-finetuned/object_detection_no_trainer/1722925039.8834014/hparams.yml b/examples/pytorch/zero-shot/grounding-dino-tiny-finetuned/object_detection_no_trainer/1722925039.8834014/hparams.yml
deleted file mode 100644
index fd4e54228d4860..00000000000000
--- a/examples/pytorch/zero-shot/grounding-dino-tiny-finetuned/object_detection_no_trainer/1722925039.8834014/hparams.yml
+++ /dev/null
@@ -1,29 +0,0 @@
-adam_beta1: 0.9
-adam_beta2: 0.999
-adam_epsilon: 1.0e-08
-cache_dir: null
-checkpointing_steps: epoch
-dataloader_num_workers: 4
-dataset_name: cppe-5
-gradient_accumulation_steps: 1
-hub_model_id: null
-hub_token: null
-ignore_mismatched_sizes: true
-image_square_size: 600
-learning_rate: 5.0e-05
-lr_scheduler_type: linear
-max_train_steps: 2700
-model_name_or_path: IDEA-Research/grounding-dino-tiny
-num_train_epochs: 100
-num_warmup_steps: 0
-output_dir: grounding-dino-tiny-finetuned
-per_device_eval_batch_size: 8
-per_device_train_batch_size: 8
-push_to_hub: false
-report_to: all
-resume_from_checkpoint: null
-seed: null
-train_val_split: 0.15
-trust_remote_code: false
-use_auth_token: false
-with_tracking: true
diff --git a/examples/pytorch/zero-shot/grounding-dino-tiny-finetuned/object_detection_no_trainer/1722925229.4844875/events.out.tfevents.1722925229.a9776447cf29.71066.1 b/examples/pytorch/zero-shot/grounding-dino-tiny-finetuned/object_detection_no_trainer/1722925229.4844875/events.out.tfevents.1722925229.a9776447cf29.71066.1
deleted file mode 100644
index c62c0ae3dbbe340d44904ab45939f17e6185ef86..0000000000000000000000000000000000000000
GIT binary patch
literal 0
HcmV?d00001

literal 1642
zcmaJ>J!lj`6uy}A^q$5ft0ubn^&%+hT}(tl5ib%0nnq0{R>R!x<ZjIF&N?%j7)u*L
z#KIs5YNe3I-Zp3x&{)|BRw=Yl&`vAy&F$T}cZp7Mu<v{G=6mmb-(5__=l8>l&$3G=
z$kFiS$MbeaNm3pZrsi3o&y3F$;FxBC60#(ybPFo-eJU3V3zBOl3(RDZS?)0sM~03q
z-MG9Zt{Z*{KWATGjq4tHlZKTv>>-tqO6n^z%5H^Bavuv!FQ?%}8n)FS>v7t8l+N{l
z<qMa2B%nScLXwc`%5Yn*3wF9xQ^j-@!N_jt)P^*fryfzE$MtY~t`B;7IS`T&pDUkg
zr@~yK_-&>T+zZ{@r)5Ue&4^-(iDTG;U6EphM!G_@m}3FPhM*655|_<$hmrW5k||fe
zGzfMgff)`>xD7pquP3OBb%Mwz3qsB@X?S!)hsUT4cu*#iYQtd{0ClNPN=(zSmhW-=
z#`dc?3l;Z-VYmmnOx2hg3x!iLrEw@ey;X}Ujp-P~X3CPf+-!|HPUJ@()m#Lmo`8e#
zBvEEqk%x5kV#sj9wI>#+^rNtmj8sLW!$=dCOU&wo9bu%*=tw0h;d59`%SdS=nJ+XW
z4(=1fy|5b}aOOh618rK5UvD$xSm0*%k~(t;U~UIwCH5pWRvm6DOg_$CYmepi>E6im
zY+<G*ueTXk%|QQOc`2q;+!@8oJeU>J%Ncl)Nysc?Kgz(sCLLSb?Tp6*+zbqE%GKIt
zZ>>%muUChkoQ#$2t~o0O{Trl-=!xoiQ-r=&y(O>iYu35p<b)y1pF~a@)my*0a%Tb#
z{!{w(tkjPp^d^Khj=<5oJs;oy{9c5^{}oBH1a{Y=u{yf;VFC{Q&lvSY!(`-N4rwHH
zO&C@Qo3;9|k%{H@HOa7nq*7od#AI!?+o|W(7npQU<I433IKA1wkTqf-xO8Fae13*0
z6p2$AElUvvE=o3!KO&E+UWEP4PQ|Kc?jK)&usC%N>=xf6mY?kZx_%0Z25T}wgAKNZ
zo8Mrl296CTcRGf^+B<W;6(OIrsx@Q}Y(h9ci;~w-z}#O?#gnr&4X?g>K9jvtx7iqs
bg1ymXEO29sbwlT0gQas@wzoZ)UfTNy8xj6#

diff --git a/examples/pytorch/zero-shot/grounding-dino-tiny-finetuned/object_detection_no_trainer/1722925229.4961867/hparams.yml b/examples/pytorch/zero-shot/grounding-dino-tiny-finetuned/object_detection_no_trainer/1722925229.4961867/hparams.yml
deleted file mode 100644
index a6ee74feb4d073..00000000000000
--- a/examples/pytorch/zero-shot/grounding-dino-tiny-finetuned/object_detection_no_trainer/1722925229.4961867/hparams.yml
+++ /dev/null
@@ -1,29 +0,0 @@
-adam_beta1: 0.9
-adam_beta2: 0.999
-adam_epsilon: 1.0e-08
-cache_dir: null
-checkpointing_steps: epoch
-dataloader_num_workers: 4
-dataset_name: cppe-5
-gradient_accumulation_steps: 1
-hub_model_id: null
-hub_token: null
-ignore_mismatched_sizes: true
-image_square_size: 600
-learning_rate: 5.0e-05
-lr_scheduler_type: linear
-max_train_steps: 21300
-model_name_or_path: IDEA-Research/grounding-dino-tiny
-num_train_epochs: 100
-num_warmup_steps: 0
-output_dir: grounding-dino-tiny-finetuned
-per_device_eval_batch_size: 1
-per_device_train_batch_size: 1
-push_to_hub: false
-report_to: all
-resume_from_checkpoint: null
-seed: null
-train_val_split: 0.15
-trust_remote_code: false
-use_auth_token: false
-with_tracking: true
diff --git a/examples/pytorch/zero-shot/grounding-dino-tiny-finetuned/object_detection_no_trainer/events.out.tfevents.1722922514.a9776447cf29.50015.0 b/examples/pytorch/zero-shot/grounding-dino-tiny-finetuned/object_detection_no_trainer/events.out.tfevents.1722922514.a9776447cf29.50015.0
deleted file mode 100644
index c5085c613a378cc7f9d3e0f0ba38503b08a6a725..0000000000000000000000000000000000000000
GIT binary patch
literal 0
HcmV?d00001

literal 88
zcmeZZfPjCKJmzx77tU_UU31e>iZ`h!F*8rkwJbHS#L6g0k4vW{HLp0oC@DX&C`GTh
hG&eV~s8X-ID6=HBNG}znDn2bUCp8`-!q@*~B>*P5AdLV3

diff --git a/examples/pytorch/zero-shot/grounding-dino-tiny-finetuned/object_detection_no_trainer/events.out.tfevents.1722922680.a9776447cf29.52465.0 b/examples/pytorch/zero-shot/grounding-dino-tiny-finetuned/object_detection_no_trainer/events.out.tfevents.1722922680.a9776447cf29.52465.0
deleted file mode 100644
index c31669781c6f94717a33037726b6bdadc9730ce2..0000000000000000000000000000000000000000
GIT binary patch
literal 0
HcmV?d00001

literal 88
zcmeZZfPjCKJmzv%Xb7&$U31e>iZ`h!F*8rkwJbHS#L6g0k4vW{HLp0oC@DX&C`GTh
hG&eV~s8X-ID6=HBNG}znDn2bUCp8`-5)=FOH~{`DAV2^B

diff --git a/examples/pytorch/zero-shot/grounding-dino-tiny-finetuned/object_detection_no_trainer/events.out.tfevents.1722924013.a9776447cf29.54915.0 b/examples/pytorch/zero-shot/grounding-dino-tiny-finetuned/object_detection_no_trainer/events.out.tfevents.1722924013.a9776447cf29.54915.0
deleted file mode 100644
index b6eac80b292be0b92e14640b3714859cfa694890..0000000000000000000000000000000000000000
GIT binary patch
literal 0
HcmV?d00001

literal 88
zcmeZZfPjCKJmzvT&Mf+!x8|m!6mL>dVrHJ6YguYuiIq{19+yr@YF=@EQBr<lQHox1
hX>M*}QKepaQD#YMkzOiDReV}zPHH?vBu^$!7yv7BABO+{

diff --git a/examples/pytorch/zero-shot/grounding-dino-tiny-finetuned/object_detection_no_trainer/events.out.tfevents.1722924136.a9776447cf29.57364.0 b/examples/pytorch/zero-shot/grounding-dino-tiny-finetuned/object_detection_no_trainer/events.out.tfevents.1722924136.a9776447cf29.57364.0
deleted file mode 100644
index 0d5c402e493c5c38a9b591b3c319e28dfac8f3c1..0000000000000000000000000000000000000000
GIT binary patch
literal 0
HcmV?d00001

literal 88
zcmeZZfPjCKJmzv9=wXz~Uvtw@iZ`h!F*8rkwJbHS#L6g0k4vW{HLp0oC@DX&C`GTh
hG&eV~s8X-ID6=HBNG}znDn2bUCp8`-a{P(I0{{e=AgBNU

diff --git a/examples/pytorch/zero-shot/grounding-dino-tiny-finetuned/object_detection_no_trainer/events.out.tfevents.1722924201.a9776447cf29.59813.0 b/examples/pytorch/zero-shot/grounding-dino-tiny-finetuned/object_detection_no_trainer/events.out.tfevents.1722924201.a9776447cf29.59813.0
deleted file mode 100644
index 4dc60ced50f349842b4bd01c2e150ac73eed7246..0000000000000000000000000000000000000000
GIT binary patch
literal 0
HcmV?d00001

literal 88
zcmeZZfPjCKJmzxFUhc1zzviZ+6mL>dVrHJ6YguYuiIq{19+yr@YF=@EQBr<lQHox1
hX>M*}QKepaQD#YMkzOiDReV}zPHH?vWJ5;M4FDRsAcFt^

diff --git a/examples/pytorch/zero-shot/grounding-dino-tiny-finetuned/object_detection_no_trainer/events.out.tfevents.1722924545.a9776447cf29.62264.0 b/examples/pytorch/zero-shot/grounding-dino-tiny-finetuned/object_detection_no_trainer/events.out.tfevents.1722924545.a9776447cf29.62264.0
deleted file mode 100644
index 9dd492b6fc3317075357f86e0170590407807fd8..0000000000000000000000000000000000000000
GIT binary patch
literal 0
HcmV?d00001

literal 88
zcmeZZfPjCKJmzw`Fvd6Jues?c#hX-=n3<>NT9%quVr3Mh$E8z}npd1(l$4)Xl%iK$
hnwy(gRH;{9lv$Emq?Za(6`z)wlNt{Z={)ax5diK<AP4{e

diff --git a/examples/pytorch/zero-shot/grounding-dino-tiny-finetuned/object_detection_no_trainer/events.out.tfevents.1722924574.a9776447cf29.63604.0 b/examples/pytorch/zero-shot/grounding-dino-tiny-finetuned/object_detection_no_trainer/events.out.tfevents.1722924574.a9776447cf29.63604.0
deleted file mode 100644
index 952cdf66a6833a5b7bf2ea1ca228383f7253d338..0000000000000000000000000000000000000000
GIT binary patch
literal 0
HcmV?d00001

literal 88
zcmeZZfPjCKJmzvX8FaVjues?c#hX-=n3<>NT9%quVr3Mh$E8z}npd1(l$4)Xl%iK$
hnwy(gRH;{9lv$Emq?Za(6`z)wlNt{Zc`5Tv0{|SnAYA|e

diff --git a/examples/pytorch/zero-shot/grounding-dino-tiny-finetuned/object_detection_no_trainer/events.out.tfevents.1722925039.a9776447cf29.67335.0 b/examples/pytorch/zero-shot/grounding-dino-tiny-finetuned/object_detection_no_trainer/events.out.tfevents.1722925039.a9776447cf29.67335.0
deleted file mode 100644
index a0236d45bb8b6a72ef6baddcb446988cb9cbe98b..0000000000000000000000000000000000000000
GIT binary patch
literal 0
HcmV?d00001

literal 88
zcmeZZfPjCKJmzv%KK=eXf6YxtDc+=_#LPTB*Rs^S5-X!1JuaP+)V$*SqNM!9q7=R2
h(%js{qDsB;qRf)iBE3|Qs`#|boYZ)Th|ssX{{W2eB1-@O

diff --git a/examples/pytorch/zero-shot/grounding-dino-tiny-finetuned/object_detection_no_trainer/events.out.tfevents.1722925229.a9776447cf29.71066.0 b/examples/pytorch/zero-shot/grounding-dino-tiny-finetuned/object_detection_no_trainer/events.out.tfevents.1722925229.a9776447cf29.71066.0
deleted file mode 100644
index a8375abb5c27a05c218bcdd2ccb418a9192ad99b..0000000000000000000000000000000000000000
GIT binary patch
literal 0
HcmV?d00001

literal 88
zcmeZZfPjCKJmzwWE{M}ESaZ`+iZ`h!F*8rkwJbHS#L6g0k4vW{HLp0oC@DX&C`GTh
hG&eV~s8X-ID6=HBNG}znDn2bUCp8`-^5coN1pxQHAM^kK


From 3c8dd45e20800b210ebd415ed112a3676f4585a9 Mon Sep 17 00:00:00 2001
From: sangbumchoi <danielsejong55@gmail.com>
Date: Tue, 6 Aug 2024 13:47:07 +0000
Subject: [PATCH 45/61] fix typo

---
 examples/pytorch/object-detection/README.md | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/examples/pytorch/object-detection/README.md b/examples/pytorch/object-detection/README.md
index ab474f76075305..50d8bcc20fe1a2 100644
--- a/examples/pytorch/object-detection/README.md
+++ b/examples/pytorch/object-detection/README.md
@@ -69,7 +69,7 @@ python run_object_detection.py \
 `--eval_do_concat_batches false` is required for correct evaluation of detection models;  
 `--ignore_mismatched_sizes true` is required to load detection model for finetuning with different number of classes.
 
-The resulting model can be seen here: https://huggingface.co/qubvel-hf/qubvel-hf/detr-resnet-50-finetuned-10k-cppe5. The corresponding Weights and Biases report [here](https://api.wandb.ai/links/qubvel-hf-co/bnm0r5ex). Note that it's always advised to check the original paper to know the details regarding training hyperparameters. Hyperparameters for current example were not tuned. To improve model quality you could try:
+The resulting model can be seen here: https://huggingface.co/qubvel-hf/detr-resnet-50-finetuned-10k-cppe5. The corresponding Weights and Biases report [here](https://api.wandb.ai/links/qubvel-hf-co/bnm0r5ex). Note that it's always advised to check the original paper to know the details regarding training hyperparameters. Hyperparameters for current example were not tuned. To improve model quality you could try:
  - changing image size parameters (`--shortest_edge`/`--longest_edge`)
  - changing training parameters, such as learning rate, batch size, warmup, optimizer and many more (see [TrainingArguments](https://huggingface.co/docs/transformers/main_classes/trainer#transformers.TrainingArguments))
  - adding more image augmentations (we created a helpful [HF Space](https://huggingface.co/spaces/qubvel-hf/albumentations-demo) to choose some)

From baebefd6da2f614948f1aa572302091d3760947d Mon Sep 17 00:00:00 2001
From: sangbumchoi <danielsejong55@gmail.com>
Date: Wed, 7 Aug 2024 00:17:11 +0000
Subject: [PATCH 46/61] tmp

---
 examples/pytorch/zero-shot/README.md          | 29 ++++---
 ...n_zero_shot_object_detection_no_trainer.py | 86 +++++++++++++++----
 2 files changed, 83 insertions(+), 32 deletions(-)

diff --git a/examples/pytorch/zero-shot/README.md b/examples/pytorch/zero-shot/README.md
index 34d18393c7c4d2..6bb2cca13e7496 100644
--- a/examples/pytorch/zero-shot/README.md
+++ b/examples/pytorch/zero-shot/README.md
@@ -27,11 +27,11 @@ Content:
 
 ## PyTorch version, Trainer
 
-Based on the script [`run_object_detection.py`](https://github.com/huggingface/transformers/blob/main/examples/pytorch/object-detection/run_object_detection.py).
+Based on the script [`run_zero_shot_object_detection.py`](https://github.com/huggingface/transformers/blob/main/examples/pytorch/zero-shot/run_zero_shot_object_detection.py).
 
 The script leverages the [🤗 Trainer API](https://huggingface.co/docs/transformers/main_classes/trainer) to automatically take care of the training for you, running on distributed environments right away.
 
-Here we show how to fine-tune a [DETR](https://huggingface.co/IDEA-Research/grounding-dino-tiny) model on the [CPPE-5](https://huggingface.co/datasets/cppe-5) dataset:
+Here we show how to fine-tune a [GroundingDino](https://huggingface.co/IDEA-Research/grounding-dino-tiny) model on the [CPPE-5](https://huggingface.co/datasets/cppe-5) dataset:
 
 ```bash
 python run_zero_shot_object_detection.py \
@@ -39,7 +39,7 @@ python run_zero_shot_object_detection.py \
     --dataset_name cppe-5 \
     --do_train true \
     --do_eval true \
-    --output_dir grounding-dino-tiny-finetuned-cppe-5-10k-steps \
+    --output_dir grounding-dino-tiny-finetuned-cppe5-10k-steps \
     --num_train_epochs 100 \
     --image_square_size 600 \
     --fp16 true \
@@ -69,7 +69,7 @@ python run_zero_shot_object_detection.py \
 `--eval_do_concat_batches false` is required for correct evaluation of detection models;  
 `--ignore_mismatched_sizes true` is required to load detection model for finetuning with different number of classes.
 
-The resulting model can be seen here: https://huggingface.co/qubvel-hf/qubvel-hf/detr-resnet-50-finetuned-10k-cppe5. The corresponding Weights and Biases report [here](https://api.wandb.ai/links/qubvel-hf-co/bnm0r5ex). Note that it's always advised to check the original paper to know the details regarding training hyperparameters. Hyperparameters for current example were not tuned. To improve model quality you could try:
+The resulting model can be seen here: https://huggingface.co/danelcsb/grounding-dino-tiny-finetuned-10k-cppe5-10k-steps. The corresponding Weights and Biases report [here](https://api.wandb.ai/links/qubvel-hf-co/bnm0r5ex). Note that it's always advised to check the original paper to know the details regarding training hyperparameters. Hyperparameters for current example were not tuned. To improve model quality you could try:
  - changing image size parameters (`--shortest_edge`/`--longest_edge`)
  - changing training parameters, such as learning rate, batch size, warmup, optimizer and many more (see [TrainingArguments](https://huggingface.co/docs/transformers/main_classes/trainer#transformers.TrainingArguments))
  - adding more image augmentations (we created a helpful [HF Space](https://huggingface.co/spaces/qubvel-hf/albumentations-demo) to choose some)
@@ -82,7 +82,7 @@ For dataset, make sure it provides labels in the same format as [CPPE-5](https:/
 
 ## PyTorch version, no Trainer
 
-Based on the script [`run_object_detection_no_trainer.py`](https://github.com/huggingface/transformers/blob/main/examples/pytorch/object-detection/run_object_detection.py).
+Based on the script [`run_zero_shot_object_detection_no_trainer.py`](https://github.com/huggingface/transformers/blob/main/examples/pytorch/object-detection/run_zero_shot_object_detection.py).
 
 The script leverages [🤗 `Accelerate`](https://github.com/huggingface/accelerate), which allows to write your own training loop in PyTorch, but have it run instantly on any (distributed) environment, including CPU, multi-CPU, GPU, multi-GPU and TPU. It also supports mixed precision.
 
@@ -104,8 +104,8 @@ that will check everything is ready for training. Finally, you can launch traini
 accelerate launch run_zero_shot_object_detection_no_trainer.py \
     --model_name_or_path "IDEA-Research/grounding-dino-tiny" \
     --dataset_name cppe-5 \
-    --output_dir "grounding-dino-tiny-finetuned" \
-    --num_train_epochs 100 \
+    --output_dir "grounding-dino-tiny-finetuned-cppe5-10k-steps-no-trainer" \
+    --num_train_epochs 10 \
     --image_square_size 600 \
     --per_device_train_batch_size 1 \
     --per_device_eval_batch_size 1 \
@@ -118,7 +118,7 @@ accelerate launch run_zero_shot_object_detection_no_trainer.py \
 
 and boom, you're training, possibly on multiple GPUs, logging everything to all trackers found in your environment (like Weights and Biases, Tensorboard) and regularly pushing your model to the hub (with the repo name being equal to `args.output_dir` at your HF username) 🤗
 
-With the default settings, the script fine-tunes a [DETR](https://huggingface.co/facebook/detr-resnet-50) model on the [CPPE-5](https://huggingface.co/datasets/cppe-5) dataset. The resulting model can be seen here: https://huggingface.co/qubvel-hf/detr-resnet-50-finetuned-10k-cppe5-no-trainer. 
+With the default settings, the script fine-tunes a [GroundingDino](https://huggingface.co/IDEA-Research/grounding-dino-tiny) model on the [CPPE-5](https://huggingface.co/datasets/cppe-5) dataset. The resulting model can be seen here: https://huggingface.co/danelcsb/grounding-dino-tiny-finetuned-10k-cppe5-no-trainer. 
 
 
 ## Reload and perform inference
@@ -130,20 +130,21 @@ import requests
 import torch
 
 from PIL import Image
-from transformers import AutoImageProcessor, AutoModelForObjectDetection
+from transformers import AutoProcessor, AutoModelForZeroShotObjectDetection
 
 # Name of repo on the hub or path to a local folder
-model_name = "qubvel-hf/detr-resnet-50-finetuned-10k-cppe5"
+model_name = "danelcsb/grounding-dino-tiny-finetuned-10k-cppe5"
 
-image_processor = AutoImageProcessor.from_pretrained(model_name)
-model = AutoModelForObjectDetection.from_pretrained(model_name)
+image_processor = AutoProcessor.from_pretrained(model_name)
+model = AutoModelForZeroShotObjectDetection.from_pretrained(model_name)
 
 # Load image for inference
 url = "https://images.pexels.com/photos/8413299/pexels-photo-8413299.jpeg?auto=compress&cs=tinysrgb&w=630&h=375&dpr=2"
 image = Image.open(requests.get(url, stream=True).raw)
+text = "Coverall. Face_Shield. Gloves. Goggles. Mask"
 
 # Prepare image for the model
-inputs = image_processor(images=image, return_tensors="pt")
+inputs = image_processor(images=image, text=text, return_tensors="pt")
 
 with torch.no_grad():
     outputs = model(**inputs)
@@ -152,7 +153,7 @@ with torch.no_grad():
 # this include conversion to Pascal VOC format and filtering non confident boxes
 width, height = image.size
 target_sizes = torch.tensor([height, width]).unsqueeze(0)  # add batch dim
-results = image_processor.post_process_object_detection(outputs, threshold=0.5, target_sizes=target_sizes)[0]
+results = processor.post_process_grounded_object_detection(outputs, inputs.input_ids, box_threshold=0.15, text_threshold=0.1, target_sizes=target_sizes)[0]
 
 for score, label, box in zip(results["scores"], results["labels"], results["boxes"]):
     box = [round(i, 2) for i in box.tolist()]
diff --git a/examples/pytorch/zero-shot/run_zero_shot_object_detection_no_trainer.py b/examples/pytorch/zero-shot/run_zero_shot_object_detection_no_trainer.py
index d497204e7564b7..99a6ac04a924e4 100644
--- a/examples/pytorch/zero-shot/run_zero_shot_object_detection_no_trainer.py
+++ b/examples/pytorch/zero-shot/run_zero_shot_object_detection_no_trainer.py
@@ -27,7 +27,7 @@
 import datasets
 import numpy as np
 import torch
-from accelerate import Accelerator
+from accelerate import Accelerator, DistributedDataParallelKwargs
 from accelerate.logging import get_logger
 from accelerate.utils import set_seed
 from datasets import load_dataset
@@ -118,6 +118,50 @@ def convert_bbox_yolo_to_pascal(boxes: torch.Tensor, image_size: Tuple[int, int]
     return boxes
 
 
+def convert_zero_shot_to_coco_format(predictions, label2id):
+    """
+    Convert zershot format output to typical object detection format in order to calculate mAP.
+
+    Args:
+        predictions (Dict): Output of zero-shot object detection
+            e.g. 
+                {
+                    'scores': tensor([0.4786, 0.4379, 0.4760], device='cuda:0'), 
+                    'labels': ['a cat', 'a cat', 'a remote control'], 
+                    'boxes': tensor([[344.6973,  23.1085, 637.1817, 374.2748],
+                                    [ 12.2690,  51.9104, 316.8564, 472.4341],
+                                    [ 38.5870,  70.0092, 176.7755, 118.1748]], device='cuda:0')
+                }
+        label2id (Dict): Dictionary of label to id mapping
+
+    Returns:
+        Dict: Output of zero-shot object detection
+            e.g. 
+                {
+                    'scores': tensor([0.4786, 0.4379, 0.4760], device='cuda:0'), 
+                    'labels': tensor([1, 1, 2], device='cuda:0'), 
+                    'boxes': tensor([[344.6973,  23.1085, 637.1817, 374.2748],
+                                    [ 12.2690,  51.9104, 316.8564, 472.4341],
+                                    [ 38.5870,  70.0092, 176.7755, 118.1748]], device='cuda:0')
+                }
+    """
+    # convert center to corners format
+    torch_label = []
+    for prediction in predictions:
+        scores = prediction['scores']
+        device = scores.device
+        labels = prediction['labels']
+        for label in labels:
+            if label in label2id:
+                torch_label.append(label)
+            else:
+                # Give background class
+                torch_label.append(0)
+        prediction['labels'] = torch.Tensor(torch_label).to(device)
+
+    return predictions
+
+
 # Copied from examples/pytorch/object-detection/run_object_detection.augment_and_transform_batch
 def augment_and_transform_batch(
     examples: Mapping[str, Any],
@@ -187,6 +231,7 @@ def evaluation_loop(
     accelerator: Accelerator,
     dataloader: DataLoader,
     id2label: Mapping[int, str],
+    label2id: Mapping[str, int],
 ) -> dict:
     model.eval()
     metric = MeanAveragePrecision(box_format="xyxy", class_metrics=True)
@@ -201,8 +246,10 @@ def evaluation_loop(
         # processor convert boxes from YOLO format to Pascal VOC format
         # ([x_min, y_min, x_max, y_max] in absolute coordinates)
         image_size = torch.stack([example["orig_size"] for example in batch["labels"]], dim=0)
-        predictions = processor.post_process_grounded_object_detection(outputs, threshold=0.0, target_sizes=image_size)
+        input_ids = torch.stack([input_ids for input_ids in batch["input_ids"]], dim=0)
+        predictions = processor.post_process_grounded_object_detection(outputs, input_ids, box_threshold=0.0, text_threshold=0.0, target_sizes=image_size)
         predictions = nested_to_cpu(predictions)
+        predictions = convert_zero_shot_to_coco_format(predictions, label2id)
 
         # 2. Collect ground truth boxes in the same format for metric computation
         # Do the same, convert YOLO boxes to Pascal VOC format
@@ -215,19 +262,20 @@ def evaluation_loop(
 
         metric.update(predictions, target)
 
-    metrics = metric.compute()
+    # metrics = metric.compute()
 
-    # Replace list of per class metrics with separate metric for each class
-    classes = metrics.pop("classes")
-    map_per_class = metrics.pop("map_per_class")
-    mar_100_per_class = metrics.pop("mar_100_per_class")
-    for class_id, class_map, class_mar in zip(classes, map_per_class, mar_100_per_class):
-        class_name = id2label[class_id.item()]
-        metrics[f"map_{class_name}"] = class_map
-        metrics[f"mar_100_{class_name}"] = class_mar
+    # # Replace list of per class metrics with separate metric for each class
+    # classes = metrics.pop("classes")
+    # map_per_class = metrics.pop("map_per_class")
+    # mar_100_per_class = metrics.pop("mar_100_per_class")
+    # for class_id, class_map, class_mar in zip(classes, map_per_class, mar_100_per_class):
+    #     class_name = id2label[class_id.item()]
+    #     metrics[f"map_{class_name}"] = class_map
+    #     metrics[f"mar_100_{class_name}"] = class_mar
 
-    # Convert metrics to float
-    metrics = {k: round(v.item(), 4) for k, v in metrics.items()}
+    # # Convert metrics to float
+    # metrics = {k: round(v.item(), 4) for k, v in metrics.items()}
+    metrics = {}
 
     return metrics
 
@@ -412,6 +460,7 @@ def main():
     if args.with_tracking:
         accelerator_log_kwargs["log_with"] = args.report_to
         accelerator_log_kwargs["project_dir"] = args.output_dir
+        accelerator_log_kwargs["kwargs_handlers"] = [DistributedDataParallelKwargs(find_unused_parameters=True)]
 
     accelerator = Accelerator(gradient_accumulation_steps=args.gradient_accumulation_steps, **accelerator_log_kwargs)
 
@@ -463,7 +512,6 @@ def main():
     # Get dataset categories and prepare mappings for label_name <-> label_id
     categories = dataset["train"].features["objects"].feature["category"].names
     id2label = dict(enumerate(categories))
-    prompt = ". ".join(id2label.values()) + "."
     label2id = {v: k for k, v in id2label.items()}
 
     # ------------------------------------------------------------------------------------------------
@@ -522,11 +570,13 @@ def main():
     )
 
     # Make transform functions for batch and apply for dataset splits
+    prompt = ". ".join(id2label.values()) + "."
+
     train_transform_batch = partial(
-        augment_and_transform_batch, transform=train_augment_and_transform, processor=processor, prompt=prompt
+        augment_and_transform_batch, transform=train_augment_and_transform, processor=processor, prompt=prompt,
     )
     validation_transform_batch = partial(
-        augment_and_transform_batch, transform=validation_transform, processor=processor, prompt=prompt
+        augment_and_transform_batch, transform=validation_transform, processor=processor, prompt=prompt,
     )
 
     with accelerator.main_process_first():
@@ -708,7 +758,7 @@ def main():
                 break
 
         logger.info("***** Running evaluation *****")
-        metrics = evaluation_loop(model, processor, accelerator, valid_dataloader, id2label)
+        metrics = evaluation_loop(model, processor, accelerator, valid_dataloader, id2label, label2id)
 
         logger.info(f"epoch {epoch}: {metrics}")
 
@@ -750,7 +800,7 @@ def main():
     # ------------------------------------------------------------------------------------------------
 
     logger.info("***** Running evaluation on test dataset *****")
-    metrics = evaluation_loop(model, processor, accelerator, test_dataloader, id2label)
+    metrics = evaluation_loop(model, processor, accelerator, test_dataloader, id2label, label2id)
     metrics = {f"test_{k}": v for k, v in metrics.items()}
 
     logger.info(f"Test metrics: {metrics}")

From bde2d120eb71a86b4b49994d8a02c1d79e4aaef4 Mon Sep 17 00:00:00 2001
From: sangbumchoi <danielsejong55@gmail.com>
Date: Wed, 7 Aug 2024 02:18:01 +0000
Subject: [PATCH 47/61] tmp

---
 examples/pytorch/zero-shot/README.md          |  8 +--
 ...n_zero_shot_object_detection_no_trainer.py | 59 ++++++++++++++++---
 2 files changed, 54 insertions(+), 13 deletions(-)

diff --git a/examples/pytorch/zero-shot/README.md b/examples/pytorch/zero-shot/README.md
index 6bb2cca13e7496..f8aec74257416a 100644
--- a/examples/pytorch/zero-shot/README.md
+++ b/examples/pytorch/zero-shot/README.md
@@ -39,7 +39,7 @@ python run_zero_shot_object_detection.py \
     --dataset_name cppe-5 \
     --do_train true \
     --do_eval true \
-    --output_dir grounding-dino-tiny-finetuned-cppe5-10k-steps \
+    --output_dir grounding-dino-tiny-finetuned-cppe-5-10k-steps \
     --num_train_epochs 100 \
     --image_square_size 600 \
     --fp16 true \
@@ -69,7 +69,7 @@ python run_zero_shot_object_detection.py \
 `--eval_do_concat_batches false` is required for correct evaluation of detection models;  
 `--ignore_mismatched_sizes true` is required to load detection model for finetuning with different number of classes.
 
-The resulting model can be seen here: https://huggingface.co/danelcsb/grounding-dino-tiny-finetuned-10k-cppe5-10k-steps. The corresponding Weights and Biases report [here](https://api.wandb.ai/links/qubvel-hf-co/bnm0r5ex). Note that it's always advised to check the original paper to know the details regarding training hyperparameters. Hyperparameters for current example were not tuned. To improve model quality you could try:
+The resulting model can be seen here: https://huggingface.co/danelcsb/grounding-dino-tiny-finetuned-10k-cppe-5-10k-steps. The corresponding Weights and Biases report [here](https://api.wandb.ai/links/qubvel-hf-co/bnm0r5ex). Note that it's always advised to check the original paper to know the details regarding training hyperparameters. Hyperparameters for current example were not tuned. To improve model quality you could try:
  - changing image size parameters (`--shortest_edge`/`--longest_edge`)
  - changing training parameters, such as learning rate, batch size, warmup, optimizer and many more (see [TrainingArguments](https://huggingface.co/docs/transformers/main_classes/trainer#transformers.TrainingArguments))
  - adding more image augmentations (we created a helpful [HF Space](https://huggingface.co/spaces/qubvel-hf/albumentations-demo) to choose some)
@@ -104,7 +104,7 @@ that will check everything is ready for training. Finally, you can launch traini
 accelerate launch run_zero_shot_object_detection_no_trainer.py \
     --model_name_or_path "IDEA-Research/grounding-dino-tiny" \
     --dataset_name cppe-5 \
-    --output_dir "grounding-dino-tiny-finetuned-cppe5-10k-steps-no-trainer" \
+    --output_dir "grounding-dino-tiny-finetuned-cppe-5-10k-steps-no-trainer" \
     --num_train_epochs 10 \
     --image_square_size 600 \
     --per_device_train_batch_size 1 \
@@ -118,7 +118,7 @@ accelerate launch run_zero_shot_object_detection_no_trainer.py \
 
 and boom, you're training, possibly on multiple GPUs, logging everything to all trackers found in your environment (like Weights and Biases, Tensorboard) and regularly pushing your model to the hub (with the repo name being equal to `args.output_dir` at your HF username) 🤗
 
-With the default settings, the script fine-tunes a [GroundingDino](https://huggingface.co/IDEA-Research/grounding-dino-tiny) model on the [CPPE-5](https://huggingface.co/datasets/cppe-5) dataset. The resulting model can be seen here: https://huggingface.co/danelcsb/grounding-dino-tiny-finetuned-10k-cppe5-no-trainer. 
+With the default settings, the script fine-tunes a [GroundingDino](https://huggingface.co/IDEA-Research/grounding-dino-tiny) model on the [CPPE-5](https://huggingface.co/datasets/cppe-5) dataset. The resulting model can be seen here: https://huggingface.co/danelcsb/grounding-dino-tiny-finetuned-10k-cppe-5-no-trainer. 
 
 
 ## Reload and perform inference
diff --git a/examples/pytorch/zero-shot/run_zero_shot_object_detection_no_trainer.py b/examples/pytorch/zero-shot/run_zero_shot_object_detection_no_trainer.py
index 99a6ac04a924e4..1f7d5445927954 100644
--- a/examples/pytorch/zero-shot/run_zero_shot_object_detection_no_trainer.py
+++ b/examples/pytorch/zero-shot/run_zero_shot_object_detection_no_trainer.py
@@ -19,9 +19,10 @@
 import logging
 import math
 import os
+import random
 from functools import partial
 from pathlib import Path
-from typing import Any, List, Mapping, Tuple, Union
+from typing import Dict, Any, List, Mapping, Tuple, Union
 
 import albumentations as A
 import datasets
@@ -167,19 +168,59 @@ def augment_and_transform_batch(
     examples: Mapping[str, Any],
     transform: A.Compose,
     processor: AutoProcessor,
-    prompt: str,
+    id2label: Dict[int, str],
+    label2id: Dict[str, int],
+    random_text_prompt: bool = False,
     return_pixel_mask: bool = False,
 ) -> BatchFeature:
-    """Apply augmentations and format annotations in COCO format for object detection task"""
+    """
+    Apply augmentations and format annotations in COCO format for object detection task.
+    Generates the text prompt used. If `random_text_prompt` is False
+        then the prompt will follow the same ordering in `id2label` if set to
+        True a new ordering will be created and the prompt will be build accordingly
+        and labels will be updated as well.
+
+        Example:
+            `id2label` -> {'0': 'fish', '1': 'jellyfish', '2': 'penguins', '3':
+                        'sharks', '4': 'puffins', '5': 'stingrays', '6': 'starfish'}
+
+            If `random_text_prompt` -> False
+                `text` -> "fish. jellyfish. penguins. sharks. puffins. stingrays. starfish."
+
+            If `random_text_prompt` -> True
+                `id2label` gets shuffled e.g. {0: 'fish', 1: 'penguins', 2: 'stingrays', 3:
+                                            'jellyfish', 4: 'sharks', 5: 'starfish', 6: 'puffins'}
+                `text` -> "fish. penguins. stingrays. jellyfish. sharks. starfish. puffins."
+    """
 
     images = []
     annotations = []
     text = []
+
+    to_label_list = lambda x: list(x.values())
+    concat_func = lambda x: ". ".join(to_label_list(x)) + "."
+
     for image_id, image, objects in zip(examples["image_id"], examples["image"], examples["objects"]):
         image = np.array(image.convert("RGB"))
 
+        if random_text_prompt:
+            # Original ordering label list
+            label_list = to_label_list(id2label)
+            # Shuffle label list
+            random.shuffle(label_list)
+            # Create shuffled id2label
+            shuffled_id2label = {idx: label for idx, label in enumerate(label_list)}
+
+            # Mapping of original to shuffled id to update annotations
+            old2new = {label2id[label]: new_id for new_id, label in shuffled_id2label.items()}
+            prompt = concat_func(shuffled_id2label)
+            category = [old2new[category] for category in objects["category"]]
+        else:
+            prompt = concat_func(id2label)
+            category = objects["category"]
+
         # apply augmentations
-        output = transform(image=image, bboxes=objects["bbox"], category=objects["category"])
+        output = transform(image=image, bboxes=objects["bbox"], category=category)
         images.append(output["image"])
 
         # format annotations in COCO format
@@ -524,7 +565,7 @@ def main():
         "trust_remote_code": args.trust_remote_code,
     }
     config = AutoConfig.from_pretrained(
-        args.model_name_or_path, label2id=label2id, id2label=id2label, **common_pretrained_args
+        args.model_name_or_path, auxiliary_loss=True, label2id=label2id, id2label=id2label, **common_pretrained_args
     )
     model = AutoModelForZeroShotObjectDetection.from_pretrained(
         args.model_name_or_path,
@@ -573,15 +614,15 @@ def main():
     prompt = ". ".join(id2label.values()) + "."
 
     train_transform_batch = partial(
-        augment_and_transform_batch, transform=train_augment_and_transform, processor=processor, prompt=prompt,
+        augment_and_transform_batch, transform=train_augment_and_transform, processor=processor, id2label=id2label, label2id=label2id, random_text_prompt=False
     )
     validation_transform_batch = partial(
-        augment_and_transform_batch, transform=validation_transform, processor=processor, prompt=prompt,
+        augment_and_transform_batch, transform=validation_transform, processor=processor, id2label=id2label, label2id=label2id, random_text_prompt=True
     )
 
     with accelerator.main_process_first():
-        train_dataset = dataset["train"].with_transform(train_transform_batch)
-        valid_dataset = dataset["validation"].with_transform(validation_transform_batch)
+        train_dataset = dataset["test"].with_transform(train_transform_batch)
+        valid_dataset = dataset["test"].with_transform(validation_transform_batch)
         test_dataset = dataset["test"].with_transform(validation_transform_batch)
 
     dataloader_common_args = {

From cc399c1830b00dededc04b726644fc5627d2919c Mon Sep 17 00:00:00 2001
From: sangbumchoi <danielsejong55@gmail.com>
Date: Wed, 7 Aug 2024 02:57:37 +0000
Subject: [PATCH 48/61] make style

---
 examples/pytorch/zero-shot/README.md          |  4 +-
 .../run_zero_shot_object_detection.py         | 71 ++++++++++++---
 ...n_zero_shot_object_detection_no_trainer.py | 89 +++++++++++--------
 3 files changed, 118 insertions(+), 46 deletions(-)

diff --git a/examples/pytorch/zero-shot/README.md b/examples/pytorch/zero-shot/README.md
index f8aec74257416a..500b8d70633cd5 100644
--- a/examples/pytorch/zero-shot/README.md
+++ b/examples/pytorch/zero-shot/README.md
@@ -113,7 +113,9 @@ accelerate launch run_zero_shot_object_detection_no_trainer.py \
     --learning_rate 5e-5 \
     --ignore_mismatched_sizes \
     --with_tracking \
-    --push_to_hub
+    --push_to_hub \
+    --freeze_backbone \
+    --freeze_text_backbone
 ```
 
 and boom, you're training, possibly on multiple GPUs, logging everything to all trackers found in your environment (like Weights and Biases, Tensorboard) and regularly pushing your model to the hub (with the repo name being equal to `args.output_dir` at your HF username) 🤗
diff --git a/examples/pytorch/zero-shot/run_zero_shot_object_detection.py b/examples/pytorch/zero-shot/run_zero_shot_object_detection.py
index 5d98267625e5b1..97ec6c012ca655 100644
--- a/examples/pytorch/zero-shot/run_zero_shot_object_detection.py
+++ b/examples/pytorch/zero-shot/run_zero_shot_object_detection.py
@@ -17,10 +17,11 @@
 
 import logging
 import os
+import random
 import sys
 from dataclasses import dataclass, field
 from functools import partial
-from typing import Any, List, Mapping, Optional, Tuple, Union
+from typing import Any, Dict, List, Mapping, Optional, Tuple, Union
 
 import albumentations as A
 import numpy as np
@@ -31,8 +32,8 @@
 import transformers
 from transformers import (
     AutoConfig,
-    AutoProcessor,
     AutoModelForZeroShotObjectDetection,
+    AutoProcessor,
     HfArgumentParser,
     Trainer,
     TrainingArguments,
@@ -116,21 +117,68 @@ def convert_bbox_yolo_to_pascal(boxes: torch.Tensor, image_size: Tuple[int, int]
     return boxes
 
 
+def to_label_list(id2label):
+    return list(id2label.values())
+
+
+def concat_func(id2label):
+    return ". ".join(to_label_list(id2label)) + "."
+
+
 def augment_and_transform_batch(
     examples: Mapping[str, Any],
     transform: A.Compose,
-    image_processor: AutoImageProcessor,
+    processor: AutoProcessor,
+    id2label: Dict[int, str],
+    label2id: Dict[str, int],
+    random_text_prompt: bool = False,
     return_pixel_mask: bool = False,
 ) -> BatchFeature:
-    """Apply augmentations and format annotations in COCO format for object detection task"""
+    """
+    Apply augmentations and format annotations in COCO format for object detection task.
+    Generates the text prompt used. If `random_text_prompt` is False
+        then the prompt will follow the same ordering in `id2label` if set to
+        True a new ordering will be created and the prompt will be build accordingly
+        and labels will be updated as well.
+
+        Example:
+            `id2label` -> {'0': 'fish', '1': 'jellyfish', '2': 'penguins', '3':
+                        'sharks', '4': 'puffins', '5': 'stingrays', '6': 'starfish'}
+
+            If `random_text_prompt` -> False
+                `text` -> "fish. jellyfish. penguins. sharks. puffins. stingrays. starfish."
+
+            If `random_text_prompt` -> True
+                `id2label` gets shuffled e.g. {0: 'fish', 1: 'penguins', 2: 'stingrays', 3:
+                                            'jellyfish', 4: 'sharks', 5: 'starfish', 6: 'puffins'}
+                `text` -> "fish. penguins. stingrays. jellyfish. sharks. starfish. puffins."
+    """
 
     images = []
     annotations = []
+    text = []
+
     for image_id, image, objects in zip(examples["image_id"], examples["image"], examples["objects"]):
         image = np.array(image.convert("RGB"))
 
+        if random_text_prompt:
+            # Original ordering label list
+            label_list = to_label_list(id2label)
+            # Shuffle label list
+            random.shuffle(label_list)
+            # Create shuffled id2label
+            shuffled_id2label = dict(enumerate(label_list))
+
+            # Mapping of original to shuffled id to update annotations
+            old2new = {label2id[label]: new_id for new_id, label in shuffled_id2label.items()}
+            prompt = concat_func(shuffled_id2label)
+            category = [old2new[category] for category in objects["category"]]
+        else:
+            prompt = concat_func(id2label)
+            category = objects["category"]
+
         # apply augmentations
-        output = transform(image=image, bboxes=objects["bbox"], category=objects["category"])
+        output = transform(image=image, bboxes=objects["bbox"], category=category)
         images.append(output["image"])
 
         # format annotations in COCO format
@@ -138,9 +186,10 @@ def augment_and_transform_batch(
             image_id, output["category"], objects["area"], output["bboxes"]
         )
         annotations.append(formatted_annotations)
+        text.append(prompt)
 
     # Apply the image processor transformations: resizing, rescaling, normalization
-    result = image_processor(images=images, annotations=annotations, return_tensors="pt")
+    result = processor(images=images, text=text, annotations=annotations, return_tensors="pt")
 
     if not return_pixel_mask:
         result.pop("pixel_mask", None)
@@ -151,16 +200,20 @@ def augment_and_transform_batch(
 def collate_fn(batch: List[BatchFeature]) -> Mapping[str, Union[torch.Tensor, List[Any]]]:
     data = {}
     data["pixel_values"] = torch.stack([x["pixel_values"] for x in batch])
+    data["input_ids"] = torch.stack([x["input_ids"] for x in batch])
+    data["token_type_ids"] = torch.stack([x["token_type_ids"] for x in batch])
     data["labels"] = [x["labels"] for x in batch]
     if "pixel_mask" in batch[0]:
         data["pixel_mask"] = torch.stack([x["pixel_mask"] for x in batch])
+    if "attention_mask" in batch[0]:
+        data["attention_mask"] = torch.stack([x["attention_mask"] for x in batch])
     return data
 
 
 @torch.no_grad()
 def compute_metrics(
     evaluation_results: EvalPrediction,
-    image_processor: AutoImageProcessor,
+    image_processor: AutoProcessor,
     threshold: float = 0.0,
     id2label: Optional[Mapping[int, str]] = None,
 ) -> Mapping[str, float]:
@@ -474,9 +527,7 @@ def main():
     # Model training and evaluation with Trainer API
     # ------------------------------------------------------------------------------------------------
 
-    eval_compute_metrics_fn = partial(
-        compute_metrics, image_processor=processor, id2label=id2label, threshold=0.0
-    )
+    eval_compute_metrics_fn = partial(compute_metrics, image_processor=processor, id2label=id2label, threshold=0.0)
 
     trainer = Trainer(
         model=model,
diff --git a/examples/pytorch/zero-shot/run_zero_shot_object_detection_no_trainer.py b/examples/pytorch/zero-shot/run_zero_shot_object_detection_no_trainer.py
index 1f7d5445927954..9318f017702b87 100644
--- a/examples/pytorch/zero-shot/run_zero_shot_object_detection_no_trainer.py
+++ b/examples/pytorch/zero-shot/run_zero_shot_object_detection_no_trainer.py
@@ -22,7 +22,7 @@
 import random
 from functools import partial
 from pathlib import Path
-from typing import Dict, Any, List, Mapping, Tuple, Union
+from typing import Any, Dict, List, Mapping, Tuple, Union
 
 import albumentations as A
 import datasets
@@ -40,8 +40,8 @@
 import transformers
 from transformers import (
     AutoConfig,
-    AutoProcessor,
     AutoModelForZeroShotObjectDetection,
+    AutoProcessor,
     SchedulerType,
     get_scheduler,
 )
@@ -125,45 +125,39 @@ def convert_zero_shot_to_coco_format(predictions, label2id):
 
     Args:
         predictions (Dict): Output of zero-shot object detection
-            e.g. 
-                {
-                    'scores': tensor([0.4786, 0.4379, 0.4760], device='cuda:0'), 
-                    'labels': ['a cat', 'a cat', 'a remote control'], 
-                    'boxes': tensor([[344.6973,  23.1085, 637.1817, 374.2748],
-                                    [ 12.2690,  51.9104, 316.8564, 472.4341],
-                                    [ 38.5870,  70.0092, 176.7755, 118.1748]], device='cuda:0')
-                }
+            e.g. {'scores': tensor([0.4786, 0.4379, 0.4760], device='cuda:0'), 'labels': ['a cat', 'a cat', 'a remote control'], 'boxes': tensor([[344.6973,  23.1085, 637.1817, 374.2748],[ 12.2690,  51.9104, 316.8564, 472.4341],[ 38.5870,  70.0092, 176.7755, 118.1748]], device='cuda:0')}
         label2id (Dict): Dictionary of label to id mapping
 
     Returns:
         Dict: Output of zero-shot object detection
-            e.g. 
-                {
-                    'scores': tensor([0.4786, 0.4379, 0.4760], device='cuda:0'), 
-                    'labels': tensor([1, 1, 2], device='cuda:0'), 
-                    'boxes': tensor([[344.6973,  23.1085, 637.1817, 374.2748],
-                                    [ 12.2690,  51.9104, 316.8564, 472.4341],
-                                    [ 38.5870,  70.0092, 176.7755, 118.1748]], device='cuda:0')
-                }
+            e.g. {'scores': tensor([0.4786, 0.4379, 0.4760], device='cuda:0'), 'labels': [1, 1, 2], 'boxes': tensor([[344.6973,  23.1085, 637.1817, 374.2748],[ 12.2690,  51.9104, 316.8564, 472.4341],[ 38.5870,  70.0092, 176.7755, 118.1748]], device='cuda:0')}
+
     """
     # convert center to corners format
     torch_label = []
     for prediction in predictions:
-        scores = prediction['scores']
+        scores = prediction["scores"]
         device = scores.device
-        labels = prediction['labels']
+        labels = prediction["labels"]
         for label in labels:
             if label in label2id:
                 torch_label.append(label)
             else:
                 # Give background class
                 torch_label.append(0)
-        prediction['labels'] = torch.Tensor(torch_label).to(device)
+        prediction["labels"] = torch.Tensor(torch_label).to(device)
 
     return predictions
 
 
-# Copied from examples/pytorch/object-detection/run_object_detection.augment_and_transform_batch
+def to_label_list(id2label):
+    return list(id2label.values())
+
+
+def concat_func(id2label):
+    return ". ".join(to_label_list(id2label)) + "."
+
+
 def augment_and_transform_batch(
     examples: Mapping[str, Any],
     transform: A.Compose,
@@ -197,9 +191,6 @@ def augment_and_transform_batch(
     annotations = []
     text = []
 
-    to_label_list = lambda x: list(x.values())
-    concat_func = lambda x: ". ".join(to_label_list(x)) + "."
-
     for image_id, image, objects in zip(examples["image_id"], examples["image"], examples["objects"]):
         image = np.array(image.convert("RGB"))
 
@@ -209,7 +200,7 @@ def augment_and_transform_batch(
             # Shuffle label list
             random.shuffle(label_list)
             # Create shuffled id2label
-            shuffled_id2label = {idx: label for idx, label in enumerate(label_list)}
+            shuffled_id2label = dict(enumerate(label_list))
 
             # Mapping of original to shuffled id to update annotations
             old2new = {label2id[label]: new_id for new_id, label in shuffled_id2label.items()}
@@ -239,7 +230,6 @@ def augment_and_transform_batch(
     return result
 
 
-# Copied from examples/pytorch/object-detection/run_object_detection.collate_fn
 def collate_fn(batch: List[BatchFeature]) -> Mapping[str, Union[torch.Tensor, List[Any]]]:
     data = {}
     data["pixel_values"] = torch.stack([x["pixel_values"] for x in batch])
@@ -287,8 +277,10 @@ def evaluation_loop(
         # processor convert boxes from YOLO format to Pascal VOC format
         # ([x_min, y_min, x_max, y_max] in absolute coordinates)
         image_size = torch.stack([example["orig_size"] for example in batch["labels"]], dim=0)
-        input_ids = torch.stack([input_ids for input_ids in batch["input_ids"]], dim=0)
-        predictions = processor.post_process_grounded_object_detection(outputs, input_ids, box_threshold=0.0, text_threshold=0.0, target_sizes=image_size)
+        input_ids = torch.stack(batch["input_ids"], dim=0)
+        predictions = processor.post_process_grounded_object_detection(
+            outputs, input_ids, box_threshold=0.0, text_threshold=0.0, target_sizes=image_size
+        )
         predictions = nested_to_cpu(predictions)
         predictions = convert_zero_shot_to_coco_format(predictions, label2id)
 
@@ -471,6 +463,18 @@ def parse_args():
             "Only applicable when `--with_tracking` is passed."
         ),
     )
+    parser.add_argument(
+        "--freeze_backbone",
+        required=False,
+        action="store_true",
+        help="Whether to freeze the image encoder while training.",
+    )
+    parser.add_argument(
+        "--freeze_text_backbone",
+        required=False,
+        action="store_true",
+        help="Whether to freeze the text encoder while training.",
+    )
     args = parser.parse_args()
 
     # Sanity checks
@@ -577,6 +581,13 @@ def main():
         args.model_name_or_path,
     )
 
+    # Freeze both text_backbone
+    if args.freeze_backbone:
+        model.model.freeze_backbone()
+    if args.freeze_text_backbone:
+        for name, param in model.model.text_backbone.named_parameters():
+            param.requires_grad_(False)
+
     # ------------------------------------------------------------------------------------------------
     # Define image augmentations and dataset transforms
     # ------------------------------------------------------------------------------------------------
@@ -611,18 +622,26 @@ def main():
     )
 
     # Make transform functions for batch and apply for dataset splits
-    prompt = ". ".join(id2label.values()) + "."
-
     train_transform_batch = partial(
-        augment_and_transform_batch, transform=train_augment_and_transform, processor=processor, id2label=id2label, label2id=label2id, random_text_prompt=False
+        augment_and_transform_batch,
+        transform=train_augment_and_transform,
+        processor=processor,
+        id2label=id2label,
+        label2id=label2id,
+        random_text_prompt=False,
     )
     validation_transform_batch = partial(
-        augment_and_transform_batch, transform=validation_transform, processor=processor, id2label=id2label, label2id=label2id, random_text_prompt=True
+        augment_and_transform_batch,
+        transform=validation_transform,
+        processor=processor,
+        id2label=id2label,
+        label2id=label2id,
+        random_text_prompt=True,
     )
 
     with accelerator.main_process_first():
-        train_dataset = dataset["test"].with_transform(train_transform_batch)
-        valid_dataset = dataset["test"].with_transform(validation_transform_batch)
+        train_dataset = dataset["train"].with_transform(train_transform_batch)
+        valid_dataset = dataset["validation"].with_transform(validation_transform_batch)
         test_dataset = dataset["test"].with_transform(validation_transform_batch)
 
     dataloader_common_args = {

From 1b156b5c78c4e40d7056b6987ad59d37715e5f2c Mon Sep 17 00:00:00 2001
From: sangbumchoi <danielsejong55@gmail.com>
Date: Wed, 7 Aug 2024 05:29:43 +0000
Subject: [PATCH 49/61] tmp

---
 examples/pytorch/zero-shot/README.md          |  2 ++
 ...n_zero_shot_object_detection_no_trainer.py | 29 +++++++++----------
 2 files changed, 16 insertions(+), 15 deletions(-)

diff --git a/examples/pytorch/zero-shot/README.md b/examples/pytorch/zero-shot/README.md
index 500b8d70633cd5..34099371549c05 100644
--- a/examples/pytorch/zero-shot/README.md
+++ b/examples/pytorch/zero-shot/README.md
@@ -77,6 +77,8 @@ The resulting model can be seen here: https://huggingface.co/danelcsb/grounding-
 Note that you can replace the model and dataset by simply setting the `model_name_or_path` and `dataset_name` arguments respectively, with model or dataset from the [hub](https://huggingface.co/). 
 For dataset, make sure it provides labels in the same format as [CPPE-5](https://huggingface.co/datasets/cppe-5) dataset and boxes are provided in [COCO format](https://albumentations.ai/docs/getting_started/bounding_boxes_augmentation/#coco).
 
+Note that zero-shot inference output is not the same output format as object-detection output. In order to compute the evaluation metric performance, we have to modify the output little bit.
+
 ![W&B report](https://i.imgur.com/ASNjamQ.png)
 
 
diff --git a/examples/pytorch/zero-shot/run_zero_shot_object_detection_no_trainer.py b/examples/pytorch/zero-shot/run_zero_shot_object_detection_no_trainer.py
index 9318f017702b87..3b746aee7d2492 100644
--- a/examples/pytorch/zero-shot/run_zero_shot_object_detection_no_trainer.py
+++ b/examples/pytorch/zero-shot/run_zero_shot_object_detection_no_trainer.py
@@ -145,7 +145,7 @@ def convert_zero_shot_to_coco_format(predictions, label2id):
             else:
                 # Give background class
                 torch_label.append(0)
-        prediction["labels"] = torch.Tensor(torch_label).to(device)
+        prediction["labels"] = torch.Tensor(torch_label).to(dtype=torch.int32).to(device)
 
     return predictions
 
@@ -277,9 +277,9 @@ def evaluation_loop(
         # processor convert boxes from YOLO format to Pascal VOC format
         # ([x_min, y_min, x_max, y_max] in absolute coordinates)
         image_size = torch.stack([example["orig_size"] for example in batch["labels"]], dim=0)
-        input_ids = torch.stack(batch["input_ids"], dim=0)
+        input_ids = batch["input_ids"]
         predictions = processor.post_process_grounded_object_detection(
-            outputs, input_ids, box_threshold=0.0, text_threshold=0.0, target_sizes=image_size
+            outputs, input_ids, box_threshold=0.15, text_threshold=0.1, target_sizes=image_size
         )
         predictions = nested_to_cpu(predictions)
         predictions = convert_zero_shot_to_coco_format(predictions, label2id)
@@ -295,20 +295,19 @@ def evaluation_loop(
 
         metric.update(predictions, target)
 
-    # metrics = metric.compute()
+    metrics = metric.compute()
 
-    # # Replace list of per class metrics with separate metric for each class
-    # classes = metrics.pop("classes")
-    # map_per_class = metrics.pop("map_per_class")
-    # mar_100_per_class = metrics.pop("mar_100_per_class")
-    # for class_id, class_map, class_mar in zip(classes, map_per_class, mar_100_per_class):
-    #     class_name = id2label[class_id.item()]
-    #     metrics[f"map_{class_name}"] = class_map
-    #     metrics[f"mar_100_{class_name}"] = class_mar
+    # Replace list of per class metrics with separate metric for each class
+    classes = metrics.pop("classes")
+    map_per_class = metrics.pop("map_per_class")
+    mar_100_per_class = metrics.pop("mar_100_per_class")
+    for class_id, class_map, class_mar in zip(classes, map_per_class, mar_100_per_class):
+        class_name = id2label[class_id.item()]
+        metrics[f"map_{class_name}"] = class_map
+        metrics[f"mar_100_{class_name}"] = class_mar
 
-    # # Convert metrics to float
-    # metrics = {k: round(v.item(), 4) for k, v in metrics.items()}
-    metrics = {}
+    # Convert metrics to float
+    metrics = {k: round(v.item(), 4) for k, v in metrics.items()}
 
     return metrics
 

From ffce43ca179f382b763477b9de31db175190cdc4 Mon Sep 17 00:00:00 2001
From: sangbumchoi <danielsejong55@gmail.com>
Date: Thu, 8 Aug 2024 02:31:55 +0000
Subject: [PATCH 50/61] currently obejct_detecion.py has OOM error

---
 examples/pytorch/zero-shot/README.md          |  4 +-
 .../run_zero_shot_object_detection.py         | 78 ++++++++++++++++---
 2 files changed, 71 insertions(+), 11 deletions(-)

diff --git a/examples/pytorch/zero-shot/README.md b/examples/pytorch/zero-shot/README.md
index 34099371549c05..40c22be8d44d5e 100644
--- a/examples/pytorch/zero-shot/README.md
+++ b/examples/pytorch/zero-shot/README.md
@@ -40,14 +40,14 @@ python run_zero_shot_object_detection.py \
     --do_train true \
     --do_eval true \
     --output_dir grounding-dino-tiny-finetuned-cppe-5-10k-steps \
-    --num_train_epochs 100 \
+    --num_train_epochs 10 \
     --image_square_size 600 \
     --fp16 true \
     --learning_rate 5e-5 \
     --weight_decay 1e-4 \
     --dataloader_num_workers 4 \
     --dataloader_prefetch_factor 2 \
-    --per_device_train_batch_size 8 \
+    --per_device_train_batch_size 1 \
     --gradient_accumulation_steps 1 \
     --remove_unused_columns false \
     --eval_do_concat_batches false \
diff --git a/examples/pytorch/zero-shot/run_zero_shot_object_detection.py b/examples/pytorch/zero-shot/run_zero_shot_object_detection.py
index 97ec6c012ca655..eaf512700e3caf 100644
--- a/examples/pytorch/zero-shot/run_zero_shot_object_detection.py
+++ b/examples/pytorch/zero-shot/run_zero_shot_object_detection.py
@@ -117,6 +117,37 @@ def convert_bbox_yolo_to_pascal(boxes: torch.Tensor, image_size: Tuple[int, int]
     return boxes
 
 
+def convert_zero_shot_to_coco_format(predictions, label2id):
+    """
+    Convert zershot format output to typical object detection format in order to calculate mAP.
+
+    Args:
+        predictions (Dict): Output of zero-shot object detection
+            e.g. {'scores': tensor([0.4786, 0.4379, 0.4760], device='cuda:0'), 'labels': ['a cat', 'a cat', 'a remote control'], 'boxes': tensor([[344.6973,  23.1085, 637.1817, 374.2748],[ 12.2690,  51.9104, 316.8564, 472.4341],[ 38.5870,  70.0092, 176.7755, 118.1748]], device='cuda:0')}
+        label2id (Dict): Dictionary of label to id mapping
+
+    Returns:
+        Dict: Output of zero-shot object detection
+            e.g. {'scores': tensor([0.4786, 0.4379, 0.4760], device='cuda:0'), 'labels': [1, 1, 2], 'boxes': tensor([[344.6973,  23.1085, 637.1817, 374.2748],[ 12.2690,  51.9104, 316.8564, 472.4341],[ 38.5870,  70.0092, 176.7755, 118.1748]], device='cuda:0')}
+
+    """
+    # convert center to corners format
+    torch_label = []
+    for prediction in predictions:
+        scores = prediction["scores"]
+        device = scores.device
+        labels = prediction["labels"]
+        for label in labels:
+            if label in label2id:
+                torch_label.append(label)
+            else:
+                # Give background class
+                torch_label.append(0)
+        prediction["labels"] = torch.Tensor(torch_label).to(dtype=torch.int32).to(device)
+
+    return predictions
+
+
 def to_label_list(id2label):
     return list(id2label.values())
 
@@ -213,16 +244,19 @@ def collate_fn(batch: List[BatchFeature]) -> Mapping[str, Union[torch.Tensor, Li
 @torch.no_grad()
 def compute_metrics(
     evaluation_results: EvalPrediction,
-    image_processor: AutoProcessor,
-    threshold: float = 0.0,
+    processor: AutoProcessor,
+    box_threshold: float = 0.15,
+    text_threshold: float = 0.1,
     id2label: Optional[Mapping[int, str]] = None,
+    label2id: Optional[Mapping[str, int]] = None,
 ) -> Mapping[str, float]:
     """
     Compute mean average mAP, mAR and their variants for the object detection task.
 
     Args:
         evaluation_results (EvalPrediction): Predictions and targets from evaluation.
-        threshold (float, optional): Threshold to filter predicted boxes by confidence. Defaults to 0.0.
+        box_threshold (float, optional): Threshold to filter predicted boxes by confidence. Defaults to 0.15.
+        text_threshold (float, optional): Threshold to filter predicted text by confidence. Defaults to 0.1.
         id2label (Optional[dict], optional): Mapping from class id to class name. Defaults to None.
 
     Returns:
@@ -254,13 +288,14 @@ def compute_metrics(
             post_processed_targets.append({"boxes": boxes, "labels": labels})
 
     # Collect predictions in the required format for metric computation,
-    # model produce boxes in YOLO format, then image_processor convert them to Pascal VOC format
+    # model produce boxes in YOLO format, then processor convert them to Pascal VOC format
     for batch, target_sizes in zip(predictions, image_sizes):
         batch_logits, batch_boxes = batch[1], batch[2]
         output = ModelOutput(logits=torch.tensor(batch_logits), pred_boxes=torch.tensor(batch_boxes))
-        post_processed_output = image_processor.post_process_object_detection(
-            output, threshold=threshold, target_sizes=target_sizes
+        post_processed_output = processor.post_process_grounded_object_detection(
+            output, box_threshold=box_threshold, text_threshold=text_threshold, target_sizes=target_sizes
         )
+        post_processed_output = convert_zero_shot_to_coco_format(post_processed_output, label2id)
         post_processed_predictions.extend(post_processed_output)
 
     # Compute metrics
@@ -372,6 +407,14 @@ class ModelArguments:
             )
         },
     )
+    freeze_backbone: bool = field(
+        default=True,
+        metadata={"help": ("Whether freeze the image backbone.")},
+    )
+    freeze_text_backbone: bool = field(
+        default=True,
+        metadata={"help": ("Whether freeze the text encoder.")},
+    )
 
 
 def main():
@@ -478,6 +521,13 @@ def main():
         model_args.image_processor_name or model_args.model_name_or_path,
     )
 
+    # Freeze both text_backbone
+    if model_args.freeze_backbone:
+        model.model.freeze_backbone()
+    if model_args.freeze_text_backbone:
+        for name, param in model.model.text_backbone.named_parameters():
+            param.requires_grad_(False)
+
     # ------------------------------------------------------------------------------------------------
     # Define image augmentations and dataset transforms
     # ------------------------------------------------------------------------------------------------
@@ -513,10 +563,20 @@ def main():
 
     # Make transform functions for batch and apply for dataset splits
     train_transform_batch = partial(
-        augment_and_transform_batch, transform=train_augment_and_transform, image_processor=processor
+        augment_and_transform_batch,
+        transform=train_augment_and_transform,
+        processor=processor,
+        id2label=id2label,
+        label2id=label2id,
+        random_text_prompt=False,
     )
     validation_transform_batch = partial(
-        augment_and_transform_batch, transform=validation_transform, image_processor=processor
+        augment_and_transform_batch,
+        transform=validation_transform,
+        processor=processor,
+        id2label=id2label,
+        label2id=label2id,
+        random_text_prompt=False,
     )
 
     dataset["train"] = dataset["train"].with_transform(train_transform_batch)
@@ -527,7 +587,7 @@ def main():
     # Model training and evaluation with Trainer API
     # ------------------------------------------------------------------------------------------------
 
-    eval_compute_metrics_fn = partial(compute_metrics, image_processor=processor, id2label=id2label, threshold=0.0)
+    eval_compute_metrics_fn = partial(compute_metrics, processor=processor, id2label=id2label, label2id=label2id)
 
     trainer = Trainer(
         model=model,

From b067cfd000d8bd4afe9619ba8b34f41525eec50e Mon Sep 17 00:00:00 2001
From: sangbumchoi <danielsejong55@gmail.com>
Date: Thu, 8 Aug 2024 08:09:53 +0000
Subject: [PATCH 51/61] pre-final

---
 examples/pytorch/zero-shot/README.md              |  2 ++
 .../zero-shot/run_zero_shot_object_detection.py   | 15 +++++++--------
 2 files changed, 9 insertions(+), 8 deletions(-)

diff --git a/examples/pytorch/zero-shot/README.md b/examples/pytorch/zero-shot/README.md
index 40c22be8d44d5e..ee9987c20672da 100644
--- a/examples/pytorch/zero-shot/README.md
+++ b/examples/pytorch/zero-shot/README.md
@@ -48,10 +48,12 @@ python run_zero_shot_object_detection.py \
     --dataloader_num_workers 4 \
     --dataloader_prefetch_factor 2 \
     --per_device_train_batch_size 1 \
+    --per_device_eval_batch_size 1 \
     --gradient_accumulation_steps 1 \
     --remove_unused_columns false \
     --eval_do_concat_batches false \
     --ignore_mismatched_sizes true \
+    --include_inputs_for_metrics true \
     --metric_for_best_model eval_map \
     --greater_is_better true \
     --load_best_model_at_end true \
diff --git a/examples/pytorch/zero-shot/run_zero_shot_object_detection.py b/examples/pytorch/zero-shot/run_zero_shot_object_detection.py
index eaf512700e3caf..581bd4b2acf045 100644
--- a/examples/pytorch/zero-shot/run_zero_shot_object_detection.py
+++ b/examples/pytorch/zero-shot/run_zero_shot_object_detection.py
@@ -263,8 +263,7 @@ def compute_metrics(
         Mapping[str, float]: Metrics in a form of dictionary {<metric_name>: <metric_value>}
     """
 
-    predictions, targets = evaluation_results.predictions, evaluation_results.label_ids
-
+    predictions, targets, inputs = evaluation_results.predictions, evaluation_results.label_ids, evaluation_results.inputs
     # For metric computation we need to provide:
     #  - targets in a form of list of dictionaries with keys "boxes", "labels"
     #  - predictions in a form of list of dictionaries with keys "boxes", "scores", "labels"
@@ -289,11 +288,11 @@ def compute_metrics(
 
     # Collect predictions in the required format for metric computation,
     # model produce boxes in YOLO format, then processor convert them to Pascal VOC format
-    for batch, target_sizes in zip(predictions, image_sizes):
-        batch_logits, batch_boxes = batch[1], batch[2]
+    for batch, target_sizes, input_ids in zip(predictions, image_sizes, inputs):
+        batch_logits, batch_boxes = batch[2], batch[3]
         output = ModelOutput(logits=torch.tensor(batch_logits), pred_boxes=torch.tensor(batch_boxes))
         post_processed_output = processor.post_process_grounded_object_detection(
-            output, box_threshold=box_threshold, text_threshold=text_threshold, target_sizes=target_sizes
+            output, input_ids, box_threshold=box_threshold, text_threshold=text_threshold, target_sizes=target_sizes
         )
         post_processed_output = convert_zero_shot_to_coco_format(post_processed_output, label2id)
         post_processed_predictions.extend(post_processed_output)
@@ -576,11 +575,11 @@ def main():
         processor=processor,
         id2label=id2label,
         label2id=label2id,
-        random_text_prompt=False,
+        random_text_prompt=True,
     )
 
-    dataset["train"] = dataset["train"].with_transform(train_transform_batch)
-    dataset["validation"] = dataset["validation"].with_transform(validation_transform_batch)
+    dataset["train"] = dataset["test"].with_transform(train_transform_batch)
+    dataset["validation"] = dataset["test"].with_transform(validation_transform_batch)
     dataset["test"] = dataset["test"].with_transform(validation_transform_batch)
 
     # ------------------------------------------------------------------------------------------------

From 9567b651beacadba5636369c040742427d5c327e Mon Sep 17 00:00:00 2001
From: sangbumchoi <danielsejong55@gmail.com>
Date: Thu, 8 Aug 2024 23:43:45 +0000
Subject: [PATCH 52/61] tmp

---
 examples/pytorch/zero-shot/README.md              |  3 +--
 .../zero-shot/run_zero_shot_object_detection.py   | 15 +++++++++++++--
 src/transformers/trainer.py                       | 10 +++++++---
 3 files changed, 21 insertions(+), 7 deletions(-)

diff --git a/examples/pytorch/zero-shot/README.md b/examples/pytorch/zero-shot/README.md
index ee9987c20672da..243be9fa77d93a 100644
--- a/examples/pytorch/zero-shot/README.md
+++ b/examples/pytorch/zero-shot/README.md
@@ -81,8 +81,7 @@ For dataset, make sure it provides labels in the same format as [CPPE-5](https:/
 
 Note that zero-shot inference output is not the same output format as object-detection output. In order to compute the evaluation metric performance, we have to modify the output little bit.
 
-![W&B report](https://i.imgur.com/ASNjamQ.png)
-
+Note that standard GPU usage for float16 is 10.2GB
 
 ## PyTorch version, no Trainer
 
diff --git a/examples/pytorch/zero-shot/run_zero_shot_object_detection.py b/examples/pytorch/zero-shot/run_zero_shot_object_detection.py
index 581bd4b2acf045..7bc139af3b8bc2 100644
--- a/examples/pytorch/zero-shot/run_zero_shot_object_detection.py
+++ b/examples/pytorch/zero-shot/run_zero_shot_object_detection.py
@@ -60,6 +60,13 @@ class ModelOutput:
     pred_boxes: torch.Tensor
 
 
+
+class ZeroShotTrainer(Trainer):
+    def _get_input_by_name(self):
+        """Simple getattr function for getting input by name"""
+        return getattr(self.model, "input_ids", "input_ids")
+
+
 def format_image_annotations_as_coco(
     image_id: str, categories: List[int], areas: List[float], bboxes: List[Tuple[float]]
 ) -> dict:
@@ -263,7 +270,11 @@ def compute_metrics(
         Mapping[str, float]: Metrics in a form of dictionary {<metric_name>: <metric_value>}
     """
 
-    predictions, targets, inputs = evaluation_results.predictions, evaluation_results.label_ids, evaluation_results.inputs
+    predictions, targets, inputs = (
+        evaluation_results.predictions,
+        evaluation_results.label_ids,
+        evaluation_results.inputs,
+    )
     # For metric computation we need to provide:
     #  - targets in a form of list of dictionaries with keys "boxes", "labels"
     #  - predictions in a form of list of dictionaries with keys "boxes", "scores", "labels"
@@ -588,7 +599,7 @@ def main():
 
     eval_compute_metrics_fn = partial(compute_metrics, processor=processor, id2label=id2label, label2id=label2id)
 
-    trainer = Trainer(
+    trainer = ZeroShotTrainer(
         model=model,
         args=training_args,
         train_dataset=dataset["train"] if training_args.do_train else None,
diff --git a/src/transformers/trainer.py b/src/transformers/trainer.py
index 59f0ed438bf7e4..a3e47308d475f7 100755
--- a/src/transformers/trainer.py
+++ b/src/transformers/trainer.py
@@ -2247,7 +2247,7 @@ def _inner_training_loop(
                 total_batched_samples += 1
 
                 if self.args.include_num_input_tokens_seen:
-                    main_input_name = getattr(self.model, "main_input_name", "input_ids")
+                    main_input_name = self._get_input_by_name()
                     if main_input_name not in inputs:
                         logger.warning(
                             "Tried to track the number of tokens seen, however the current model is "
@@ -3248,6 +3248,10 @@ def log(self, logs: Dict[str, float]) -> None:
         self.state.log_history.append(output)
         self.control = self.callback_handler.on_log(self.args, self.state, self.control, logs)
 
+    def _get_input_by_name(self) -> str:
+        """Simple getattr function for getting input by name"""
+        return getattr(self.model, "main_input_name", "input_ids")
+
     def _prepare_input(self, data: Union[torch.Tensor, Any]) -> Union[torch.Tensor, Any]:
         """
         Prepares one `data` before feeding it to the model, be it a tensor or a nested list/dictionary of tensors.
@@ -3865,7 +3869,7 @@ def evaluation_loop(
 
             # Prediction step
             losses, logits, labels = self.prediction_step(model, inputs, prediction_loss_only, ignore_keys=ignore_keys)
-            main_input_name = getattr(self.model, "main_input_name", "input_ids")
+            main_input_name = self._get_input_by_name()
             inputs_decode = self._prepare_input(inputs[main_input_name]) if args.include_inputs_for_metrics else None
 
             if is_torch_xla_available():
@@ -4455,7 +4459,7 @@ def prediction_loop(
 
         for step, inputs in enumerate(dataloader):
             loss, logits, labels = self.prediction_step(model, inputs, prediction_loss_only, ignore_keys=ignore_keys)
-            main_input_name = getattr(self.model, "main_input_name", "input_ids")
+            main_input_name = self._get_input_by_name()
             inputs_decode = self._prepare_input(inputs[main_input_name]) if args.include_inputs_for_metrics else None
 
             if loss is not None:

From c12933833dd9a1f047497608d1e4fd6dd5d0c6d1 Mon Sep 17 00:00:00 2001
From: sangbumchoi <danielsejong55@gmail.com>
Date: Fri, 9 Aug 2024 00:13:30 +0000
Subject: [PATCH 53/61] pre-final

---
 examples/pytorch/zero-shot/README.md                     | 9 ++++++++-
 .../pytorch/zero-shot/run_zero_shot_object_detection.py  | 7 +++----
 2 files changed, 11 insertions(+), 5 deletions(-)

diff --git a/examples/pytorch/zero-shot/README.md b/examples/pytorch/zero-shot/README.md
index 243be9fa77d93a..1e9e771983ecdd 100644
--- a/examples/pytorch/zero-shot/README.md
+++ b/examples/pytorch/zero-shot/README.md
@@ -81,7 +81,14 @@ For dataset, make sure it provides labels in the same format as [CPPE-5](https:/
 
 Note that zero-shot inference output is not the same output format as object-detection output. In order to compute the evaluation metric performance, we have to modify the output little bit.
 
-Note that standard GPU usage for float16 is 10.2GB
+| Train method | Batch size | freeze_text_backbone | freeze_backbone | precision | GPU Memory Usage |
+|--------------|------------|----------------------|-----------------|-----------|------------------|
+| trainer      | 1          | Y                    | Y               | fp16      | 14.839 GB        |
+| trainer      | 2          | Y                    | Y               | fp16      | 21.889 GB        |
+| trainer      | 1          | Y                    | Y               | fp32      | 16.301GB         |
+| no_trainer   | 1          | Y                    | Y               | fp32      | 20.949 GB        |
+| no_trainer   | 1          | Y                    | N               | fp32      | 21.691 GB        |
+| no_trainer   | 1          | N                    | N               | fp32      | 22.577 GB        |
 
 ## PyTorch version, no Trainer
 
diff --git a/examples/pytorch/zero-shot/run_zero_shot_object_detection.py b/examples/pytorch/zero-shot/run_zero_shot_object_detection.py
index 7bc139af3b8bc2..d91d83d5931a1f 100644
--- a/examples/pytorch/zero-shot/run_zero_shot_object_detection.py
+++ b/examples/pytorch/zero-shot/run_zero_shot_object_detection.py
@@ -60,7 +60,6 @@ class ModelOutput:
     pred_boxes: torch.Tensor
 
 
-
 class ZeroShotTrainer(Trainer):
     def _get_input_by_name(self):
         """Simple getattr function for getting input by name"""
@@ -300,7 +299,7 @@ def compute_metrics(
     # Collect predictions in the required format for metric computation,
     # model produce boxes in YOLO format, then processor convert them to Pascal VOC format
     for batch, target_sizes, input_ids in zip(predictions, image_sizes, inputs):
-        batch_logits, batch_boxes = batch[2], batch[3]
+        batch_logits, batch_boxes = batch[1], batch[2]
         output = ModelOutput(logits=torch.tensor(batch_logits), pred_boxes=torch.tensor(batch_boxes))
         post_processed_output = processor.post_process_grounded_object_detection(
             output, input_ids, box_threshold=box_threshold, text_threshold=text_threshold, target_sizes=target_sizes
@@ -589,8 +588,8 @@ def main():
         random_text_prompt=True,
     )
 
-    dataset["train"] = dataset["test"].with_transform(train_transform_batch)
-    dataset["validation"] = dataset["test"].with_transform(validation_transform_batch)
+    dataset["train"] = dataset["train"].with_transform(train_transform_batch)
+    dataset["validation"] = dataset["validation"].with_transform(validation_transform_batch)
     dataset["test"] = dataset["test"].with_transform(validation_transform_batch)
 
     # ------------------------------------------------------------------------------------------------

From 30000e9aeab1b3e2133bd48bc1efcd76c28cb16f Mon Sep 17 00:00:00 2001
From: sangbumchoi <danielsejong55@gmail.com>
Date: Fri, 9 Aug 2024 04:58:07 +0000
Subject: [PATCH 54/61] final

---
 examples/pytorch/zero-shot/README.md | 28 +++++++++++++++++-----------
 1 file changed, 17 insertions(+), 11 deletions(-)

diff --git a/examples/pytorch/zero-shot/README.md b/examples/pytorch/zero-shot/README.md
index 1e9e771983ecdd..815064475c62db 100644
--- a/examples/pytorch/zero-shot/README.md
+++ b/examples/pytorch/zero-shot/README.md
@@ -71,7 +71,8 @@ python run_zero_shot_object_detection.py \
 `--eval_do_concat_batches false` is required for correct evaluation of detection models;  
 `--ignore_mismatched_sizes true` is required to load detection model for finetuning with different number of classes.
 
-The resulting model can be seen here: https://huggingface.co/danelcsb/grounding-dino-tiny-finetuned-10k-cppe-5-10k-steps. The corresponding Weights and Biases report [here](https://api.wandb.ai/links/qubvel-hf-co/bnm0r5ex). Note that it's always advised to check the original paper to know the details regarding training hyperparameters. Hyperparameters for current example were not tuned. To improve model quality you could try:
+The resulting model can be seen here: https://huggingface.co/danelcsb/grounding-dino-tiny-finetuned-10k-cppe-5-10k-steps.. Note that it's always advised to check the original paper to know the details regarding training hyperparameters. Hyperparameters for current example were not tuned. To improve model quality you could try:
+ - changing freeze policy of image backbone and text backbone
  - changing image size parameters (`--shortest_edge`/`--longest_edge`)
  - changing training parameters, such as learning rate, batch size, warmup, optimizer and many more (see [TrainingArguments](https://huggingface.co/docs/transformers/main_classes/trainer#transformers.TrainingArguments))
  - adding more image augmentations (we created a helpful [HF Space](https://huggingface.co/spaces/qubvel-hf/albumentations-demo) to choose some)
@@ -79,22 +80,27 @@ The resulting model can be seen here: https://huggingface.co/danelcsb/grounding-
 Note that you can replace the model and dataset by simply setting the `model_name_or_path` and `dataset_name` arguments respectively, with model or dataset from the [hub](https://huggingface.co/). 
 For dataset, make sure it provides labels in the same format as [CPPE-5](https://huggingface.co/datasets/cppe-5) dataset and boxes are provided in [COCO format](https://albumentations.ai/docs/getting_started/bounding_boxes_augmentation/#coco).
 
-Note that zero-shot inference output is not the same output format as object-detection output. In order to compute the evaluation metric performance, we have to modify the output little bit.
+Note that zero-shot inference output is not the same output format as object-detection output. In order to compute the evaluation metric performance such as mean average precision, we have to modify the output little bit.
 
-| Train method | Batch size | freeze_text_backbone | freeze_backbone | precision | GPU Memory Usage |
-|--------------|------------|----------------------|-----------------|-----------|------------------|
-| trainer      | 1          | Y                    | Y               | fp16      | 14.839 GB        |
-| trainer      | 2          | Y                    | Y               | fp16      | 21.889 GB        |
-| trainer      | 1          | Y                    | Y               | fp32      | 16.301GB         |
-| no_trainer   | 1          | Y                    | Y               | fp32      | 20.949 GB        |
-| no_trainer   | 1          | Y                    | N               | fp32      | 21.691 GB        |
-| no_trainer   | 1          | N                    | N               | fp32      | 22.577 GB        |
+| Train method | Batch size | freeze_text_backbone | freeze_backbone | precision | MSDA kernels | GPU Memory Usage (GB) | Time (s/epoch) |
+|--------------|------------|----------------------|-----------------|-----------|--------------|-----------------------|----------------|
+| trainer      | 2          | Y                    | Y               | fp16      | Y            | 22.785                | 353            |
+| trainer      | 1          | Y                    | Y               | fp32      | Y            | 8.813                 | 429            |
+| no_trainer   | 2          | N                    | N               | fp32      | Y            | OOM                   | -              |
+| no_trainer   | 1          | N                    | N               | fp32      | N            | 20.441                | 724            |
+| no_trainer   | 1          | N                    | N               | fp32      | Y            | 11.243                | 473            |
+| no_trainer   | 1          | Y                    | Y               | fp32      | Y            | 11.539                | 386            |
+
+Above table is tested on following device.
+- Platform: Linux-5.4.0-167-generic-x86_64-with-glibc2.35
+- GPU type: NVIDIA TITAN RTX
+- PyTorch version (GPU): 2.2.2
 
 ## PyTorch version, no Trainer
 
 Based on the script [`run_zero_shot_object_detection_no_trainer.py`](https://github.com/huggingface/transformers/blob/main/examples/pytorch/object-detection/run_zero_shot_object_detection.py).
 
-The script leverages [🤗 `Accelerate`](https://github.com/huggingface/accelerate), which allows to write your own training loop in PyTorch, but have it run instantly on any (distributed) environment, including CPU, multi-CPU, GPU, multi-GPU and TPU. It also supports mixed precision.
+The script leverages [🤗 `Accelerate`](https://github.com/huggingface/accelerate), which allows to write your own training loop in PyTorch, but have it run instantly on any (distributed) environment, including CPU, multi-CPU, GPU, multi-GPU and TPU. It also supports mixed precision. However, currently multi-GPU evaluation is not working due to following [issue](https://github.com/Lightning-AI/torchmetrics/issues/2477).
 
 First, run:
 

From a1e4e1cc228cb744701ddb79cb0dfab55171bd87 Mon Sep 17 00:00:00 2001
From: sangbumchoi <danielsejong55@gmail.com>
Date: Sun, 11 Aug 2024 02:01:32 +0000
Subject: [PATCH 55/61] nit

---
 .../run_zero_shot_object_detection.py         |  5 +--
 src/transformers/trainer.py                   | 38 ++++++++-----------
 2 files changed, 17 insertions(+), 26 deletions(-)

diff --git a/examples/pytorch/zero-shot/run_zero_shot_object_detection.py b/examples/pytorch/zero-shot/run_zero_shot_object_detection.py
index d91d83d5931a1f..6338db1e8e5a70 100644
--- a/examples/pytorch/zero-shot/run_zero_shot_object_detection.py
+++ b/examples/pytorch/zero-shot/run_zero_shot_object_detection.py
@@ -61,9 +61,8 @@ class ModelOutput:
 
 
 class ZeroShotTrainer(Trainer):
-    def _get_input_by_name(self):
-        """Simple getattr function for getting input by name"""
-        return getattr(self.model, "input_ids", "input_ids")
+    def _select_inputs_for_validation(inputs):
+        return inputs["input_ids"]
 
 
 def format_image_annotations_as_coco(
diff --git a/src/transformers/trainer.py b/src/transformers/trainer.py
index bb25649e8e2bf6..0b2bf714c238ee 100755
--- a/src/transformers/trainer.py
+++ b/src/transformers/trainer.py
@@ -2244,25 +2244,16 @@ def _inner_training_loop(
                 total_batched_samples += 1
 
                 if self.args.include_num_input_tokens_seen:
-                    main_input_name = self._get_input_by_name()
-                    if main_input_name not in inputs:
-                        logger.warning(
-                            "Tried to track the number of tokens seen, however the current model is "
-                            "not configured properly to know what item is the input. To fix this, add "
-                            "a `main_input_name` attribute to the model class you are using."
-                        )
-                    else:
-                        self.state.num_input_tokens_seen += (
-                            torch.sum(
-                                self.accelerator.gather(
-                                    torch.tensor(
-                                        inputs[main_input_name].numel(), device=self.args.device, dtype=torch.int64
-                                    )
-                                )
+                    selected_inputs = self._select_inputs_for_validation(inputs)
+                    self.state.num_input_tokens_seen += (
+                        torch.sum(
+                            self.accelerator.gather(
+                                torch.tensor(selected_inputs, device=self.args.device, dtype=torch.int64)
                             )
-                            .cpu()
-                            .item()
                         )
+                        .cpu()
+                        .item()
+                    )
                 if rng_to_sync:
                     self._load_rng_state(resume_from_checkpoint)
                     rng_to_sync = False
@@ -3245,9 +3236,10 @@ def log(self, logs: Dict[str, float]) -> None:
         self.state.log_history.append(output)
         self.control = self.callback_handler.on_log(self.args, self.state, self.control, logs)
 
-    def _get_input_by_name(self) -> str:
+    def _select_inputs_for_validation(self, inputs):
         """Simple getattr function for getting input by name"""
-        return getattr(self.model, "main_input_name", "input_ids")
+        main_input_name = getattr(self.model, "main_input_name", "input_ids")
+        return inputs[main_input_name]
 
     def _prepare_input(self, data: Union[torch.Tensor, Any]) -> Union[torch.Tensor, Any]:
         """
@@ -3866,8 +3858,8 @@ def evaluation_loop(
 
             # Prediction step
             losses, logits, labels = self.prediction_step(model, inputs, prediction_loss_only, ignore_keys=ignore_keys)
-            main_input_name = self._get_input_by_name()
-            inputs_decode = self._prepare_input(inputs[main_input_name]) if args.include_inputs_for_metrics else None
+            selected_inputs = self._select_inputs_for_validation(inputs)
+            inputs_decode = self._prepare_input(selected_inputs) if args.include_inputs_for_metrics else None
 
             if is_torch_xla_available():
                 xm.mark_step()
@@ -4456,8 +4448,8 @@ def prediction_loop(
 
         for step, inputs in enumerate(dataloader):
             loss, logits, labels = self.prediction_step(model, inputs, prediction_loss_only, ignore_keys=ignore_keys)
-            main_input_name = self._get_input_by_name()
-            inputs_decode = self._prepare_input(inputs[main_input_name]) if args.include_inputs_for_metrics else None
+            selected_inputs = self._select_inputs_for_validation(inputs)
+            inputs_decode = self._prepare_input(selected_inputs) if args.include_inputs_for_metrics else None
 
             if loss is not None:
                 losses = loss.repeat(batch_size)

From bee09008613fcc3d6b46ced243ca5cd8287ea8b7 Mon Sep 17 00:00:00 2001
From: sangbumchoi <danielsejong55@gmail.com>
Date: Sun, 11 Aug 2024 02:02:23 +0000
Subject: [PATCH 56/61] add self

---
 examples/pytorch/zero-shot/run_zero_shot_object_detection.py | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/examples/pytorch/zero-shot/run_zero_shot_object_detection.py b/examples/pytorch/zero-shot/run_zero_shot_object_detection.py
index 6338db1e8e5a70..9a82ca4fc9863b 100644
--- a/examples/pytorch/zero-shot/run_zero_shot_object_detection.py
+++ b/examples/pytorch/zero-shot/run_zero_shot_object_detection.py
@@ -61,7 +61,7 @@ class ModelOutput:
 
 
 class ZeroShotTrainer(Trainer):
-    def _select_inputs_for_validation(inputs):
+    def _select_inputs_for_validation(self, inputs):
         return inputs["input_ids"]
 
 
From f0cb798245803bf6c84690596f5b115bdede3a84 Mon Sep 17 00:00:00 2001
From: sangbumchoi <danielsejong55@gmail.com>
Date: Sun, 11 Aug 2024 02:13:32 +0000
Subject: [PATCH 57/61] add tests

---
 examples/pytorch/README.md                |  2 +-
 examples/pytorch/test_pytorch_examples.py | 30 +++++++++++++++++++++++
 2 files changed, 31 insertions(+), 1 deletion(-)

diff --git a/examples/pytorch/README.md b/examples/pytorch/README.md
index 4e318b3edb920c..2f2766a04e493d 100644
--- a/examples/pytorch/README.md
+++ b/examples/pytorch/README.md
@@ -48,7 +48,7 @@ Coming soon!
 | [**`semantic-segmentation`**](https://github.com/huggingface/transformers/tree/main/examples/pytorch/semantic-segmentation) | [SCENE_PARSE_150](https://huggingface.co/datasets/scene_parse_150) | ✅ | ✅ |✅ | [![Open In Colab](https://colab.research.google.com/assets/colab-badge.svg)](https://colab.research.google.com/github/huggingface/notebooks/blob/main/examples/semantic_segmentation.ipynb)
 | [**`object-detection`**](https://github.com/huggingface/transformers/tree/main/examples/pytorch/object-detection) | [CPPE-5](https://huggingface.co/datasets/cppe-5) | ✅ | ✅ |✅ | [![Open In Colab](https://colab.research.google.com/assets/colab-badge.svg)](https://colab.research.google.com/github/huggingface/notebooks/blob/main/transformers_doc/en/pytorch/object_detection.ipynb)
 | [**`instance-segmentation`**](https://github.com/huggingface/transformers/tree/main/examples/pytorch/instance-segmentation) | [ADE20K sample](https://huggingface.co/datasets/qubvel-hf/ade20k-mini) | ✅ | ✅ |✅ |
-
+| [**`zero-shot`**](https://github.com/huggingface/transformers/tree/main/examples/pytorch/zero-shot) | [CPPE-5](https://huggingface.co/datasets/cppe-5) | ✅ | ✅ |✅ | /
 
 ## Running quick tests
 
diff --git a/examples/pytorch/test_pytorch_examples.py b/examples/pytorch/test_pytorch_examples.py
index c609ee860c728f..5bd17a0eb982cc 100644
--- a/examples/pytorch/test_pytorch_examples.py
+++ b/examples/pytorch/test_pytorch_examples.py
@@ -50,6 +50,7 @@
         "semantic-segmentation",
         "object-detection",
         "instance-segmentation",
+        "zero-shot",
     ]
 ]
 sys.path.extend(SRC_DIRS)
@@ -76,6 +77,7 @@
     import run_swag
     import run_translation
     import run_wav2vec2_pretraining_no_trainer
+    import run_zero_shot_object_detection
 
 
 logging.basicConfig(level=logging.DEBUG)
@@ -678,3 +680,31 @@ def test_run_instance_segmentation(self):
             run_instance_segmentation.main()
             result = get_results(tmp_dir)
             self.assertGreaterEqual(result["test_map"], 0.1)
+
+    @patch.dict(os.environ, {"WANDB_DISABLED": "true"})
+    def test_zero_shotrun_object_detection(self):
+        tmp_dir = self.get_auto_remove_tmp_dir()
+        testargs = f"""
+            run_zero_shot_object_detection.py
+            --model_name_or_path IDEA-Research/grounding-dino-tiny
+            --output_dir {tmp_dir}
+            --dataset_name qubvel-hf/cppe-5-sample
+            --do_train
+            --do_eval
+            --remove_unused_columns False
+            --overwrite_output_dir True
+            --eval_do_concat_batches False
+            --max_steps 10
+            --learning_rate=5e-5
+            --per_device_train_batch_size=1
+            --per_device_eval_batch_size=1
+            --seed 32
+        """.split()
+
+        if is_torch_fp16_available_on_device(torch_device):
+            testargs.append("--fp16")
+
+        with patch.object(sys, "argv", testargs):
+            run_zero_shot_object_detection.main()
+            result = get_results(tmp_dir)
+            self.assertGreaterEqual(result["test_map"], 0.01)

From 9735814eccb611c74552b39d16e4d80f7b8f599c Mon Sep 17 00:00:00 2001
From: sangbumchoi <danielsejong55@gmail.com>
Date: Fri, 16 Aug 2024 00:40:48 +0000
Subject: [PATCH 58/61] fix label

---
 examples/pytorch/zero-shot/run_zero_shot_object_detection.py    | 2 +-
 .../zero-shot/run_zero_shot_object_detection_no_trainer.py      | 2 +-
 2 files changed, 2 insertions(+), 2 deletions(-)

diff --git a/examples/pytorch/zero-shot/run_zero_shot_object_detection.py b/examples/pytorch/zero-shot/run_zero_shot_object_detection.py
index 9a82ca4fc9863b..32162f237c061b 100644
--- a/examples/pytorch/zero-shot/run_zero_shot_object_detection.py
+++ b/examples/pytorch/zero-shot/run_zero_shot_object_detection.py
@@ -144,7 +144,7 @@ def convert_zero_shot_to_coco_format(predictions, label2id):
         labels = prediction["labels"]
         for label in labels:
             if label in label2id:
-                torch_label.append(label)
+                torch_label.append(label2id[label])
             else:
                 # Give background class
                 torch_label.append(0)
diff --git a/examples/pytorch/zero-shot/run_zero_shot_object_detection_no_trainer.py b/examples/pytorch/zero-shot/run_zero_shot_object_detection_no_trainer.py
index 3b746aee7d2492..6aa13763544b02 100644
--- a/examples/pytorch/zero-shot/run_zero_shot_object_detection_no_trainer.py
+++ b/examples/pytorch/zero-shot/run_zero_shot_object_detection_no_trainer.py
@@ -141,7 +141,7 @@ def convert_zero_shot_to_coco_format(predictions, label2id):
         labels = prediction["labels"]
         for label in labels:
             if label in label2id:
-                torch_label.append(label)
+                torch_label.append(label2id[label])
             else:
                 # Give background class
                 torch_label.append(0)

From 079691647f1e59bb3d8dec4361dbd88afdb05d82 Mon Sep 17 00:00:00 2001
From: Eduardo Pacheco <eduardo.pach@hotmail.com>
Date: Wed, 21 Aug 2024 22:14:26 +0200
Subject: [PATCH 59/61] addressed comments

---
 .../grounding_dino/modeling_grounding_dino.py | 80 ++++++++++++-------
 1 file changed, 52 insertions(+), 28 deletions(-)

diff --git a/src/transformers/models/grounding_dino/modeling_grounding_dino.py b/src/transformers/models/grounding_dino/modeling_grounding_dino.py
index 0cd821a2226b14..de9ba1cf1fdb3a 100644
--- a/src/transformers/models/grounding_dino/modeling_grounding_dino.py
+++ b/src/transformers/models/grounding_dino/modeling_grounding_dino.py
@@ -258,7 +258,7 @@ class GroundingDinoModelOutput(ModelOutput):
             weighted average in the text-vision attention, vision-text attention, text-enhancer (self-attention) and
             multi-scale deformable attention heads. attention softmax, used to compute the weighted average in the
             bi-attention heads.
-        enc_topk_proposals (`torch.FloatTensor` of shape `(batch_size, num_queries)`, *optional*, returned when `config.two_stage=True`):
+        encoder_topk_proposals (`torch.FloatTensor` of shape `(batch_size, num_queries)`, *optional*, returned when `config.two_stage=True`):
             Top `config.num_queries` scoring bounding boxes indices picked as region proposals in the first stage.
         enc_outputs_class (`torch.FloatTensor` of shape `(batch_size, sequence_length, config.num_labels)`, *optional*, returned when `config.two_stage=True`):
             Predicted bounding boxes scores where the top `config.num_queries` scoring bounding boxes are picked as
@@ -283,7 +283,7 @@ class GroundingDinoModelOutput(ModelOutput):
     encoder_vision_hidden_states: Optional[Tuple[torch.FloatTensor]] = None
     encoder_text_hidden_states: Optional[Tuple[torch.FloatTensor]] = None
     encoder_attentions: Optional[Tuple[Tuple[torch.FloatTensor]]] = None
-    enc_topk_proposals: Optional[torch.FloatTensor] = None
+    encoder_topk_proposals: Optional[torch.FloatTensor] = None
     enc_outputs_class: Optional[torch.FloatTensor] = None
     enc_outputs_coord_logits: Optional[torch.FloatTensor] = None
     encoder_logits: Optional[torch.FloatTensor] = None
@@ -346,7 +346,7 @@ class GroundingDinoObjectDetectionOutput(ModelOutput):
             Stacked intermediate reference points (reference points of each layer of the decoder).
         init_reference_points (`torch.FloatTensor` of shape  `(batch_size, num_queries, 4)`):
             Initial reference points sent through the Transformer decoder.
-        enc_topk_proposals (`torch.FloatTensor` of shape `(batch_size, num_queries)`, *optional*, returned when `config.two_stage=True`):
+        encoder_topk_proposals (`torch.FloatTensor` of shape `(batch_size, num_queries)`, *optional*, returned when `config.two_stage=True`):
             Top `config.num_queries` scoring bounding boxes indices picked as region proposals in the first stage.
         enc_outputs_class (`torch.FloatTensor` of shape `(batch_size, sequence_length, config.num_labels)`, *optional*, returned when `config.two_stage=True`):
             Predicted bounding boxes scores where the top `config.num_queries` scoring bounding boxes are picked as
@@ -376,7 +376,7 @@ class GroundingDinoObjectDetectionOutput(ModelOutput):
     encoder_vision_hidden_states: Optional[Tuple[torch.FloatTensor]] = None
     encoder_text_hidden_states: Optional[Tuple[torch.FloatTensor]] = None
     encoder_attentions: Optional[Tuple[Tuple[torch.FloatTensor]]] = None
-    enc_topk_proposals: Optional[torch.FloatTensor] = None
+    encoder_topk_proposals: Optional[torch.FloatTensor] = None
     enc_outputs_class: Optional[torch.FloatTensor] = None
     enc_outputs_coord_logits: Optional[torch.FloatTensor] = None
     encoder_logits: Optional[torch.FloatTensor] = None
@@ -2493,7 +2493,7 @@ def forward(
             encoder_vision_hidden_states=encoder_outputs.vision_hidden_states,
             encoder_text_hidden_states=encoder_outputs.text_hidden_states,
             encoder_attentions=encoder_outputs.attentions,
-            enc_topk_proposals=topk_proposals,
+            encoder_topk_proposals=topk_proposals,
             enc_outputs_class=enc_outputs_class,
             enc_outputs_coord_logits=enc_outputs_coord_logits,
             encoder_logits=encoder_logits,
@@ -2591,8 +2591,8 @@ def generalized_box_iou(boxes1, boxes2):
     return iou - (area - union) / area
 
 
-# Similar to `DeformableDetr` but we pass `num_queries` (because inputs are only the valid logits)
-# and we also pass `reduction` for testing purposes.
+# Similar to the one used in `DeformableDetr` but we pass `num_queries`, as `logits` are flattened
+# due to masked selection, and support different `reduction` modes.
 def sigmoid_focal_loss(
     inputs: torch.Tensor,
     targets: torch.Tensor,
@@ -2924,34 +2924,58 @@ def forward(self, outputs, targets):
         return losses
 
 
-def build_label_maps(logits, input_ids):
+def build_label_maps(logits: torch.FloatTensor, input_ids: torch.LongTensor) -> Tuple[torch.FloatTensor]:
     """
-    Computes a mapping between the tokens associated with the prompt labels in the logit space with shape `(batch_size, num_labels, hidden_size)`
-    where `num_labels` is defined by the number of classes in the input prompt.
-
-    For instance, given the prompt "fish. shark." we get input_ids = [  101,  3869,  1012, 11420,  1012,   102].
-    This function will return a mapping for each of the prompt tokens (i.e. tokens associated with "fish" and "shark")
-    indicating their position in the logit space.
+    Computes a mapping between tokens and their corresponding labels, where `num_labels` is determined by the number of classes in the input prompt.
+    The function identifies segments of tokens between specific delimiter tokens and generates label maps for those segments.
+    Args:
+        logits (`torch.Tensor` of shape `(batch_size, seq_length, hidden_size)`):
+            The output logits from the model, where `hidden_size` corresponds to the dimension of the model's output features.
 
+        input_ids (`torch.Tensor` of shape `(batch_size, seq_length)`):
+            The input token IDs corresponding to the input prompt. For example, given the prompt "fish. shark.",
+            `input_ids` might look like `[101, 3869, 1012, 11420, 1012, 102]` where each number corresponds to a token including special tokens.
+    Returns:
+        tuple: A tuple containing label maps for each instance in the batch.
+        - label_maps (tuple of `torch.Tensor`):
+            A tuple of tensors, where each tensor in the tuple corresponds to an instance in the batch. Each tensor
+            has shape `(num_labels, hidden_size)` and contains binary values (0 or 1), where `1` indicates the tokens
+            that are associated with a specific label (class) between delimiter tokens, and `0` elsewhere.
+    Example:
+        Given an input prompt "fish. shark." and corresponding `input_ids` as `[101, 3869, 1012, 11420, 1012, 102]`:
+        - The function identifies the tokens for "fish" (IDs `[3869]`) and "shark" (IDs `[11420]`).
+        - The function then constructs label maps for these tokens, where each label map indicates which tokens
+          correspond to which label between the delimiter tokens (e.g., between the period `.`).
+        - The output is a tuple of label maps, one for each instance in the batch.
+    Note:
+        - `SPECIAL_TOKENS` should be a predefined list of tokens that are considered special (e.g., `[CLS]`, `[SEP]`, etc.).
     """
-    hidden_size = logits.shape[-1]
+    max_seq_len = logits.shape[-1]
     # Add [PAD] token to the list of special tokens
     delimiter_tokens = torch.tensor(SPECIAL_TOKENS + [0], device=input_ids.device)
 
     delimiter_token_masks = torch.isin(input_ids, delimiter_tokens)
+    label_groups = torch.cumsum(delimiter_token_masks, dim=1) * (~delimiter_token_masks).to(torch.int32)
+
     label_maps = ()
-    for delimiter_token_mask in delimiter_token_masks:
-        label_map_within_batch = []
-        delimiter_indices = torch.where(delimiter_token_mask)[0]
-        for i in range(len(delimiter_indices) - 1):
-            start = delimiter_indices[i]
-            end = delimiter_indices[i + 1]
-            if end - start > 1:
-                label_map = torch.zeros(hidden_size, device=input_ids.device)
-                label_map[start + 1 : end] = 1
-                label_map_within_batch.append(label_map)
-
-        label_maps += (torch.stack(label_map_within_batch),)
+
+    # Iterate over batch dimension as we can have different number of labels
+    for label_group in label_groups:
+        # `label_group` is a tensor of shape `(seq_len,)` with zeros for non-label tokens and integers for label tokens
+        # label tokens with same integer value are part of the same label group
+
+        # Get unique labels and exclude 0 (i.e. non-label tokens)
+        unique_labels = torch.unique(label_group)[1:, None]
+        num_labels = unique_labels.shape[0]
+
+        # Create one-hot encoding for each label group
+        label_map = label_group.unsqueeze(0).repeat(num_labels, 1)
+        label_map = torch.where(label_map == unique_labels, 1, 0)
+
+        # Pad label_map to match `max_seq_len`
+        label_map = F.pad(label_map, (0, max_seq_len - label_map.shape[1]), value=0)
+
+        label_maps += (label_map,)
 
     return label_maps
 
@@ -3211,7 +3235,7 @@ def forward(
             intermediate_hidden_states=outputs.intermediate_hidden_states,
             intermediate_reference_points=outputs.intermediate_reference_points,
             init_reference_points=outputs.init_reference_points,
-            enc_topk_proposals=outputs.enc_topk_proposals,
+            encoder_topk_proposals=outputs.encoder_topk_proposals,
             enc_outputs_class=outputs.enc_outputs_class,
             enc_outputs_coord_logits=outputs.enc_outputs_coord_logits,
             encoder_logits=outputs.encoder_logits,

From 03f6f3ff3cc7620e41d4e8ff9dd5e9eda63e6b4b Mon Sep 17 00:00:00 2001
From: Eduardo Pacheco <eduardo.pach@hotmail.com>
Date: Wed, 21 Aug 2024 22:53:55 +0200
Subject: [PATCH 60/61] addressed one-hot

---
 .../grounding_dino/modeling_grounding_dino.py    | 16 ++++++++++------
 1 file changed, 10 insertions(+), 6 deletions(-)

diff --git a/src/transformers/models/grounding_dino/modeling_grounding_dino.py b/src/transformers/models/grounding_dino/modeling_grounding_dino.py
index de9ba1cf1fdb3a..fe6c63a4bc662c 100644
--- a/src/transformers/models/grounding_dino/modeling_grounding_dino.py
+++ b/src/transformers/models/grounding_dino/modeling_grounding_dino.py
@@ -2760,15 +2760,19 @@ def _get_target_classes_one_hot(self, outputs, targets, indices):
         """
         Create one_hot based on the matching indices
         """
-        class_labels = [target["class_labels"] for target in targets]
         logits = outputs["logits"]
-        label_maps = outputs["label_maps"]
+        # Add offsets to class_labels to select the correct label map
+        class_labels = torch.cat(
+            [
+                target["class_labels"][J] + len(outputs["label_maps"][i]) if i > 0 else target["class_labels"][J]
+                for i, (target, (_, J)) in enumerate(zip(targets, indices))
+            ]
+        )
+        label_maps = torch.cat(outputs["label_maps"], dim=0)
 
+        idx = self._get_source_permutation_idx(indices)
         target_classes_onehot = torch.zeros_like(logits, device=logits.device, dtype=torch.long)
-
-        for i, (source, target) in enumerate(indices):
-            labels = class_labels[i][target]
-            target_classes_onehot[i, source] = label_maps[i][labels].to(torch.long)
+        target_classes_onehot[idx] = label_maps[class_labels].to(torch.long)
 
         return target_classes_onehot
 

From 2fca079b28753b69c4bedb7428be0a49c7ae3f5c Mon Sep 17 00:00:00 2001
From: sangbumchoi <danielsejong55@gmail.com>
Date: Wed, 11 Sep 2024 06:12:07 +0000
Subject: [PATCH 61/61] remove items()

---
 .../zero-shot/run_zero_shot_object_detection_no_trainer.py      | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/examples/pytorch/zero-shot/run_zero_shot_object_detection_no_trainer.py b/examples/pytorch/zero-shot/run_zero_shot_object_detection_no_trainer.py
index 6aa13763544b02..f3eb9b699bb1f6 100644
--- a/examples/pytorch/zero-shot/run_zero_shot_object_detection_no_trainer.py
+++ b/examples/pytorch/zero-shot/run_zero_shot_object_detection_no_trainer.py
@@ -824,7 +824,7 @@ def main():
         if args.with_tracking:
             accelerator.log(
                 {
-                    "train_loss": total_loss.item() / len(train_dataloader),
+                    "train_loss": total_loss / len(train_dataloader),
                     **metrics,
                     "epoch": epoch,
                     "step": completed_steps,