From 842a28d096bc356070aedfadab94bf9148a3f6ae Mon Sep 17 00:00:00 2001
From: Andres Marafioti <andimarafioti@gmail.com>
Date: Tue, 6 Aug 2024 15:28:48 +0000
Subject: [PATCH 01/50] Add Idefics 3!

---
 docs/source/en/_toctree.yml                   |    2 +
 docs/source/en/index.md                       |    1 +
 docs/source/en/model_doc/idefics3.md          |   50 +
 docs/source/en/perf_infer_gpu_one.md          |    1 +
 src/transformers/__init__.py                  |   18 +
 src/transformers/models/__init__.py           |    1 +
 .../models/auto/configuration_auto.py         |    2 +
 .../models/auto/image_processing_auto.py      |    1 +
 src/transformers/models/auto/modeling_auto.py |    3 +
 .../models/auto/processing_auto.py            |    1 +
 .../models/auto/tokenization_auto.py          |    1 +
 src/transformers/models/idefics3/__init__.py  |   72 +
 .../models/idefics3/configuration_idefics3.py |  222 +++
 .../convert_idefics3_weights_to_hf.py         |  214 +++
 .../idefics3/image_processing_idefics3.py     |  813 ++++++++++
 .../models/idefics3/modeling_idefics3.py      | 1303 +++++++++++++++++
 .../models/idefics3/processing_idefics3.py    |  351 +++++
 src/transformers/utils/dummy_pt_objects.py    |   21 +
 .../utils/dummy_vision_objects.py             |    7 +
 tests/models/idefics3/__init__.py             |    0
 .../test_image_processing_idefics3.py         |  311 ++++
 .../models/idefics3/test_modeling_idefics3.py |  532 +++++++
 .../idefics3/test_processing_idefics3.py      |  235 +++
 23 files changed, 4162 insertions(+)
 create mode 100644 docs/source/en/model_doc/idefics3.md
 create mode 100644 src/transformers/models/idefics3/__init__.py
 create mode 100644 src/transformers/models/idefics3/configuration_idefics3.py
 create mode 100644 src/transformers/models/idefics3/convert_idefics3_weights_to_hf.py
 create mode 100644 src/transformers/models/idefics3/image_processing_idefics3.py
 create mode 100644 src/transformers/models/idefics3/modeling_idefics3.py
 create mode 100644 src/transformers/models/idefics3/processing_idefics3.py
 create mode 100644 tests/models/idefics3/__init__.py
 create mode 100644 tests/models/idefics3/test_image_processing_idefics3.py
 create mode 100644 tests/models/idefics3/test_modeling_idefics3.py
 create mode 100644 tests/models/idefics3/test_processing_idefics3.py

diff --git a/docs/source/en/_toctree.yml b/docs/source/en/_toctree.yml
index 7eff2a38302669..dfd1b4c50bdbfd 100644
--- a/docs/source/en/_toctree.yml
+++ b/docs/source/en/_toctree.yml
@@ -820,6 +820,8 @@
         title: IDEFICS
       - local: model_doc/idefics2
         title: Idefics2
+      - local: model_doc/idefics3
+        title: Idefics3
       - local: model_doc/instructblip
         title: InstructBLIP
       - local: model_doc/instructblipvideo
diff --git a/docs/source/en/index.md b/docs/source/en/index.md
index c18426de4c031c..6a43972286ef6e 100644
--- a/docs/source/en/index.md
+++ b/docs/source/en/index.md
@@ -168,6 +168,7 @@ Flax), PyTorch, and/or TensorFlow.
 |                        [I-BERT](model_doc/ibert)                         |       ✅        |         ❌         |      ❌      |
 |                       [IDEFICS](model_doc/idefics)                       |       ✅        |         ✅         |      ❌      |
 |                      [Idefics2](model_doc/idefics2)                      |       ✅        |         ❌         |      ❌      |
+|                      [Idefics3](model_doc/idefics3)                      |       ✅        |         ❌         |      ❌      |
 |                      [ImageGPT](model_doc/imagegpt)                      |       ✅        |         ❌         |      ❌      |
 |                      [Informer](model_doc/informer)                      |       ✅        |         ❌         |      ❌      |
 |                  [InstructBLIP](model_doc/instructblip)                  |       ✅        |         ❌         |      ❌      |
diff --git a/docs/source/en/model_doc/idefics3.md b/docs/source/en/model_doc/idefics3.md
new file mode 100644
index 00000000000000..9391c9020ee502
--- /dev/null
+++ b/docs/source/en/model_doc/idefics3.md
@@ -0,0 +1,50 @@
+<!--Copyright 2024 The HuggingFace Team. All rights reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License"); you may not use this file except in compliance with
+the License. You may obtain a copy of the License at
+
+http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software distributed under the License is distributed on
+an "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. See the License for the
+specific language governing permissions and limitations under the License.
+
+⚠️ Note that this file is in Markdown but contain specific syntax for our doc-builder (similar to MDX) that may not be
+rendered properly in your Markdown viewer.
+
+-->
+
+# Idefics3
+
+## Overview
+
+The Idefics3 model was proposed in [<INSERT PAPER NAME HERE>](<INSERT PAPER LINK HERE>) by <INSERT AUTHORS HERE>.
+
+Idefics3 is an adaptation of the Idefics2 model with three main differences: 
+- the use of Llama3 for the text model
+- an updated processing logic for the images.
+- The removal of the perceiver. 
+
+The resolutions of input images can be directly controlled, and they are decomposed into
+patches, or not, depending on the resolution. See [Idefics2] for more details on the model architecture.
+
+The abstract from the paper is the following:
+
+*<INSERT PAPER ABSTRACT HERE>*
+
+Tips:
+
+<INSERT TIPS ABOUT MODEL HERE>
+
+This model was contributed by [amyeroberts](https://huggingface.co/amyeroberts) and [andimarafioti](https://huggingface.co/andito).
+The original code can be found [here](<INSERT LINK TO GITHUB REPO HERE>).
+
+
+## Idefics3ImageProcessor
+[[autodoc]] Idefics3ImageProcessor
+    - preprocess
+
+
+## Idefics3Processor
+[[autodoc]] Idefics3Processor
+    - __call__
diff --git a/docs/source/en/perf_infer_gpu_one.md b/docs/source/en/perf_infer_gpu_one.md
index dd3433f2cd4862..18c385dfe68e08 100644
--- a/docs/source/en/perf_infer_gpu_one.md
+++ b/docs/source/en/perf_infer_gpu_one.md
@@ -53,6 +53,7 @@ FlashAttention-2 is currently supported for the following architectures:
 * [GPT-J](https://huggingface.co/docs/transformers/model_doc/gptj#transformers.GPTJModel)
 * [Granite](https://huggingface.co/docs/transformers/model_doc/granite#transformers.GraniteModel)
 * [Idefics2](https://huggingface.co/docs/transformers/model_doc/idefics2#transformers.Idefics2Model)
+* [Idefics3](https://huggingface.co/docs/transformers/model_doc/idefics3#transformers.Idefics3Model)
 * [Falcon](https://huggingface.co/docs/transformers/model_doc/falcon#transformers.FalconModel)
 * [JetMoe](https://huggingface.co/docs/transformers/model_doc/jetmoe#transformers.JetMoeModel)
 * [Jamba](https://huggingface.co/docs/transformers/model_doc/jamba#transformers.JambaModel)
diff --git a/src/transformers/__init__.py b/src/transformers/__init__.py
index bfd0d37916b553..16c98b88041651 100755
--- a/src/transformers/__init__.py
+++ b/src/transformers/__init__.py
@@ -480,6 +480,7 @@
     "models.ibert": ["IBertConfig"],
     "models.idefics": ["IdeficsConfig"],
     "models.idefics2": ["Idefics2Config"],
+    "models.idefics3": ["Idefics3Config"],
     "models.imagegpt": ["ImageGPTConfig"],
     "models.informer": ["InformerConfig"],
     "models.instructblip": [
@@ -1180,6 +1181,7 @@
     _import_structure["models.grounding_dino"].extend(["GroundingDinoImageProcessor"])
     _import_structure["models.idefics"].extend(["IdeficsImageProcessor"])
     _import_structure["models.idefics2"].extend(["Idefics2ImageProcessor"])
+    _import_structure["models.idefics3"].extend(["Idefics3ImageProcessor"])
     _import_structure["models.imagegpt"].extend(["ImageGPTFeatureExtractor", "ImageGPTImageProcessor"])
     _import_structure["models.instructblipvideo"].extend(["InstructBlipVideoImageProcessor"])
     _import_structure["models.layoutlmv2"].extend(["LayoutLMv2FeatureExtractor", "LayoutLMv2ImageProcessor"])
@@ -2401,6 +2403,14 @@
             "Idefics2Processor",
         ]
     )
+    _import_structure["models.idefics3"].extend(
+        [
+            "Idefics3ForConditionalGeneration",
+            "Idefics3Model",
+            "Idefics3PreTrainedModel",
+            "Idefics3Processor",
+        ]
+    )
     _import_structure["models.imagegpt"].extend(
         [
             "ImageGPTForCausalImageModeling",
@@ -5247,6 +5257,7 @@
         IdeficsConfig,
     )
     from .models.idefics2 import Idefics2Config
+    from .models.idefics3 import Idefics3Config
     from .models.imagegpt import ImageGPTConfig
     from .models.informer import InformerConfig
     from .models.instructblip import (
@@ -5983,6 +5994,7 @@
         from .models.grounding_dino import GroundingDinoImageProcessor
         from .models.idefics import IdeficsImageProcessor
         from .models.idefics2 import Idefics2ImageProcessor
+        from .models.idefics3 import Idefics3ImageProcessor
         from .models.imagegpt import ImageGPTFeatureExtractor, ImageGPTImageProcessor
         from .models.instructblipvideo import InstructBlipVideoImageProcessor
         from .models.layoutlmv2 import (
@@ -7011,6 +7023,12 @@
             Idefics2PreTrainedModel,
             Idefics2Processor,
         )
+        from .models.idefics3 import (
+            Idefics3ForConditionalGeneration,
+            Idefics3Model,
+            Idefics3PreTrainedModel,
+            Idefics3Processor,
+        )
         from .models.imagegpt import (
             ImageGPTForCausalImageModeling,
             ImageGPTForImageClassification,
diff --git a/src/transformers/models/__init__.py b/src/transformers/models/__init__.py
index 2022048cd4553f..29c53ed32de36d 100644
--- a/src/transformers/models/__init__.py
+++ b/src/transformers/models/__init__.py
@@ -114,6 +114,7 @@
     ibert,
     idefics,
     idefics2,
+    idefics3,
     imagegpt,
     informer,
     instructblip,
diff --git a/src/transformers/models/auto/configuration_auto.py b/src/transformers/models/auto/configuration_auto.py
index 2cd7d550d90b7a..9267ea67613813 100644
--- a/src/transformers/models/auto/configuration_auto.py
+++ b/src/transformers/models/auto/configuration_auto.py
@@ -131,6 +131,7 @@
         ("ibert", "IBertConfig"),
         ("idefics", "IdeficsConfig"),
         ("idefics2", "Idefics2Config"),
+        ("idefics3", "Idefics3Config"),
         ("imagegpt", "ImageGPTConfig"),
         ("informer", "InformerConfig"),
         ("instructblip", "InstructBlipConfig"),
@@ -425,6 +426,7 @@
         ("ibert", "I-BERT"),
         ("idefics", "IDEFICS"),
         ("idefics2", "Idefics2"),
+        ("idefics3", "Idefics3"),
         ("imagegpt", "ImageGPT"),
         ("informer", "Informer"),
         ("instructblip", "InstructBLIP"),
diff --git a/src/transformers/models/auto/image_processing_auto.py b/src/transformers/models/auto/image_processing_auto.py
index 95d9ddef8f7979..6bbc4ca605c95e 100644
--- a/src/transformers/models/auto/image_processing_auto.py
+++ b/src/transformers/models/auto/image_processing_auto.py
@@ -89,6 +89,7 @@
             ("hiera", ("BitImageProcessor",)),
             ("idefics", ("IdeficsImageProcessor",)),
             ("idefics2", ("Idefics2ImageProcessor",)),
+            ("idefics3", ("Idefics3ImageProcessor",)),
             ("imagegpt", ("ImageGPTImageProcessor",)),
             ("instructblip", ("BlipImageProcessor",)),
             ("instructblipvideo", ("InstructBlipVideoImageProcessor",)),
diff --git a/src/transformers/models/auto/modeling_auto.py b/src/transformers/models/auto/modeling_auto.py
index e0d15f1e236590..fde37934a93089 100644
--- a/src/transformers/models/auto/modeling_auto.py
+++ b/src/transformers/models/auto/modeling_auto.py
@@ -128,6 +128,7 @@
         ("ibert", "IBertModel"),
         ("idefics", "IdeficsModel"),
         ("idefics2", "Idefics2Model"),
+        ("idefics3", "Idefics3Model"),
         ("imagegpt", "ImageGPTModel"),
         ("informer", "InformerModel"),
         ("jamba", "JambaModel"),
@@ -311,6 +312,7 @@
         ("ibert", "IBertForMaskedLM"),
         ("idefics", "IdeficsForVisionText2Text"),
         ("idefics2", "Idefics2ForConditionalGeneration"),
+        ("idefics3", "Idefics3ForConditionalGeneration"),
         ("layoutlm", "LayoutLMForMaskedLM"),
         ("llava", "LlavaForConditionalGeneration"),
         ("llava_next", "LlavaNextForConditionalGeneration"),
@@ -725,6 +727,7 @@
         ("chameleon", "ChameleonForConditionalGeneration"),
         ("git", "GitForCausalLM"),
         ("idefics2", "Idefics2ForConditionalGeneration"),
+        ("idefics3", "Idefics3ForConditionalGeneration"),
         ("instructblip", "InstructBlipForConditionalGeneration"),
         ("instructblipvideo", "InstructBlipVideoForConditionalGeneration"),
         ("kosmos-2", "Kosmos2ForConditionalGeneration"),
diff --git a/src/transformers/models/auto/processing_auto.py b/src/transformers/models/auto/processing_auto.py
index 82d325248eabfb..374a7e13482ff7 100644
--- a/src/transformers/models/auto/processing_auto.py
+++ b/src/transformers/models/auto/processing_auto.py
@@ -65,6 +65,7 @@
         ("hubert", "Wav2Vec2Processor"),
         ("idefics", "IdeficsProcessor"),
         ("idefics2", "Idefics2Processor"),
+        ("idefics3", "Idefics3Processor"),
         ("instructblip", "InstructBlipProcessor"),
         ("instructblipvideo", "InstructBlipVideoProcessor"),
         ("kosmos-2", "Kosmos2Processor"),
diff --git a/src/transformers/models/auto/tokenization_auto.py b/src/transformers/models/auto/tokenization_auto.py
index e735579108d857..24e9521fc4e4b2 100644
--- a/src/transformers/models/auto/tokenization_auto.py
+++ b/src/transformers/models/auto/tokenization_auto.py
@@ -219,6 +219,7 @@
             ("ibert", ("RobertaTokenizer", "RobertaTokenizerFast" if is_tokenizers_available() else None)),
             ("idefics", (None, "LlamaTokenizerFast" if is_tokenizers_available() else None)),
             ("idefics2", ("LlamaTokenizer", "LlamaTokenizerFast" if is_tokenizers_available() else None)),
+            ("idefics3", ("LlamaTokenizer", "LlamaTokenizerFast" if is_tokenizers_available() else None)),
             ("instructblip", ("GPT2Tokenizer", "GPT2TokenizerFast" if is_tokenizers_available() else None)),
             ("instructblipvideo", ("GPT2Tokenizer", "GPT2TokenizerFast" if is_tokenizers_available() else None)),
             (
diff --git a/src/transformers/models/idefics3/__init__.py b/src/transformers/models/idefics3/__init__.py
new file mode 100644
index 00000000000000..35b1df5c678439
--- /dev/null
+++ b/src/transformers/models/idefics3/__init__.py
@@ -0,0 +1,72 @@
+# Copyright 2024 The HuggingFace Team. All rights reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+from typing import TYPE_CHECKING
+
+from ...utils import OptionalDependencyNotAvailable, _LazyModule, is_torch_available, is_vision_available
+
+
+_import_structure = {"configuration_idefics3": ["Idefics3Config"]}
+
+
+try:
+    if not is_vision_available():
+        raise OptionalDependencyNotAvailable()
+except OptionalDependencyNotAvailable:
+    pass
+else:
+    _import_structure["image_processing_idefics3"] = ["Idefics3ImageProcessor"]
+
+
+try:
+    if not is_torch_available():
+        raise OptionalDependencyNotAvailable()
+except OptionalDependencyNotAvailable:
+    pass
+else:
+    _import_structure["modeling_idefics3"] = [
+        "Idefics3ForConditionalGeneration",
+        "Idefics3PreTrainedModel",
+        "Idefics3Model",
+    ]
+    _import_structure["processing_idefics3"] = ["Idefics3Processor"]
+
+if TYPE_CHECKING:
+    from .configuration_idefics3 import Idefics3Config
+
+    try:
+        if not is_vision_available():
+            raise OptionalDependencyNotAvailable()
+    except OptionalDependencyNotAvailable:
+        pass
+    else:
+        from .image_processing_idefics3 import Idefics3ImageProcessor
+
+    try:
+        if not is_torch_available():
+            raise OptionalDependencyNotAvailable()
+    except OptionalDependencyNotAvailable:
+        pass
+    else:
+        from .modeling_idefics3 import (
+            Idefics3ForConditionalGeneration,
+            Idefics3Model,
+            Idefics3PreTrainedModel,
+        )
+        from .processing_idefics3 import Idefics3Processor
+
+
+else:
+    import sys
+
+    sys.modules[__name__] = _LazyModule(__name__, globals()["__file__"], _import_structure)
diff --git a/src/transformers/models/idefics3/configuration_idefics3.py b/src/transformers/models/idefics3/configuration_idefics3.py
new file mode 100644
index 00000000000000..8dd80791f4a309
--- /dev/null
+++ b/src/transformers/models/idefics3/configuration_idefics3.py
@@ -0,0 +1,222 @@
+# coding=utf-8
+# Copyright 2024 The HuggingFace Inc. team. All rights reserved.
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+"""Idefics3 model configuration"""
+
+import os
+from typing import Union
+
+from ...configuration_utils import PretrainedConfig
+from ...utils import logging
+from ..auto import CONFIG_MAPPING
+
+
+logger = logging.get_logger(__name__)
+
+
+class Idefics3VisionConfig(PretrainedConfig):
+    r"""
+    This is the configuration class to store the configuration of a [`Idefics3VisionModel`]. It is used to instantiate a
+    Idefics3 vision encoder according to the specified arguments, defining the model architecture. Instantiating a
+    configuration with the defaults will yield a similar configuration to that of the SigLIP checkpoint
+    [google/siglip-base-patch16-224](https://huggingface.co/google/siglip-base-patch16-224) used in the Idefics3 model
+    [HuggingFaceM4/idefics3-8b](https://huggingface.co/HuggingFaceM4/idefics3-8b).
+
+    Configuration objects inherit from [`PretrainedConfig`] and can be used to control the model outputs. Read the
+    documentation from [`PretrainedConfig`] for more information.
+
+    Args:
+        hidden_size (`int`, *optional*, defaults to 768):
+            Dimensionality of the encoder layers and the pooler layer.
+        intermediate_size (`int`, *optional*, defaults to 3072):
+            Dimensionality of the "intermediate" (i.e., feed-forward) layer in the Transformer encoder.
+        num_hidden_layers (`int`, *optional*, defaults to 12):
+            Number of hidden layers in the Transformer encoder.
+        num_attention_heads (`int`, *optional*, defaults to 12):
+            Number of attention heads for each attention layer in the Transformer encoder.
+        num_channels (`int`, *optional*, defaults to 3):
+            Number of channels in the input images.
+        image_size (`int`, *optional*, defaults to 224):
+            The size (resolution) of each image.
+        patch_size (`int`, *optional*, defaults to 32):
+            The size (resolution) of each patch.
+        hidden_act (`str` or `function`, *optional*, defaults to `"gelu_pytorch_tanh"`):
+            The non-linear activation function (function or string) in the encoder and pooler. If string, `"gelu"`,
+            `"relu"`, `"selu"` and `"gelu_new"` `"quick_gelu"` are supported.
+        layer_norm_eps (`float`, *optional*, defaults to 1e-06):
+            The epsilon used by the layer normalization layers.
+        attention_dropout (`float`, *optional*, defaults to 0.0):
+            The dropout ratio for the attention probabilities.
+        intializer_range (`float`, *optional*, defaults to 0.02):
+            The standard deviation for initializing all weight matrices in the model.
+
+    Example:
+
+    ```python
+    >>> from transformers.models.idefics3.modeling_idefics3 import Idefics3VisionTransformer
+    >>> from transformers.models.idefics3.configuration_idefics3 import Idefics3VisionConfig
+
+    >>> # Initializing a Idefics3VisionConfig with google/siglip-base-patch16-224 style configuration
+    >>> configuration = Idefics3VisionConfig()
+
+    >>> # Initializing a Idefics3VisionTransformer (with random weights) from the google/siglip-base-patch16-224 style configuration
+    >>> model = Idefics3VisionTransformer(configuration)
+
+    >>> # Accessing the model configuration
+    >>> configuration = model.config
+    ```"""
+
+    model_type = "idefics3"
+
+    def __init__(
+        self,
+        hidden_size=1152,
+        intermediate_size=3072,
+        num_hidden_layers=12,
+        num_attention_heads=16,
+        num_channels=3,
+        image_size=224,
+        patch_size=32,
+        hidden_act="gelu_pytorch_tanh",
+        layer_norm_eps=1e-6,
+        attention_dropout=0.0,
+        initializer_range=0.02,
+        **kwargs,
+    ):
+        super().__init__(**kwargs)
+
+        self.hidden_size = hidden_size
+        self.intermediate_size = intermediate_size
+        self.num_hidden_layers = num_hidden_layers
+        self.num_attention_heads = num_attention_heads
+        self.num_channels = num_channels
+        self.patch_size = patch_size
+        self.image_size = image_size
+        self.attention_dropout = attention_dropout
+        self.layer_norm_eps = layer_norm_eps
+        self.hidden_act = hidden_act
+        self.initializer_range = initializer_range
+
+    @classmethod
+    def from_pretrained(cls, pretrained_model_name_or_path: Union[str, os.PathLike], **kwargs) -> "PretrainedConfig":
+        cls._set_token_in_kwargs(kwargs)
+
+        config_dict, kwargs = cls.get_config_dict(pretrained_model_name_or_path, **kwargs)
+
+        # get the vision config dict if we are loading from Idefics3Config
+        if config_dict.get("model_type") == "idefics3":
+            config_dict = config_dict["vision_config"]
+
+        if "model_type" in config_dict and hasattr(cls, "model_type") and config_dict["model_type"] != cls.model_type:
+            logger.warning(
+                f"You are using a model of type {config_dict['model_type']} to instantiate a model of type "
+                f"{cls.model_type}. This is not supported for all configurations of models and can yield errors."
+            )
+
+        return cls.from_dict(config_dict, **kwargs)
+
+
+
+class Idefics3Config(PretrainedConfig):
+    r"""
+    This is the configuration class to store the configuration of a [`Idefics3Model`]. It is used to instantiate a
+    Idefics3 model according to the specified arguments, defining the model architecture. Instantiating a
+    configuration with the defaults will yield a similar configuration to that of the model of the Idefics3
+    [HuggingFaceM4/idefics3-8b](https://huggingface.co/HuggingFaceM4/idefics3-8b) architecture.
+
+    Configuration objects inherit from [`PretrainedConfig`] and can be used to control the model outputs. Read the
+    documentation from [`PretrainedConfig`] for more information.
+
+    Args:
+        use_cache (`bool`, *optional*, defaults to `True`):
+            Whether or not the model should cache the key/value pairs of the attention mechanism.
+        image_token_id (`int`, *optional*, defaults to 128257):
+            The id of the "image" token.
+        tie_word_embeddings (`bool`, *optional*, defaults to `False`):
+            Whether or not to tie the word embeddings with the token embeddings.
+        vision_config (`IdeficsVisionConfig` or `dict`, *optional*):
+            Custom vision config or dict
+        text_config (`LlamaConfig` or `dict`, *optional*):
+            Custom text config or dict for the text model
+        scale_factor (`int`, *optional*, defaults to 2):
+            The scale factor for the image encoder.
+        vocab_size (`int`, *optional*, defaults to 100000):
+            The size of the vocabulary.
+        hidden_size (`int`, *optional*, defaults to 4096):
+            Dimensionality of the encoder layers and the pooler layer.
+        pad_token_id (`int`, *optional*, defaults to 128002):
+            The id of the padding token.
+        max_position_embeddings (`int`, *optional*, defaults to 131072):
+            The maximum length of the input sequence.
+
+    Example:
+    ```python
+    >>> from transformers import Idefics3Model, Idefics3Config
+    >>> # Initializing configuration
+    >>> configuration = Idefics3Config()
+    >>> # Initializing a model from the configuration
+    >>> model = Idefics3Model(configuration)
+    >>> # Accessing the model configuration
+    >>> configuration = model.config
+    ```"""
+
+    model_type = "idefics3"
+    is_composition = True
+
+    def __init__(
+        self,
+        use_cache=True,
+        image_token_id=128257,
+        tie_word_embeddings=False,
+        vision_config=None,
+        text_config=None,
+        scale_factor=2,
+        vocab_size=100000,
+        hidden_size=4096,
+        pad_token_id=128_002,
+        max_position_embeddings=131_072,
+        **kwargs,
+    ):
+        self.image_token_id = image_token_id
+        self.use_cache = use_cache
+        self.tie_word_embeddings = tie_word_embeddings
+        self.vocab_size = vocab_size
+
+        if vision_config is None:
+            self.vision_config = Idefics3VisionConfig()
+            logger.info("vision_config is None, using default vision config")
+        elif isinstance(vision_config, dict):
+            self.vision_config = Idefics3VisionConfig(**vision_config)
+        elif isinstance(vision_config, Idefics3VisionConfig):
+            self.vision_config = vision_config
+
+        if isinstance(text_config, dict):
+            text_config["model_type"] = text_config["model_type"] if "model_type" in text_config else "llama3"
+            text_config["vocab_size"] = vocab_size
+            text_config = CONFIG_MAPPING[text_config["model_type"]](**text_config)
+        elif text_config is None:
+            logger.info("text_config is None, using default text config")
+            text_config = CONFIG_MAPPING["llama3"](
+                vocab_size=vocab_size,
+                max_position_embeddings=max_position_embeddings,
+                rms_norm_eps=1e-5,
+                pad_token_id=pad_token_id,
+                tie_word_embeddings=False,
+            )
+
+        text_config.vocab_size = vocab_size
+        self.text_config = text_config
+        self.scale_factor = scale_factor
+        self.hidden_size = hidden_size
+
+        super().__init__(**kwargs, tie_word_embeddings=tie_word_embeddings)
diff --git a/src/transformers/models/idefics3/convert_idefics3_weights_to_hf.py b/src/transformers/models/idefics3/convert_idefics3_weights_to_hf.py
new file mode 100644
index 00000000000000..fc52edd943ee03
--- /dev/null
+++ b/src/transformers/models/idefics3/convert_idefics3_weights_to_hf.py
@@ -0,0 +1,214 @@
+# Copyright 2024 The HuggingFace Inc. team. All rights reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+import argparse
+import json
+
+import torch
+from accelerate import init_empty_weights
+from huggingface_hub import hf_hub_download
+
+from transformers import (
+    AutoModelForCausalLM,
+    AutoTokenizer,
+    Idefics3Config,
+    Idefics3ForConditionalGeneration,
+    Idefics3ImageProcessor,
+    Idefics3Processor,
+    LlamaConfig,
+)
+
+
+EPILOG_TXT = """Example:
+    python transformers/src/transformers/models/idefics3/convert_idefics3_weights_to_hf.py --original_model_id HuggingFaceM4/idefics3-8b --output_hub_path org/idefics3
+"""
+
+
+KEYS_TO_MODIFY_MAPPING = {
+    "lm_head.weight": "lm_head.linear.weight",
+    "model.layers": "model.text_model.layers",
+    "model.norm": "model.text_model.norm",
+    "model.modality_projection": "model.connector.modality_projection",
+}
+
+
+WEIGHTS_TO_MERGE_MAPPING = (
+    # (weights to merge in merging order), (new weight name)
+    (
+        ("model.embed_tokens.weight", "model.embed_tokens.additional_embedding.weight"),
+        "model.text_model.embed_tokens.weight",
+    ),
+    (("lm_head.linear.weight", "additional_fc.weight"), "lm_head.weight"),
+)
+
+WEIGHTS_TO_DROP = (
+    # The original model had a vision head, but this is never used
+    "model.vision_model.head",
+)
+
+
+def convert_state_dict_to_hf(state_dict):
+    new_state_dict = {}
+    old_state_dict_keys = set(state_dict.keys())
+
+    # Flattened list of weights to merge. We keep these in the original state dict to merge them later
+    original_weights_to_merge = [w for weights in WEIGHTS_TO_MERGE_MAPPING for w in weights[0]]
+
+    # for key, value in state_dict.items():
+    for old_key in old_state_dict_keys:
+        if old_key.endswith(".inv_freq") or any(w in old_key for w in WEIGHTS_TO_DROP):
+            state_dict.pop(old_key)
+            continue
+
+        key = old_key
+        for key_to_modify, new_key in KEYS_TO_MODIFY_MAPPING.items():
+            if key_to_modify in key:
+                key = key.replace(key_to_modify, new_key)
+
+        weight = state_dict.pop(old_key)
+        if key in original_weights_to_merge:
+            new_state_dict[key] = weight
+            # Bit of a hack - we need to keep the original weights to merge them later
+            state_dict[key] = weight
+        else:
+            new_state_dict[key] = weight
+
+    return new_state_dict
+
+
+def merge_weights(state_dict, new_state_dict):
+    old_weight_names = set(state_dict.keys())
+
+    # Merge the weights
+    for weights_to_merge, new_weight_name in WEIGHTS_TO_MERGE_MAPPING:
+        for weight_to_merge in weights_to_merge:
+            print(weight_to_merge)
+            assert weight_to_merge in state_dict, f"Weight {weight_to_merge} is missing in the state dict"
+
+            weight = state_dict.pop(weight_to_merge)
+            if new_weight_name not in new_state_dict:
+                new_state_dict[new_weight_name] = [weight]
+            else:
+                new_state_dict[new_weight_name].append(weight)
+
+            old_weight_names.remove(weight_to_merge)
+
+        new_state_dict[new_weight_name] = torch.cat(new_state_dict[new_weight_name], dim=0)
+
+    # Remove the weights that were merged
+    for weights_to_merge, new_weight_name in WEIGHTS_TO_MERGE_MAPPING:
+        for weight in weights_to_merge:
+            if weight in new_state_dict and weight != new_weight_name:
+                new_state_dict.pop(weight)
+
+    return new_state_dict
+
+
+def get_config(checkpoint):
+    # We load the config then recreate to use the text_config
+
+    # download the config file
+    filepath = hf_hub_download(repo_id=checkpoint, filename="config.json")
+    with open(filepath, "r") as f:
+        config_json = json.load(f)
+
+    # Setup the vision config
+    vision_config = config_json.pop("vision_config")
+    vision_config.pop("vision_model_name", None)
+    if "embed_dim" in vision_config:
+        vision_config["hidden_size"] = vision_config.pop("embed_dim")
+
+    config_json["vocab_size"] = config_json.pop("vocab_size") + config_json.pop("additional_vocab_size")
+
+    image_token_id = config_json.pop("image_token_id", config_json["vocab_size"] - 2)
+    use_cache = config_json.pop("use_cache", True)
+    tie_word_embeddings = config_json.pop("tie_word_embeddings", True)
+    scale_factor = config_json.pop("scale_factor", 2)
+    vocab_size = config_json.pop("vocab_size", 100000)
+
+    # Remove "freeze" params from the config
+    config_json = {k: v for k, v in config_json.items() if not k.startswith("freeze_")}
+    text_config = LlamaConfig(**config_json)
+
+    config = Idefics3Config(
+        text_config=text_config,
+        vision_config=vision_config,
+        use_cache=use_cache,
+        image_token_id=image_token_id,
+        tie_word_embeddings=tie_word_embeddings,
+        scale_factor=scale_factor,
+        vocab_size=vocab_size,
+    )
+    return config
+
+
+def convert_idefics3_hub_to_hf(original_model_id, output_hub_path, push_to_hub):
+    # The original model maps to AutoModelForCausalLM, converted we map to Idefics3ForConditionalGeneration
+    original_model = AutoModelForCausalLM.from_pretrained(
+        original_model_id, trust_remote_code=True, torch_dtype=torch.bfloat16
+    )
+    # The original model doesn't use the Idefics3 processing objects
+    image_processor = Idefics3ImageProcessor()
+    tokenizer = AutoTokenizer.from_pretrained(original_model_id)
+    processor = Idefics3Processor(
+        image_processor=image_processor,
+        tokenizer=tokenizer,
+    )
+    state_dict = original_model.state_dict()
+    new_state_dict = convert_state_dict_to_hf(state_dict)
+
+    # Merge weights
+    new_state_dict = merge_weights(state_dict, new_state_dict)
+    del state_dict
+
+    config = get_config(original_model_id)
+    print(config)
+
+    with init_empty_weights():
+        model = Idefics3ForConditionalGeneration(config)
+
+    model.load_state_dict(new_state_dict, strict=True, assign=True)
+
+    model.save_pretrained(output_hub_path)
+    processor.save_pretrained(output_hub_path)
+
+    if push_to_hub:
+        model.push_to_hub(output_hub_path, private=True)
+        processor.push_to_hub(output_hub_path, private=True)
+
+
+def main():
+    parser = argparse.ArgumentParser(
+        epilog=EPILOG_TXT,
+        formatter_class=argparse.RawDescriptionHelpFormatter,
+    )
+    parser.add_argument(
+        "--original_model_id",
+        help="Hub location of the text model",
+    )
+    parser.add_argument(
+        "--output_hub_path",
+        help="Location on the hub of the converted model",
+    )
+    parser.add_argument(
+        "--push_to_hub",
+        action="store_true",
+        help="If set, the model will be pushed to the hub after conversion.",
+    )
+    args = parser.parse_args()
+    convert_idefics3_hub_to_hf(args.original_model_id, args.output_hub_path, args.push_to_hub)
+
+
+if __name__ == "__main__":
+    main()
diff --git a/src/transformers/models/idefics3/image_processing_idefics3.py b/src/transformers/models/idefics3/image_processing_idefics3.py
new file mode 100644
index 00000000000000..2dedac9c9acad8
--- /dev/null
+++ b/src/transformers/models/idefics3/image_processing_idefics3.py
@@ -0,0 +1,813 @@
+# coding=utf-8
+# Copyright 2024 The HuggingFace Inc. team. All rights reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+import math
+from typing import Any, Dict, Iterable, List, Optional, Tuple, Union
+
+import numpy as np
+
+from ...image_processing_utils import BaseImageProcessor, BatchFeature
+from ...image_transforms import PaddingMode, pad, resize, to_channel_dimension_format
+from ...image_utils import (
+    IMAGENET_STANDARD_MEAN,
+    IMAGENET_STANDARD_STD,
+    ChannelDimension,
+    ImageInput,
+    PILImageResampling,
+    get_image_size,
+    infer_channel_dimension_format,
+    is_scaled_image,
+    is_valid_image,
+    to_numpy_array,
+    valid_images,
+    validate_preprocess_arguments,
+)
+from ...utils import TensorType, is_vision_available, logging
+
+
+logger = logging.get_logger(__name__)
+
+
+if is_vision_available():
+    import PIL
+    from PIL import Image
+
+
+def _resize_output_size_rescale_to_max_len(
+    height: int, width: int, min_len: Optional[int] = 1, max_len: Optional[int] = None
+) -> Tuple[int, int]:
+    """
+    Get the output size of the image after resizing given a dictionary specifying the max and min sizes.
+
+    Args:
+        height (`int`):
+            Height of the input image.
+        width (`int`):
+            Width of the input image.
+        min_len (`int`, *optional*, defaults to 1):
+            Minimum size of the output image.
+        max_len (`int`, *optional*, defaults to the maximum size of the image):
+            Maximum size of the output image.
+        size (`Dict[str, int]`):
+            Size of the output image containing the keys "shortest_edge" and "longest_edge".
+        input_data_format (`ChannelDimension` or `str`):
+            The channel dimension format of the input image.
+
+    Returns:
+        The output size of the image after resizing.
+    """
+    max_len = max(height, width) if max_len is None else max_len
+    aspect_ratio = width / height
+
+    if width >= height:
+        width = max_len
+        height = int(width / aspect_ratio)
+    elif height > width:
+        height = max_len
+        width = int(height * aspect_ratio)
+
+    # Avoid resizing to a size smaller than min_len
+    height = max(height, min_len)
+    width = max(width, min_len)
+    return height, width
+
+
+def _resize_output_size_scale_below_upper_bound(
+    height: int, width: int, max_len: Optional[Dict[str, int]] = None
+) -> Tuple[int, int]:
+    """
+    Get the output size of the image after resizing given a dictionary specifying the max and min sizes.
+
+    Args:
+        height (`int`):
+            Height of the input image.
+        width (`int`):
+            Width of the input image.
+        max_len (`Dict[str, int]`, *optional*, defaults to the maximum size of the image):
+            Defines the maximum dimensions of the image.
+        size (`Dict[str, int]`):
+            Size of the output image containing the keys "shortest_edge" and "longest_edge".
+        input_data_format (`ChannelDimension` or `str`):
+            The channel dimension format of the input image.
+
+    Returns:
+        The output size of the image after resizing.
+    """
+    max_len = max(height, width) if max_len is None else max_len
+
+    aspect_ratio = width / height
+    if width >= height and width > max_len:
+        width = max_len
+        height = int(width / aspect_ratio)
+    elif height > width and height > max_len:
+        height = max_len
+        width = int(height * aspect_ratio)
+
+    # Avoid resizing to a size smaller than 1
+    height = max(height, 1)
+    width = max(width, 1)
+    return height, width
+
+
+def get_resize_output_image_size(
+    image,
+    resolution_max_side: int,
+    max_image_size: int = 1820,
+    input_data_format: Optional[Union[str, ChannelDimension]] = None,
+) -> Tuple[int, int]:
+    """
+    Get the output size of the image after resizing given a dictionary specifying the max and min sizes.
+
+    Args:
+        image (`np.ndarray`):
+            Image to resize.
+        resolution_max_side (`int`):
+            The longest edge of the image will be resized to this value. The shortest edge will be resized to keep the
+            input aspect ratio, with a lower bound of `min_image_size`.
+        max_image_size (`int`, *optional*, defaults to 1820):
+            Maximum image resolution. If the image is larger than this size, the longest edge will be resized to this
+            value, with the shortest edge resized to keep the input aspect ratio, with a lower bound of `min_image_size`.
+        input_data_format (`ChannelDimension` or `str`):
+            The channel dimension format of the input image.
+
+    Returns:
+        The output size of the image after resizing.
+    """
+    if resolution_max_side > max_image_size:
+        raise ValueError("`resolution_max_side` cannot be larger than `max_image_size`")
+
+    height, width = get_image_size(image, channel_dim=input_data_format)
+
+    # Find the output size, when rescaling the longest edge to max_len and preserving the aspect ratio
+    height, width = _resize_output_size_rescale_to_max_len(
+        height, width, max_len=resolution_max_side
+    )
+    # Find the output size when scaling the image to be below the max_image_size
+    height, width = _resize_output_size_scale_below_upper_bound(
+        height, width, max_len=max_image_size
+    )
+    return height, width
+
+
+def split_image(
+    image: np.ndarray,
+    max_image_size: Dict[str, int],
+    resample: PILImageResampling = PILImageResampling.LANCZOS,
+    input_data_format: Optional[Union[str, ChannelDimension]] = None,
+    data_format: Optional[Union[str, ChannelDimension]] = None,
+):
+    """
+    Image splitting strategy.
+    1) If one side of the original image is larger than `max_image_size`, resize it to `max_image_size` while preserving the aspect ratio.
+    2) Divide the resulting image into `ceil(height / max_image_size)` x `ceil(width / max_image_size)`
+    sub-images of approximately the same size each (up to the fact that `vision_encoder_max_image_size` does not divide `height` or
+    `width`).
+    3) Returns the list of the crops and the original image, in addition to the number of splits for the height and the width.
+    """
+    height, width = get_image_size(image, channel_dim=input_data_format)
+    max_height = max_width = max_image_size["longest_edge"]
+
+    frames = []
+    if height > max_height or width > max_width:
+        # Calculate the number of splits
+        num_splits_h = math.ceil(height / max_height)
+        num_splits_w = math.ceil(width / max_width)
+        # Calculate the optimal width and height for the sub-images
+        optimal_height = math.ceil(height / num_splits_h)
+        optimal_width = math.ceil(width / num_splits_w)
+
+        # Iterate through each row and column
+        for r in range(num_splits_h):
+            for c in range(num_splits_w):
+                # Calculate the starting point of the crop
+                start_x = c * optimal_width
+                start_y = r * optimal_height
+
+                # Calculate the ending point of the crop
+                end_x = min(start_x + optimal_width, width)
+                end_y = min(start_y + optimal_height, height)
+
+                # Crop the image
+                cropped_image = _crop(
+                    image, start_x, start_y, end_x, end_y, input_data_format=input_data_format, data_format=data_format
+                )
+                frames.append(cropped_image)
+
+        # For the global image at the end, we resize it to match the max_image_size, for cpu memory efficiency
+        global_image_height, global_image_width = max_height, max_width
+        if height != global_image_height or width != global_image_width:
+            image = resize(
+                image,
+                (global_image_height, global_image_width),
+                resample=resample,
+                input_data_format=input_data_format,
+            )
+    else:
+        num_splits_h, num_splits_w = 0, 0
+
+    if data_format is not None:
+        image = to_channel_dimension_format(image, data_format, input_channel_dim=input_data_format)
+
+    frames.append(image)
+
+    return frames, num_splits_h, num_splits_w
+
+
+# Copied from transformers.models.idefics2.image_processing_idefics2.make_list_of_images
+def make_list_of_images(images: ImageInput) -> List[List[np.ndarray]]:
+    """
+    Convert a single image or a list of images to a list of numpy arrays.
+
+    Args:
+        images (`ImageInput`):
+            A single image or a list of images.
+
+    Returns:
+        A list of numpy arrays.
+    """
+    # If it's a single image, convert it to a list of lists
+    if is_valid_image(images):
+        images = [[images]]
+    # If it's a list of images, it's a single batch, so convert it to a list of lists
+    elif isinstance(images, (list, tuple)) and len(images) > 0 and is_valid_image(images[0]):
+        images = [images]
+    # If it's a list of batches, it's already in the right format
+    elif (
+        isinstance(images, (list, tuple))
+        and len(images) > 0
+        and isinstance(images[0], (list, tuple))
+        and is_valid_image(images[0][0])
+    ):
+        pass
+    else:
+        raise ValueError(
+            "Invalid input type. Must be a single image, a list of images, or a list of batches of images."
+        )
+    return images
+
+
+# Copied from transformers.models.detr.image_processing_detr.max_across_indices
+def max_across_indices(values: Iterable[Any]) -> List[Any]:
+    """
+    Return the maximum value across all indices of an iterable of values.
+    """
+    return [max(values_i) for values_i in zip(*values)]
+
+
+def get_max_height_width(
+    images_list: List[List[np.ndarray]], input_data_format: Optional[Union[str, ChannelDimension]] = None
+) -> List[int]:
+    """
+    Get the maximum height and width across all images in a batch.
+    """
+    if input_data_format is None:
+        input_data_format = infer_channel_dimension_format(images_list[0][0])
+
+    image_sizes = []
+    for images in images_list:
+        for image in images:
+            image_sizes.append(get_image_size(image, channel_dim=input_data_format))
+
+    max_height, max_width = max_across_indices(image_sizes)
+    return (max_height, max_width)
+
+
+# Copied from transformers.models.detr.image_processing_detr.make_pixel_mask
+def make_pixel_mask(
+    image: np.ndarray, output_size: Tuple[int, int], input_data_format: Optional[Union[str, ChannelDimension]] = None
+) -> np.ndarray:
+    """
+    Make a pixel mask for the image, where 1 indicates a valid pixel and 0 indicates padding.
+
+    Args:
+        image (`np.ndarray`):
+            Image to make the pixel mask for.
+        output_size (`Tuple[int, int]`):
+            Output size of the mask.
+    """
+    input_height, input_width = get_image_size(image, channel_dim=input_data_format)
+    mask = np.zeros(output_size, dtype=np.int64)
+    mask[:input_height, :input_width] = 1
+    return mask
+
+
+# Copied from transformers.models.idefics2.image_processing_idefics2.convert_to_rgb
+def convert_to_rgb(image: ImageInput) -> ImageInput:
+    """
+    Converts an image to RGB format. Only converts if the image is of type PIL.Image.Image, otherwise returns the image
+    as is.
+    Args:
+        image (Image):
+            The image to convert.
+    """
+    if not isinstance(image, PIL.Image.Image):
+        return image
+
+    # `image.convert("RGB")` would only work for .jpg images, as it creates a wrong background
+    # for transparent images. The call to `alpha_composite` handles this case
+    if image.mode == "RGB":
+        return image
+
+    image_rgba = image.convert("RGBA")
+    background = Image.new("RGBA", image_rgba.size, (255, 255, 255))
+    alpha_composite = Image.alpha_composite(background, image_rgba)
+    alpha_composite = alpha_composite.convert("RGB")
+    return alpha_composite
+
+
+# FIXME Amy: make a more general crop function that isn't just centre crop
+def _crop(
+    image: np.ndarray,
+    w1: int,
+    h1: int,
+    w2: int,
+    h2: int,
+    input_data_format: Optional[Union[str, ChannelDimension]] = None,
+    data_format: Optional[Union[str, ChannelDimension]] = None,
+) -> np.ndarray:
+    if input_data_format is None:
+        input_data_format = infer_channel_dimension_format(image)
+
+    if input_data_format == ChannelDimension.FIRST:
+        image = image[:, h1:h2, w1:w2]
+    elif input_data_format == ChannelDimension.LAST:
+        image = image[h1:h2, w1:w2, :]
+    else:
+        raise ValueError("Invalid channel dimension format.")
+
+    if data_format is not None:
+        image = to_channel_dimension_format(image, data_format)
+
+    return image
+
+
+class Idefics3ImageProcessor(BaseImageProcessor):
+    r"""
+    Constructs a Idefics3 image processor.
+
+    Args:
+        do_convert_rgb (`bool`, *optional*, defaults to `True`):
+            Whether to convert the image to RGB. This is useful if the input image is of a different format e.g. RGBA.
+            Only has an effect if the input image is in the PIL format.
+        do_resize (`bool`, *optional*, defaults to `True`):
+            Whether to resize the image. The longest edge of the image is resized to  be <= `size["longest_edge"]`, with the
+            shortest edge resized to keep the input aspect ratio, with a minimum size of `size["shortest_edge"]`.
+        size (`Dict`, *optional*):
+            Controls the size of the output image. This is a dictionary containing the keys "shortest_edge" and "longest_edge".
+            The image will be resized such that the longest edge is <= `size["longest_edge"]` and the shortest edge is resized
+            to keep the input aspect ratio, with a lower bound of `size["shortest_edge"]`.
+        resample (`Resampling`, *optional*, defaults to `Resampling.LANCZOS`):
+            Resampling filter to use when resizing the image.
+        do_image_splitting (`bool`, *optional*, defaults to `True`):
+            Whether to split the image into sub-images concatenated with the original image. They are split into patches
+            such that each patch has a size of `max_image_size["height"]` x `max_image_size["width"]`.
+        max_image_size (`Dict`, *optional*, defaults to `self.max_image_size`):
+            Maximum resolution of the images accepted by the model. This is a dictionary containing the key "longest".
+        do_rescale (`bool`, *optional*, defaults to `True`):
+            Whether to rescale the image. If set to `True`, the image is rescaled to have pixel values between 0 and 1.
+        rescale_factor (`float`, *optional*, defaults to `1/255`):
+            Rescale factor to rescale the image by if `do_rescale` is set to `True`.
+        do_normalize (`bool`, *optional*, defaults to `True`):
+            Whether to normalize the image. If set to `True`, the image is normalized to have a mean of `image_mean` and
+            a standard deviation of `image_std`.
+        image_mean (`float` or `List[float]`, *optional*, defaults to `IDEFICS_STANDARD_MEAN`):
+            Mean to use if normalizing the image. This is a float or list of floats the length of the number of
+            channels in the image. Can be overridden by the `image_mean` parameter in the `preprocess` method. Can be
+            overridden by the `image_mean` parameter in the `preprocess` method.
+        image_std (`float` or `List[float]`, *optional*, defaults to `IDEFICS_STANDARD_STD`):
+            Standard deviation to use if normalizing the image. This is a float or list of floats the length of the
+            number of channels in the image. Can be overridden by the `image_std` parameter in the `preprocess` method.
+            Can be overridden by the `image_std` parameter in the `preprocess` method.
+        do_pad (`bool`, *optional*, defaults to `True`):
+            Whether or not to pad the images to the largest height and width in the batch and number of images per
+            sample in the batch, such that the returned tensor is of shape (batch_size, max_num_images, num_channels, max_height, max_width).
+        vision_encoder_max_size (`int`, *optional*, defaults to `364`):
+            Maximum size of the images accepted by the vision encoder. The images are split into patches of this size.
+    """
+
+    model_input_names = ["pixel_values"]
+
+    def __init__(
+        self,
+        do_convert_rgb: bool = True,
+        do_resize: bool = True,
+        size: Dict[str, int] = None,
+        resample: PILImageResampling = PILImageResampling.LANCZOS,
+        do_image_splitting: bool = True,
+        max_image_size: Dict[str, int] = None,
+        do_rescale: bool = True,
+        rescale_factor: float = 1 / 255,
+        do_normalize: bool = True,
+        image_mean: Optional[Union[float, List[float]]] = None,
+        image_std: Optional[Union[float, List[float]]] = None,
+        do_pad: bool = True,
+        vision_encoder_max_size: int = 364,
+        **kwargs,
+    ) -> None:
+        super().__init__(**kwargs)
+        self.do_convert_rgb = do_convert_rgb
+        self.do_resize = do_resize
+        self.size = size if size is not None else {"longest_edge": 5*364}
+        self.resample = resample
+        self.do_image_splitting = do_image_splitting
+        self.max_image_size = max_image_size if max_image_size is not None else {"longest_edge": 364}
+        self.do_rescale = do_rescale
+        self.rescale_factor = rescale_factor
+        self.do_normalize = do_normalize
+        self.image_mean = image_mean if image_mean is not None else IMAGENET_STANDARD_MEAN
+        self.image_std = image_std if image_std is not None else IMAGENET_STANDARD_STD
+        self.do_pad = do_pad
+        self.vision_encoder_max_size = vision_encoder_max_size
+
+    def resize(
+        self,
+        image: np.ndarray,
+        size: Dict[str, int],
+        resample: PILImageResampling = PILImageResampling.LANCZOS,
+        data_format: Optional[Union[str, ChannelDimension]] = None,
+        input_data_format: Optional[Union[str, ChannelDimension]] = None,
+        **kwargs,
+    ) -> np.ndarray:
+        """
+        Resize an image. The longest edge of the image is resized to size["longest_edge"], with the shortest edge
+        resized to keep the input aspect ratio. Can also be used with size["height"] and size["width"].
+
+        Args:
+            image (`np.ndarray`):
+                Image to resize.
+            size (`Dict[str, int]`):
+                Size of the output image.
+            resample (`PILImageResampling`, *optional*, defaults to `PILImageResampling.BICUBIC`):
+                Resampling filter to use when resizing the image.
+            data_format (`str` or `ChannelDimension`, *optional*):
+                The channel dimension format of the image. If not provided, it will be the same as the input image.
+            input_data_format (`ChannelDimension` or `str`, *optional*):
+                The channel dimension format of the input image. If not provided, it will be inferred.
+        """
+        if "longest_edge" in size:
+            size = get_resize_output_image_size(
+                image, resolution_max_side=size["longest_edge"], input_data_format=input_data_format
+            )
+        elif "height" in size and "width" in size:
+            size = (size["height"], size["width"])
+        else:
+            raise ValueError("size must be a dictionary with key 'longest_edge' or 'height' and 'width'.")
+        return resize(
+            image, size, resample=resample, data_format=data_format, input_data_format=input_data_format, **kwargs
+        )
+
+    def split_image(
+        self,
+        image,
+        max_image_size: Dict[str, int],
+        input_data_format: Optional[Union[str, ChannelDimension]] = None,
+        data_format: Optional[Union[str, ChannelDimension]] = None,
+    ):
+        """
+        Split an image into 4 equal sub-images, and the concatenate that sequence with the original image.
+        That means that a single image becomes a sequence of 5 images.
+        This is a "trick" to spend more compute on each image with no changes in the vision encoder.
+
+        Args:
+            image (`np.ndarray`):
+                Images to split.
+            max_image_size (`Dict[str, int]`):
+                Maximum size of the output image. If the image is larger than this size, it will be split into
+                patches of this size, and the original image will be concatenated with the patches, resized to max_size.
+            input_data_format (`ChannelDimension` or `str`, *optional*):
+                The channel dimension format of the input image. If not provided, it will be inferred.
+        """
+        return split_image(image, max_image_size, input_data_format=input_data_format, data_format=data_format)
+
+    def _pad_image(
+        self,
+        image: np.ndarray,
+        output_size: Tuple[int, int],
+        constant_values: Union[float, Iterable[float]] = 0,
+        data_format: Optional[ChannelDimension] = None,
+        input_data_format: Optional[Union[str, ChannelDimension]] = None,
+    ) -> np.ndarray:
+        """
+        Pad an image with zeros to the given size.
+        """
+        input_height, input_width = get_image_size(image, channel_dim=input_data_format)
+        output_height, output_width = output_size
+
+        pad_bottom = output_height - input_height
+        pad_right = output_width - input_width
+        padding = ((0, pad_bottom), (0, pad_right))
+        padded_image = pad(
+            image,
+            padding,
+            mode=PaddingMode.CONSTANT,
+            constant_values=constant_values,
+            data_format=data_format,
+            input_data_format=input_data_format,
+        )
+        return padded_image
+
+    def pad(
+        self,
+        images: List[np.ndarray],
+        constant_values: Union[float, Iterable[float]] = 0,
+        return_pixel_mask: bool = True,
+        return_tensors: Optional[Union[str, TensorType]] = None,
+        data_format: Optional[ChannelDimension] = None,
+        input_data_format: Optional[Union[str, ChannelDimension]] = None,
+    ) -> BatchFeature:
+        """
+        For a list of images, for each images, pads a batch of images to the bottom and right of the image with zeros to the size of largest height and width.
+        For each sample in the batch, pads the sample with empty images to the max_number of images per sample in the batch. Optionally returns a pixel mask.
+
+        Args:
+            images (`np.ndarray`):
+                List of list of images to pad. Pads to the largest height and width in the batch.
+            constant_values (`float` or `Iterable[float]`, *optional*):
+                The value to use for the padding if `mode` is `"constant"`.
+            return_pixel_mask (`bool`, *optional*, defaults to `True`):
+                Whether to return a pixel mask.
+            return_tensors (`str` or `TensorType`, *optional*):
+                The type of tensors to return. Can be one of:
+                    - Unset: Return a list of `np.ndarray`.
+                    - `TensorType.TENSORFLOW` or `'tf'`: Return a batch of type `tf.Tensor`.
+                    - `TensorType.PYTORCH` or `'pt'`: Return a batch of type `torch.Tensor`.
+                    - `TensorType.NUMPY` or `'np'`: Return a batch of type `np.ndarray`.
+                    - `TensorType.JAX` or `'jax'`: Return a batch of type `jax.numpy.ndarray`.
+            data_format (`str` or `ChannelDimension`, *optional*):
+                The channel dimension format of the image. If not provided, it will be the same as the input image.
+            input_data_format (`ChannelDimension` or `str`, *optional*):
+                The channel dimension format of the input image. If not provided, it will be inferred.
+        """
+        pad_size = get_max_height_width(images, input_data_format=input_data_format)
+
+        batch_size = len(images)
+        max_num_images = max(len(images_) for images_ in images)
+        input_data_format = (
+            infer_channel_dimension_format(images[0][0]) if input_data_format is None else input_data_format
+        )
+        data_format = input_data_format if data_format is None else data_format
+
+        if input_data_format == ChannelDimension.FIRST:
+            n_channels = images[0][0].shape[0]
+        elif input_data_format == ChannelDimension.LAST:
+            n_channels = images[0][0].shape[-1]
+        else:
+            raise ValueError("Invalid channel dimension format.")
+
+        def empty_image(size, input_data_format):
+            if input_data_format == ChannelDimension.FIRST:
+                return np.zeros((n_channels, *size), dtype=np.uint8)
+            elif input_data_format == ChannelDimension.LAST:
+                return np.zeros((*size, n_channels), dtype=np.uint8)
+            raise ValueError("Invalid channel dimension format.")
+
+        padded_images_list = [
+            [empty_image(pad_size, data_format) for _ in range(max_num_images)] for _ in range(batch_size)
+        ]
+        padded_masks = [[np.zeros(pad_size) for _ in range(max_num_images)] for _ in range(batch_size)]
+
+        for batch_idx in range(batch_size):
+            for sample_idx, image in enumerate(images[batch_idx]):
+                padded_images_list[batch_idx][sample_idx] = self._pad_image(
+                    image,
+                    pad_size,
+                    constant_values=constant_values,
+                    data_format=data_format,
+                    input_data_format=input_data_format,
+                )
+                padded_masks[batch_idx][sample_idx] = make_pixel_mask(
+                    image, output_size=pad_size, input_data_format=input_data_format
+                )
+
+        padded_masks = padded_masks if return_pixel_mask else None
+        return padded_images_list, padded_masks
+
+    def preprocess(
+        self,
+        images: ImageInput,
+        do_convert_rgb: Optional[bool] = None,
+        do_resize: Optional[bool] = None,
+        size: Optional[Dict[str, int]] = None,
+        resample: PILImageResampling = None,
+        do_image_splitting: Optional[bool] = None,
+        do_rescale: Optional[bool] = None,
+        max_image_size: Optional[Dict[str, int]] = None,
+        rescale_factor: Optional[float] = None,
+        do_normalize: Optional[bool] = None,
+        image_mean: Optional[Union[float, List[float]]] = None,
+        image_std: Optional[Union[float, List[float]]] = None,
+        do_pad: Optional[bool] = None,
+        return_tensors: Optional[Union[str, TensorType]] = None,
+        return_row_col_info: bool = False,
+        input_data_format: Optional[ChannelDimension] = None,
+        data_format: Optional[ChannelDimension] = ChannelDimension.FIRST,
+    ):
+        """
+        Preprocess a batch of images.
+
+        Args:
+            images (`ImageInput`):
+                A list of images to preprocess.
+            do_convert_rgb (`bool`, *optional*, defaults to `self.do_convert_rgb`):
+                Whether to convert the image to RGB.
+            do_resize (`bool`, *optional*, defaults to `self.do_resize`):
+                Whether to resize the image.
+            size (`Dict[str, int]`, *optional*, defaults to `self.size`):
+                Size of the image after resizing. With the longest edge resized to keep the input aspect ratio.
+            resample (`int`, *optional*, defaults to `self.resample`):
+                Resampling filter to use if resizing the image. This can be one of the enum `PILImageResampling`. Only
+                has an effect if `do_resize` is set to `True`.
+            do_image_splitting (`bool`, *optional*, defaults to `self.do_image_splitting`):
+                Whether to split the image into sub-images concatenated with the original image. They are split into patches
+                such that each patch has a size of `max_image_size["height"]` x `max_image_size["width"]`.
+            max_image_size (`Dict`, *optional*, defaults to `self.max_image_size`):
+                Maximum resolution of the images. If the image is larger than this size, the image is split into patches.
+            do_rescale (`bool`, *optional*, defaults to `self.do_rescale`):
+                Whether to rescale the image.
+            rescale_factor (`float`, *optional*, defaults to `self.rescale_factor`):
+                Rescale factor to rescale the image by if `do_rescale` is set to `True`.
+            do_normalize (`bool`, *optional*, defaults to `self.do_normalize`):
+                Whether to normalize the image.
+            image_mean (`float` or `List[float]`, *optional*, defaults to `self.image_mean`):
+                Image mean to use for normalization. Only has an effect if `do_normalize` is set to `True`.
+            image_std (`float` or `List[float]`, *optional*, defaults to `self.image_std`):
+                Image standard deviation to use for normalization. Only has an effect if `do_normalize` is set to
+                `True`.
+            do_pad (`bool`, *optional*, defaults to `self.do_pad`):
+                Whether or not to pad the images to the largest height and width in the batch.
+            return_tensors (`str` or `TensorType`, *optional*):
+                The type of tensors to return. Can be one of:
+                - Unset: Return a list of `np.ndarray`.
+                - `TensorType.TENSORFLOW` or `'tf'`: Return a batch of type `tf.Tensor`.
+                - `TensorType.PYTORCH` or `'pt'`: Return a batch of type `torch.Tensor`.
+                - `TensorType.NUMPY` or `'np'`: Return a batch of type `np.ndarray`.
+                - `TensorType.JAX` or `'jax'`: Return a batch of type `jax.numpy.ndarray`.
+            return_row_col_info (`bool`, *optional*, default to `False`):
+                Whether to return the number of rows and columns of the split images. This is used for the
+                `Idefics3Processor` to generate prompt strings based on the number of rows and columns.
+            data_format (`ChannelDimension` or `str`, *optional*, defaults to `ChannelDimension.FIRST`):
+                The channel dimension format for the output image. Can be one of:
+                - `"channels_first"` or `ChannelDimension.FIRST`: image in (num_channels, height, width) format.
+                - `"channels_last"` or `ChannelDimension.LAST`: image in (height, width, num_channels) format.
+                - Unset: Use the channel dimension format of the input image.
+            input_data_format (`ChannelDimension` or `str`, *optional*):
+                The channel dimension format for the input image. If unset, the channel dimension format is inferred
+                from the input image. Can be one of:
+                - `"channels_first"` or `ChannelDimension.FIRST`: image in (num_channels, height, width) format.
+                - `"channels_last"` or `ChannelDimension.LAST`: image in (height, width, num_channels) format.
+                - `"none"` or `ChannelDimension.NONE`: image in (height, width) format.
+        """
+        do_resize = do_resize if do_resize is not None else self.do_resize
+        size = size if size is not None else self.size
+        resample = resample if resample is not None else self.resample
+        do_rescale = do_rescale if do_rescale is not None else self.do_rescale
+        rescale_factor = rescale_factor if rescale_factor is not None else self.rescale_factor
+        do_image_splitting = do_image_splitting if do_image_splitting is not None else self.do_image_splitting
+        max_image_size = max_image_size if max_image_size is not None else self.max_image_size
+        do_normalize = do_normalize if do_normalize is not None else self.do_normalize
+        image_mean = image_mean if image_mean is not None else self.image_mean
+        image_std = image_std if image_std is not None else self.image_std
+        do_convert_rgb = do_convert_rgb if do_convert_rgb is not None else self.do_convert_rgb
+        do_pad = do_pad if do_pad is not None else self.do_pad
+
+        images_list = make_list_of_images(images)
+
+        if not valid_images(images_list[0]):
+            raise ValueError(
+                "Invalid image type. Must be of type PIL.Image.Image, numpy.ndarray, "
+                "torch.Tensor, tf.Tensor or jax.ndarray."
+            )
+
+        validate_preprocess_arguments(
+            do_rescale=do_rescale,
+            rescale_factor=rescale_factor,
+            do_normalize=do_normalize,
+            image_mean=image_mean,
+            image_std=image_std,
+            do_resize=do_resize,
+            size=size,
+            resample=resample,
+        )
+
+        if do_convert_rgb:
+            images_list = [[convert_to_rgb(image) for image in images] for images in images_list]
+
+        # All transformations expect numpy arrays.
+        images_list = [[to_numpy_array(image) for image in images] for images in images_list]
+
+        if is_scaled_image(images_list[0][0]) and do_rescale:
+            logger.warning_once(
+                "It looks like you are trying to rescale already rescaled images. If the input"
+                " images have pixel values between 0 and 1, set `do_rescale=False` to avoid rescaling them again."
+            )
+
+        if input_data_format is None:
+            # We assume that all images have the same channel dimension format.
+            input_data_format = infer_channel_dimension_format(images_list[0][0])
+
+        if do_resize:
+            images_list = [
+                [
+                    self.resize(image=image, size=size, resample=resample, input_data_format=input_data_format)
+                    for image in images
+                ]
+                for images in images_list
+            ]
+
+        # We will resize both height and width of each image to the nearest 364 multiple, disregarding the aspect ratio
+        # for size=(10, 364) -> rescaled_size=(364, 364)
+        # for size=(11, 365) -> rescaled_size=(364, 364*2) 
+        new_images_list = []
+        for images in images_list:
+            new_images = []
+            for img in images:
+                height, width, _ = img.shape
+                new_size = {"height": math.ceil(height / self.vision_encoder_max_size) * self.vision_encoder_max_size,
+                            "width": math.ceil(width/ self.vision_encoder_max_size) * self.vision_encoder_max_size}
+                new_images.append(self.resize(img, size=new_size, resample=resample, input_data_format=input_data_format))
+            new_images_list.append(new_images)
+        images_list = new_images_list
+        del new_images_list
+
+        if do_image_splitting:
+            images_list_split_arrays = []
+            images_list_rows = []
+            images_list_cols = []
+            for images in images_list:
+                split_image_arrays = []
+                image_rows = []
+                image_cols = []
+                for image in images:
+                    split_image_array, rows, cols = self.split_image(
+                        image,
+                        max_image_size=max_image_size,
+                        input_data_format=input_data_format,
+                    )
+                    split_image_arrays.extend(split_image_array)
+                    image_rows.append(rows)
+                    image_cols.append(cols)
+                images_list_split_arrays.append(split_image_arrays)
+                images_list_rows.append(image_rows)
+                images_list_cols.append(image_cols)
+            images_list = images_list_split_arrays
+        else:
+            images_list_rows = [[0] * len(images) for images in images_list]
+            images_list_cols = [[0] * len(images) for images in images_list]
+
+        if do_rescale:
+            images_list = [
+                [
+                    self.rescale(image=image, scale=rescale_factor, input_data_format=input_data_format)
+                    for image in images
+                ]
+                for images in images_list
+            ]
+
+        if do_normalize:
+            images_list = [
+                [
+                    self.normalize(image=image, mean=image_mean, std=image_std, input_data_format=input_data_format)
+                    for image in images
+                ]
+                for images in images_list
+            ]
+
+        pixel_attention_mask = None
+        if do_pad:
+            images_list, pixel_attention_mask = self.pad(
+                images_list, return_pixel_mask=True, return_tensors=return_tensors, input_data_format=input_data_format
+            )
+
+        if data_format is not None:
+            images_list = [
+                [
+                    to_channel_dimension_format(image, data_format, input_channel_dim=input_data_format)
+                    for image in images
+                ]
+                for images in images_list
+            ]
+
+        data = {"pixel_values": np.array(images_list) if do_pad else images_list}  # Faster tensor conversion
+        if pixel_attention_mask is not None:
+            data["pixel_attention_mask"] = np.array(pixel_attention_mask) if do_pad else pixel_attention_mask
+
+        encoding = BatchFeature(data=data, tensor_type=return_tensors)
+
+        # This is needed for generating correct text inputs in the processor - we don't pad to the max number of images
+        if return_row_col_info:
+            encoding["rows"] = images_list_rows
+            encoding["cols"] = images_list_cols
+
+        return encoding
diff --git a/src/transformers/models/idefics3/modeling_idefics3.py b/src/transformers/models/idefics3/modeling_idefics3.py
new file mode 100644
index 00000000000000..e3f554c8594e4f
--- /dev/null
+++ b/src/transformers/models/idefics3/modeling_idefics3.py
@@ -0,0 +1,1303 @@
+# coding=utf-8
+# Copyright 2024 the HuggingFace Inc. team. All rights reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+"""PyTorch Idefics3 model."""
+
+import math
+from dataclasses import dataclass
+from typing import Dict, List, Optional, Tuple, Union
+
+import torch
+import torch.utils.checkpoint
+from torch import nn
+from torch.nn import CrossEntropyLoss
+
+from ... import PreTrainedModel
+from ...activations import ACT2FN
+from ...cache_utils import Cache, DynamicCache
+from ...modeling_attn_mask_utils import _prepare_4d_attention_mask
+from ...modeling_outputs import BaseModelOutput, ModelOutput
+from ...utils import (
+    add_start_docstrings,
+    add_start_docstrings_to_model_forward,
+    is_flash_attn_2_available,
+    is_flash_attn_greater_or_equal_2_10,
+    logging,
+    replace_return_docstrings,
+)
+from ..auto import AutoModel
+from .configuration_idefics3 import Idefics3Config, Idefics3VisionConfig
+
+
+if is_flash_attn_2_available():
+    from ...modeling_flash_attention_utils import _flash_attention_forward
+
+
+logger = logging.get_logger(__name__)
+
+_CONFIG_FOR_DOC = "Idefics3Config"
+
+
+@dataclass
+class Idefics3BaseModelOutputWithPast(ModelOutput):
+    """
+    Base class for Idefics3 model's outputs that may also contain a past key/values (to speed up sequential decoding).
+    Args:
+        last_hidden_state (`torch.FloatTensor` of shape `(batch_size, sequence_length, hidden_size)`):
+            Sequence of hidden-states at the output of the last layer of the model.
+            If `past_key_values` is used only the last hidden-state of the sequences of shape `(batch_size, 1,
+            hidden_size)` is output.
+        past_key_values (`tuple(tuple(torch.FloatTensor))`, *optional*, returned when `use_cache=True` is passed or when `config.use_cache=True`):
+            Tuple of `tuple(torch.FloatTensor)` of length `config.n_layers`, with each tuple having 2 tensors of shape
+            `(batch_size, num_heads, sequence_length, embed_size_per_head)`) and optionally if
+            `config.is_encoder_decoder=True` 2 additional tensors of shape `(batch_size, num_heads,
+            encoder_sequence_length, embed_size_per_head)`.
+            Contains pre-computed hidden-states (key and values in the self-attention blocks and optionally if
+            `config.is_encoder_decoder=True` in the cross-attention blocks) that can be used (see `past_key_values`
+            input) to speed up sequential decoding.
+        hidden_states (`tuple(torch.FloatTensor)`, *optional*, returned when `output_hidden_states=True` is passed or when `config.output_hidden_states=True`):
+            Tuple of `torch.FloatTensor` (one for the output of the embeddings, if the model has an embedding layer, +
+            one for the output of each layer) of shape `(batch_size, sequence_length, hidden_size)`.
+            Hidden-states of the model at the output of each layer plus the optional initial embedding outputs.
+        attentions (`tuple(torch.FloatTensor)`, *optional*, returned when `output_attentions=True` is passed or when `config.output_attentions=True`):
+            Tuple of `torch.FloatTensor` (one for each layer) of shape `(batch_size, num_heads, sequence_length,
+            sequence_length)`.
+            Attentions weights after the attention softmax, used to compute the weighted average in the self-attention
+            heads.
+        image_hidden_states (`tuple(torch.FloatTensor)`, *optional*):
+            Tuple of `torch.FloatTensor` (one for the output of the image embeddings, `(batch_size, num_images,
+            sequence_length, hidden_size)`.
+            image_hidden_states of the model produced by the vision encoder
+    """
+
+    last_hidden_state: torch.FloatTensor = None
+    past_key_values: Optional[Tuple[Tuple[torch.FloatTensor]]] = None
+    hidden_states: Optional[Tuple[torch.FloatTensor]] = None
+    attentions: Optional[Tuple[torch.FloatTensor]] = None
+    image_hidden_states: Optional[Tuple[torch.FloatTensor]] = None
+
+
+@dataclass
+# Copied from transformers.models.idefics.modeling_idefics.IdeficsCausalLMOutputWithPast with Idefics->Idefics3
+class Idefics3CausalLMOutputWithPast(ModelOutput):
+    """
+    Base class for Idefics3 causal language model (or autoregressive) outputs.
+    Args:
+        loss (`torch.FloatTensor` of shape `(1,)`, *optional*, returned when `labels` is provided):
+            Language modeling loss (for next-token prediction).
+        logits (`torch.FloatTensor` of shape `(batch_size, sequence_length, config.vocab_size)`):
+            Prediction scores of the language modeling head (scores for each vocabulary token before SoftMax).
+        past_key_values (`tuple(tuple(torch.FloatTensor))`, *optional*, returned when `use_cache=True` is passed or when `config.use_cache=True`):
+            Tuple of `tuple(torch.FloatTensor)` of length `config.n_layers`, with each tuple having 2 tensors of shape
+            `(batch_size, num_heads, sequence_length, embed_size_per_head)`)
+            Contains pre-computed hidden-states (key and values in the self-attention blocks) that can be used (see
+            `past_key_values` input) to speed up sequential decoding.
+        hidden_states (`tuple(torch.FloatTensor)`, *optional*, returned when `output_hidden_states=True` is passed or when `config.output_hidden_states=True`):
+            Tuple of `torch.FloatTensor` (one for the output of the embeddings, if the model has an embedding layer, +
+            one for the output of each layer) of shape `(batch_size, sequence_length, hidden_size)`.
+            Hidden-states of the model at the output of each layer plus the optional initial embedding outputs.
+        attentions (`tuple(torch.FloatTensor)`, *optional*, returned when `output_attentions=True` is passed or when `config.output_attentions=True`):
+            Tuple of `torch.FloatTensor` (one for each layer) of shape `(batch_size, num_heads, sequence_length,
+            sequence_length)`.
+            Attentions weights after the attention softmax, used to compute the weighted average in the self-attention
+            heads.
+        image_hidden_states (`tuple(torch.FloatTensor)`, *optional*):
+            Tuple of `torch.FloatTensor` (one for the output of the image embeddings, `(batch_size, num_images,
+            sequence_length, hidden_size)`.
+            image_hidden_states of the model produced by the vision encoder
+    """
+
+    loss: Optional[torch.FloatTensor] = None
+    logits: torch.FloatTensor = None
+    past_key_values: Optional[List[torch.FloatTensor]] = None
+    hidden_states: Optional[Tuple[torch.FloatTensor]] = None
+    attentions: Optional[Tuple[torch.FloatTensor]] = None
+    image_hidden_states: Optional[Tuple[torch.FloatTensor]] = None
+
+
+class Idefics3VisionEmbeddings(nn.Module):
+    """
+    This is a modified version of `siglip.modelign_siglip.SiglipVisionEmbeddings` to enable images of variable
+    resolution.
+
+    The modifications are adapted from [Patch n' Pack: NaViT, a Vision Transformer for any Aspect Ratio and Resolution](https://arxiv.org/abs/2307.06304)
+    which allows treating images in their native aspect ratio and without the need to resize them to the same
+    fixed size. In particular, we start from the original pre-trained SigLIP model
+    (which uses images of fixed-size square images) and adapt it by training on images of variable resolutions.
+    """
+
+    def __init__(self, config: Idefics3VisionConfig):
+        super().__init__()
+        self.embed_dim = config.hidden_size
+        self.image_size = config.image_size
+        self.patch_size = config.patch_size
+
+        self.patch_embedding = nn.Conv2d(
+            in_channels=config.num_channels,
+            out_channels=self.embed_dim,
+            kernel_size=self.patch_size,
+            stride=self.patch_size,
+            padding="valid",
+        )
+
+        self.num_patches_per_side = self.image_size // self.patch_size
+        self.num_patches = self.num_patches_per_side**2
+        self.num_positions = self.num_patches
+        self.position_embedding = nn.Embedding(self.num_positions, self.embed_dim)
+
+    def forward(self, pixel_values: torch.FloatTensor, patch_attention_mask: torch.BoolTensor) -> torch.Tensor:
+        batch_size, _, max_im_h, max_im_w = pixel_values.shape
+
+        patch_embeds = self.patch_embedding(pixel_values)
+        embeddings = patch_embeds.flatten(2).transpose(1, 2)
+
+        max_nb_patches_h, max_nb_patches_w = max_im_h // self.patch_size, max_im_w // self.patch_size
+        boundaries = torch.arange(1 / self.num_patches_per_side, 1.0, 1 / self.num_patches_per_side)
+        position_ids = torch.full(size=(batch_size, max_nb_patches_h * max_nb_patches_w), fill_value=0)
+
+        for batch_idx, p_attn_mask in enumerate(patch_attention_mask):
+            nb_patches_h = p_attn_mask[:, 0].sum()
+            nb_patches_w = p_attn_mask[0].sum()
+
+            fractional_coords_h = torch.arange(0, 1 - 1e-6, 1 / nb_patches_h)
+            fractional_coords_w = torch.arange(0, 1 - 1e-6, 1 / nb_patches_w)
+
+            bucket_coords_h = torch.bucketize(fractional_coords_h, boundaries, right=True)
+            bucket_coords_w = torch.bucketize(fractional_coords_w, boundaries, right=True)
+
+            pos_ids = (bucket_coords_h[:, None] * self.num_patches_per_side + bucket_coords_w).flatten()
+            position_ids[batch_idx][p_attn_mask.view(-1).cpu()] = pos_ids
+
+        position_ids = position_ids.to(self.position_embedding.weight.device)
+        embeddings = embeddings + self.position_embedding(position_ids)
+        return embeddings
+
+
+# Copied from transformers.models.siglip.modeling_siglip.SiglipAttention with Siglip->Idefics3Vision
+class Idefics3VisionAttention(nn.Module):
+    """Multi-headed attention from 'Attention Is All You Need' paper"""
+
+    # Copied from transformers.models.clip.modeling_clip.CLIPAttention.__init__
+    def __init__(self, config):
+        super().__init__()
+        self.config = config
+        self.embed_dim = config.hidden_size
+        self.num_heads = config.num_attention_heads
+        self.head_dim = self.embed_dim // self.num_heads
+        if self.head_dim * self.num_heads != self.embed_dim:
+            raise ValueError(
+                f"embed_dim must be divisible by num_heads (got `embed_dim`: {self.embed_dim} and `num_heads`:"
+                f" {self.num_heads})."
+            )
+        self.scale = self.head_dim**-0.5
+        self.dropout = config.attention_dropout
+
+        self.k_proj = nn.Linear(self.embed_dim, self.embed_dim)
+        self.v_proj = nn.Linear(self.embed_dim, self.embed_dim)
+        self.q_proj = nn.Linear(self.embed_dim, self.embed_dim)
+        self.out_proj = nn.Linear(self.embed_dim, self.embed_dim)
+
+        # Ignore copy
+        self.is_causal = False
+
+    def forward(
+        self,
+        hidden_states: torch.Tensor,
+        attention_mask: Optional[torch.Tensor] = None,
+        output_attentions: Optional[bool] = False,
+    ) -> Tuple[torch.Tensor, Optional[torch.Tensor]]:
+        """Input shape: Batch x Time x Channel"""
+
+        batch_size, q_len, _ = hidden_states.size()
+
+        query_states = self.q_proj(hidden_states)
+        key_states = self.k_proj(hidden_states)
+        value_states = self.v_proj(hidden_states)
+
+        query_states = query_states.view(batch_size, q_len, self.num_heads, self.head_dim).transpose(1, 2)
+        key_states = key_states.view(batch_size, q_len, self.num_heads, self.head_dim).transpose(1, 2)
+        value_states = value_states.view(batch_size, q_len, self.num_heads, self.head_dim).transpose(1, 2)
+
+        k_v_seq_len = key_states.shape[-2]
+        attn_weights = torch.matmul(query_states, key_states.transpose(2, 3)) * self.scale
+
+        if attn_weights.size() != (batch_size, self.num_heads, q_len, k_v_seq_len):
+            raise ValueError(
+                f"Attention weights should be of size {(batch_size, self.num_heads, q_len, k_v_seq_len)}, but is"
+                f" {attn_weights.size()}"
+            )
+
+        if attention_mask is not None:
+            if attention_mask.size() != (batch_size, 1, q_len, k_v_seq_len):
+                raise ValueError(
+                    f"Attention mask should be of size {(batch_size, 1, q_len, k_v_seq_len)}, but is {attention_mask.size()}"
+                )
+            attn_weights = attn_weights + attention_mask
+
+        # upcast attention to fp32
+        attn_weights = nn.functional.softmax(attn_weights, dim=-1, dtype=torch.float32).to(query_states.dtype)
+        attn_weights = nn.functional.dropout(attn_weights, p=self.dropout, training=self.training)
+        attn_output = torch.matmul(attn_weights, value_states)
+
+        if attn_output.size() != (batch_size, self.num_heads, q_len, self.head_dim):
+            raise ValueError(
+                f"`attn_output` should be of size {(batch_size, self.num_heads, q_len, self.head_dim)}, but is"
+                f" {attn_output.size()}"
+            )
+
+        attn_output = attn_output.transpose(1, 2).contiguous()
+        attn_output = attn_output.reshape(batch_size, q_len, self.embed_dim)
+
+        attn_output = self.out_proj(attn_output)
+
+        return attn_output, attn_weights
+
+
+class Idefics3VisionFlashAttention2(Idefics3VisionAttention):
+    """
+    Idefics3Vision flash attention module. This module inherits from `Idefics3VisionAttention` as the weights of the module stays
+    untouched. The only required change would be on the forward pass where it needs to correctly call the public API of
+    flash attention and deal with padding tokens in case the input contains any of them.
+    """
+
+    # Copied from transformers.models.llama.modeling_llama.LlamaFlashAttention2.__init__
+    def __init__(self, *args, **kwargs):
+        super().__init__(*args, **kwargs)
+
+        # TODO: Should be removed once Flash Attention for RoCm is bumped to 2.1.
+        # flash_attn<2.1 generates top-left aligned causal mask, while what is needed here is bottom-right alignement, that was made default for flash_attn>=2.1. This attribute is used to handle this difference. Reference: https://github.com/Dao-AILab/flash-attention/releases/tag/v2.1.0.
+        # Beware that with flash_attn<2.1, using q_seqlen != k_seqlen (except for the case q_seqlen == 1) produces a wrong mask (top-left).
+        self._flash_attn_uses_top_left_mask = not is_flash_attn_greater_or_equal_2_10()
+
+    def forward(
+        self,
+        hidden_states: torch.Tensor,
+        attention_mask: Optional[torch.LongTensor] = None,
+        position_ids: Optional[torch.LongTensor] = None,
+        past_key_value: Optional[Cache] = None,
+        output_attentions: bool = False,
+        use_cache: bool = False,
+        **kwargs,
+    ) -> Tuple[torch.Tensor, Optional[torch.Tensor], Optional[Tuple[torch.Tensor]]]:
+        output_attentions = False
+
+        bsz, q_len, _ = hidden_states.size()
+
+        query_states = self.q_proj(hidden_states)
+        key_states = self.k_proj(hidden_states)
+        value_states = self.v_proj(hidden_states)
+
+        # Flash attention requires the input to have the shape
+        # batch_size x seq_length x head_dim x hidden_dim
+        # therefore we just need to keep the original shape
+        query_states = query_states.view(bsz, q_len, self.num_heads, self.head_dim)
+        key_states = key_states.view(bsz, q_len, self.num_heads, self.head_dim).transpose(1, 2)
+        value_states = value_states.view(bsz, q_len, self.num_heads, self.head_dim).transpose(1, 2)
+
+        kv_seq_len = key_states.shape[-2]
+        if past_key_value is not None:
+            kv_seq_len += past_key_value.get_usable_length(kv_seq_len, self.layer_idx)
+
+        # TODO: These transpose are quite inefficient but Flash Attention requires the layout [batch_size, sequence_length, num_heads, head_dim]. We would need to refactor the KV cache
+        # to be able to avoid many of these transpose/reshape/view.
+        key_states = key_states.transpose(1, 2)
+        value_states = value_states.transpose(1, 2)
+
+        dropout_rate = self.dropout if self.training else 0.0
+
+        # In PEFT, usually we cast the layer norms in float32 for training stability reasons
+        # therefore the input hidden states gets silently casted in float32. Hence, we need
+        # cast them back in the correct dtype just to be sure everything works as expected.
+        # This might slowdown training & inference so it is recommended to not cast the LayerNorms
+        # in fp32. (Idefics3VisionRMSNorm handles it correctly)
+
+        input_dtype = query_states.dtype
+        if input_dtype == torch.float32:
+            if torch.is_autocast_enabled():
+                target_dtype = torch.get_autocast_gpu_dtype()
+            # Handle the case where the model is quantized
+            elif hasattr(self.config, "_pre_quantization_dtype"):
+                target_dtype = self.config._pre_quantization_dtype
+            else:
+                target_dtype = self.q_proj.weight.dtype
+
+            logger.warning_once(
+                f"The input hidden states seems to be silently casted in float32, this might be related to"
+                f" the fact you have upcasted embedding or layer norm layers in float32. We will cast back the input in"
+                f" {target_dtype}."
+            )
+
+            query_states = query_states.to(target_dtype)
+            key_states = key_states.to(target_dtype)
+            value_states = value_states.to(target_dtype)
+
+        attn_output = _flash_attention_forward(
+            query_states,
+            key_states,
+            value_states,
+            attention_mask,
+            q_len,
+            dropout=dropout_rate,
+            is_causal=self.is_causal,
+            use_top_left_mask=self._flash_attn_uses_top_left_mask,
+        )
+
+        attn_output = attn_output.reshape(bsz, q_len, self.embed_dim).contiguous()
+        attn_output = self.out_proj(attn_output)
+
+        if not output_attentions:
+            attn_weights = None
+
+        return attn_output, attn_weights
+
+
+IDEFICS_VISION_ATTENTION_CLASSES = {
+    "eager": Idefics3VisionAttention,
+    "flash_attention_2": Idefics3VisionFlashAttention2,
+}
+
+
+# Copied from transformers.models.siglip.modeling_siglip.SiglipMLP with Siglip->Idefics3Vision
+class Idefics3VisionMLP(nn.Module):
+    def __init__(self, config):
+        super().__init__()
+        self.config = config
+        self.activation_fn = ACT2FN[config.hidden_act]
+        self.fc1 = nn.Linear(config.hidden_size, config.intermediate_size)
+        self.fc2 = nn.Linear(config.intermediate_size, config.hidden_size)
+
+    def forward(self, hidden_states: torch.Tensor) -> torch.Tensor:
+        hidden_states = self.fc1(hidden_states)
+        hidden_states = self.activation_fn(hidden_states)
+        hidden_states = self.fc2(hidden_states)
+        return hidden_states
+
+class Idefics3SimpleMLP(nn.Module):
+    def __init__(self, input_size, output_size):
+        super().__init__()
+        self.input_size = input_size
+        self.output_size = output_size
+        self.proj = nn.Linear(input_size, output_size, bias=False)
+
+    def forward(self, x):
+        return self.proj(x)
+
+class Idefics3EncoderLayer(nn.Module):
+    def __init__(self, config: Idefics3Config):
+        super().__init__()
+        self.embed_dim = config.hidden_size
+        self.self_attn = IDEFICS_VISION_ATTENTION_CLASSES[config._attn_implementation](config)
+        self.layer_norm1 = nn.LayerNorm(self.embed_dim, eps=config.layer_norm_eps)
+        self.mlp = Idefics3VisionMLP(config)
+        self.layer_norm2 = nn.LayerNorm(self.embed_dim, eps=config.layer_norm_eps)
+
+    # Copied from transformers.models.siglip.modeling_siglip.SiglipEncoderLayer.forward
+    def forward(
+        self,
+        hidden_states: torch.Tensor,
+        attention_mask: torch.Tensor,
+        output_attentions: Optional[bool] = False,
+    ) -> Tuple[torch.FloatTensor]:
+        """
+        Args:
+            hidden_states (`torch.FloatTensor`):
+                Input to the layer of shape `(batch, seq_len, embed_dim)`.
+            attention_mask (`torch.FloatTensor`):
+                Attention mask of shape `(batch, 1, q_len, k_v_seq_len)` where padding elements are indicated by very large negative values.
+            output_attentions (`bool`, *optional*, defaults to `False`):
+                Whether or not to return the attentions tensors of all attention layers. See `attentions` under
+                returned tensors for more detail.
+        """
+        residual = hidden_states
+
+        hidden_states = self.layer_norm1(hidden_states)
+        hidden_states, attn_weights = self.self_attn(
+            hidden_states=hidden_states,
+            attention_mask=attention_mask,
+            output_attentions=output_attentions,
+        )
+        hidden_states = residual + hidden_states
+
+        residual = hidden_states
+        hidden_states = self.layer_norm2(hidden_states)
+        hidden_states = self.mlp(hidden_states)
+        hidden_states = residual + hidden_states
+
+        outputs = (hidden_states,)
+
+        if output_attentions:
+            outputs += (attn_weights,)
+
+        return outputs
+
+
+# Copied from transformers.models.siglip.modeling_siglip.SiglipEncoder with Siglip->Idefics3
+class Idefics3Encoder(nn.Module):
+    """
+    Transformer encoder consisting of `config.num_hidden_layers` self attention layers. Each layer is a
+    [`Idefics3EncoderLayer`].
+
+    Args:
+        config: Idefics3Config
+    """
+
+    def __init__(self, config: Idefics3Config):
+        super().__init__()
+        self.config = config
+        self.layers = nn.ModuleList([Idefics3EncoderLayer(config) for _ in range(config.num_hidden_layers)])
+        self.gradient_checkpointing = False
+
+    # Ignore copy
+    def forward(
+        self,
+        inputs_embeds,
+        attention_mask: Optional[torch.Tensor] = None,
+        output_attentions: Optional[bool] = None,
+        output_hidden_states: Optional[bool] = None,
+        return_dict: Optional[bool] = None,
+    ) -> Union[Tuple, BaseModelOutput]:
+        r"""
+        Args:
+            inputs_embeds (`torch.FloatTensor` of shape `(batch_size, sequence_length, hidden_size)`):
+                Optionally, instead of passing `input_ids` you can choose to directly pass an embedded representation.
+                This is useful if you want more control over how to convert `input_ids` indices into associated vectors
+                than the model's internal embedding lookup matrix.
+            attention_mask (`torch.Tensor` of shape `(batch_size, sequence_length)`, *optional*):
+                Mask to avoid performing attention on padding token indices. Mask values selected in `[0, 1]`:
+
+                - 1 for tokens that are **not masked**,
+                - 0 for tokens that are **masked**.
+
+                [What are attention masks?](../glossary#attention-mask)
+            output_attentions (`bool`, *optional*):
+                Whether or not to return the attentions tensors of all attention layers. See `attentions` under
+                returned tensors for more detail.
+            output_hidden_states (`bool`, *optional*):
+                Whether or not to return the hidden states of all layers. See `hidden_states` under returned tensors
+                for more detail.
+            return_dict (`bool`, *optional*):
+                Whether or not to return a [`~utils.ModelOutput`] instead of a plain tuple.
+        """
+        output_attentions = output_attentions if output_attentions is not None else self.config.output_attentions
+        output_hidden_states = (
+            output_hidden_states if output_hidden_states is not None else self.config.output_hidden_states
+        )
+        return_dict = return_dict if return_dict is not None else self.config.use_return_dict
+
+        encoder_states = () if output_hidden_states else None
+        all_attentions = () if output_attentions else None
+
+        hidden_states = inputs_embeds
+        for encoder_layer in self.layers:
+            if output_hidden_states:
+                encoder_states = encoder_states + (hidden_states,)
+            if self.gradient_checkpointing and self.training:
+                layer_outputs = self._gradient_checkpointing_func(
+                    encoder_layer.__call__,
+                    hidden_states,
+                    attention_mask,
+                    output_attentions,
+                )
+            else:
+                layer_outputs = encoder_layer(
+                    hidden_states,
+                    attention_mask,
+                    output_attentions=output_attentions,
+                )
+
+            hidden_states = layer_outputs[0]
+
+            if output_attentions:
+                all_attentions = all_attentions + (layer_outputs[1],)
+
+        if output_hidden_states:
+            encoder_states = encoder_states + (hidden_states,)
+
+        if not return_dict:
+            return tuple(v for v in [hidden_states, encoder_states, all_attentions] if v is not None)
+        return BaseModelOutput(
+            last_hidden_state=hidden_states, hidden_states=encoder_states, attentions=all_attentions
+        )
+
+
+class Idefics3VisionTransformer(nn.Module):
+    def __init__(self, config: Idefics3VisionConfig):
+        super().__init__()
+        embed_dim = config.hidden_size
+
+        self.config = config
+        self.embeddings = Idefics3VisionEmbeddings(config)
+        self.encoder = Idefics3Encoder(config)
+        self.post_layernorm = nn.LayerNorm(embed_dim, eps=config.layer_norm_eps)
+        self._use_flash_attention_2 = config._attn_implementation == "flash_attention_2"
+
+    def get_input_embeddings(self):
+        return self.embeddings
+
+    def set_input_embeddings(self, value):
+        self.embeddings = value
+
+    def forward(
+        self,
+        pixel_values,
+        patch_attention_mask: Optional[torch.BoolTensor] = None,
+        output_attentions: Optional[bool] = None,
+        output_hidden_states: Optional[bool] = None,
+        return_dict: Optional[bool] = None,
+    ) -> Union[Tuple, BaseModelOutput]:
+        output_attentions = output_attentions if output_attentions is not None else self.config.output_attentions
+        output_hidden_states = (
+            output_hidden_states if output_hidden_states is not None else self.config.output_hidden_states
+        )
+        return_dict = return_dict if return_dict is not None else self.config.use_return_dict
+
+        batch_size = pixel_values.size(0)
+        if patch_attention_mask is None:
+            patch_size = self.config.patch_size
+            patch_attention_mask = torch.ones(
+                (
+                    batch_size,
+                    pixel_values.size(2) // patch_size,
+                    pixel_values.size(3) // patch_size,
+                )
+            )
+            patch_attention_mask = patch_attention_mask.to(dtype=torch.bool, device=pixel_values.device)
+
+        hidden_states = self.embeddings(pixel_values=pixel_values, patch_attention_mask=patch_attention_mask)
+
+        patch_attention_mask = patch_attention_mask.view(batch_size, -1)
+        # The call to `_upad_input` in `_flash_attention_forward` is expensive
+        # So when the `patch_attention_mask` is full of 1s (i.e. attending to the whole sequence),
+        # avoiding passing the attention_mask, which is equivalent to attending to the full sequence
+        if not torch.any(~patch_attention_mask):
+            patch_attention_mask = None
+        elif not self._use_flash_attention_2:
+            patch_attention_mask = _prepare_4d_attention_mask(patch_attention_mask, hidden_states.dtype)
+
+        encoder_outputs = self.encoder(
+            inputs_embeds=hidden_states,
+            attention_mask=patch_attention_mask,
+            output_attentions=output_attentions,
+            output_hidden_states=output_hidden_states,
+            return_dict=return_dict,
+        )
+
+        last_hidden_state = encoder_outputs[0]
+        last_hidden_state = self.post_layernorm(last_hidden_state)
+
+        if not return_dict:
+            return (last_hidden_state,) + encoder_outputs[1:]
+
+        return BaseModelOutput(
+            last_hidden_state=last_hidden_state,
+            hidden_states=encoder_outputs.hidden_states,
+            attentions=encoder_outputs.attentions,
+        )
+
+
+# Copied from transformers.models.llama.modeling_llama.repeat_kv
+def repeat_kv(hidden_states: torch.Tensor, n_rep: int) -> torch.Tensor:
+    """
+    This is the equivalent of torch.repeat_interleave(x, dim=1, repeats=n_rep). The hidden states go from (batch,
+    num_key_value_heads, seqlen, head_dim) to (batch, num_attention_heads, seqlen, head_dim)
+    """
+    batch, num_key_value_heads, slen, head_dim = hidden_states.shape
+    if n_rep == 1:
+        return hidden_states
+    hidden_states = hidden_states[:, :, None, :, :].expand(batch, num_key_value_heads, n_rep, slen, head_dim)
+    return hidden_states.reshape(batch, num_key_value_heads * n_rep, slen, head_dim)
+
+
+# Copied from transformers.models.llama.modeling_llama.LlamaRMSNorm with Llama->Idefics3
+class Idefics3RMSNorm(nn.Module):
+    def __init__(self, hidden_size, eps=1e-6):
+        """
+        Idefics3RMSNorm is equivalent to T5LayerNorm
+        """
+        super().__init__()
+        self.weight = nn.Parameter(torch.ones(hidden_size))
+        self.variance_epsilon = eps
+
+    def forward(self, hidden_states):
+        input_dtype = hidden_states.dtype
+        hidden_states = hidden_states.to(torch.float32)
+        variance = hidden_states.pow(2).mean(-1, keepdim=True)
+        hidden_states = hidden_states * torch.rsqrt(variance + self.variance_epsilon)
+        return self.weight * hidden_states.to(input_dtype)
+
+
+class Idefics3Connector(nn.Module):
+    def __init__(self, config):
+        super().__init__()
+        self.scale_factor = config.scale_factor
+        self.modality_projection = Idefics3SimpleMLP(
+            input_size=config.vision_config.hidden_size * (self.scale_factor**2),
+            output_size=config.text_config.hidden_size,
+        )
+
+    def pixel_shuffle(self, x, scale_factor=2):
+        bsz, seq, embed_dim = x.size()
+        height = width = int(seq**0.5)
+        x = x.view(bsz, height, width, embed_dim)
+        x = x.view(bsz, height, int(width / scale_factor), embed_dim * scale_factor)
+        x = x.permute(0, 2, 1, 3).contiguous()
+        x = x.view(bsz, int(width / scale_factor), int(height / scale_factor), embed_dim * (scale_factor**2))
+        x = x.permute(0, 2, 1, 3).contiguous()
+        x = x.view(bsz, int(seq / (scale_factor**2)), embed_dim * (scale_factor**2))
+        return x
+
+    def forward(self, image_hidden_states):
+        image_hidden_states = self.pixel_shuffle(image_hidden_states, self.scale_factor)
+        image_hidden_states = self.modality_projection(image_hidden_states)
+        return image_hidden_states
+
+
+IDEFICS3_START_DOCSTRING = r"""
+    This model inherits from [`PreTrainedModel`]. Check the superclass documentation for the generic methods the
+    library implements for all its model (such as downloading or saving, resizing the input embeddings, pruning heads
+    etc.)
+
+    This model is also a PyTorch [torch.nn.Module](https://pytorch.org/docs/stable/nn.html#torch.nn.Module) subclass.
+    Use it as a regular PyTorch Module and refer to the PyTorch documentation for all matter related to general usage
+    and behavior.
+
+    Parameters:
+        config ([`Idefics3Config`] or [`Idefics3VisionConfig`]):
+            Model configuration class with all the parameters of the model. Initializing with a config file does not
+            load the weights associated with the model, only the configuration. Check out the
+            [`~PreTrainedModel.from_pretrained`] method to load the model weights.
+"""
+
+
+@add_start_docstrings(
+    "The bare Idefics3 Model outputting raw hidden-states without any specific head on top.",
+    IDEFICS3_START_DOCSTRING,
+)
+class Idefics3PreTrainedModel(PreTrainedModel):
+    config_class = Idefics3Config
+    base_model_prefix = "model"
+    supports_gradient_checkpointing = True
+    _no_split_modules = ["Idefics3VisionAttention", "Idefics3SimpleMLP", "Idefics3DecoderLayer"]
+    _skip_keys_device_placement = "past_key_values"
+    _supports_flash_attn_2 = True
+    _supports_cache_class = True
+
+    def _init_weights(self, module):
+        std = (
+            self.config.text_config.initializer_range
+            if hasattr(self.config, "initializer_range")
+            else self.config.text_config.initializer_range
+        )
+
+        if hasattr(module, "class_embedding"):
+            module.class_embedding.data.normal_(mean=0.0, std=std)
+
+        if isinstance(module, (nn.Linear, nn.Conv2d)):
+            module.weight.data.normal_(mean=0.0, std=std)
+            if module.bias is not None:
+                module.bias.data.zero_()
+        elif isinstance(module, nn.Embedding):
+            module.weight.data.normal_(mean=0.0, std=std)
+            if module.padding_idx is not None:
+                module.weight.data[module.padding_idx].zero_()
+
+    @classmethod
+    def _autoset_attn_implementation(
+        cls,
+        config,
+        use_flash_attention_2: bool = False,
+        torch_dtype: Optional[torch.dtype] = None,
+        device_map: Optional[Union[str, Dict[str, int]]] = None,
+        check_device_map: bool = True,
+        **kwargs,
+    ):
+        """
+        Overrides the method in `PreTrainedModel` to update the vision config with the correct attention implementation
+        """
+        config = super()._autoset_attn_implementation(
+            config=config,
+            use_flash_attention_2=use_flash_attention_2,
+            torch_dtype=torch_dtype,
+            device_map=device_map,
+            check_device_map=check_device_map,
+            **kwargs,
+        )
+        config.vision_config._attn_implementation = config._attn_implementation
+        return config
+
+
+IDEFICS3_INPUTS_DOCSTRING = r"""
+    Args:
+        input_ids (`torch.LongTensor` of shape `(batch_size, sequence_length)`):
+            Indices of input sequence tokens in the vocabulary. Padding will be ignored by default should you provide
+            it.
+
+            Indices can be obtained using [`AutoTokenizer`]. See [`PreTrainedTokenizer.encode`] and
+            [`PreTrainedTokenizer.__call__`] for details.
+
+            [What are input IDs?](../glossary#input-ids)
+        attention_mask (`torch.Tensor` of shape `(batch_size, sequence_length)`, *optional*):
+            Mask to avoid performing attention on padding token indices. Mask values selected in `[0, 1]`:
+
+            - 1 for tokens that are **not masked**,
+            - 0 for tokens that are **masked**.
+
+            [What are attention masks?](../glossary#attention-mask)
+
+            Indices can be obtained using [`AutoTokenizer`]. See [`PreTrainedTokenizer.encode`] and
+            [`PreTrainedTokenizer.__call__`] for details.
+
+            If `past_key_values` is used, optionally only the last `decoder_input_ids` have to be input (see
+            `past_key_values`).
+
+            If you want to change padding behavior, you should read [`modeling_opt._prepare_decoder_attention_mask`]
+            and modify to your needs. See diagram 1 in [the paper](https://arxiv.org/abs/1910.13461) for more
+            information on the default strategy.
+
+            - 1 indicates the head is **not masked**,
+            - 0 indicates the head is **masked**.
+        position_ids (`torch.LongTensor` of shape `(batch_size, sequence_length)`, *optional*):
+            Indices of positions of each input sequence tokens in the position embeddings. Selected in the range `[0,
+            config.n_positions - 1]`. [What are position IDs?](../glossary#position-ids)
+        past_key_values (`tuple(tuple(torch.FloatTensor))`, *optional*, returned when `use_cache=True` is passed or when `config.use_cache=True`):
+            Tuple of `tuple(torch.FloatTensor)` of length `config.n_layers`, with each tuple having 2 tensors of shape
+            `(batch_size, num_heads, sequence_length, embed_size_per_head)`) and 2 additional tensors of shape
+            `(batch_size, num_heads, encoder_sequence_length, embed_size_per_head)`.
+
+            Contains pre-computed hidden-states (key and values in the self-attention blocks and in the cross-attention
+            blocks) that can be used (see `past_key_values` input) to speed up sequential decoding.
+
+            If `past_key_values` are used, the user can optionally input only the last `decoder_input_ids` (those that
+            don't have their past key value states given to this model) of shape `(batch_size, 1)` instead of all
+            `decoder_input_ids` of shape `(batch_size, sequence_length)`.
+        inputs_embeds (`torch.FloatTensor` of shape `(batch_size, sequence_length, hidden_size)`, *optional*):
+            Optionally, instead of passing `input_ids` you can choose to directly pass an embedded representation. This
+            is useful if you want more control over how to convert `input_ids` indices into associated vectors than the
+            model's internal embedding lookup matrix.
+        pixel_values (`torch.FloatTensor` of shape `(batch_size, num_channels, image_size, image_size)):
+            The tensors corresponding to the input images. Pixel values can be obtained using
+            [`AutoImageProcessor`]. See [`CLIPImageProcessor.__call__`] for details ([]`LlavaProcessor`] uses
+            [`CLIPImageProcessor`] for processing images).
+        pixel_attention_mask (`torch.Tensor` of shape `(batch_size, image_size, image_size)`, *optional*):
+            Mask to avoid performing attention on padding pixel indices.
+        image_hidden_states (`torch.FloatTensor` of shape `(batch_size, num_channels, image_size, image_size)`):
+            The hidden states of the image encoder after modality projection.
+        use_cache (`bool`, *optional*):
+            If set to `True`, `past_key_values` key value states are returned and can be used to speed up decoding (see
+            `past_key_values`).
+        output_attentions (`bool`, *optional*):
+            Whether or not to return the attentions tensors of all attention layers. See `attentions` under returned
+            tensors for more detail.
+        output_hidden_states (`bool`, *optional*):
+            Whether or not to return the hidden states of all layers. See `hidden_states` under returned tensors for
+            more detail.
+        return_dict (`bool`, *optional*):
+            Whether or not to return a [`~utils.ModelOutput`] instead of a plain tuple.
+"""
+
+
+@add_start_docstrings(
+    """Idefics3 model consisting of a SIGLIP vision encoder and Llama3 language decoder""",
+    IDEFICS3_START_DOCSTRING,
+)
+class Idefics3Model(Idefics3PreTrainedModel):
+    def __init__(self, config: Idefics3Config):
+        super().__init__(config)
+        self.padding_idx = self.config.text_config.pad_token_id
+        self.vocab_size = self.config.text_config.vocab_size
+
+        self.vision_model = Idefics3VisionTransformer(config.vision_config)
+        self.connector = Idefics3Connector(config)
+        self.text_model = AutoModel.from_config(config.text_config, attn_implementation=config._attn_implementation)
+
+        self.image_seq_len = int(((config.vision_config.image_size // config.vision_config.patch_size) ** 2) / (config.scale_factor**2))
+        self.image_token_id = self.config.image_token_id
+
+        self._use_flash_attention_2 = config._attn_implementation == "flash_attention_2"
+
+        self.post_init()
+
+    def enable_input_require_grads(self):
+        """
+        Enables the gradients for the input embeddings.
+
+        This is useful for lora when using gradient checkpointing.
+        c.f. https://github.com/huggingface/peft/issues/1402#issuecomment-1913675032
+
+        Override to set output.requires_grad = True for both the decoder's and vision model's embeddings.
+        """
+
+        def get_lowest_module(module):
+            if len(list(module.children())) == 0:
+                # If the module has no children, it is a leaf module (e.g., Linear, Conv2d, etc.)
+                return module
+            else:
+                # Recursively call the function on each child module
+                return get_lowest_module(list(module.children())[0])
+
+        def make_inputs_require_grads(module, input, output):
+            output.requires_grad_(True)
+
+        self._text_require_grads_hook = self.get_input_embeddings().register_forward_hook(make_inputs_require_grads)
+        self._vision_require_grads_hook = get_lowest_module(self.vision_model).register_forward_hook(
+            make_inputs_require_grads
+        )
+
+    def get_input_embeddings(self):
+        return self.text_model.get_input_embeddings()
+
+    def set_input_embeddings(self, value):
+        self.text_model.set_input_embeddings(value)
+
+    def resize_token_embeddings(self, new_num_tokens: Optional[int] = None, pad_to_multiple_of=None) -> nn.Embedding:
+        model_embeds = self.text_model.resize_token_embeddings(
+            new_num_tokens=new_num_tokens, pad_to_multiple_of=pad_to_multiple_of
+        )
+        self.config.text_config.vocab_size = model_embeds.num_embeddings
+        return model_embeds
+
+    def inputs_merger(
+        self,
+        input_ids: torch.LongTensor,
+        inputs_embeds: Optional[torch.Tensor],
+        image_hidden_states: Optional[torch.Tensor],
+    ):
+        """
+        This method aims at merging the token embeddings with the image hidden states into one single sequence of vectors that are fed to the transformer LM.
+        The merging happens as follows:
+        - The text token sequence is: `tok_1 tok_2 tok_3 <fake_token_around_image> <image> <image> ... <image> <fake_token_around_image> tok_4`.
+        - We get the image hidden states for the image through the vision encoder and that hidden state, after a pixel shuffle operation, is then projected into the text embedding space.
+        We thus have a sequence of image hidden states of size (1, image_seq_len, hidden_dim), where 1 is for batch_size of 1 image and hidden_dim is the hidden_dim of the LM transformer.
+        - The merging happens so that we obtain the following sequence: `vector_tok_1 vector_tok_2 vector_tok_3 vector_fake_tok_around_image {sequence of image_seq_len image hidden states} vector_fake_toke_around_image vector_tok_4`. That sequence is fed to the LM.
+        - To fit the format of that sequence, `input_ids`, `input_embeds`, `attention_mask` are all 3 adapted to insert the image hidden states.
+        """
+        num_images, _, vision_hidden_size = image_hidden_states.shape
+        special_image_token_mask = input_ids == self.image_token_id
+        new_inputs_embeds = inputs_embeds.clone()
+        reshaped_image_hidden_states = image_hidden_states.view(-1, vision_hidden_size)
+        new_inputs_embeds[special_image_token_mask] = reshaped_image_hidden_states
+        return new_inputs_embeds
+
+    @add_start_docstrings_to_model_forward(
+        """
+        Inputs fed to the model can have an arbitrary number of images. To account for this, pixel_values fed to
+        the model have image padding -> (batch_size, max_num_images, 3, max_heights, max_widths) where
+        max_num_images is the maximum number of images among the batch_size samples in the batch.
+        Padding images are not needed beyond padding the pixel_values at the entrance of the model.
+        For efficiency, we only pass through the vision_model's forward the real images by
+        discarding the padding images i.e. pixel_values of size (image_batch_size, 3, height, width) where
+        image_batch_size would be 7 when num_images_per_sample=[1, 3, 1, 2] and max_num_images would be 3.
+        """,
+        IDEFICS3_INPUTS_DOCSTRING,
+    )
+    def forward(
+        self,
+        input_ids: torch.LongTensor = None,
+        attention_mask: Optional[torch.Tensor] = None,
+        position_ids: Optional[torch.LongTensor] = None,
+        past_key_values: Optional[List[torch.FloatTensor]] = None,
+        inputs_embeds: Optional[torch.FloatTensor] = None,
+        pixel_values: Optional[torch.FloatTensor] = None,
+        pixel_attention_mask: Optional[torch.BoolTensor] = None,
+        image_hidden_states: Optional[torch.FloatTensor] = None,
+        use_cache: Optional[bool] = None,
+        output_attentions: Optional[bool] = None,
+        output_hidden_states: Optional[bool] = None,
+        return_dict: Optional[bool] = None,
+    ) -> Union[Tuple, Idefics3BaseModelOutputWithPast]:
+        output_attentions = output_attentions if output_attentions is not None else self.config.output_attentions
+        output_hidden_states = (
+            output_hidden_states if output_hidden_states is not None else self.config.output_hidden_states
+        )
+        use_cache = use_cache if use_cache is not None else self.config.use_cache
+        return_dict = return_dict if return_dict is not None else self.config.use_return_dict
+
+        if self.training and self.text_model.gradient_checkpointing and use_cache:
+            logger.warning_once(
+                "`use_cache=True` is incompatible with gradient checkpointing. Setting `use_cache=False`..."
+            )
+            use_cache = False
+
+        # retrieve input_ids and inputs_embeds
+        if input_ids is not None:
+            batch_size, seq_length = input_ids.shape
+        elif inputs_embeds is not None:
+            batch_size, seq_length, _ = inputs_embeds.shape
+        else:
+            raise ValueError("You have to specify either input_ids or inputs_embeds")
+
+        past_seen_tokens = 0
+        return_legacy_cache = False
+        if use_cache:
+            if not isinstance(past_key_values, Cache):  # kept for BC (non `Cache` `past_key_values` inputs)
+                return_legacy_cache = True
+                past_key_values = DynamicCache.from_legacy_cache(past_key_values)
+            past_seen_tokens = past_key_values.get_seq_length()
+
+        if inputs_embeds is not None and input_ids is None and past_seen_tokens == 0:
+            raise ValueError("When first calling the model, if input_embeds are passed, input_ids should not be None.")
+
+        if inputs_embeds is None:
+            inputs_embeds = self.text_model.get_input_embeddings()(input_ids).to(self.device)
+
+        # START VISUAL INPUTS INTEGRATION
+        if pixel_values is not None and image_hidden_states is not None:
+            raise ValueError("You cannot specify both pixel_values and image_hidden_states at the same time")
+        elif pixel_values is not None:
+            batch_size, num_images, num_channels, height, width = pixel_values.shape
+            pixel_values = pixel_values.to(dtype=self.dtype)  # fp16 compatibility
+            pixel_values = pixel_values.view(batch_size * num_images, *pixel_values.shape[2:])
+
+            # Remove padding images - padding images are full 0.
+            nb_values_per_image = pixel_values.shape[1:].numel()
+            real_images_inds = (pixel_values == 0.0).sum(dim=(-1, -2, -3)) != nb_values_per_image
+            pixel_values = pixel_values[real_images_inds].contiguous()
+
+            # Handle the vision attention mask
+            if pixel_attention_mask is None:
+                pixel_attention_mask = torch.ones(
+                    size=(pixel_values.size(0), pixel_values.size(2), pixel_values.size(3)),
+                    dtype=torch.bool,
+                    device=pixel_values.device,
+                )
+            else:
+                # Remove padding images from the mask
+                pixel_attention_mask = pixel_attention_mask.view(
+                    batch_size * num_images, *pixel_attention_mask.shape[2:]
+                )
+                pixel_attention_mask = pixel_attention_mask[real_images_inds].contiguous()
+
+            patch_size = self.config.vision_config.patch_size
+            patches_subgrid = pixel_attention_mask.unfold(dimension=1, size=patch_size, step=patch_size)
+            patches_subgrid = patches_subgrid.unfold(dimension=2, size=patch_size, step=patch_size)
+            patch_attention_mask = (patches_subgrid.sum(dim=(-1, -2)) > 0).bool()
+
+            # Get sequence from the vision encoder
+            image_hidden_states = self.vision_model(
+                pixel_values=pixel_values,
+                patch_attention_mask=patch_attention_mask,
+            ).last_hidden_state
+
+            # Modality projection & resampling
+            image_hidden_states = self.connector(image_hidden_states)
+
+        elif image_hidden_states is not None:
+            image_hidden_states = image_hidden_states.to(dtype=self.dtype, device=input_ids.device)
+
+        if past_seen_tokens == 0 and inputs_embeds is not None and image_hidden_states is not None:
+            # When we generate, we don't want to replace the potential image_token_id that we generated by images
+            # that simply don't exist
+            inputs_embeds = self.inputs_merger(
+                input_ids=input_ids,
+                inputs_embeds=inputs_embeds,
+                image_hidden_states=image_hidden_states,
+            )
+
+        outputs = self.text_model(
+            inputs_embeds=inputs_embeds,
+            attention_mask=attention_mask,
+            position_ids=position_ids,
+            past_key_values=past_key_values,
+            output_attentions=output_attentions,
+            output_hidden_states=output_hidden_states,
+            return_dict=return_dict,
+        )
+
+        if return_legacy_cache and use_cache:
+            outputs.past_key_values = outputs.past_key_values.to_legacy_cache()
+
+        if not return_dict:
+            return tuple(v for v in [*outputs, image_hidden_states] if v is not None)
+
+        return Idefics3BaseModelOutputWithPast(
+            last_hidden_state=outputs.last_hidden_state,
+            past_key_values=outputs.past_key_values,
+            hidden_states=outputs.hidden_states,
+            attentions=outputs.attentions,
+            image_hidden_states=image_hidden_states,
+        )
+
+
+@add_start_docstrings(
+    """The Idefics3 Model with a language modeling head. It is made up a SigLIP vision encoder, with a language modeling head on top. """,
+    IDEFICS3_START_DOCSTRING,
+)
+class Idefics3ForConditionalGeneration(Idefics3PreTrainedModel):
+    _tied_weights_keys = ["lm_head.weight"]
+
+    def __init__(self, config):
+        super().__init__(config)
+        self.model = Idefics3Model(config)
+        self.image_token_id = self.config.image_token_id
+
+        self.lm_head = nn.Linear(config.text_config.hidden_size, config.text_config.vocab_size, bias=False)
+        self.vocab_size = config.text_config.vocab_size
+
+        # Initialize weights and apply final processing
+        self.post_init()
+
+    def enable_input_require_grads(self):
+        """
+        Enables the gradients for the input embeddings. This is useful for fine-tuning adapter weights while keeping
+        the model weights fixed.
+        """
+
+        def make_inputs_require_grads(module, input, output):
+            output.requires_grad_(True)
+
+        self._text_require_grads_hook = self.get_input_embeddings().register_forward_hook(make_inputs_require_grads)
+        self._vision_require_grads_hook = self.model.vision_model.get_input_embeddings().register_forward_hook(
+            make_inputs_require_grads
+        )
+
+    def get_input_embeddings(self):
+        return self.model.text_model.get_input_embeddings()
+
+    def set_input_embeddings(self, value):
+        self.model.text_model.set_input_embeddings(value)
+
+    def get_output_embeddings(self):
+        return self.lm_head
+
+    def set_output_embeddings(self, new_embeddings):
+        self.lm_head = new_embeddings
+
+    def resize_token_embeddings(self, new_num_tokens: Optional[int] = None, pad_to_multiple_of=None) -> nn.Embedding:
+        # model_embeds = self.model.resize_token_embeddings(new_num_tokens=new_num_tokens, pad_to_multiple_of=pad_to_multiple_of)
+        model_embeds = self._resize_token_embeddings(new_num_tokens, pad_to_multiple_of)
+        if new_num_tokens is None and pad_to_multiple_of is None:
+            return model_embeds
+
+        # Update base model and current model config
+        # Ignore copy
+        self.config.text_config.vocab_size = model_embeds.weight.shape[0]
+        self.vocab_size = self.config.text_config.vocab_size
+
+        # Tie weights again if needed
+        self.tie_weights()
+
+        return model_embeds
+
+    def tie_weights(self):
+        """
+        Overwrite `transformers.modeling_utils.PreTrainedModel.tie_weights` to handle the case of DecoupledLinear and DecoupledEmbedding.
+        """
+        output_embeddings = self.get_output_embeddings()
+        input_embeddings = self.get_input_embeddings()
+
+        if getattr(self.config, "tie_word_embeddings", True):
+            output_embeddings.weight = input_embeddings.weight
+
+    @add_start_docstrings_to_model_forward(IDEFICS3_INPUTS_DOCSTRING)
+    @replace_return_docstrings(output_type=Idefics3CausalLMOutputWithPast, config_class=_CONFIG_FOR_DOC)
+    def forward(
+        self,
+        input_ids: torch.LongTensor = None,
+        attention_mask: Optional[torch.Tensor] = None,
+        position_ids: Optional[torch.LongTensor] = None,
+        past_key_values: Optional[List[torch.FloatTensor]] = None,
+        inputs_embeds: Optional[torch.FloatTensor] = None,
+        pixel_values: Optional[torch.FloatTensor] = None,
+        pixel_attention_mask: Optional[torch.BoolTensor] = None,
+        image_hidden_states: Optional[torch.FloatTensor] = None,
+        labels: Optional[torch.LongTensor] = None,
+        use_cache: Optional[bool] = None,
+        output_attentions: Optional[bool] = None,
+        output_hidden_states: Optional[bool] = None,
+        return_dict: Optional[bool] = None,
+    ) -> Union[Tuple, Idefics3CausalLMOutputWithPast]:
+        r"""
+        Args:
+            labels (`torch.LongTensor` of shape `(batch_size, sequence_length)`, *optional*):
+                Labels for computing the masked language modeling loss. Indices should either be in `[0, ...,
+                config.vocab_size]` or `model.image_token_id` (where `model` is your instance of `Idefics3ForConditionalGeneration`).
+                Tokens with indices set to `model.image_token_id` are ignored (masked), the loss is only
+                computed for the tokens with labels in `[0, ..., config.vocab_size]`.
+        Returns:
+
+        Example:
+
+        ```python
+        >>> import requests
+        >>> import torch
+        >>> from PIL import Image
+        >>> from io import BytesIO
+
+        >>> from transformers import AutoProcessor, AutoModelForVision2Seq
+        >>> from transformers.image_utils import load_image
+
+        >>> # Note that passing the image urls (instead of the actual pil images) to the processor is also possible
+        >>> image1 = load_image("https://cdn.britannica.com/61/93061-050-99147DCE/Statue-of-Liberty-Island-New-York-Bay.jpg")
+        >>> image2 = load_image("https://cdn.britannica.com/59/94459-050-DBA42467/Skyline-Chicago.jpg")
+        >>> image3 = load_image("https://cdn.britannica.com/68/170868-050-8DDE8263/Golden-Gate-Bridge-San-Francisco.jpg")
+
+        >>> processor = AutoProcessor.from_pretrained("HuggingFaceM4/idefics3-8b")
+        >>> model = AutoModelForVision2Seq.from_pretrained("HuggingFaceM4/idefics3-8b", device_map="auto")
+
+        >>> BAD_WORDS_IDS = processor.tokenizer(["<image>", "<fake_token_around_image>"], add_special_tokens=False).input_ids
+        >>> EOS_WORDS_IDS = [processor.tokenizer.eos_token_id]
+
+        >>> # Create inputs
+        >>> prompts = [
+        ...   "<image>In this image, we can see the city of New York, and more specifically the Statue of Liberty.<image>In this image,",
+        ...   "In which city is that bridge located?<image>",
+        ... ]
+        >>> images = [[image1, image2], [image3]]
+        >>> inputs = processor(text=prompts, images=images, padding=True, return_tensors="pt").to("cuda")
+
+        >>> # Generate
+        >>> generated_ids = model.generate(**inputs, bad_words_ids=BAD_WORDS_IDS, max_new_tokens=20)
+        >>> generated_texts = processor.batch_decode(generated_ids, skip_special_tokens=True)
+
+        >>> print(generated_texts)
+        ['In this image, we can see the city of New York, and more specifically the Statue of Liberty. In this image, we can see the city of New York, and more specifically the Statue of Liberty.\n\n', 'In which city is that bridge located?\n\nThe bridge is located in the city of Pittsburgh, Pennsylvania.\n\n\nThe bridge is']
+        ```"""
+        output_attentions = output_attentions if output_attentions is not None else self.config.output_attentions
+        output_hidden_states = (
+            output_hidden_states if output_hidden_states is not None else self.config.output_hidden_states
+        )
+        return_dict = return_dict if return_dict is not None else self.config.use_return_dict
+
+        # decoder outputs consists of (dec_features, layer_state, dec_hidden, dec_attn)
+        outputs = self.model(
+            input_ids=input_ids,
+            attention_mask=attention_mask,
+            position_ids=position_ids,
+            past_key_values=past_key_values,
+            inputs_embeds=inputs_embeds,
+            pixel_values=pixel_values,
+            pixel_attention_mask=pixel_attention_mask,
+            image_hidden_states=image_hidden_states,
+            use_cache=use_cache,
+            output_attentions=output_attentions,
+            output_hidden_states=output_hidden_states,
+            return_dict=return_dict,
+        )
+
+        hidden_states = outputs[0]
+        logits = self.lm_head(hidden_states)
+        logits = logits.float()
+
+        loss = None
+        if labels is not None:
+            labels = labels.to(logits.device)
+            # Shift so that tokens < n predict n
+            if attention_mask is not None:
+                shift_attention_mask = attention_mask[..., 1:].to(logits.device)
+                shift_logits = logits[..., :-1, :][shift_attention_mask != 0].contiguous()
+                shift_labels = labels[..., 1:][shift_attention_mask != 0].contiguous()
+            else:
+                shift_logits = logits[..., :-1, :].contiguous()
+                shift_labels = labels[..., 1:].contiguous()
+            # Flatten the tokens
+            loss_fct = CrossEntropyLoss()
+            loss = loss_fct(shift_logits.view(-1, shift_logits.size(-1)), shift_labels.view(-1))
+
+        if not return_dict:
+            output = (logits,) + outputs[1:]
+            return (loss,) + output if loss is not None else output
+
+        return Idefics3CausalLMOutputWithPast(
+            loss=loss,
+            logits=logits,
+            past_key_values=outputs.past_key_values,
+            hidden_states=outputs.hidden_states,
+            attentions=outputs.attentions,
+            image_hidden_states=outputs.image_hidden_states,
+        )
+
+    def prepare_inputs_for_generation(
+        self, input_ids, past_key_values=None, attention_mask=None, inputs_embeds=None, **kwargs
+    ):
+        past_length = 0
+        # Omit tokens covered by past_key_values
+        if past_key_values is not None:
+            # Past key values are always initialized with a `Cache` object -> no need for if-else anymore
+            past_length = past_key_values.get_seq_length()
+            max_cache_length = past_key_values.get_max_length()
+
+            # Keep only the unprocessed tokens:
+            # 1 - If the length of the attention_mask exceeds the length of input_ids, then we are in a setting where
+            # some of the inputs are exclusively passed as part of the cache (e.g. when passing input_embeds as
+            # input)
+            if attention_mask is not None and attention_mask.shape[1] > input_ids.shape[1]:
+                input_ids = input_ids[:, -(attention_mask.shape[1] - past_length) :]
+            # 2 - If the past_length is smaller than input_ids', then input_ids holds all input tokens. We can discard
+            # input_ids based on the past_length.
+            elif past_length < input_ids.shape[1]:
+                input_ids = input_ids[:, past_length:]
+            # 3 - Otherwise (past_length >= input_ids.shape[1]), let's assume input_ids only has unprocessed tokens.
+
+            # If we are about to go beyond the maximum cache length, we need to crop the input attention mask.
+            if (
+                max_cache_length is not None
+                and attention_mask is not None
+                and past_length + input_ids.shape[1] > max_cache_length
+            ):
+                attention_mask = attention_mask[:, -max_cache_length:]
+
+        position_ids = kwargs.get("position_ids", None)
+        if attention_mask is not None and position_ids is None:
+            # create position_ids on the fly for batch generation
+            position_ids = attention_mask.long().cumsum(-1) - 1
+            position_ids.masked_fill_(attention_mask == 0, 1)
+            if past_key_values:
+                position_ids = position_ids[:, -input_ids.shape[1] :]
+
+        # if `inputs_embeds` are passed, we only want to use them in the 1st generation step
+        if inputs_embeds is not None and past_length == 0:
+            model_inputs = {"inputs_embeds": inputs_embeds}
+        else:
+            model_inputs = {"input_ids": input_ids}
+
+        image_hidden_states = kwargs.get("image_hidden_states", None)
+        if image_hidden_states is not None:
+            pixel_values = None
+            pixel_attention_mask = None
+        else:
+            pixel_values = kwargs.get("pixel_values", None)
+            pixel_attention_mask = kwargs.get("pixel_attention_mask", None)
+        model_inputs.update(
+            {
+                "position_ids": position_ids,
+                "past_key_values": past_key_values,
+                "use_cache": kwargs.get("use_cache"),
+                "attention_mask": attention_mask,
+                "pixel_values": pixel_values,
+                "pixel_attention_mask": pixel_attention_mask,
+                "image_hidden_states": image_hidden_states,
+            }
+        )
+        return model_inputs
+
+    def _update_model_kwargs_for_generation(self, outputs, model_kwargs, is_encoder_decoder, **kwargs):
+        model_kwargs = super()._update_model_kwargs_for_generation(
+            outputs=outputs,
+            model_kwargs=model_kwargs,
+            is_encoder_decoder=is_encoder_decoder,
+            **kwargs,
+        )
+        # Get the precomputed image_hidden_states
+        model_kwargs["image_hidden_states"] = outputs.image_hidden_states
+        return model_kwargs
+
+    @staticmethod
+    # Copied from transformers.models.opt.modeling_opt.OPTForCausalLM._reorder_cache
+    def _reorder_cache(past_key_values, beam_idx):
+        reordered_past = ()
+        for layer_past in past_key_values:
+            reordered_past += (
+                tuple(past_state.index_select(0, beam_idx.to(past_state.device)) for past_state in layer_past),
+            )
+        return reordered_past
diff --git a/src/transformers/models/idefics3/processing_idefics3.py b/src/transformers/models/idefics3/processing_idefics3.py
new file mode 100644
index 00000000000000..1319d884e2d1e4
--- /dev/null
+++ b/src/transformers/models/idefics3/processing_idefics3.py
@@ -0,0 +1,351 @@
+# coding=utf-8
+# Copyright 2024 The HuggingFace Inc. team.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+"""
+Processor class for Idefics3.
+"""
+
+from typing import TYPE_CHECKING, List, Optional, Union
+
+from ...feature_extraction_utils import BatchFeature
+from ...image_utils import ImageInput, is_valid_image, load_image
+from ...processing_utils import ProcessorMixin
+from ...tokenization_utils_base import AddedToken, BatchEncoding, PaddingStrategy, TextInput, TruncationStrategy
+from ...utils import TensorType, logging
+
+
+if TYPE_CHECKING:
+    from ...tokenization_utils_base import PreTokenizedInput
+
+
+logger = logging.get_logger(__name__)
+
+
+def is_url(val) -> bool:
+    return isinstance(val, str) and val.startswith("http")
+
+
+def is_image_or_image_url(elem):
+    return is_url(elem) or is_valid_image(elem)
+
+
+def _prompt_split_image(image_seq_len, image_rows, image_cols, fake_token_around_image, image_token):
+    """Prompt with expanded image tokens for when the image is split into patches."""
+    text_splitted_images = ""
+    for n_h in range(image_rows):
+        for n_w in range(image_cols):
+            text_splitted_images += (
+                f"{fake_token_around_image}"
+                + f"<row_{n_h + 1}_col_{n_w + 1}>"
+                + f"{image_token}" * image_seq_len
+            )
+        text_splitted_images += "\n"
+
+    text_splitted_images += (
+        f"\n{fake_token_around_image}"
+        + "<global-img>"
+        + f"{image_token}" * image_seq_len
+        + f"{fake_token_around_image}"
+    )
+    return text_splitted_images
+
+
+def _prompt_single_image(image_seq_len, fake_token_around_image, image_token):
+    """Prompt with expanded image tokens for a single image."""
+    return f"{fake_token_around_image}" + "<global-img>" + f"{image_token}" * image_seq_len + f"{fake_token_around_image}"
+
+
+def get_image_prompt_string(image_rows, image_cols, image_seq_len, fake_token_around_image, image_token):
+    if image_rows == 0 and image_cols == 0:
+        return _prompt_single_image(
+            image_seq_len, fake_token_around_image=fake_token_around_image, image_token=image_token
+        )
+    return _prompt_split_image(image_seq_len, image_rows, image_cols, fake_token_around_image, image_token)
+
+
+class Idefics3Processor(ProcessorMixin):
+    r"""
+    Constructs a Idefics3 processor which wraps a LLama tokenizer and Idefics3 image processor into a single processor.
+
+    [`Idefics3Processor`] offers all the functionalities of [`Idefics3ImageProcessor`] and [`Idefics3TokenizerFast`]. See
+    the docstring of [`~IdeficsProcessor.__call__`] and [`~IdeficsProcessor.decode`] for more information.
+
+    Args:
+        image_processor (`Idefics3ImageProcessor`):
+            An instance of [`Idefics3ImageProcessor`]. The image processor is a required input.
+        tokenizer (`PreTrainedTokenizerBase`, *optional*):
+            An instance of [`PreTrainedTokenizerBase`]. This should correspond with the model's text model. The tokenizer is a required input.
+        chat_template (`str`, *optional*): A Jinja template which will be used to convert lists of messages
+            in a chat into a tokenizable string.
+    """
+
+    attributes = ["image_processor", "tokenizer"]
+    image_processor_class = "Idefics3ImageProcessor"
+    tokenizer_class = "AutoTokenizer"
+
+    def __init__(self, image_processor, tokenizer=None, chat_template: str = None, **kwargs):
+        if image_processor is None:
+            raise ValueError("You need to specify an `image_processor`.")
+        if tokenizer is None:
+            raise ValueError("You need to specify a `tokenizer`.")
+
+        self.fake_image_token = AddedToken("<fake_token_around_image>", normalized=False, special=True)
+        self.image_token = AddedToken("<image>", normalized=False, special=True)
+        self.end_of_utterance_token = AddedToken("<end_of_utterance>", normalized=False, special=True)
+
+        tokens_to_add = {
+            "additional_special_tokens": [self.fake_image_token, self.image_token, self.end_of_utterance_token]
+        }
+        tokenizer.add_special_tokens(tokens_to_add)
+
+        super().__init__(image_processor, tokenizer, chat_template=chat_template)
+
+    def _extract_images_from_prompts(self, prompts):
+        prompt_images = []
+        for prompt in prompts:
+            images = []
+            for elem in prompt:
+                if is_valid_image(elem):
+                    images.append(elem)
+                elif is_url(elem):
+                    images.append(load_image(elem))
+            prompt_images.append(images)
+        return prompt_images
+
+    def __call__(
+        self,
+        text: Union[TextInput, "PreTokenizedInput", List[TextInput], List["PreTokenizedInput"]] = None,
+        images: Union[ImageInput, List[ImageInput], List[List[ImageInput]]] = None,
+        image_seq_len: int = 169,
+        padding: Union[bool, str, PaddingStrategy] = False,
+        truncation: Union[bool, str, TruncationStrategy] = None,
+        max_length: Optional[int] = None,
+        is_split_into_words: bool = False,
+        add_special_tokens: bool = True,
+        return_tensors: Optional[Union[str, TensorType]] = None,
+    ) -> BatchEncoding:
+        """
+        Processes the input prompts and returns a BatchEncoding.
+
+        Example:
+
+        ```python
+        >>> import requests
+        >>> from transformers import Idefics3Processor
+        >>> from transformers.image_utils import load_image
+
+        >>> processor = Idefics3Processor.from_pretrained("HuggingFaceM4/idefics3-8b", image_seq_len=2)
+        >>> processor.image_processor.do_image_splitting = False  # Force as False to simplify the example
+
+        >>> url1 = "https://cdn.britannica.com/61/93061-050-99147DCE/Statue-of-Liberty-Island-New-York-Bay.jpg"
+        >>> url2 = "https://cdn.britannica.com/59/94459-050-DBA42467/Skyline-Chicago.jpg"
+
+        >>> image1, image2 = load_image(url1), load_image(url2)
+        >>> images = [[image1], [image2]]
+
+        >>> text = [
+        ...     "<image>In this image, we see",
+        ...     "bla bla bla<image>",
+        ... ]
+        >>> outputs = processor(text=text, images=images, return_tensors="pt", padding=True)
+        >>> input_ids = outputs.input_ids
+        >>> input_tokens = processor.tokenizer.batch_decode(input_ids)
+        >>> print(input_tokens)
+        ['<s><fake_token_around_image><image><image><fake_token_around_image> In this image, we see', '<s> bla bla bla<fake_token_around_image><image><image><fake_token_around_image>']
+        ```
+
+        Args:
+            text (`Union[TextInput, PreTokenizedInput, List[TextInput], List[PreTokenizedInput]]`, *optional*):
+                The sequence or batch of sequences to be encoded. Each sequence can be a string or a list of strings
+                (pretokenized string). If the sequences are provided as list of strings (pretokenized), you must set
+                `is_split_into_words=True` (to lift the ambiguity with a batch of sequences).
+
+                Wherever an image token, `<image>` is encountered it is expanded to
+                `<fake_token_around_image>` + `<image>` * `image_seq_len` * <fake_token_around_image>`.
+            images (`PIL.Image.Image`, `np.ndarray`, `torch.Tensor`, `List[PIL.Image.Image]`, `List[np.ndarray]`, `List[torch.Tensor]`, *optional*):
+                The image or batch of images to be prepared. Each image can be a PIL image, NumPy array or PyTorch
+                tensor. If is of type `List[ImageInput]`, it's assumed that this is for a single prompt i.e. of batch size 1.
+            image_seq_len (`int`, *optional*):
+                The length of the image sequence. If not provided, the default value of 169 is used.
+            padding (`Union[bool, str, PaddingStrategy]`, *optional*, defaults to `False`):
+                Padding strategy applied to the input ids. See [`PreTrainedTokenizerFast.pad`] for more information.
+            truncation (`Union[bool, str, TruncationStrategy]`, *optional*):
+                Truncation strategy applied to the input ids. See [`PreTrainedTokenizerFast.truncate`] for more information.
+            max_length (`int`, *optional*):
+                Maximum length of the returned list and optionally padding/truncation length. See
+                [`PreTrainedTokenizerFast.__call__`] for more information.
+            is_split_into_words (`bool`, *optional*, defaults to `False`):
+                Whether the input text is split into words or not. If set to `True`, the tokenizer will skip the
+                tokenization process and assume the input is already tokenized.
+            add_special_tokens (`bool`, *optional*, defaults to `True`):
+                Whether to add special tokens or not. See [`PreTrainedTokenizerFast.__call__`] for more information.
+            return_tensors (`Union[str, TensorType]`, *optional*):
+                If set, will return tensors of a particular framework. See [`PreTrainedTokenizerFast.__call__`] for more
+                information.
+        """
+        n_images_in_text = []
+        n_images_in_images = []
+        inputs = BatchFeature()
+
+        if images is not None:
+            if is_image_or_image_url(images):
+                images = [[images]]
+            elif isinstance(images, list) and is_image_or_image_url(images[0]):
+                images = [images]
+            elif (
+                not isinstance(images, list)
+                and not isinstance(images[0], list)
+                and not is_image_or_image_url(images[0][0])
+            ):
+                raise ValueError(
+                    "Invalid input images. Please provide a single image or a list of images or a list of list of images."
+                )
+            n_images_in_images = [len(sample) for sample in images]
+
+            # Load images if they are URLs
+            images = [[load_image(im) for im in sample] for sample in images]
+            image_inputs = self.image_processor(images, return_tensors=return_tensors, return_row_col_info=True)
+            inputs.update(image_inputs)
+
+        if text is not None:
+            if isinstance(text, str):
+                text = [text]
+            elif not isinstance(text, list) and not isinstance(text[0], str):
+                raise ValueError("Invalid input text. Please provide a string, or a list of strings")
+
+            image_rows = inputs.pop("rows", [[0] * len(text)])
+            image_cols = inputs.pop("cols", [[0] * len(text)])
+
+            fake_image_token = self.fake_image_token.content
+            image_token = self.image_token.content
+
+            prompt_strings = []
+            for sample, sample_rows, sample_cols in zip(text, image_rows, image_cols):
+                n_images_in_text.append(sample.count(image_token))
+
+                # Replace the image token with fake tokens around the expanded image token sequence of length `image_seq_len`
+                image_prompt_strings = []
+                for n_rows, n_cols in zip(sample_rows, sample_cols):
+                    image_prompt_string = get_image_prompt_string(
+                        n_rows,
+                        n_cols,
+                        image_seq_len,
+                        image_token=image_token,
+                        fake_token_around_image=fake_image_token,
+                    )
+                    image_prompt_strings.append(image_prompt_string)
+
+                split_sample = sample.split(image_token)
+
+                # Place in the image prompt strings where the image tokens are
+                sample = split_sample[0]
+                for i, image_prompt_string in enumerate(image_prompt_strings):
+                    sample += image_prompt_string + split_sample[i + 1]
+                prompt_strings.append(sample)
+
+            text_inputs = self.tokenizer(
+                text=prompt_strings,
+                add_special_tokens=add_special_tokens,
+                padding=padding,
+                truncation=truncation,
+                max_length=max_length,
+                is_split_into_words=is_split_into_words,
+                return_tensors=return_tensors,
+            )
+            inputs.update(text_inputs)
+
+            if n_images_in_images != n_images_in_text:
+                raise ValueError(
+                    f"The number of images in the text {n_images_in_text} and images  {n_images_in_images} should be the same."
+                )
+
+        return inputs
+
+    def batch_decode(self, *args, **kwargs):
+        """
+        This method forwards all its arguments to Idefics3TokenizerFast's [`~PreTrainedTokenizer.batch_decode`]. Please
+        refer to the docstring of this method for more information.
+        """
+        return self.tokenizer.batch_decode(*args, **kwargs)
+
+    def decode(self, *args, **kwargs):
+        """
+        This method forwards all its arguments to Idefics3TokenizerFast's [`~PreTrainedTokenizer.decode`]. Please refer to
+        the docstring of this method for more information.
+        """
+        return self.tokenizer.decode(*args, **kwargs)
+
+    @property
+    def model_input_names(self):
+        tokenizer_input_names = self.tokenizer.model_input_names
+        image_processor_input_names = self.image_processor.model_input_names
+        return list(dict.fromkeys(tokenizer_input_names + image_processor_input_names))
+
+    @property
+    def default_chat_template(self):
+        """
+        This template formats inputs in the form of a chat history. For each message in the chat history:
+        * the template will output the role of the speaker followed by the content of the message.
+        * content can be a single string or a list of strings and images.
+        * If the content element is an image, the template will output a sequence of <image> tokens
+        * The template will output an <end_of_utterance> token at the end of each message.
+
+        Example:
+
+        ```python
+        messages = [{
+            "role": "user",
+            "content": [
+                {"type": "image"},
+                {"type": "image"},
+                {"type": "text", "text": "What is in this image?"},
+                ],
+        },
+        {
+            "role": "assistant",
+            "content": [{"type": "text", "text": "This picture depicts Idefix, the dog of Obelix in Asterix and Obelix. Idefix is running on the ground."},]
+        }]
+        ```
+
+        Will create outputs like:
+        ```
+        <|begin_of_text|>User:<image><image> What is in this Image?<end_of_utterance>
+        Assistant: This picture depicts Idefix, the dog of Obelix in Asterix and Obelix. Idefix is running on the ground.<end_of_utterance>
+        ```
+        """
+        # fmt: off
+        return (
+            "<|begin_of_text|>"
+            "{% for message in messages %}"
+                "{{message['role'].capitalize()}}"
+                "{% if message['content'][0]['type'] == 'image' %}"
+                    "{{':'}}"
+                "{% else %}"
+                    "{{': '}}"
+                "{% endif %}"
+                "{% for line in message['content'] %}"
+                    "{% if line['type'] == 'text' %}"
+                        "{{line['text']}}"
+                    "{% elif line['type'] == 'image' %}"
+                        "{{ '<image>' }}"
+                    "{% endif %}"
+                "{% endfor %}"
+                "<end_of_utterance>\n"
+            "{% endfor %}"
+
+            "{% if add_generation_prompt %}"
+                "{{ 'Assistant:' }}"
+            "{% endif %}"
+        )
+        # fmt: on
diff --git a/src/transformers/utils/dummy_pt_objects.py b/src/transformers/utils/dummy_pt_objects.py
index 2db7b38b580375..52e3c33629fbdb 100644
--- a/src/transformers/utils/dummy_pt_objects.py
+++ b/src/transformers/utils/dummy_pt_objects.py
@@ -4828,6 +4828,13 @@ def __init__(self, *args, **kwargs):
         requires_backends(self, ["torch"])
 
 
+class Idefics3ForConditionalGeneration(metaclass=DummyObject):
+    _backends = ["torch"]
+
+    def __init__(self, *args, **kwargs):
+        requires_backends(self, ["torch"])
+
+
 class Idefics2Model(metaclass=DummyObject):
     _backends = ["torch"]
 
@@ -4842,6 +4849,13 @@ def __init__(self, *args, **kwargs):
         requires_backends(self, ["torch"])
 
 
+class Idefics3PreTrainedModel(metaclass=DummyObject):
+    _backends = ["torch"]
+
+    def __init__(self, *args, **kwargs):
+        requires_backends(self, ["torch"])
+
+
 class Idefics2Processor(metaclass=DummyObject):
     _backends = ["torch"]
 
@@ -4849,6 +4863,13 @@ def __init__(self, *args, **kwargs):
         requires_backends(self, ["torch"])
 
 
+class Idefics3Processor(metaclass=DummyObject):
+    _backends = ["torch"]
+
+    def __init__(self, *args, **kwargs):
+        requires_backends(self, ["torch"])
+
+
 class ImageGPTForCausalImageModeling(metaclass=DummyObject):
     _backends = ["torch"]
 
diff --git a/src/transformers/utils/dummy_vision_objects.py b/src/transformers/utils/dummy_vision_objects.py
index 436378582e54ca..53100be2f3a1c6 100644
--- a/src/transformers/utils/dummy_vision_objects.py
+++ b/src/transformers/utils/dummy_vision_objects.py
@@ -296,6 +296,13 @@ def __init__(self, *args, **kwargs):
         requires_backends(self, ["vision"])
 
 
+class Idefics3ImageProcessor(metaclass=DummyObject):
+    _backends = ["vision"]
+
+    def __init__(self, *args, **kwargs):
+        requires_backends(self, ["vision"])
+
+
 class ImageGPTFeatureExtractor(metaclass=DummyObject):
     _backends = ["vision"]
 
diff --git a/tests/models/idefics3/__init__.py b/tests/models/idefics3/__init__.py
new file mode 100644
index 00000000000000..e69de29bb2d1d6
diff --git a/tests/models/idefics3/test_image_processing_idefics3.py b/tests/models/idefics3/test_image_processing_idefics3.py
new file mode 100644
index 00000000000000..ff4fa435ed51b2
--- /dev/null
+++ b/tests/models/idefics3/test_image_processing_idefics3.py
@@ -0,0 +1,311 @@
+# coding=utf-8
+# Copyright 2024 HuggingFace Inc.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+
+import unittest
+
+import numpy as np
+
+from transformers.testing_utils import require_torch, require_vision
+from transformers.utils import is_torch_available, is_vision_available
+
+from ...test_image_processing_common import ImageProcessingTestMixin
+
+
+if is_vision_available():
+    from PIL import Image
+
+    from transformers import Idefics2ImageProcessor
+
+
+if is_torch_available():
+    import torch
+
+
+class Idefics2ImageProcessingTester(unittest.TestCase):
+    def __init__(
+        self,
+        parent,
+        batch_size=7,
+        num_channels=3,
+        num_images=1,
+        image_size=18,
+        min_resolution=30,
+        max_resolution=400,
+        do_resize=True,
+        size=None,
+        do_rescale=True,
+        rescale_factor=1 / 255,
+        do_normalize=True,
+        image_mean=[0.5, 0.5, 0.5],
+        image_std=[0.5, 0.5, 0.5],
+        do_convert_rgb=True,
+        do_pad=True,
+        do_image_splitting=True,
+    ):
+        size = size if size is not None else {"longest_edge": 4*364}
+        self.parent = parent
+        self.batch_size = batch_size
+        self.num_channels = num_channels
+        self.num_images = num_images
+        self.image_size = image_size
+        self.min_resolution = min_resolution
+        self.max_resolution = max_resolution
+        self.do_resize = do_resize
+        self.size = size
+        self.do_normalize = do_normalize
+        self.image_mean = image_mean
+        self.image_std = image_std
+        self.do_rescale = do_rescale
+        self.rescale_factor = rescale_factor
+        self.do_convert_rgb = do_convert_rgb
+        self.do_pad = do_pad
+        self.do_image_splitting = do_image_splitting
+
+    def prepare_image_processor_dict(self):
+        return {
+            "do_convert_rgb": self.do_convert_rgb,
+            "do_resize": self.do_resize,
+            "size": self.size,
+            "do_rescale": self.do_rescale,
+            "rescale_factor": self.rescale_factor,
+            "do_normalize": self.do_normalize,
+            "image_mean": self.image_mean,
+            "image_std": self.image_std,
+            "do_pad": self.do_pad,
+            "do_image_splitting": self.do_image_splitting,
+        }
+
+    def get_expected_values(self, image_inputs, batched=False):
+        """
+        This function computes the expected height and width when providing images to BridgeTowerImageProcessor,
+        assuming do_resize is set to True with a scalar size and size_divisor.
+        """
+        if not batched:
+            shortest_edge = self.size["shortest_edge"]
+            longest_edge = self.size["longest_edge"]
+            image = image_inputs[0]
+            if isinstance(image, Image.Image):
+                w, h = image.size
+            elif isinstance(image, np.ndarray):
+                h, w = image.shape[0], image.shape[1]
+            else:
+                h, w = image.shape[1], image.shape[2]
+
+            aspect_ratio = w / h
+            if w > h and w >= longest_edge:
+                w = longest_edge
+                h = int(w / aspect_ratio)
+            elif h > w and h >= longest_edge:
+                h = longest_edge
+                w = int(h * aspect_ratio)
+            w = max(w, shortest_edge)
+            h = max(h, shortest_edge)
+            expected_height = h
+            expected_width = w
+        else:
+            expected_values = []
+            for images in image_inputs:
+                for image in images:
+                    expected_height, expected_width = self.get_expected_values([image])
+                    expected_values.append((expected_height, expected_width))
+            expected_height = max(expected_values, key=lambda item: item[0])[0]
+            expected_width = max(expected_values, key=lambda item: item[1])[1]
+
+        return expected_height, expected_width
+
+    def expected_output_image_shape(self, images):
+        height, width = self.get_expected_values(images, batched=True)
+        effective_nb_images = self.num_images * 5 if self.do_image_splitting else 1
+        return effective_nb_images, self.num_channels, height, width
+
+    def prepare_image_inputs(
+        self,
+        batch_size=None,
+        min_resolution=None,
+        max_resolution=None,
+        num_channels=None,
+        num_images=None,
+        size_divisor=None,
+        equal_resolution=False,
+        numpify=False,
+        torchify=False,
+    ):
+        """This function prepares a list of PIL images, or a list of numpy arrays if one specifies numpify=True,
+        or a list of PyTorch tensors if one specifies torchify=True.
+
+        One can specify whether the images are of the same resolution or not.
+        """
+        assert not (numpify and torchify), "You cannot specify both numpy and PyTorch tensors at the same time"
+
+        batch_size = batch_size if batch_size is not None else self.batch_size
+        min_resolution = min_resolution if min_resolution is not None else self.min_resolution
+        max_resolution = max_resolution if max_resolution is not None else self.max_resolution
+        num_channels = num_channels if num_channels is not None else self.num_channels
+        num_images = num_images if num_images is not None else self.num_images
+
+        images_list = []
+        for i in range(batch_size):
+            images = []
+            for j in range(num_images):
+                if equal_resolution:
+                    width = height = max_resolution
+                else:
+                    # To avoid getting image width/height 0
+                    if size_divisor is not None:
+                        # If `size_divisor` is defined, the image needs to have width/size >= `size_divisor`
+                        min_resolution = max(size_divisor, min_resolution)
+                    width, height = np.random.choice(np.arange(min_resolution, max_resolution), 2)
+                images.append(np.random.randint(255, size=(num_channels, width, height), dtype=np.uint8))
+            images_list.append(images)
+
+        if not numpify and not torchify:
+            # PIL expects the channel dimension as last dimension
+            images_list = [[Image.fromarray(np.moveaxis(image, 0, -1)) for image in images] for images in images_list]
+
+        if torchify:
+            images_list = [[torch.from_numpy(image) for image in images] for images in images_list]
+
+        if numpify:
+            # Numpy images are typically in channels last format
+            images_list = [[image.transpose(1, 2, 0) for image in images] for images in images_list]
+
+        return images_list
+
+
+@require_torch
+@require_vision
+class Idefics2ImageProcessingTest(ImageProcessingTestMixin, unittest.TestCase):
+    image_processing_class = Idefics2ImageProcessor if is_vision_available() else None
+
+    def setUp(self):
+        super().setUp()
+        self.image_processor_tester = Idefics2ImageProcessingTester(self)
+
+    @property
+    def image_processor_dict(self):
+        return self.image_processor_tester.prepare_image_processor_dict()
+
+    def test_image_processor_properties(self):
+        image_processing = self.image_processing_class(**self.image_processor_dict)
+        self.assertTrue(hasattr(image_processing, "do_convert_rgb"))
+        self.assertTrue(hasattr(image_processing, "do_resize"))
+        self.assertTrue(hasattr(image_processing, "size"))
+        self.assertTrue(hasattr(image_processing, "do_rescale"))
+        self.assertTrue(hasattr(image_processing, "rescale_factor"))
+        self.assertTrue(hasattr(image_processing, "do_normalize"))
+        self.assertTrue(hasattr(image_processing, "image_mean"))
+        self.assertTrue(hasattr(image_processing, "image_std"))
+        self.assertTrue(hasattr(image_processing, "do_pad"))
+        self.assertTrue(hasattr(image_processing, "do_image_splitting"))
+
+    def test_call_numpy(self):
+        for image_processing_class in self.image_processor_list:
+            # Initialize image_processing
+            image_processing = self.image_processing_class(**self.image_processor_dict)
+            # create random numpy tensors
+            image_inputs = self.image_processor_tester.prepare_image_inputs(equal_resolution=False, numpify=True)
+            for sample_images in image_inputs:
+                for image in sample_images:
+                    self.assertIsInstance(image, np.ndarray)
+
+            # Test not batched input
+            encoded_images = image_processing(image_inputs[0], return_tensors="pt").pixel_values
+            expected_output_image_shape = self.image_processor_tester.expected_output_image_shape([image_inputs[0]])
+            self.assertEqual(tuple(encoded_images.shape), (1, *expected_output_image_shape))
+
+            # Test batched
+            encoded_images = image_processing(image_inputs, return_tensors="pt").pixel_values
+            expected_output_image_shape = self.image_processor_tester.expected_output_image_shape(image_inputs)
+            self.assertEqual(
+                tuple(encoded_images.shape), (self.image_processor_tester.batch_size, *expected_output_image_shape)
+            )
+
+    def test_call_numpy_4_channels(self):
+        for image_processing_class in self.image_processor_list:
+            # Initialize image_processing
+            image_processor_dict = self.image_processor_dict
+            image_processor_dict["image_mean"] = [0.5, 0.5, 0.5, 0.5]
+            image_processor_dict["image_std"] = [0.5, 0.5, 0.5, 0.5]
+            image_processing = self.image_processing_class(**image_processor_dict)
+            # create random numpy tensors
+            self.image_processor_tester.num_channels = 4
+            image_inputs = self.image_processor_tester.prepare_image_inputs(equal_resolution=False, numpify=True)
+
+            for sample_images in image_inputs:
+                for image in sample_images:
+                    self.assertIsInstance(image, np.ndarray)
+
+            # Test not batched input
+            encoded_images = image_processing(
+                image_inputs[0], input_data_format="channels_last", return_tensors="pt"
+            ).pixel_values
+            expected_output_image_shape = self.image_processor_tester.expected_output_image_shape([image_inputs[0]])
+            self.assertEqual(tuple(encoded_images.shape), (1, *expected_output_image_shape))
+
+            # Test batched
+            encoded_images = image_processing(
+                image_inputs, input_data_format="channels_last", return_tensors="pt"
+            ).pixel_values
+            expected_output_image_shape = self.image_processor_tester.expected_output_image_shape(image_inputs)
+            self.assertEqual(
+                tuple(encoded_images.shape), (self.image_processor_tester.batch_size, *expected_output_image_shape)
+            )
+
+    def test_call_pil(self):
+        for image_processing_class in self.image_processor_list:
+            # Initialize image_processing
+            image_processing = self.image_processing_class(**self.image_processor_dict)
+            # create random PIL images
+            image_inputs = self.image_processor_tester.prepare_image_inputs(equal_resolution=False)
+            for images in image_inputs:
+                for image in images:
+                    self.assertIsInstance(image, Image.Image)
+
+            # Test not batched input
+            encoded_images = image_processing(image_inputs[0], return_tensors="pt").pixel_values
+            expected_output_image_shape = self.image_processor_tester.expected_output_image_shape([image_inputs[0]])
+            self.assertEqual(tuple(encoded_images.shape), (1, *expected_output_image_shape))
+
+            # Test batched
+            encoded_images = image_processing(image_inputs, return_tensors="pt").pixel_values
+            expected_output_image_shape = self.image_processor_tester.expected_output_image_shape(image_inputs)
+            self.assertEqual(
+                tuple(encoded_images.shape), (self.image_processor_tester.batch_size, *expected_output_image_shape)
+            )
+
+    def test_call_pytorch(self):
+        for image_processing_class in self.image_processor_list:
+            # Initialize image_processing
+            image_processing = self.image_processing_class(**self.image_processor_dict)
+            # create random PyTorch tensors
+            image_inputs = self.image_processor_tester.prepare_image_inputs(equal_resolution=False, torchify=True)
+
+            for images in image_inputs:
+                for image in images:
+                    self.assertIsInstance(image, torch.Tensor)
+
+            # Test not batched input
+            encoded_images = image_processing(image_inputs[0], return_tensors="pt").pixel_values
+            expected_output_image_shape = self.image_processor_tester.expected_output_image_shape([image_inputs[0]])
+            self.assertEqual(tuple(encoded_images.shape), (1, *expected_output_image_shape))
+
+            # Test batched
+            expected_output_image_shape = self.image_processor_tester.expected_output_image_shape(image_inputs)
+            encoded_images = image_processing(image_inputs, return_tensors="pt").pixel_values
+            self.assertEqual(
+                tuple(encoded_images.shape),
+                (self.image_processor_tester.batch_size, *expected_output_image_shape),
+            )
diff --git a/tests/models/idefics3/test_modeling_idefics3.py b/tests/models/idefics3/test_modeling_idefics3.py
new file mode 100644
index 00000000000000..74f7a24ca76a25
--- /dev/null
+++ b/tests/models/idefics3/test_modeling_idefics3.py
@@ -0,0 +1,532 @@
+# coding=utf-8
+# Copyright 2024 The HuggingFace Inc. team. All rights reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+"""Testing suite for the PyTorch Idefics3 model."""
+
+import copy
+import gc
+import unittest
+from io import BytesIO
+
+import requests
+
+from transformers import (
+    AutoProcessor,
+    Idefics3Config,
+    Idefics3ForConditionalGeneration,
+    Idefics3Model,
+    is_torch_available,
+    is_vision_available,
+)
+from transformers.testing_utils import require_bitsandbytes, require_torch, slow, torch_device
+
+from ...generation.test_utils import GenerationTesterMixin
+from ...test_configuration_common import ConfigTester
+from ...test_modeling_common import ModelTesterMixin, floats_tensor, ids_tensor
+
+
+if is_torch_available():
+    import torch
+else:
+    is_torch_greater_or_equal_than_2_0 = False
+
+if is_vision_available():
+    from PIL import Image
+
+
+class Idefics3VisionText2TextModelTester:
+    def __init__(
+        self,
+        parent,
+        is_training=True,
+        batch_size=2,
+        num_images=2,
+        seq_length=10,
+        vision_config={
+            "image_size": 12,
+            "patch_size": 12,
+            "num_channels": 3,
+            "hidden_size": 32,
+            "num_hidden_layers": 2,
+            "num_attention_heads": 4,
+            "intermediate_size": 32,
+            "dropout": 0.1,
+            "attention_dropout": 0.1,
+            "initializer_range": 0.02,
+        },
+        perceiver_config={
+            "hidden_act": "silu",
+            "resampler_n_latents": 2,
+            "resampler_depth": 2,
+            "resampler_n_heads": 2,
+            "num_key_value_heads": 1,
+            "resampler_head_dim": 12,
+            "attention_dropout": 0.0,
+        },
+        text_config={
+            "vocab_size": 100,
+            "hidden_size": 64,
+            "intermediate_size": 56,
+            "num_hidden_layers": 3,
+            "num_attention_heads": 2,
+            "num_key_value_heads": 2,
+            "hidden_act": "silu",
+            "max_position_embeddings": 256,
+            "initializer_range": 0.02,
+            "rms_norm_eps": 1e-6,
+            "pad_token_id": 0,  # None in the original configuration_mistral, we set it to the unk_token_id
+            "bos_token_id": 1,
+            "eos_token_id": 2,
+            "image_token_id": 32_001,
+            "tie_word_embeddings": False,
+            "rope_theta": 10000.0,
+            "sliding_window": 32,
+            "attention_dropout": 0.0,
+        },
+        use_cache=False,
+        tie_word_embeddings=False,
+        image_token_id=99,
+    ):
+        self.parent = parent
+        self.is_training = is_training
+        self.batch_size = batch_size
+        self.num_images = num_images
+        self.num_channels = 3
+        self.seq_length = seq_length
+        self.use_cache = use_cache
+        self.image_token_id = image_token_id
+        self.tie_word_embeddings = tie_word_embeddings
+        # Hack - add properties here so use common tests
+        self.vocab_size = text_config["vocab_size"]
+        self.num_hidden_layers = text_config["num_hidden_layers"]
+        self.num_attention_heads = text_config["num_attention_heads"]
+        self.hidden_size = text_config["hidden_size"]
+
+        self.vision_config = vision_config
+        self.perceiver_config = perceiver_config
+        self.text_config = text_config
+
+    def get_config(self):
+        return Idefics3Config(
+            use_cache=self.use_cache,
+            image_token_id=self.image_token_id,
+            tie_word_embeddings=self.tie_word_embeddings,
+            vision_config=self.vision_config,
+            perceiver_config=self.perceiver_config,
+            text_config=self.text_config,
+            vocab_size=self.vocab_size,
+        )
+
+    def prepare_config_and_inputs(self):
+        pixel_values = floats_tensor(
+            [
+                self.batch_size,
+                self.num_images,
+                self.vision_config["num_channels"],
+                self.vision_config["image_size"],
+                self.vision_config["image_size"],
+            ]
+        )
+        config = self.get_config()
+
+        return config, pixel_values
+
+    def prepare_config_and_inputs_for_common(self):
+        config_and_inputs = self.prepare_config_and_inputs()
+        config, pixel_values = config_and_inputs
+        input_ids = ids_tensor([self.batch_size, self.seq_length], config.text_config.vocab_size - 2) + 1
+
+        # For simplicity just set the last n tokens to the image token
+        n_image_tokens_per_batch = self.num_images * self.perceiver_config["resampler_n_latents"]
+        input_ids[:, -n_image_tokens_per_batch:] = self.image_token_id
+        attention_mask = input_ids.ne(1).to(torch_device)
+        inputs_dict = {
+            "pixel_values": pixel_values,
+            "input_ids": input_ids,
+            "attention_mask": attention_mask,
+        }
+        return config, inputs_dict
+
+
+@require_torch
+class Idefics3ModelTest(ModelTesterMixin, unittest.TestCase):
+    """
+    Model tester for `Idefics3`.
+    """
+
+    all_model_classes = (Idefics3Model,) if is_torch_available() else ()
+    fx_compatible = False
+    test_torchscript = False
+    test_pruning = False
+    test_resize_embeddings = True
+    test_head_masking = False
+
+    def setUp(self):
+        self.model_tester = Idefics3VisionText2TextModelTester(self)
+        self.config_tester = ConfigTester(self, config_class=Idefics3Config, has_text_modality=False)
+
+    @unittest.skip(reason="input_embeds cannot be passed in without input_ids")
+    def test_inputs_embeds():
+        pass
+
+    @unittest.skip(reason="input_embeds cannot be passed in without input_ids")
+    def test_inputs_embeds_matches_input_ids(self):
+        pass
+
+    @unittest.skip(reason="Model does not support padding right")
+    def test_flash_attn_2_generate_padding_right(self):
+        pass
+
+    @unittest.skip(reason="Model does not support padding right")
+    def test_flash_attn_2_inference_padding_right(self):
+        pass
+
+    # We need to override as we need to prepare such that the image token is the last token
+    def test_resize_tokens_embeddings(self):
+        (original_config, inputs_dict) = self.model_tester.prepare_config_and_inputs_for_common()
+
+        for model_class in self.all_model_classes:
+            config = copy.deepcopy(original_config)
+            model = model_class(config)
+            model.to(torch_device)
+
+            if self.model_tester.is_training is False:
+                model.eval()
+
+            model_vocab_size = config.text_config.vocab_size
+            # Retrieve the embeddings and clone theme
+            model_embed = model.resize_token_embeddings(model_vocab_size)
+            cloned_embeddings = model_embed.weight.clone()
+
+            # Check that resizing the token embeddings with a larger vocab size increases the model's vocab size
+            model_embed = model.resize_token_embeddings(model_vocab_size + 10)
+            self.assertEqual(model.config.text_config.vocab_size, model_vocab_size + 10)
+            # Check that it actually resizes the embeddings matrix
+            self.assertEqual(model_embed.weight.shape[0], cloned_embeddings.shape[0] + 10)
+            # Check that the model can still do a forward pass successfully (every parameter should be resized)
+            model(**self._prepare_for_class(inputs_dict, model_class))
+
+            # Check that resizing the token embeddings with a smaller vocab size decreases the model's vocab size
+            model_embed = model.resize_token_embeddings(model_vocab_size - 15)
+            self.assertEqual(model.config.text_config.vocab_size, model_vocab_size - 15)
+            # Check that it actually resizes the embeddings matrix
+            self.assertEqual(model_embed.weight.shape[0], cloned_embeddings.shape[0] - 15)
+
+            # Ignore copy
+            # Check that the model can still do a forward pass successfully (every parameter should be resized)
+            # Input ids should be clamped to the maximum size of the vocabulary - 1 and the image token should be the last token
+            inputs_dict["input_ids"].clamp_(max=model_vocab_size - 15 - 2)
+            n_images = self.model_tester.num_images * self.model_tester.perceiver_config["resampler_n_latents"]
+            model.image_token_id = model_vocab_size - 15 - 1
+            inputs_dict["input_ids"][:, -n_images:] = model.image_token_id
+
+            # make sure that decoder_input_ids are resized as well
+            if "decoder_input_ids" in inputs_dict:
+                inputs_dict["decoder_input_ids"].clamp_(max=model_vocab_size - 15 - 1)
+            model(**self._prepare_for_class(inputs_dict, model_class))
+
+            # Check that adding and removing tokens has not modified the first part of the embedding matrix.
+            models_equal = True
+            for p1, p2 in zip(cloned_embeddings, model_embed.weight):
+                if p1.data.ne(p2.data).sum() > 0:
+                    models_equal = False
+
+            self.assertTrue(models_equal)
+
+            config = copy.deepcopy(original_config)
+            model = model_class(config)
+            model.to(torch_device)
+
+            model_vocab_size = config.text_config.vocab_size
+            model.resize_token_embeddings(model_vocab_size + 10, pad_to_multiple_of=1)
+            self.assertTrue(model.config.text_config.vocab_size + 10, model_vocab_size)
+
+            model_embed = model.resize_token_embeddings(model_vocab_size, pad_to_multiple_of=64)
+            self.assertTrue(model_embed.weight.shape[0] // 64, 0)
+
+            self.assertTrue(model_embed.weight.shape[0], model.config.text_config.vocab_size)
+            self.assertTrue(model.config.text_config.vocab_size, model.vocab_size)
+
+            model_embed = model.resize_token_embeddings(model_vocab_size + 13, pad_to_multiple_of=64)
+            self.assertTrue(model_embed.weight.shape[0] // 64, 0)
+
+            # Check that resizing a model to a multiple of pad_to_multiple leads to a model of exactly that size
+            target_dimension = 128
+            model_embed = model.resize_token_embeddings(target_dimension, pad_to_multiple_of=64)
+            self.assertTrue(model_embed.weight.shape[0], target_dimension)
+
+            with self.assertRaisesRegex(
+                ValueError,
+                "Asking to pad the embedding matrix to a multiple of `1.3`, which is not and integer. Please make sure to pass an integer",
+            ):
+                model.resize_token_embeddings(model_vocab_size, pad_to_multiple_of=1.3)
+
+    # We need to override as we need to prepare such that the image token is the last token
+    def test_resize_embeddings_untied(self):
+        (original_config, inputs_dict) = self.model_tester.prepare_config_and_inputs_for_common()
+
+        original_config.tie_word_embeddings = False
+
+        for model_class in self.all_model_classes:
+            config = copy.deepcopy(original_config)
+            model = model_class(config).to(torch_device)
+
+            # if no output embeddings -> leave test
+            if model.get_output_embeddings() is None:
+                continue
+
+            # Check that resizing the token embeddings with a larger vocab size increases the model's vocab size
+            model_vocab_size = config.text_config.vocab_size
+            model.resize_token_embeddings(model_vocab_size + 10)
+            self.assertEqual(model.config.text_config.vocab_size, model_vocab_size + 10)
+            output_embeds = model.get_output_embeddings()
+            self.assertEqual(output_embeds.weight.shape[0], model_vocab_size + 10)
+            # Check bias if present
+            if output_embeds.bias is not None:
+                self.assertEqual(output_embeds.bias.shape[0], model_vocab_size + 10)
+            # Check that the model can still do a forward pass successfully (every parameter should be resized)
+            model(**self._prepare_for_class(inputs_dict, model_class))
+
+            # Check that resizing the token embeddings with a smaller vocab size decreases the model's vocab size
+            model.resize_token_embeddings(model_vocab_size - 15)
+            self.assertEqual(model.config.text_config.vocab_size, model_vocab_size - 15)
+            # Check that it actually resizes the embeddings matrix
+            output_embeds = model.get_output_embeddings()
+            self.assertEqual(output_embeds.weight.shape[0], model_vocab_size - 15)
+            # Check bias if present
+            if output_embeds.bias is not None:
+                self.assertEqual(output_embeds.bias.shape[0], model_vocab_size - 15)
+
+            # Check that the model can still do a forward pass successfully (every parameter should be resized)
+            # Input ids should be clamped to the maximum size of the vocabulary - 1 and the image token should be the last token
+            inputs_dict["input_ids"].clamp_(max=model_vocab_size - 15 - 2)
+            n_images = self.model_tester.num_images * self.model_tester.perceiver_config["resampler_n_latents"]
+            model.image_token_id = model_vocab_size - 15 - 1
+            inputs_dict["input_ids"][:, -n_images:] = model.image_token_id
+
+            # Check that the model can still do a forward pass successfully (every parameter should be resized)
+            model(**self._prepare_for_class(inputs_dict, model_class))
+
+
+@require_torch
+class Idefics3ForConditionalGenerationModelTest(GenerationTesterMixin, ModelTesterMixin, unittest.TestCase):
+    """
+    Model tester for `Idefics3ForConditionalGeneration`.
+    """
+
+    all_model_classes = (Idefics3ForConditionalGeneration,) if is_torch_available() else ()
+    fx_compatible = False
+    test_pruning = False
+    test_resize_embeddings = True
+    test_head_masking = False
+    test_torchscript = False
+
+    def setUp(self):
+        self.model_tester = Idefics3VisionText2TextModelTester(self)
+        self.config_tester = ConfigTester(self, config_class=Idefics3Config, has_text_modality=False)
+
+    @unittest.skip(reason="input_embeds cannot be passed in without input_ids")
+    def test_inputs_embeds():
+        pass
+
+    @unittest.skip(reason="Model does not support padding right")
+    def test_flash_attn_2_generate_padding_right(self):
+        pass
+
+    @unittest.skip(reason="Model does not support padding right")
+    def test_flash_attn_2_inference_padding_right(self):
+        pass
+
+    # We need to override as we need to prepare such that the image token is the last token
+    def test_resize_tokens_embeddings(self):
+        (original_config, inputs_dict) = self.model_tester.prepare_config_and_inputs_for_common()
+
+        for model_class in self.all_model_classes:
+            config = copy.deepcopy(original_config)
+            model = model_class(config)
+            model.to(torch_device)
+
+            model_vocab_size = config.text_config.vocab_size
+            # Retrieve the embeddings and clone theme
+            model_embed = model.resize_token_embeddings(model_vocab_size)
+            cloned_embeddings = model_embed.weight.clone()
+
+            # Check that resizing the token embeddings with a larger vocab size increases the model's vocab size
+            model_embed = model.resize_token_embeddings(model_vocab_size + 10)
+            self.assertEqual(model.config.text_config.vocab_size, model_vocab_size + 10)
+            # Check that it actually resizes the embeddings matrix
+            self.assertEqual(model_embed.weight.shape[0], cloned_embeddings.shape[0] + 10)
+            # Check that the model can still do a forward pass successfully (every parameter should be resized)
+            model(**self._prepare_for_class(inputs_dict, model_class))
+
+            # Check that resizing the token embeddings with a smaller vocab size decreases the model's vocab size
+            model_embed = model.resize_token_embeddings(model_vocab_size - 15)
+            self.assertEqual(model.config.text_config.vocab_size, model_vocab_size - 15)
+            # Check that it actually resizes the embeddings matrix
+            self.assertEqual(model_embed.weight.shape[0], cloned_embeddings.shape[0] - 15)
+
+            # Check that the model can still do a forward pass successfully (every parameter should be resized)
+            # Input ids should be clamped to the maximum size of the vocabulary - 1 and the image token should be the last token
+            inputs_dict["input_ids"].clamp_(max=model_vocab_size - 15 - 2)
+            n_images = self.model_tester.num_images * self.model_tester.perceiver_config["resampler_n_latents"]
+            model.model.image_token_id = model_vocab_size - 15 - 1
+            inputs_dict["input_ids"][:, -n_images:] = model.model.image_token_id
+
+            model(**self._prepare_for_class(inputs_dict, model_class))
+
+            # Check that adding and removing tokens has not modified the first part of the embedding matrix.
+            models_equal = True
+            for p1, p2 in zip(cloned_embeddings, model_embed.weight):
+                if p1.data.ne(p2.data).sum() > 0:
+                    models_equal = False
+
+            self.assertTrue(models_equal)
+
+            config = copy.deepcopy(original_config)
+            model = model_class(config)
+            model.to(torch_device)
+
+            model_vocab_size = config.text_config.vocab_size
+            model.resize_token_embeddings(model_vocab_size + 10, pad_to_multiple_of=1)
+            self.assertTrue(model.config.text_config.vocab_size + 10, model_vocab_size)
+
+            model_embed = model.resize_token_embeddings(model_vocab_size, pad_to_multiple_of=64)
+            self.assertTrue(model_embed.weight.shape[0] // 64, 0)
+
+            self.assertTrue(model_embed.weight.shape[0], model.config.text_config.vocab_size)
+            self.assertTrue(model.config.text_config.vocab_size, model.vocab_size)
+
+            model_embed = model.resize_token_embeddings(model_vocab_size + 13, pad_to_multiple_of=64)
+            self.assertTrue(model_embed.weight.shape[0] // 64, 0)
+
+            # Check that resizing a model to a multiple of pad_to_multiple leads to a model of exactly that size
+            target_dimension = 128
+            model_embed = model.resize_token_embeddings(target_dimension, pad_to_multiple_of=64)
+            self.assertTrue(model_embed.weight.shape[0], target_dimension)
+
+            with self.assertRaisesRegex(
+                ValueError,
+                "Asking to pad the embedding matrix to a multiple of `1.3`, which is not and integer. Please make sure to pass an integer",
+            ):
+                model.resize_token_embeddings(model_vocab_size, pad_to_multiple_of=1.3)
+
+    # We need to override as we need to prepare such that the image token is the last token
+    def test_resize_embeddings_untied(self):
+        (original_config, inputs_dict) = self.model_tester.prepare_config_and_inputs_for_common()
+
+        original_config.tie_word_embeddings = False
+
+        for model_class in self.all_model_classes:
+            config = copy.deepcopy(original_config)
+            model = model_class(config).to(torch_device)
+
+            # Check that resizing the token embeddings with a larger vocab size increases the model's vocab size
+            model_vocab_size = config.text_config.vocab_size
+            model.resize_token_embeddings(model_vocab_size + 10)
+            self.assertEqual(model.config.text_config.vocab_size, model_vocab_size + 10)
+            output_embeds = model.get_output_embeddings()
+            self.assertEqual(output_embeds.weight.shape[0], model_vocab_size + 10)
+            # Check bias if present
+            if output_embeds.bias is not None:
+                self.assertEqual(output_embeds.bias.shape[0], model_vocab_size + 10)
+            # Check that the model can still do a forward pass successfully (every parameter should be resized)
+            model(**self._prepare_for_class(inputs_dict, model_class))
+
+            # Check that resizing the token embeddings with a smaller vocab size decreases the model's vocab size
+            model.resize_token_embeddings(model_vocab_size - 15)
+            self.assertEqual(model.config.text_config.vocab_size, model_vocab_size - 15)
+            # Check that it actually resizes the embeddings matrix
+            output_embeds = model.get_output_embeddings()
+            self.assertEqual(output_embeds.weight.shape[0], model_vocab_size - 15)
+            # Check bias if present
+            if output_embeds.bias is not None:
+                self.assertEqual(output_embeds.bias.shape[0], model_vocab_size - 15)
+
+            # Check that the model can still do a forward pass successfully (every parameter should be resized)
+            # Input ids should be clamped to the maximum size of the vocabulary - 1 and the image token should be the last token
+            inputs_dict["input_ids"].clamp_(max=model_vocab_size - 15 - 2)
+            n_images = self.model_tester.num_images * self.model_tester.perceiver_config["resampler_n_latents"]
+            model.model.image_token_id = model_vocab_size - 15 - 1
+            inputs_dict["input_ids"][:, -n_images:] = model.model.image_token_id
+
+            # Check that the model can still do a forward pass successfully (every parameter should be resized)
+            model(**self._prepare_for_class(inputs_dict, model_class))
+
+
+@require_torch
+class Idefics3ForConditionalGenerationIntegrationTest(unittest.TestCase):
+    def setUp(self):
+        self.processor = AutoProcessor.from_pretrained("HuggingFaceM4/idefics3-8b-base")
+        self.image1 = Image.open(
+            BytesIO(
+                requests.get(
+                    "https://cdn.britannica.com/61/93061-050-99147DCE/Statue-of-Liberty-Island-New-York-Bay.jpg"
+                ).content
+            )
+        )
+        self.image2 = Image.open(
+            BytesIO(requests.get("https://cdn.britannica.com/59/94459-050-DBA42467/Skyline-Chicago.jpg").content)
+        )
+        self.image3 = Image.open(
+            BytesIO(
+                requests.get(
+                    "https://thumbs.dreamstime.com/b/golden-gate-bridge-san-francisco-purple-flowers-california-echium-candicans-36805947.jpg"
+                ).content
+            )
+        )
+
+    def tearDown(self):
+        gc.collect()
+        torch.cuda.empty_cache()
+
+    @slow
+    def test_integration_test(self):
+        model = Idefics3ForConditionalGeneration.from_pretrained(
+            "HuggingFaceM4/idefics3-8b-base",
+            torch_dtype=torch.bfloat16,
+            device_map="auto",
+        )
+        model.to(torch_device)
+
+        # Create inputs
+        text = "<image>In this image, we see"
+        images = self.image1
+        inputs = self.processor(text=text, images=images, return_tensors="pt", padding=True)
+        inputs.to(torch_device)
+
+        generated_ids = model.generate(**inputs, max_new_tokens=10)
+        generated_texts = self.processor.batch_decode(generated_ids, skip_special_tokens=True)
+
+        # Batch affects generated text. Single batch output: ['In this image, we see the Statue of Liberty in the foreground and']
+        expected_generated_text = "In this image, we see the Statue of Liberty, the New York City"
+        self.assertEqual(generated_texts[0], expected_generated_text)
+
+    @slow
+    @require_bitsandbytes
+    def test_integration_test_4bit(self):
+        # Let' s make sure we test the preprocessing to replace what is used
+        model = Idefics3ForConditionalGeneration.from_pretrained(
+            "HuggingFaceM4/idefics3-8b-base", load_in_4bit=True, device_map="auto"
+        )
+
+        # Create pixel inputs
+        text = ["<image>In this image, we see", "bla, bla <image><image>"]
+        images = [[self.image1], [self.image2, self.image3]]
+        inputs = self.processor(text=text, images=images, padding=True, return_tensors="pt")
+
+        generated_ids = model.generate(**inputs, max_new_tokens=10)
+        generated_texts = self.processor.batch_decode(generated_ids, skip_special_tokens=True)
+
+        expected_generated_text = "In this image, we see the Statue of Liberty, the Hudson River,"
+        self.assertEqual(generated_texts[0], expected_generated_text)
diff --git a/tests/models/idefics3/test_processing_idefics3.py b/tests/models/idefics3/test_processing_idefics3.py
new file mode 100644
index 00000000000000..cee3e381872bee
--- /dev/null
+++ b/tests/models/idefics3/test_processing_idefics3.py
@@ -0,0 +1,235 @@
+# coding=utf-8
+# Copyright 2024 HuggingFace Inc.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+import unittest
+from io import BytesIO
+
+import requests
+
+from transformers import Idefics3Processor
+from transformers.testing_utils import require_torch, require_vision
+from transformers.utils import is_vision_available
+
+
+if is_vision_available():
+    from PIL import Image
+
+
+@require_torch
+@require_vision
+class Idefics3ProcessorTest(unittest.TestCase):
+    def setUp(self):
+        self.processor = Idefics3Processor.from_pretrained("HuggingFaceM4/idefics3-8b", image_seq_len=2)
+        self.image1 = Image.open(
+            BytesIO(
+                requests.get(
+                    "https://cdn.britannica.com/61/93061-050-99147DCE/Statue-of-Liberty-Island-New-York-Bay.jpg"
+                ).content
+            )
+        )
+        self.image2 = Image.open(
+            BytesIO(requests.get("https://cdn.britannica.com/59/94459-050-DBA42467/Skyline-Chicago.jpg").content)
+        )
+        self.image3 = Image.open(
+            BytesIO(
+                requests.get(
+                    "https://thumbs.dreamstime.com/b/golden-gate-bridge-san-francisco-purple-flowers-california-echium-candicans-36805947.jpg"
+                ).content
+            )
+        )
+        self.bos_token = self.processor.tokenizer.bos_token
+        self.image_token = self.processor.image_token.content
+        self.fake_image_token = self.processor.fake_image_token.content
+
+        self.bos_token_id = self.processor.tokenizer.convert_tokens_to_ids(self.bos_token)
+        self.image_token_id = self.processor.tokenizer.convert_tokens_to_ids(self.image_token)
+        self.fake_image_token_id = self.processor.tokenizer.convert_tokens_to_ids(self.fake_image_token)
+        self.image_seq_len = self.processor.image_seq_len
+
+    def test_process_interleaved_images_prompts_no_image_splitting(self):
+        old_image_splitting = self.processor.image_processor.do_image_splitting
+
+        self.processor.image_processor.do_image_splitting = False
+
+        # Test that a single image is processed correctly
+        inputs = self.processor(images=self.image1)
+        self.assertEqual(inputs["pixel_values"].shape, (1, 1, 3, 653, 980))
+        self.assertEqual(inputs["pixel_attention_mask"].shape, (1, 1, 653, 980))
+        # fmt: on
+
+        # Test a single sample with image and text
+        image_str = "<image>"
+        text_str = "In this image, we see"
+        text = image_str + text_str
+        inputs = self.processor(text=text, images=self.image1)
+
+        # fmt: off
+        tokenized_sentence = self.processor.tokenizer(text_str, add_special_tokens=False)
+        expected_input_ids = [[self.bos_token_id] + [self.fake_image_token_id] + [self.image_token_id] * self.image_seq_len + [self.fake_image_token_id] + tokenized_sentence["input_ids"]]
+        self.assertEqual(inputs["input_ids"], expected_input_ids)
+        self.assertEqual(inputs["attention_mask"], [[1] * len(expected_input_ids[0])])
+        self.assertEqual(inputs["pixel_values"].shape, (1, 1, 3, 653, 980))
+        self.assertEqual(inputs["pixel_attention_mask"].shape, (1, 1, 653, 980))
+        # fmt: on
+
+        # Test that batch is correctly processed
+        image_str = "<image>"
+        text_str_1 = "In this image, we see"
+        text_str_2 = "bla, bla"
+
+        text = [
+            image_str + text_str_1,
+            text_str_2 + image_str + image_str,
+        ]
+        images = [[self.image1], [self.image2, self.image3]]
+
+        inputs = self.processor(text=text, images=images, padding=True)
+
+        # fmt: off
+        tokenized_sentence_1 = self.processor.tokenizer(text_str_1, add_special_tokens=False)
+        tokenized_sentence_2 = self.processor.tokenizer(text_str_2, add_special_tokens=False)
+        expected_input_ids_1 = [self.bos_token_id] + [self.fake_image_token_id] + [self.image_token_id] * self.image_seq_len + [self.fake_image_token_id] + tokenized_sentence_1["input_ids"]
+        expected_input_ids_2 = [self.bos_token_id] + tokenized_sentence_2["input_ids"] + [self.fake_image_token_id] + [self.image_token_id] * self.image_seq_len + [self.fake_image_token_id] + [self.image_token_id] * self.image_seq_len + [self.fake_image_token_id]
+        # Pad the first input to match the second input
+        pad_len = len(expected_input_ids_2) - len(expected_input_ids_1)
+        padded_expected_input_ids_1 = [0] * pad_len + expected_input_ids_1
+
+        self.assertEqual(
+            inputs["input_ids"], [padded_expected_input_ids_1, expected_input_ids_2]
+        )
+        self.assertEqual(
+            inputs["attention_mask"],
+            [[0] * pad_len + [1] * len(expected_input_ids_1), [1] * len(expected_input_ids_2)]
+        )
+        self.assertEqual(inputs['pixel_values'].shape, (2, 2, 3, 767, 980))
+        self.assertEqual(inputs['pixel_attention_mask'].shape, (2, 2, 767, 980))
+        # fmt: on
+
+        self.processor.image_processor.do_image_splitting = old_image_splitting
+
+    def test_process_interleaved_images_prompts_image_splitting(self):
+        old_image_splitting = self.processor.image_processor.do_image_splitting
+
+        self.processor.image_processor.do_image_splitting = True
+
+        # Test that a single image is processed correctly
+        inputs = self.processor(images=self.image1)
+        self.assertEqual(inputs["pixel_values"].shape, (1, 5, 3, 653, 980))
+        self.assertEqual(inputs["pixel_attention_mask"].shape, (1, 5, 653, 980))
+        # fmt: on
+
+        # Test a single sample with image and text
+        image_str = "<image>"
+        text_str = "In this image, we see"
+        text = image_str + text_str
+        inputs = self.processor(text=text, images=self.image1)
+
+        # fmt: off
+        tokenized_sentence = self.processor.tokenizer(text_str, add_special_tokens=False)
+        expected_input_ids = [[self.bos_token_id] + ([self.fake_image_token_id] + [self.image_token_id] * self.image_seq_len) * 5 + [self.fake_image_token_id] + tokenized_sentence["input_ids"]]
+        self.assertEqual(inputs["input_ids"], expected_input_ids)
+        self.assertEqual(inputs["attention_mask"], [[1] * len(expected_input_ids[0])])
+        self.assertEqual(inputs["pixel_values"].shape, (1, 5, 3, 653, 980))
+        self.assertEqual(inputs["pixel_attention_mask"].shape, (1, 5, 653, 980))
+        # fmt: on
+
+        # Test that batch is correctly processed
+        image_str = "<image>"
+        text_str_1 = "In this image, we see"
+        text_str_2 = "bla, bla"
+
+        text = [
+            image_str + text_str_1,
+            text_str_2 + image_str + image_str,
+        ]
+        images = [[self.image1], [self.image2, self.image3]]
+
+        inputs = self.processor(text=text, images=images, padding=True)
+
+        # fmt: off
+        tokenized_sentence_1 = self.processor.tokenizer(text_str_1, add_special_tokens=False)
+        tokenized_sentence_2 = self.processor.tokenizer(text_str_2, add_special_tokens=False)
+        expected_input_ids_1 = [self.bos_token_id] + ([self.fake_image_token_id] + [self.image_token_id] * self.image_seq_len) * 5 + [self.fake_image_token_id] + tokenized_sentence_1["input_ids"]
+        expected_input_ids_2 = [self.bos_token_id] + tokenized_sentence_2["input_ids"] + ([self.fake_image_token_id] + [self.image_token_id] * self.image_seq_len) * 5 + ([self.fake_image_token_id] + [self.image_token_id] * self.image_seq_len) * 5 + [self.fake_image_token_id]
+        # Pad the first input to match the second input
+        pad_len = len(expected_input_ids_2) - len(expected_input_ids_1)
+        padded_expected_input_ids_1 = [0] * pad_len + expected_input_ids_1
+
+        self.assertEqual(
+            inputs["input_ids"], [padded_expected_input_ids_1, expected_input_ids_2]
+        )
+        self.assertEqual(
+            inputs["attention_mask"],
+            [[0] * pad_len + [1] * len(expected_input_ids_1), [1] * len(expected_input_ids_2)]
+        )
+        self.assertEqual(inputs['pixel_values'].shape, (2, 10, 3, 767, 980))
+        self.assertEqual(inputs['pixel_attention_mask'].shape, (2, 10, 767, 980))
+        # fmt: on
+
+        self.processor.image_processor.do_image_splitting = old_image_splitting
+
+    def test_add_special_tokens_processor(self):
+        image_str = "<image>"
+        text_str = "In this image, we see"
+        text = text_str + image_str
+
+        n_image_repeat = 5 if self.processor.image_processor.do_image_splitting else 1
+
+        # fmt: off
+        inputs = self.processor(text=text, images=self.image1, add_special_tokens=False)
+        tokenized_sentence = self.processor.tokenizer(text_str, add_special_tokens=False)
+        expected_input_ids = [tokenized_sentence["input_ids"] + ([self.fake_image_token_id] + [self.image_token_id] * self.image_seq_len) * n_image_repeat + [self.fake_image_token_id]]
+        self.assertEqual(inputs["input_ids"], expected_input_ids)
+
+        inputs = self.processor(text=text, images=self.image1)
+        expected_input_ids = [[self.bos_token_id] + tokenized_sentence["input_ids"] + ([self.fake_image_token_id] + [self.image_token_id] * self.image_seq_len) * n_image_repeat + [self.fake_image_token_id]]
+        self.assertEqual(inputs["input_ids"], expected_input_ids)
+        # fmt: on
+
+    def test_apply_chat_template(self):
+        # Message contains content which a mix of lists with images and image urls and string
+        messages = [
+            {
+                "role": "user",
+                "content": [
+                    {"type": "text", "text": "What do these images show?"},
+                    {"type": "image"},
+                    {"type": "image"},
+                    "What do these images show?",
+                ],
+            },
+            {
+                "role": "assistant",
+                "content": [
+                    {
+                        "type": "text",
+                        "text": "The first image shows the statue of Liberty in New York. The second image picture depicts Idefix, the dog of Obelix in Asterix and Obelix.",
+                    }
+                ],
+            },
+            {"role": "user", "content": [{"type": "text", "text": "And who is that?"}]},
+        ]
+
+        processor = self.processor
+        # Make short sequence length to test that the fake tokens are added correctly
+        rendered = processor.apply_chat_template(messages, add_generation_prompt=True)
+
+        expected_rendered = (
+            "User: What do these images show?<image><image><end_of_utterance>\n"
+            "Assistant: The first image shows the statue of Liberty in New York. The second image picture depicts Idefix, the dog of Obelix in Asterix and Obelix.<end_of_utterance>\n"
+            "User: And who is that?<end_of_utterance>\n"
+            "Assistant:"
+        )
+        self.assertEqual(rendered, expected_rendered)

From afce007ee199f2db84cf447da648cc83bcc5f4de Mon Sep 17 00:00:00 2001
From: Andres Marafioti <andimarafioti@gmail.com>
Date: Wed, 7 Aug 2024 20:26:23 +0000
Subject: [PATCH 02/50] fixes to make both pipelines identical

---
 .../idefics3/image_processing_idefics3.py     | 108 +++++++++++-------
 .../models/idefics3/processing_idefics3.py    |  13 ++-
 2 files changed, 80 insertions(+), 41 deletions(-)

diff --git a/src/transformers/models/idefics3/image_processing_idefics3.py b/src/transformers/models/idefics3/image_processing_idefics3.py
index 2dedac9c9acad8..ccb0b6602ef069 100644
--- a/src/transformers/models/idefics3/image_processing_idefics3.py
+++ b/src/transformers/models/idefics3/image_processing_idefics3.py
@@ -19,7 +19,7 @@
 import numpy as np
 
 from ...image_processing_utils import BaseImageProcessor, BatchFeature
-from ...image_transforms import PaddingMode, pad, resize, to_channel_dimension_format
+from ...image_transforms import PaddingMode, pad, rescale, resize, to_channel_dimension_format
 from ...image_utils import (
     IMAGENET_STANDARD_MEAN,
     IMAGENET_STANDARD_STD,
@@ -74,9 +74,13 @@ def _resize_output_size_rescale_to_max_len(
     if width >= height:
         width = max_len
         height = int(width / aspect_ratio)
+        if height % 2 != 0:
+            height += 1
     elif height > width:
         height = max_len
         width = int(height * aspect_ratio)
+        if width % 2 != 0:
+            width += 1
 
     # Avoid resizing to a size smaller than min_len
     height = max(height, min_len)
@@ -148,7 +152,10 @@ def get_resize_output_image_size(
     if resolution_max_side > max_image_size:
         raise ValueError("`resolution_max_side` cannot be larger than `max_image_size`")
 
-    height, width = get_image_size(image, channel_dim=input_data_format)
+    if isinstance(image, Image.Image):
+        width, height = image.size
+    else:
+        height, width = get_image_size(image, channel_dim=input_data_format)
 
     # Find the output size, when rescaling the longest edge to max_len and preserving the aspect ratio
     height, width = _resize_output_size_rescale_to_max_len(
@@ -176,7 +183,10 @@ def split_image(
     `width`).
     3) Returns the list of the crops and the original image, in addition to the number of splits for the height and the width.
     """
-    height, width = get_image_size(image, channel_dim=input_data_format)
+    if isinstance(image, Image.Image):
+        width, height = image.size
+    else:
+        height, width = get_image_size(image, channel_dim=input_data_format)
     max_height = max_width = max_image_size["longest_edge"]
 
     frames = []
@@ -200,24 +210,30 @@ def split_image(
                 end_y = min(start_y + optimal_height, height)
 
                 # Crop the image
-                cropped_image = _crop(
-                    image, start_x, start_y, end_x, end_y, input_data_format=input_data_format, data_format=data_format
-                )
+                if isinstance(image, Image.Image):
+                    cropped_image = image.crop((start_x, start_y, end_x, end_y))
+                else:
+                    cropped_image = _crop(
+                        image, start_x, start_y, end_x, end_y, input_data_format=input_data_format, data_format=data_format
+                    )
                 frames.append(cropped_image)
 
         # For the global image at the end, we resize it to match the max_image_size, for cpu memory efficiency
         global_image_height, global_image_width = max_height, max_width
         if height != global_image_height or width != global_image_width:
-            image = resize(
-                image,
-                (global_image_height, global_image_width),
-                resample=resample,
-                input_data_format=input_data_format,
-            )
+            if isinstance(image, Image.Image):
+                image = image.resize((global_image_width, global_image_height), resample=resample)
+            else:
+                image = resize(
+                    image,
+                    (global_image_height, global_image_width),
+                    resample=resample,
+                    input_data_format=input_data_format,
+                )
     else:
         num_splits_h, num_splits_w = 0, 0
 
-    if data_format is not None:
+    if data_format is not None and not isinstance(image, Image.Image):
         image = to_channel_dimension_format(image, data_format, input_channel_dim=input_data_format)
 
     frames.append(image)
@@ -419,7 +435,7 @@ def __init__(
         super().__init__(**kwargs)
         self.do_convert_rgb = do_convert_rgb
         self.do_resize = do_resize
-        self.size = size if size is not None else {"longest_edge": 5*364}
+        self.size = size if size is not None else {"longest_edge": 4*364}
         self.resample = resample
         self.do_image_splitting = do_image_splitting
         self.max_image_size = max_image_size if max_image_size is not None else {"longest_edge": 364}
@@ -464,6 +480,8 @@ def resize(
             size = (size["height"], size["width"])
         else:
             raise ValueError("size must be a dictionary with key 'longest_edge' or 'height' and 'width'.")
+        if isinstance(image, Image.Image):
+            return image.resize((size[1], size[0]), resample=resample)
         return resize(
             image, size, resample=resample, data_format=data_format, input_data_format=input_data_format, **kwargs
         )
@@ -701,21 +719,8 @@ def preprocess(
             resample=resample,
         )
 
-        if do_convert_rgb:
-            images_list = [[convert_to_rgb(image) for image in images] for images in images_list]
-
-        # All transformations expect numpy arrays.
-        images_list = [[to_numpy_array(image) for image in images] for images in images_list]
-
-        if is_scaled_image(images_list[0][0]) and do_rescale:
-            logger.warning_once(
-                "It looks like you are trying to rescale already rescaled images. If the input"
-                " images have pixel values between 0 and 1, set `do_rescale=False` to avoid rescaling them again."
-            )
-
-        if input_data_format is None:
-            # We assume that all images have the same channel dimension format.
-            input_data_format = infer_channel_dimension_format(images_list[0][0])
+        # # All transformations expect numpy arrays.
+        # images_list = [[to_numpy_array(image) for image in images] for images in images_list]
 
         if do_resize:
             images_list = [
@@ -733,10 +738,21 @@ def preprocess(
         for images in images_list:
             new_images = []
             for img in images:
-                height, width, _ = img.shape
-                new_size = {"height": math.ceil(height / self.vision_encoder_max_size) * self.vision_encoder_max_size,
-                            "width": math.ceil(width/ self.vision_encoder_max_size) * self.vision_encoder_max_size}
-                new_images.append(self.resize(img, size=new_size, resample=resample, input_data_format=input_data_format))
+                if isinstance(img, Image.Image):
+                    width, height = img.size
+                else:
+                    height, width, _ = img.shape
+                aspect_ratio = width / height
+                if width >= height:
+                    width = math.ceil(width / self.vision_encoder_max_size) * self.vision_encoder_max_size
+                    height = int(width / aspect_ratio)
+                    height = math.ceil(height / self.vision_encoder_max_size) * self.vision_encoder_max_size
+                elif height > width:
+                    height = math.ceil(height / self.vision_encoder_max_size) * self.vision_encoder_max_size
+                    width = int(height * aspect_ratio)
+                    width = math.ceil(width / self.vision_encoder_max_size) * self.vision_encoder_max_size
+                new_size = {"height": height, "width": width}
+                new_images.append(self.resize(img, size=new_size, resample=resample))
             new_images_list.append(new_images)
         images_list = new_images_list
         del new_images_list
@@ -766,14 +782,26 @@ def preprocess(
             images_list_rows = [[0] * len(images) for images in images_list]
             images_list_cols = [[0] * len(images) for images in images_list]
 
+        if do_convert_rgb:
+            images_list = [[convert_to_rgb(image) for image in images] for images in images_list]
+
+        # All transformations expect numpy arrays.
+        images_list = [[to_numpy_array(image) for image in images] for images in images_list]
+
+        if is_scaled_image(images_list[0][0]) and do_rescale:
+            logger.warning_once(
+                "It looks like you are trying to rescale already rescaled images. If the input"
+                " images have pixel values between 0 and 1, set `do_rescale=False` to avoid rescaling them again."
+            )
         if do_rescale:
-            images_list = [
-                [
-                    self.rescale(image=image, scale=rescale_factor, input_data_format=input_data_format)
-                    for image in images
-                ]
-                for images in images_list
-            ]
+            rescaled_images_array = []
+            for image in images_list:
+                rescaled_images_array.append([rescale(img, rescale_factor) for img in image])
+            images_list = rescaled_images_array
+
+        if input_data_format is None:
+            # We assume that all images have the same channel dimension format.
+            input_data_format = infer_channel_dimension_format(images_list[0][0])
 
         if do_normalize:
             images_list = [
diff --git a/src/transformers/models/idefics3/processing_idefics3.py b/src/transformers/models/idefics3/processing_idefics3.py
index 1319d884e2d1e4..d948d167fe549b 100644
--- a/src/transformers/models/idefics3/processing_idefics3.py
+++ b/src/transformers/models/idefics3/processing_idefics3.py
@@ -214,7 +214,18 @@ def __call__(
             n_images_in_images = [len(sample) for sample in images]
 
             # Load images if they are URLs
-            images = [[load_image(im) for im in sample] for sample in images]
+            new_images = []
+            for sample in images:
+                new_images.append([])
+                for im in sample:
+                    if is_valid_image(im):
+                        new_images[-1].append(im) # already loaded
+                    elif isinstance(im, str):
+                        new_images[-1].append(load_image(im))
+
+            images = new_images
+            del new_images
+
             image_inputs = self.image_processor(images, return_tensors=return_tensors, return_row_col_info=True)
             inputs.update(image_inputs)
 

From 3e3b31da0e6a92d8599358849e7fb55ed024726c Mon Sep 17 00:00:00 2001
From: Andres Marafioti <andimarafioti@gmail.com>
Date: Thu, 8 Aug 2024 09:04:47 +0000
Subject: [PATCH 03/50] fix for quantized models

---
 src/transformers/models/idefics3/modeling_idefics3.py | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/src/transformers/models/idefics3/modeling_idefics3.py b/src/transformers/models/idefics3/modeling_idefics3.py
index e3f554c8594e4f..abd7d79f5ab9c7 100644
--- a/src/transformers/models/idefics3/modeling_idefics3.py
+++ b/src/transformers/models/idefics3/modeling_idefics3.py
@@ -885,7 +885,7 @@ def inputs_merger(
         num_images, _, vision_hidden_size = image_hidden_states.shape
         special_image_token_mask = input_ids == self.image_token_id
         new_inputs_embeds = inputs_embeds.clone()
-        reshaped_image_hidden_states = image_hidden_states.view(-1, vision_hidden_size)
+        reshaped_image_hidden_states = image_hidden_states.view(-1, vision_hidden_size).to(inputs_embeds.dtype) # cast to the dtype of the input_embeds to support quantized models
         new_inputs_embeds[special_image_token_mask] = reshaped_image_hidden_states
         return new_inputs_embeds
 

From 9c8ffc4d1815194cd677c32c787569d23f41213e Mon Sep 17 00:00:00 2001
From: Andres Marafioti <andimarafioti@gmail.com>
Date: Thu, 8 Aug 2024 10:05:37 +0000
Subject: [PATCH 04/50] First pass at the review

---
 .../idefics3/image_processing_idefics3.py     |  6 ++
 .../models/idefics3/processing_idefics3.py    | 58 -------------------
 2 files changed, 6 insertions(+), 58 deletions(-)

diff --git a/src/transformers/models/idefics3/image_processing_idefics3.py b/src/transformers/models/idefics3/image_processing_idefics3.py
index ccb0b6602ef069..3b89ab2eafcb1e 100644
--- a/src/transformers/models/idefics3/image_processing_idefics3.py
+++ b/src/transformers/models/idefics3/image_processing_idefics3.py
@@ -700,6 +700,12 @@ def preprocess(
         do_convert_rgb = do_convert_rgb if do_convert_rgb is not None else self.do_convert_rgb
         do_pad = do_pad if do_pad is not None else self.do_pad
 
+        if not do_image_splitting:
+            logger.warning_once(
+                "Idefics3 was trained on splitted image to support high resolution. Setting do_image_splitting=False will degrade the performance."
+            )
+
+
         images_list = make_list_of_images(images)
 
         if not valid_images(images_list[0]):
diff --git a/src/transformers/models/idefics3/processing_idefics3.py b/src/transformers/models/idefics3/processing_idefics3.py
index d948d167fe549b..dca8847350e978 100644
--- a/src/transformers/models/idefics3/processing_idefics3.py
+++ b/src/transformers/models/idefics3/processing_idefics3.py
@@ -302,61 +302,3 @@ def model_input_names(self):
         tokenizer_input_names = self.tokenizer.model_input_names
         image_processor_input_names = self.image_processor.model_input_names
         return list(dict.fromkeys(tokenizer_input_names + image_processor_input_names))
-
-    @property
-    def default_chat_template(self):
-        """
-        This template formats inputs in the form of a chat history. For each message in the chat history:
-        * the template will output the role of the speaker followed by the content of the message.
-        * content can be a single string or a list of strings and images.
-        * If the content element is an image, the template will output a sequence of <image> tokens
-        * The template will output an <end_of_utterance> token at the end of each message.
-
-        Example:
-
-        ```python
-        messages = [{
-            "role": "user",
-            "content": [
-                {"type": "image"},
-                {"type": "image"},
-                {"type": "text", "text": "What is in this image?"},
-                ],
-        },
-        {
-            "role": "assistant",
-            "content": [{"type": "text", "text": "This picture depicts Idefix, the dog of Obelix in Asterix and Obelix. Idefix is running on the ground."},]
-        }]
-        ```
-
-        Will create outputs like:
-        ```
-        <|begin_of_text|>User:<image><image> What is in this Image?<end_of_utterance>
-        Assistant: This picture depicts Idefix, the dog of Obelix in Asterix and Obelix. Idefix is running on the ground.<end_of_utterance>
-        ```
-        """
-        # fmt: off
-        return (
-            "<|begin_of_text|>"
-            "{% for message in messages %}"
-                "{{message['role'].capitalize()}}"
-                "{% if message['content'][0]['type'] == 'image' %}"
-                    "{{':'}}"
-                "{% else %}"
-                    "{{': '}}"
-                "{% endif %}"
-                "{% for line in message['content'] %}"
-                    "{% if line['type'] == 'text' %}"
-                        "{{line['text']}}"
-                    "{% elif line['type'] == 'image' %}"
-                        "{{ '<image>' }}"
-                    "{% endif %}"
-                "{% endfor %}"
-                "<end_of_utterance>\n"
-            "{% endfor %}"
-
-            "{% if add_generation_prompt %}"
-                "{{ 'Assistant:' }}"
-            "{% endif %}"
-        )
-        # fmt: on

From 7e3d7a63f51197e47c9c3b067a29edf05c012b57 Mon Sep 17 00:00:00 2001
From: Andres Marafioti <andimarafioti@gmail.com>
Date: Thu, 8 Aug 2024 13:31:42 +0000
Subject: [PATCH 05/50] remove vocab size from the main config (it's still in
 the text_config)

---
 .../models/idefics3/configuration_idefics3.py         | 11 ++---------
 1 file changed, 2 insertions(+), 9 deletions(-)

diff --git a/src/transformers/models/idefics3/configuration_idefics3.py b/src/transformers/models/idefics3/configuration_idefics3.py
index 8dd80791f4a309..8ea1f490cb5110 100644
--- a/src/transformers/models/idefics3/configuration_idefics3.py
+++ b/src/transformers/models/idefics3/configuration_idefics3.py
@@ -150,8 +150,6 @@ class Idefics3Config(PretrainedConfig):
             Custom text config or dict for the text model
         scale_factor (`int`, *optional*, defaults to 2):
             The scale factor for the image encoder.
-        vocab_size (`int`, *optional*, defaults to 100000):
-            The size of the vocabulary.
         hidden_size (`int`, *optional*, defaults to 4096):
             Dimensionality of the encoder layers and the pooler layer.
         pad_token_id (`int`, *optional*, defaults to 128002):
@@ -181,7 +179,6 @@ def __init__(
         vision_config=None,
         text_config=None,
         scale_factor=2,
-        vocab_size=100000,
         hidden_size=4096,
         pad_token_id=128_002,
         max_position_embeddings=131_072,
@@ -190,7 +187,6 @@ def __init__(
         self.image_token_id = image_token_id
         self.use_cache = use_cache
         self.tie_word_embeddings = tie_word_embeddings
-        self.vocab_size = vocab_size
 
         if vision_config is None:
             self.vision_config = Idefics3VisionConfig()
@@ -201,20 +197,17 @@ def __init__(
             self.vision_config = vision_config
 
         if isinstance(text_config, dict):
-            text_config["model_type"] = text_config["model_type"] if "model_type" in text_config else "llama3"
-            text_config["vocab_size"] = vocab_size
+            text_config["model_type"] = text_config["model_type"] if "model_type" in text_config else "llama"
             text_config = CONFIG_MAPPING[text_config["model_type"]](**text_config)
         elif text_config is None:
             logger.info("text_config is None, using default text config")
-            text_config = CONFIG_MAPPING["llama3"](
-                vocab_size=vocab_size,
+            text_config = CONFIG_MAPPING["llama"](
                 max_position_embeddings=max_position_embeddings,
                 rms_norm_eps=1e-5,
                 pad_token_id=pad_token_id,
                 tie_word_embeddings=False,
             )
 
-        text_config.vocab_size = vocab_size
         self.text_config = text_config
         self.scale_factor = scale_factor
         self.hidden_size = hidden_size

From dd99bca05c4fa6cedf9b09c5b008c7fe3f24d5a3 Mon Sep 17 00:00:00 2001
From: Andres Marafioti <andimarafioti@gmail.com>
Date: Thu, 8 Aug 2024 14:20:46 +0000
Subject: [PATCH 06/50] hot fix for merve

---
 src/transformers/models/idefics3/configuration_idefics3.py | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/src/transformers/models/idefics3/configuration_idefics3.py b/src/transformers/models/idefics3/configuration_idefics3.py
index 8ea1f490cb5110..f9e838c30fdfcb 100644
--- a/src/transformers/models/idefics3/configuration_idefics3.py
+++ b/src/transformers/models/idefics3/configuration_idefics3.py
@@ -197,7 +197,7 @@ def __init__(
             self.vision_config = vision_config
 
         if isinstance(text_config, dict):
-            text_config["model_type"] = text_config["model_type"] if "model_type" in text_config else "llama"
+            text_config["model_type"] = "llama"
             text_config = CONFIG_MAPPING[text_config["model_type"]](**text_config)
         elif text_config is None:
             logger.info("text_config is None, using default text config")

From ddac9ec93a8660e83305ec0bf37b30e317774824 Mon Sep 17 00:00:00 2001
From: =?UTF-8?q?Andr=C3=A9s=20Marafioti?= <andimarafioti@gmail.com>
Date: Fri, 9 Aug 2024 11:40:03 +0200
Subject: [PATCH 07/50] Apply suggestions from code review

Co-authored-by: amyeroberts <22614925+amyeroberts@users.noreply.github.com>
---
 src/transformers/models/idefics3/modeling_idefics3.py  |  5 +++--
 .../models/idefics3/processing_idefics3.py             | 10 +++++-----
 2 files changed, 8 insertions(+), 7 deletions(-)

diff --git a/src/transformers/models/idefics3/modeling_idefics3.py b/src/transformers/models/idefics3/modeling_idefics3.py
index abd7d79f5ab9c7..75145f4c41b7f1 100644
--- a/src/transformers/models/idefics3/modeling_idefics3.py
+++ b/src/transformers/models/idefics3/modeling_idefics3.py
@@ -383,6 +383,7 @@ def forward(self, hidden_states: torch.Tensor) -> torch.Tensor:
         hidden_states = self.fc2(hidden_states)
         return hidden_states
 
+
 class Idefics3SimpleMLP(nn.Module):
     def __init__(self, input_size, output_size):
         super().__init__()
@@ -393,6 +394,7 @@ def __init__(self, input_size, output_size):
     def forward(self, x):
         return self.proj(x)
 
+
 class Idefics3EncoderLayer(nn.Module):
     def __init__(self, config: Idefics3Config):
         super().__init__()
@@ -688,7 +690,7 @@ class Idefics3PreTrainedModel(PreTrainedModel):
     config_class = Idefics3Config
     base_model_prefix = "model"
     supports_gradient_checkpointing = True
-    _no_split_modules = ["Idefics3VisionAttention", "Idefics3SimpleMLP", "Idefics3DecoderLayer"]
+    _no_split_modules = ["Idefics3VisionAttention", "Idefics3DecoderLayer"]
     _skip_keys_device_placement = "past_key_values"
     _supports_flash_attn_2 = True
     _supports_cache_class = True
@@ -1074,7 +1076,6 @@ def set_output_embeddings(self, new_embeddings):
         self.lm_head = new_embeddings
 
     def resize_token_embeddings(self, new_num_tokens: Optional[int] = None, pad_to_multiple_of=None) -> nn.Embedding:
-        # model_embeds = self.model.resize_token_embeddings(new_num_tokens=new_num_tokens, pad_to_multiple_of=pad_to_multiple_of)
         model_embeds = self._resize_token_embeddings(new_num_tokens, pad_to_multiple_of)
         if new_num_tokens is None and pad_to_multiple_of is None:
             return model_embeds
diff --git a/src/transformers/models/idefics3/processing_idefics3.py b/src/transformers/models/idefics3/processing_idefics3.py
index dca8847350e978..6aa5e1a410954b 100644
--- a/src/transformers/models/idefics3/processing_idefics3.py
+++ b/src/transformers/models/idefics3/processing_idefics3.py
@@ -42,23 +42,23 @@ def is_image_or_image_url(elem):
 
 def _prompt_split_image(image_seq_len, image_rows, image_cols, fake_token_around_image, image_token):
     """Prompt with expanded image tokens for when the image is split into patches."""
-    text_splitted_images = ""
+    text_split_images = ""
     for n_h in range(image_rows):
         for n_w in range(image_cols):
-            text_splitted_images += (
+            text_split_images += (
                 f"{fake_token_around_image}"
                 + f"<row_{n_h + 1}_col_{n_w + 1}>"
                 + f"{image_token}" * image_seq_len
             )
-        text_splitted_images += "\n"
+        text_split_images += "\n"
 
-    text_splitted_images += (
+    text_split_images += (
         f"\n{fake_token_around_image}"
         + "<global-img>"
         + f"{image_token}" * image_seq_len
         + f"{fake_token_around_image}"
     )
-    return text_splitted_images
+    return text_split_images
 
 
 def _prompt_single_image(image_seq_len, fake_token_around_image, image_token):

From 188bb761e524cbe29fa656bdc357e2b593b8c8db Mon Sep 17 00:00:00 2001
From: Andres Marafioti <andimarafioti@gmail.com>
Date: Fri, 9 Aug 2024 09:18:33 +0000
Subject: [PATCH 08/50] re-add model_type for text_config

---
 src/transformers/models/idefics3/configuration_idefics3.py | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/src/transformers/models/idefics3/configuration_idefics3.py b/src/transformers/models/idefics3/configuration_idefics3.py
index f9e838c30fdfcb..8ea1f490cb5110 100644
--- a/src/transformers/models/idefics3/configuration_idefics3.py
+++ b/src/transformers/models/idefics3/configuration_idefics3.py
@@ -197,7 +197,7 @@ def __init__(
             self.vision_config = vision_config
 
         if isinstance(text_config, dict):
-            text_config["model_type"] = "llama"
+            text_config["model_type"] = text_config["model_type"] if "model_type" in text_config else "llama"
             text_config = CONFIG_MAPPING[text_config["model_type"]](**text_config)
         elif text_config is None:
             logger.info("text_config is None, using default text config")

From 43fb214994141e3144e158e7b6bc75f021c39918 Mon Sep 17 00:00:00 2001
From: Andres Marafioti <andimarafioti@gmail.com>
Date: Fri, 9 Aug 2024 09:46:39 +0000
Subject: [PATCH 09/50] remove support for old_cache

---
 .../idefics3/image_processing_idefics3.py     |  2 +-
 .../models/idefics3/modeling_idefics3.py      | 39 -------------------
 2 files changed, 1 insertion(+), 40 deletions(-)

diff --git a/src/transformers/models/idefics3/image_processing_idefics3.py b/src/transformers/models/idefics3/image_processing_idefics3.py
index 3b89ab2eafcb1e..114714b5553992 100644
--- a/src/transformers/models/idefics3/image_processing_idefics3.py
+++ b/src/transformers/models/idefics3/image_processing_idefics3.py
@@ -739,7 +739,7 @@ def preprocess(
 
         # We will resize both height and width of each image to the nearest 364 multiple, disregarding the aspect ratio
         # for size=(10, 364) -> rescaled_size=(364, 364)
-        # for size=(11, 365) -> rescaled_size=(364, 364*2) 
+        # for size=(11, 365) -> rescaled_size=(364, 364*2)
         new_images_list = []
         for images in images_list:
             new_images = []
diff --git a/src/transformers/models/idefics3/modeling_idefics3.py b/src/transformers/models/idefics3/modeling_idefics3.py
index 75145f4c41b7f1..bee15e69b4d909 100644
--- a/src/transformers/models/idefics3/modeling_idefics3.py
+++ b/src/transformers/models/idefics3/modeling_idefics3.py
@@ -862,13 +862,6 @@ def get_input_embeddings(self):
     def set_input_embeddings(self, value):
         self.text_model.set_input_embeddings(value)
 
-    def resize_token_embeddings(self, new_num_tokens: Optional[int] = None, pad_to_multiple_of=None) -> nn.Embedding:
-        model_embeds = self.text_model.resize_token_embeddings(
-            new_num_tokens=new_num_tokens, pad_to_multiple_of=pad_to_multiple_of
-        )
-        self.config.text_config.vocab_size = model_embeds.num_embeddings
-        return model_embeds
-
     def inputs_merger(
         self,
         input_ids: torch.LongTensor,
@@ -940,11 +933,7 @@ def forward(
             raise ValueError("You have to specify either input_ids or inputs_embeds")
 
         past_seen_tokens = 0
-        return_legacy_cache = False
         if use_cache:
-            if not isinstance(past_key_values, Cache):  # kept for BC (non `Cache` `past_key_values` inputs)
-                return_legacy_cache = True
-                past_key_values = DynamicCache.from_legacy_cache(past_key_values)
             past_seen_tokens = past_key_values.get_seq_length()
 
         if inputs_embeds is not None and input_ids is None and past_seen_tokens == 0:
@@ -1016,9 +1005,6 @@ def forward(
             return_dict=return_dict,
         )
 
-        if return_legacy_cache and use_cache:
-            outputs.past_key_values = outputs.past_key_values.to_legacy_cache()
-
         if not return_dict:
             return tuple(v for v in [*outputs, image_hidden_states] if v is not None)
 
@@ -1075,21 +1061,6 @@ def get_output_embeddings(self):
     def set_output_embeddings(self, new_embeddings):
         self.lm_head = new_embeddings
 
-    def resize_token_embeddings(self, new_num_tokens: Optional[int] = None, pad_to_multiple_of=None) -> nn.Embedding:
-        model_embeds = self._resize_token_embeddings(new_num_tokens, pad_to_multiple_of)
-        if new_num_tokens is None and pad_to_multiple_of is None:
-            return model_embeds
-
-        # Update base model and current model config
-        # Ignore copy
-        self.config.text_config.vocab_size = model_embeds.weight.shape[0]
-        self.vocab_size = self.config.text_config.vocab_size
-
-        # Tie weights again if needed
-        self.tie_weights()
-
-        return model_embeds
-
     def tie_weights(self):
         """
         Overwrite `transformers.modeling_utils.PreTrainedModel.tie_weights` to handle the case of DecoupledLinear and DecoupledEmbedding.
@@ -1292,13 +1263,3 @@ def _update_model_kwargs_for_generation(self, outputs, model_kwargs, is_encoder_
         # Get the precomputed image_hidden_states
         model_kwargs["image_hidden_states"] = outputs.image_hidden_states
         return model_kwargs
-
-    @staticmethod
-    # Copied from transformers.models.opt.modeling_opt.OPTForCausalLM._reorder_cache
-    def _reorder_cache(past_key_values, beam_idx):
-        reordered_past = ()
-        for layer_past in past_key_values:
-            reordered_past += (
-                tuple(past_state.index_select(0, beam_idx.to(past_state.device)) for past_state in layer_past),
-            )
-        return reordered_past

From c9e0d85765d2e435e543e4b92a01bf5b7fb0c1ab Mon Sep 17 00:00:00 2001
From: Andres Marafioti <andimarafioti@gmail.com>
Date: Fri, 9 Aug 2024 11:14:18 +0000
Subject: [PATCH 10/50] remove hidden_size from main config

---
 .../models/idefics3/configuration_idefics3.py             | 8 ++------
 1 file changed, 2 insertions(+), 6 deletions(-)

diff --git a/src/transformers/models/idefics3/configuration_idefics3.py b/src/transformers/models/idefics3/configuration_idefics3.py
index 8ea1f490cb5110..48d271ba8a247e 100644
--- a/src/transformers/models/idefics3/configuration_idefics3.py
+++ b/src/transformers/models/idefics3/configuration_idefics3.py
@@ -30,7 +30,7 @@ class Idefics3VisionConfig(PretrainedConfig):
     Idefics3 vision encoder according to the specified arguments, defining the model architecture. Instantiating a
     configuration with the defaults will yield a similar configuration to that of the SigLIP checkpoint
     [google/siglip-base-patch16-224](https://huggingface.co/google/siglip-base-patch16-224) used in the Idefics3 model
-    [HuggingFaceM4/idefics3-8b](https://huggingface.co/HuggingFaceM4/idefics3-8b).
+    [HuggingFaceM4/Idefics3-8B-Llama3](https://huggingface.co/HuggingFaceM4/Idefics3-8B-Llama3).
 
     Configuration objects inherit from [`PretrainedConfig`] and can be used to control the model outputs. Read the
     documentation from [`PretrainedConfig`] for more information.
@@ -132,7 +132,7 @@ class Idefics3Config(PretrainedConfig):
     This is the configuration class to store the configuration of a [`Idefics3Model`]. It is used to instantiate a
     Idefics3 model according to the specified arguments, defining the model architecture. Instantiating a
     configuration with the defaults will yield a similar configuration to that of the model of the Idefics3
-    [HuggingFaceM4/idefics3-8b](https://huggingface.co/HuggingFaceM4/idefics3-8b) architecture.
+    [HuggingFaceM4/Idefics3-8B-Llama3](https://huggingface.co/HuggingFaceM4/Idefics3-8B-Llama3) architecture.
 
     Configuration objects inherit from [`PretrainedConfig`] and can be used to control the model outputs. Read the
     documentation from [`PretrainedConfig`] for more information.
@@ -150,8 +150,6 @@ class Idefics3Config(PretrainedConfig):
             Custom text config or dict for the text model
         scale_factor (`int`, *optional*, defaults to 2):
             The scale factor for the image encoder.
-        hidden_size (`int`, *optional*, defaults to 4096):
-            Dimensionality of the encoder layers and the pooler layer.
         pad_token_id (`int`, *optional*, defaults to 128002):
             The id of the padding token.
         max_position_embeddings (`int`, *optional*, defaults to 131072):
@@ -179,7 +177,6 @@ def __init__(
         vision_config=None,
         text_config=None,
         scale_factor=2,
-        hidden_size=4096,
         pad_token_id=128_002,
         max_position_embeddings=131_072,
         **kwargs,
@@ -210,6 +207,5 @@ def __init__(
 
         self.text_config = text_config
         self.scale_factor = scale_factor
-        self.hidden_size = hidden_size
 
         super().__init__(**kwargs, tie_word_embeddings=tie_word_embeddings)

From 1b2b89c465bf76c849e509ed36a93a0d6155f3de Mon Sep 17 00:00:00 2001
From: Andres Marafioti <andimarafioti@gmail.com>
Date: Fri, 9 Aug 2024 11:14:30 +0000
Subject: [PATCH 11/50] rename idefics3 HF repo

---
 .../models/idefics3/convert_idefics3_weights_to_hf.py       | 2 +-
 src/transformers/models/idefics3/modeling_idefics3.py       | 4 ++--
 src/transformers/models/idefics3/processing_idefics3.py     | 2 +-
 tests/models/idefics3/test_modeling_idefics3.py             | 6 +++---
 tests/models/idefics3/test_processing_idefics3.py           | 2 +-
 5 files changed, 8 insertions(+), 8 deletions(-)

diff --git a/src/transformers/models/idefics3/convert_idefics3_weights_to_hf.py b/src/transformers/models/idefics3/convert_idefics3_weights_to_hf.py
index fc52edd943ee03..204104a58b30e8 100644
--- a/src/transformers/models/idefics3/convert_idefics3_weights_to_hf.py
+++ b/src/transformers/models/idefics3/convert_idefics3_weights_to_hf.py
@@ -31,7 +31,7 @@
 
 
 EPILOG_TXT = """Example:
-    python transformers/src/transformers/models/idefics3/convert_idefics3_weights_to_hf.py --original_model_id HuggingFaceM4/idefics3-8b --output_hub_path org/idefics3
+    python transformers/src/transformers/models/idefics3/convert_idefics3_weights_to_hf.py --original_model_id HuggingFaceM4/Idefics3-8B-Llama3 --output_hub_path org/idefics3
 """
 
 
diff --git a/src/transformers/models/idefics3/modeling_idefics3.py b/src/transformers/models/idefics3/modeling_idefics3.py
index bee15e69b4d909..d17cdb3399a366 100644
--- a/src/transformers/models/idefics3/modeling_idefics3.py
+++ b/src/transformers/models/idefics3/modeling_idefics3.py
@@ -1114,8 +1114,8 @@ def forward(
         >>> image2 = load_image("https://cdn.britannica.com/59/94459-050-DBA42467/Skyline-Chicago.jpg")
         >>> image3 = load_image("https://cdn.britannica.com/68/170868-050-8DDE8263/Golden-Gate-Bridge-San-Francisco.jpg")
 
-        >>> processor = AutoProcessor.from_pretrained("HuggingFaceM4/idefics3-8b")
-        >>> model = AutoModelForVision2Seq.from_pretrained("HuggingFaceM4/idefics3-8b", device_map="auto")
+        >>> processor = AutoProcessor.from_pretrained("HuggingFaceM4/Idefics3-8B-Llama3")
+        >>> model = AutoModelForVision2Seq.from_pretrained("HuggingFaceM4/Idefics3-8B-Llama3", device_map="auto")
 
         >>> BAD_WORDS_IDS = processor.tokenizer(["<image>", "<fake_token_around_image>"], add_special_tokens=False).input_ids
         >>> EOS_WORDS_IDS = [processor.tokenizer.eos_token_id]
diff --git a/src/transformers/models/idefics3/processing_idefics3.py b/src/transformers/models/idefics3/processing_idefics3.py
index 6aa5e1a410954b..8c01eb1fae857e 100644
--- a/src/transformers/models/idefics3/processing_idefics3.py
+++ b/src/transformers/models/idefics3/processing_idefics3.py
@@ -145,7 +145,7 @@ def __call__(
         >>> from transformers import Idefics3Processor
         >>> from transformers.image_utils import load_image
 
-        >>> processor = Idefics3Processor.from_pretrained("HuggingFaceM4/idefics3-8b", image_seq_len=2)
+        >>> processor = Idefics3Processor.from_pretrained("HuggingFaceM4/Idefics3-8B-Llama3", image_seq_len=2)
         >>> processor.image_processor.do_image_splitting = False  # Force as False to simplify the example
 
         >>> url1 = "https://cdn.britannica.com/61/93061-050-99147DCE/Statue-of-Liberty-Island-New-York-Bay.jpg"
diff --git a/tests/models/idefics3/test_modeling_idefics3.py b/tests/models/idefics3/test_modeling_idefics3.py
index 74f7a24ca76a25..c032514f3836e9 100644
--- a/tests/models/idefics3/test_modeling_idefics3.py
+++ b/tests/models/idefics3/test_modeling_idefics3.py
@@ -467,7 +467,7 @@ def test_resize_embeddings_untied(self):
 @require_torch
 class Idefics3ForConditionalGenerationIntegrationTest(unittest.TestCase):
     def setUp(self):
-        self.processor = AutoProcessor.from_pretrained("HuggingFaceM4/idefics3-8b-base")
+        self.processor = AutoProcessor.from_pretrained("HuggingFaceM4/Idefics3-8B-Llama3-base")
         self.image1 = Image.open(
             BytesIO(
                 requests.get(
@@ -493,7 +493,7 @@ def tearDown(self):
     @slow
     def test_integration_test(self):
         model = Idefics3ForConditionalGeneration.from_pretrained(
-            "HuggingFaceM4/idefics3-8b-base",
+            "HuggingFaceM4/Idefics3-8B-Llama3-base",
             torch_dtype=torch.bfloat16,
             device_map="auto",
         )
@@ -517,7 +517,7 @@ def test_integration_test(self):
     def test_integration_test_4bit(self):
         # Let' s make sure we test the preprocessing to replace what is used
         model = Idefics3ForConditionalGeneration.from_pretrained(
-            "HuggingFaceM4/idefics3-8b-base", load_in_4bit=True, device_map="auto"
+            "HuggingFaceM4/Idefics3-8B-Llama3-base", load_in_4bit=True, device_map="auto"
         )
 
         # Create pixel inputs
diff --git a/tests/models/idefics3/test_processing_idefics3.py b/tests/models/idefics3/test_processing_idefics3.py
index cee3e381872bee..7391f076a75486 100644
--- a/tests/models/idefics3/test_processing_idefics3.py
+++ b/tests/models/idefics3/test_processing_idefics3.py
@@ -31,7 +31,7 @@
 @require_vision
 class Idefics3ProcessorTest(unittest.TestCase):
     def setUp(self):
-        self.processor = Idefics3Processor.from_pretrained("HuggingFaceM4/idefics3-8b", image_seq_len=2)
+        self.processor = Idefics3Processor.from_pretrained("HuggingFaceM4/Idefics3-8B-Llama3", image_seq_len=2)
         self.image1 = Image.open(
             BytesIO(
                 requests.get(

From 6ff766f5d218f8a1435cc381b515a4471c9a2074 Mon Sep 17 00:00:00 2001
From: Andres Marafioti <andimarafioti@gmail.com>
Date: Mon, 12 Aug 2024 09:29:09 +0000
Subject: [PATCH 12/50] few changes suggested in the PR

---
 .../models/idefics3/configuration_idefics3.py |  3 ++-
 .../idefics3/image_processing_idefics3.py     | 27 +++++++++++--------
 .../models/idefics3/modeling_idefics3.py      |  1 +
 3 files changed, 19 insertions(+), 12 deletions(-)

diff --git a/src/transformers/models/idefics3/configuration_idefics3.py b/src/transformers/models/idefics3/configuration_idefics3.py
index 48d271ba8a247e..4dcaded43e5839 100644
--- a/src/transformers/models/idefics3/configuration_idefics3.py
+++ b/src/transformers/models/idefics3/configuration_idefics3.py
@@ -139,7 +139,8 @@ class Idefics3Config(PretrainedConfig):
 
     Args:
         use_cache (`bool`, *optional*, defaults to `True`):
-            Whether or not the model should cache the key/value pairs of the attention mechanism.
+            Whether or not the model should cache the key/value pairs of the attention mechanism. Only
+            relevant if `config.is_decoder=True`.
         image_token_id (`int`, *optional*, defaults to 128257):
             The id of the "image" token.
         tie_word_embeddings (`bool`, *optional*, defaults to `False`):
diff --git a/src/transformers/models/idefics3/image_processing_idefics3.py b/src/transformers/models/idefics3/image_processing_idefics3.py
index 114714b5553992..3e9c7ca0cadaa5 100644
--- a/src/transformers/models/idefics3/image_processing_idefics3.py
+++ b/src/transformers/models/idefics3/image_processing_idefics3.py
@@ -28,6 +28,7 @@
     PILImageResampling,
     get_image_size,
     infer_channel_dimension_format,
+    is_pil_image,
     is_scaled_image,
     is_valid_image,
     to_numpy_array,
@@ -60,10 +61,6 @@ def _resize_output_size_rescale_to_max_len(
             Minimum size of the output image.
         max_len (`int`, *optional*, defaults to the maximum size of the image):
             Maximum size of the output image.
-        size (`Dict[str, int]`):
-            Size of the output image containing the keys "shortest_edge" and "longest_edge".
-        input_data_format (`ChannelDimension` or `str`):
-            The channel dimension format of the input image.
 
     Returns:
         The output size of the image after resizing.
@@ -101,10 +98,6 @@ def _resize_output_size_scale_below_upper_bound(
             Width of the input image.
         max_len (`Dict[str, int]`, *optional*, defaults to the maximum size of the image):
             Defines the maximum dimensions of the image.
-        size (`Dict[str, int]`):
-            Size of the output image containing the keys "shortest_edge" and "longest_edge".
-        input_data_format (`ChannelDimension` or `str`):
-            The channel dimension format of the input image.
 
     Returns:
         The output size of the image after resizing.
@@ -494,8 +487,8 @@ def split_image(
         data_format: Optional[Union[str, ChannelDimension]] = None,
     ):
         """
-        Split an image into 4 equal sub-images, and the concatenate that sequence with the original image.
-        That means that a single image becomes a sequence of 5 images.
+        Split an image into squares of side max_image_size and the original image resized to max_image_size.
+        That means that a single image becomes a sequence of images.
         This is a "trick" to spend more compute on each image with no changes in the vision encoder.
 
         Args:
@@ -536,6 +529,7 @@ def _pad_image(
         )
         return padded_image
 
+    # Copied from transformers.models.idefics2.image_processing_idefics2.pad
     def pad(
         self,
         images: List[np.ndarray],
@@ -714,6 +708,17 @@ def preprocess(
                 "torch.Tensor, tf.Tensor or jax.ndarray."
             )
 
+        for img in images_list[0]:
+            if not is_pil_image(img):
+                logger.warning_once(
+                    "Idefics3's image processing pipeline is optimized to process PIL images, but you passed a different type of image. "
+                    "This might lead to inconsistent results"
+                )
+            if input_data_format is None:
+                # We assume that all images have the same channel dimension format.
+                input_data_format = infer_channel_dimension_format(images_list[0][0])
+
+
         validate_preprocess_arguments(
             do_rescale=do_rescale,
             rescale_factor=rescale_factor,
@@ -758,7 +763,7 @@ def preprocess(
                     width = int(height * aspect_ratio)
                     width = math.ceil(width / self.vision_encoder_max_size) * self.vision_encoder_max_size
                 new_size = {"height": height, "width": width}
-                new_images.append(self.resize(img, size=new_size, resample=resample))
+                new_images.append(self.resize(img, size=new_size, resample=resample, input_data_format=input_data_format))
             new_images_list.append(new_images)
         images_list = new_images_list
         del new_images_list
diff --git a/src/transformers/models/idefics3/modeling_idefics3.py b/src/transformers/models/idefics3/modeling_idefics3.py
index d17cdb3399a366..aa918692dbda11 100644
--- a/src/transformers/models/idefics3/modeling_idefics3.py
+++ b/src/transformers/models/idefics3/modeling_idefics3.py
@@ -126,6 +126,7 @@ class Idefics3CausalLMOutputWithPast(ModelOutput):
     image_hidden_states: Optional[Tuple[torch.FloatTensor]] = None
 
 
+# Copied from transformers.models.idefics2.modeling_idefics2.Idefics2VisionEmbeddings
 class Idefics3VisionEmbeddings(nn.Module):
     """
     This is a modified version of `siglip.modelign_siglip.SiglipVisionEmbeddings` to enable images of variable

From 11c2e1ad727207d129c08a5983e66b2b69ffeeef Mon Sep 17 00:00:00 2001
From: Andres Marafioti <andimarafioti@gmail.com>
Date: Mon, 12 Aug 2024 09:55:23 +0000
Subject: [PATCH 13/50] fix to input_data_format computation

---
 .../models/idefics3/image_processing_idefics3.py            | 6 +++---
 1 file changed, 3 insertions(+), 3 deletions(-)

diff --git a/src/transformers/models/idefics3/image_processing_idefics3.py b/src/transformers/models/idefics3/image_processing_idefics3.py
index 3e9c7ca0cadaa5..7806d22947359a 100644
--- a/src/transformers/models/idefics3/image_processing_idefics3.py
+++ b/src/transformers/models/idefics3/image_processing_idefics3.py
@@ -714,9 +714,9 @@ def preprocess(
                     "Idefics3's image processing pipeline is optimized to process PIL images, but you passed a different type of image. "
                     "This might lead to inconsistent results"
                 )
-            if input_data_format is None:
-                # We assume that all images have the same channel dimension format.
-                input_data_format = infer_channel_dimension_format(images_list[0][0])
+                if input_data_format is None:
+                    # We assume that all images have the same channel dimension format.
+                    input_data_format = infer_channel_dimension_format(images_list[0][0])
 
 
         validate_preprocess_arguments(

From c1048ed198ead8d22536c272f737dfdac8f06cfe Mon Sep 17 00:00:00 2001
From: Andres Marafioti <andimarafioti@gmail.com>
Date: Mon, 12 Aug 2024 09:56:13 +0000
Subject: [PATCH 14/50] remove overwrite of _autoset_attn_implementation
 following @zucchini-nlp suggestion

---
 .../models/idefics3/modeling_idefics3.py      | 59 ++++++++++---------
 1 file changed, 31 insertions(+), 28 deletions(-)

diff --git a/src/transformers/models/idefics3/modeling_idefics3.py b/src/transformers/models/idefics3/modeling_idefics3.py
index aa918692dbda11..f7474a42e014d8 100644
--- a/src/transformers/models/idefics3/modeling_idefics3.py
+++ b/src/transformers/models/idefics3/modeling_idefics3.py
@@ -534,12 +534,39 @@ def forward(
         )
 
 
-class Idefics3VisionTransformer(nn.Module):
+IDEFICS3_VISION_START_DOCSTRING = r"""
+    This model inherits from [`PreTrainedModel`]. Check the superclass documentation for the generic methods the
+    library implements for all its model (such as downloading or saving, resizing the input embeddings, pruning heads
+    etc.)
+
+    This model is also a PyTorch [torch.nn.Module](https://pytorch.org/docs/stable/nn.html#torch.nn.Module) subclass.
+    Use it as a regular PyTorch Module and refer to the PyTorch documentation for all matter related to general usage
+    and behavior.
+
+    Parameters:
+        config ([`Idefics3VisionConfig`]):
+            Model configuration class with all the parameters of the model. Initializing with a config file does not
+            load the weights associated with the model, only the configuration. Check out the
+            [`~PreTrainedModel.from_pretrained`] method to load the model weights.
+"""
+
+@add_start_docstrings(
+    "The Idefics3 Vision Transformer Model outputting raw image embedding.",
+    IDEFICS3_VISION_START_DOCSTRING,
+)
+class Idefics3VisionTransformer(PreTrainedModel):
+    config_class = Idefics3VisionConfig
+    base_model_prefix = "model"
+    supports_gradient_checkpointing = True
+    _no_split_modules = ["Idefics3VisionAttention"]
+    _skip_keys_device_placement = "past_key_values"
+    _supports_flash_attn_2 = True
+    _supports_cache_class = True
+
     def __init__(self, config: Idefics3VisionConfig):
-        super().__init__()
+        super().__init__(config)
         embed_dim = config.hidden_size
 
-        self.config = config
         self.embeddings = Idefics3VisionEmbeddings(config)
         self.encoder = Idefics3Encoder(config)
         self.post_layernorm = nn.LayerNorm(embed_dim, eps=config.layer_norm_eps)
@@ -715,30 +742,6 @@ def _init_weights(self, module):
             if module.padding_idx is not None:
                 module.weight.data[module.padding_idx].zero_()
 
-    @classmethod
-    def _autoset_attn_implementation(
-        cls,
-        config,
-        use_flash_attention_2: bool = False,
-        torch_dtype: Optional[torch.dtype] = None,
-        device_map: Optional[Union[str, Dict[str, int]]] = None,
-        check_device_map: bool = True,
-        **kwargs,
-    ):
-        """
-        Overrides the method in `PreTrainedModel` to update the vision config with the correct attention implementation
-        """
-        config = super()._autoset_attn_implementation(
-            config=config,
-            use_flash_attention_2=use_flash_attention_2,
-            torch_dtype=torch_dtype,
-            device_map=device_map,
-            check_device_map=check_device_map,
-            **kwargs,
-        )
-        config.vision_config._attn_implementation = config._attn_implementation
-        return config
-
 
 IDEFICS3_INPUTS_DOCSTRING = r"""
     Args:
@@ -820,7 +823,7 @@ def __init__(self, config: Idefics3Config):
         self.padding_idx = self.config.text_config.pad_token_id
         self.vocab_size = self.config.text_config.vocab_size
 
-        self.vision_model = Idefics3VisionTransformer(config.vision_config)
+        self.vision_model = Idefics3VisionTransformer._from_config(config.vision_config, attn_implementation=config._attn_implementation)
         self.connector = Idefics3Connector(config)
         self.text_model = AutoModel.from_config(config.text_config, attn_implementation=config._attn_implementation)
 

From a1635647b9cee9972eb0d7eb920c043cb4a49bd4 Mon Sep 17 00:00:00 2001
From: Andres Marafioti <andimarafioti@gmail.com>
Date: Mon, 12 Aug 2024 11:45:24 +0000
Subject: [PATCH 15/50] improve example

---
 .../models/idefics3/modeling_idefics3.py      | 42 +++++++++++++------
 1 file changed, 30 insertions(+), 12 deletions(-)

diff --git a/src/transformers/models/idefics3/modeling_idefics3.py b/src/transformers/models/idefics3/modeling_idefics3.py
index f7474a42e014d8..9ebf962b2a746a 100644
--- a/src/transformers/models/idefics3/modeling_idefics3.py
+++ b/src/transformers/models/idefics3/modeling_idefics3.py
@@ -1119,25 +1119,43 @@ def forward(
         >>> image3 = load_image("https://cdn.britannica.com/68/170868-050-8DDE8263/Golden-Gate-Bridge-San-Francisco.jpg")
 
         >>> processor = AutoProcessor.from_pretrained("HuggingFaceM4/Idefics3-8B-Llama3")
-        >>> model = AutoModelForVision2Seq.from_pretrained("HuggingFaceM4/Idefics3-8B-Llama3", device_map="auto")
-
-        >>> BAD_WORDS_IDS = processor.tokenizer(["<image>", "<fake_token_around_image>"], add_special_tokens=False).input_ids
-        >>> EOS_WORDS_IDS = [processor.tokenizer.eos_token_id]
+        >>> model = AutoModelForVision2Seq.from_pretrained("HuggingFaceM4/Idefics3-8B-Llama3", torch_dtype=torch.bfloat16, device_map="auto")
 
         >>> # Create inputs
-        >>> prompts = [
-        ...   "<image>In this image, we can see the city of New York, and more specifically the Statue of Liberty.<image>In this image,",
-        ...   "In which city is that bridge located?<image>",
-        ... ]
+        >>> messages = [
+                {
+                    "role": "user",
+                    "content": [
+                        {"type": "image"},
+                        {"type": "text", "text": "In this image, we can see the city of New York, and more specifically the Statue of Liberty."},
+                        {"type": "image"},
+                        {"type": "text", "text": "What can we see in this image?"},
+                    ]
+                },
+                {
+                "role": "user",
+                "content": [
+                    {"type": "image"},
+                    {"type": "text", "text": "In which city is that bridge located?"},
+                ]
+            }
+
+            ]
+
+
+        >>> prompts = [processor.apply_chat_template([message], add_generation_prompt=True) for message in messages]
         >>> images = [[image1, image2], [image3]]
-        >>> inputs = processor(text=prompts, images=images, padding=True, return_tensors="pt").to("cuda")
+        >>> inputs = processor(text=prompts, images=images, padding=True, return_tensors="pt").to(model.device)
 
         >>> # Generate
-        >>> generated_ids = model.generate(**inputs, bad_words_ids=BAD_WORDS_IDS, max_new_tokens=20)
+        >>> generated_ids = model.generate(**inputs, max_new_tokens=256)
         >>> generated_texts = processor.batch_decode(generated_ids, skip_special_tokens=True)
 
-        >>> print(generated_texts)
-        ['In this image, we can see the city of New York, and more specifically the Statue of Liberty. In this image, we can see the city of New York, and more specifically the Statue of Liberty.\n\n', 'In which city is that bridge located?\n\nThe bridge is located in the city of Pittsburgh, Pennsylvania.\n\n\nThe bridge is']
+        >>> print(generated_texts[0])
+        Assistant: There are buildings, trees, lights, and water visible in this image.
+
+        >>> print(generated_texts[1])
+        Assistant: The bridge is in San Francisco.
         ```"""
         output_attentions = output_attentions if output_attentions is not None else self.config.output_attentions
         output_hidden_states = (

From 6f0a479e80350e75bda323a73b53deab201ca98f Mon Sep 17 00:00:00 2001
From: Andres Marafioti <andimarafioti@gmail.com>
Date: Mon, 12 Aug 2024 16:38:32 +0000
Subject: [PATCH 16/50] few improvements from amy's review

---
 .../models/idefics3/modeling_idefics3.py      | 23 ++++++++++---------
 1 file changed, 12 insertions(+), 11 deletions(-)

diff --git a/src/transformers/models/idefics3/modeling_idefics3.py b/src/transformers/models/idefics3/modeling_idefics3.py
index 9ebf962b2a746a..19ca944187e622 100644
--- a/src/transformers/models/idefics3/modeling_idefics3.py
+++ b/src/transformers/models/idefics3/modeling_idefics3.py
@@ -265,6 +265,7 @@ def forward(
         return attn_output, attn_weights
 
 
+# copied from transformers.models.idefics2.modeling_idefics2.Idefics2VisionFlashAttention2
 class Idefics3VisionFlashAttention2(Idefics3VisionAttention):
     """
     Idefics3Vision flash attention module. This module inherits from `Idefics3VisionAttention` as the weights of the module stays
@@ -386,10 +387,12 @@ def forward(self, hidden_states: torch.Tensor) -> torch.Tensor:
 
 
 class Idefics3SimpleMLP(nn.Module):
-    def __init__(self, input_size, output_size):
+    def __init__(self, config):
         super().__init__()
-        self.input_size = input_size
-        self.output_size = output_size
+        self.config = config
+
+        input_size=config.vision_config.hidden_size * (config.scale_factor**2)
+        output_size=config.text_config.hidden_size
         self.proj = nn.Linear(input_size, output_size, bias=False)
 
     def forward(self, x):
@@ -550,6 +553,7 @@ def forward(
             [`~PreTrainedModel.from_pretrained`] method to load the model weights.
 """
 
+#Copied from transformers.models.idefics2.modeling_idefics2.Idefics2VisionTransformer
 @add_start_docstrings(
     "The Idefics3 Vision Transformer Model outputting raw image embedding.",
     IDEFICS3_VISION_START_DOCSTRING,
@@ -671,20 +675,17 @@ class Idefics3Connector(nn.Module):
     def __init__(self, config):
         super().__init__()
         self.scale_factor = config.scale_factor
-        self.modality_projection = Idefics3SimpleMLP(
-            input_size=config.vision_config.hidden_size * (self.scale_factor**2),
-            output_size=config.text_config.hidden_size,
-        )
+        self.modality_projection = Idefics3SimpleMLP(config)
 
     def pixel_shuffle(self, x, scale_factor=2):
         bsz, seq, embed_dim = x.size()
         height = width = int(seq**0.5)
         x = x.view(bsz, height, width, embed_dim)
         x = x.view(bsz, height, int(width / scale_factor), embed_dim * scale_factor)
-        x = x.permute(0, 2, 1, 3).contiguous()
-        x = x.view(bsz, int(width / scale_factor), int(height / scale_factor), embed_dim * (scale_factor**2))
-        x = x.permute(0, 2, 1, 3).contiguous()
-        x = x.view(bsz, int(seq / (scale_factor**2)), embed_dim * (scale_factor**2))
+        x = x.permute(0, 2, 1, 3)
+        x = x.reshape(bsz, int(width / scale_factor), int(height / scale_factor), embed_dim * (scale_factor**2))
+        x = x.permute(0, 2, 1, 3)
+        x = x.reshape(bsz, int(seq / (scale_factor**2)), embed_dim * (scale_factor**2))
         return x
 
     def forward(self, image_hidden_states):

From 8361fce7e3fbdb8c98dc37ad5ecab2dc1297c197 Mon Sep 17 00:00:00 2001
From: Andres Marafioti <andimarafioti@gmail.com>
Date: Mon, 12 Aug 2024 16:39:08 +0000
Subject: [PATCH 17/50] big change to enable processing input images as numpy
 arrays

---
 .../idefics3/image_processing_idefics3.py     | 245 +++++++++++-------
 1 file changed, 147 insertions(+), 98 deletions(-)

diff --git a/src/transformers/models/idefics3/image_processing_idefics3.py b/src/transformers/models/idefics3/image_processing_idefics3.py
index 7806d22947359a..2d54c7c47d00ba 100644
--- a/src/transformers/models/idefics3/image_processing_idefics3.py
+++ b/src/transformers/models/idefics3/image_processing_idefics3.py
@@ -29,6 +29,9 @@
     get_image_size,
     infer_channel_dimension_format,
     is_pil_image,
+    is_torch_tensor,
+    is_jax_tensor,
+    is_tf_tensor,
     is_scaled_image,
     is_valid_image,
     to_numpy_array,
@@ -161,79 +164,6 @@ def get_resize_output_image_size(
     return height, width
 
 
-def split_image(
-    image: np.ndarray,
-    max_image_size: Dict[str, int],
-    resample: PILImageResampling = PILImageResampling.LANCZOS,
-    input_data_format: Optional[Union[str, ChannelDimension]] = None,
-    data_format: Optional[Union[str, ChannelDimension]] = None,
-):
-    """
-    Image splitting strategy.
-    1) If one side of the original image is larger than `max_image_size`, resize it to `max_image_size` while preserving the aspect ratio.
-    2) Divide the resulting image into `ceil(height / max_image_size)` x `ceil(width / max_image_size)`
-    sub-images of approximately the same size each (up to the fact that `vision_encoder_max_image_size` does not divide `height` or
-    `width`).
-    3) Returns the list of the crops and the original image, in addition to the number of splits for the height and the width.
-    """
-    if isinstance(image, Image.Image):
-        width, height = image.size
-    else:
-        height, width = get_image_size(image, channel_dim=input_data_format)
-    max_height = max_width = max_image_size["longest_edge"]
-
-    frames = []
-    if height > max_height or width > max_width:
-        # Calculate the number of splits
-        num_splits_h = math.ceil(height / max_height)
-        num_splits_w = math.ceil(width / max_width)
-        # Calculate the optimal width and height for the sub-images
-        optimal_height = math.ceil(height / num_splits_h)
-        optimal_width = math.ceil(width / num_splits_w)
-
-        # Iterate through each row and column
-        for r in range(num_splits_h):
-            for c in range(num_splits_w):
-                # Calculate the starting point of the crop
-                start_x = c * optimal_width
-                start_y = r * optimal_height
-
-                # Calculate the ending point of the crop
-                end_x = min(start_x + optimal_width, width)
-                end_y = min(start_y + optimal_height, height)
-
-                # Crop the image
-                if isinstance(image, Image.Image):
-                    cropped_image = image.crop((start_x, start_y, end_x, end_y))
-                else:
-                    cropped_image = _crop(
-                        image, start_x, start_y, end_x, end_y, input_data_format=input_data_format, data_format=data_format
-                    )
-                frames.append(cropped_image)
-
-        # For the global image at the end, we resize it to match the max_image_size, for cpu memory efficiency
-        global_image_height, global_image_width = max_height, max_width
-        if height != global_image_height or width != global_image_width:
-            if isinstance(image, Image.Image):
-                image = image.resize((global_image_width, global_image_height), resample=resample)
-            else:
-                image = resize(
-                    image,
-                    (global_image_height, global_image_width),
-                    resample=resample,
-                    input_data_format=input_data_format,
-                )
-    else:
-        num_splits_h, num_splits_w = 0, 0
-
-    if data_format is not None and not isinstance(image, Image.Image):
-        image = to_channel_dimension_format(image, data_format, input_channel_dim=input_data_format)
-
-    frames.append(image)
-
-    return frames, num_splits_h, num_splits_w
-
-
 # Copied from transformers.models.idefics2.image_processing_idefics2.make_list_of_images
 def make_list_of_images(images: ImageInput) -> List[List[np.ndarray]]:
     """
@@ -312,28 +242,67 @@ def make_pixel_mask(
     return mask
 
 
-# Copied from transformers.models.idefics2.image_processing_idefics2.convert_to_rgb
-def convert_to_rgb(image: ImageInput) -> ImageInput:
+# Copied from transformers.image_transforms.to_pil_image
+def to_pil_image(
+    image: Union[np.ndarray, "PIL.Image.Image", "torch.Tensor", "tf.Tensor", "jnp.ndarray"],
+    do_rescale: Optional[bool] = None,
+    input_data_format: Optional[Union[str, ChannelDimension]] = None,
+    image_mode: str = "RGB",
+) -> "PIL.Image.Image":
+    """
+    Converts `image` to a PIL Image. Optionally rescales it and puts the channel dimension back as the last axis if
+    needed.
+
+    Args:
+        image (`PIL.Image.Image` or `numpy.ndarray` or `torch.Tensor` or `tf.Tensor`):
+            The image to convert to the `PIL.Image` format.
+        do_rescale (`bool`, *optional*):
+            Whether or not to apply the scaling factor (to make pixel values integers between 0 and 255). Will default
+            to `True` if the image type is a floating type and casting to `int` would result in a loss of precision,
+            and `False` otherwise.
+        input_data_format (`ChannelDimension`, *optional*):
+            The channel dimension format of the input image. If unset, will use the inferred format from the input.
+
+    Returns:
+        `PIL.Image.Image`: The converted image.
+    """
+    if isinstance(image, PIL.Image.Image):
+        return image
+
+    # Convert all tensors to numpy arrays before converting to PIL image
+    if is_torch_tensor(image) or is_tf_tensor(image):
+        image = image.numpy()
+    elif is_jax_tensor(image):
+        image = np.array(image)
+    elif not isinstance(image, np.ndarray):
+        raise ValueError("Input image type not supported: {}".format(type(image)))
+
+    # If there is a single channel, we squeeze it, as otherwise PIL can't handle it.
+    image = np.squeeze(image, axis=-1) if image.shape[-1] == 1 else image
+    image = image.astype(np.uint8)
+    return PIL.Image.fromarray(image, mode=image_mode)
+
+def convert_to_rgb(image: ImageInput, palette: Optional[PIL.ImagePalette.ImagePalette]=None) -> ImageInput:
     """
     Converts an image to RGB format. Only converts if the image is of type PIL.Image.Image, otherwise returns the image
     as is.
     Args:
         image (Image):
             The image to convert.
+        palette (List[int], *optional*):
+            The palette to use if given.
     """
     if not isinstance(image, PIL.Image.Image):
-        return image
-
-    # `image.convert("RGB")` would only work for .jpg images, as it creates a wrong background
-    # for transparent images. The call to `alpha_composite` handles this case
-    if image.mode == "RGB":
-        return image
+        mode = "P" if palette is not None else None
+        image = to_pil_image(image, image_mode=mode)
+        if image.mode=='P' and palette is not None:
+            image.putpalette(palette)
 
     image_rgba = image.convert("RGBA")
     background = Image.new("RGBA", image_rgba.size, (255, 255, 255))
     alpha_composite = Image.alpha_composite(background, image_rgba)
     alpha_composite = alpha_composite.convert("RGB")
-    return alpha_composite
+    return np.array(alpha_composite)
 
 
 # FIXME Amy: make a more general crop function that isn't just centre crop
@@ -475,9 +444,18 @@ def resize(
             raise ValueError("size must be a dictionary with key 'longest_edge' or 'height' and 'width'.")
         if isinstance(image, Image.Image):
             return image.resize((size[1], size[0]), resample=resample)
-        return resize(
-            image, size, resample=resample, data_format=data_format, input_data_format=input_data_format, **kwargs
-        )
+        else:
+            image_mode = None
+            if image.ndim == 2 or image.shape[-1] == 1:
+                image_mode = 'P'
+            image = to_pil_image(image, input_data_format=input_data_format, image_mode=image_mode)
+
+        resized_image = image.resize((size[1], size[0]), resample=resample)
+        resized_array = np.array(resized_image)
+        if resized_array.ndim == 2:
+            resized_array = np.expand_dims(resized_array, axis=-1)
+        return resized_array
+
 
     def split_image(
         self,
@@ -485,11 +463,16 @@ def split_image(
         max_image_size: Dict[str, int],
         input_data_format: Optional[Union[str, ChannelDimension]] = None,
         data_format: Optional[Union[str, ChannelDimension]] = None,
+        resample: PILImageResampling = PILImageResampling.LANCZOS,
     ):
         """
         Split an image into squares of side max_image_size and the original image resized to max_image_size.
         That means that a single image becomes a sequence of images.
         This is a "trick" to spend more compute on each image with no changes in the vision encoder.
+        1) If one side of the original image is larger than `max_image_size`, resize it to `max_image_size` while preserving the aspect ratio.
+        2) Divide the resulting image into `ceil(height / max_image_size)` x `ceil(width / max_image_size)`
+        sub-images of the same size each (image_size, image_size). Typically, 364x364.
+        3) Returns the list of the crops and the original image, in addition to the number of splits for the height and the width.
 
         Args:
             image (`np.ndarray`):
@@ -499,9 +482,67 @@ def split_image(
                 patches of this size, and the original image will be concatenated with the patches, resized to max_size.
             input_data_format (`ChannelDimension` or `str`, *optional*):
                 The channel dimension format of the input image. If not provided, it will be inferred.
+            data_format (`ChannelDimension` or `str`, *optional*):
+                The channel dimension format of the output image. If not provided, it will be the same as the input image.
+            resample (`PILImageResampling`, *optional*, defaults to `PILImageResampling.LANCZOS`):
+                Resampling filter to use when resizing the image.
         """
-        return split_image(image, max_image_size, input_data_format=input_data_format, data_format=data_format)
+        if isinstance(image, Image.Image):
+            width, height = image.size
+        else:
+            height, width = get_image_size(image, channel_dim=input_data_format)
+        max_height = max_width = max_image_size["longest_edge"]
+
+        frames = []
+        if height > max_height or width > max_width:
+            # Calculate the number of splits
+            num_splits_h = math.ceil(height / max_height)
+            num_splits_w = math.ceil(width / max_width)
+            # Calculate the optimal width and height for the sub-images
+            optimal_height = math.ceil(height / num_splits_h)
+            optimal_width = math.ceil(width / num_splits_w)
+
+            # Iterate through each row and column
+            for r in range(num_splits_h):
+                for c in range(num_splits_w):
+                    # Calculate the starting point of the crop
+                    start_x = c * optimal_width
+                    start_y = r * optimal_height
+
+                    # Calculate the ending point of the crop
+                    end_x = min(start_x + optimal_width, width)
+                    end_y = min(start_y + optimal_height, height)
+
+                    # Crop the image
+                    if isinstance(image, Image.Image):
+                        cropped_image = image.crop((start_x, start_y, end_x, end_y))
+                    else:
+                        cropped_image = _crop(
+                            image, start_x, start_y, end_x, end_y, input_data_format=input_data_format, data_format=data_format
+                        )
+                    frames.append(cropped_image)
+
+            # For the global image at the end, we resize it to match the max_image_size, for cpu memory efficiency
+            global_image_height, global_image_width = max_height, max_width
+            if height != global_image_height or width != global_image_width:
+                if isinstance(image, Image.Image):
+                    image = image.resize((global_image_width, global_image_height), resample=resample)
+                else:
+                    image = self.resize(
+                        image,
+                        {'height': global_image_height, 'width': global_image_width},
+                        resample=resample,
+                        input_data_format=input_data_format,
+                    )
+        else:
+            num_splits_h, num_splits_w = 0, 0
+
+        if data_format is not None and not isinstance(image, Image.Image):
+            image = to_channel_dimension_format(image, data_format, input_channel_dim=input_data_format)
+
+        frames.append(image)
 
+        return frames, num_splits_h, num_splits_w
     def _pad_image(
         self,
         image: np.ndarray,
@@ -708,15 +749,25 @@ def preprocess(
                 "torch.Tensor, tf.Tensor or jax.ndarray."
             )
 
-        for img in images_list[0]:
-            if not is_pil_image(img):
-                logger.warning_once(
-                    "Idefics3's image processing pipeline is optimized to process PIL images, but you passed a different type of image. "
-                    "This might lead to inconsistent results"
-                )
-                if input_data_format is None:
-                    # We assume that all images have the same channel dimension format.
-                    input_data_format = infer_channel_dimension_format(images_list[0][0])
+
+        palettes = [image[0].getpalette() if isinstance(image[0], Image.Image) and image[0].mode == 'P' else None for image in images_list] # save the palettes for conversion to RGB
+        # All transformations expect numpy arrays.
+        images_list = [[to_numpy_array(image) for image in images] for images in images_list]
+        
+        new_images_list = []
+        for images in images_list:
+            new_images = []
+            for img in images:
+                if img.ndim == 2:
+                    img = np.expand_dims(img, axis=-1)
+                new_images.append(img)
+            new_images_list.append(new_images)
+        images_list = new_images_list
+        del new_images_list
+
+        if input_data_format is None:
+            # We assume that all images have the same channel dimension format.
+            input_data_format = infer_channel_dimension_format(images_list[0][0], num_channels=(1, 3, 4))
 
 
         validate_preprocess_arguments(
@@ -794,10 +845,8 @@ def preprocess(
             images_list_cols = [[0] * len(images) for images in images_list]
 
         if do_convert_rgb:
-            images_list = [[convert_to_rgb(image) for image in images] for images in images_list]
+            images_list = [[convert_to_rgb(image, palette) for image in images] for images, palette in zip(images_list, palettes)]
 
-        # All transformations expect numpy arrays.
-        images_list = [[to_numpy_array(image) for image in images] for images in images_list]
 
         if is_scaled_image(images_list[0][0]) and do_rescale:
             logger.warning_once(

From 32970d0dced4a17331fe27aa915661e4257ac7ee Mon Sep 17 00:00:00 2001
From: Andres Marafioti <andimarafioti@gmail.com>
Date: Tue, 13 Aug 2024 09:53:59 +0000
Subject: [PATCH 18/50] Changes to the code to uniformize processor kwargs

---
 .../idefics3/image_processing_idefics3.py     | 20 +++-
 .../models/idefics3/modeling_idefics3.py      |  5 +-
 .../models/idefics3/processing_idefics3.py    | 95 ++++++++++++++-----
 .../idefics3/test_processing_idefics3.py      |  5 +-
 4 files changed, 89 insertions(+), 36 deletions(-)

diff --git a/src/transformers/models/idefics3/image_processing_idefics3.py b/src/transformers/models/idefics3/image_processing_idefics3.py
index 2d54c7c47d00ba..106f8b2f28c580 100644
--- a/src/transformers/models/idefics3/image_processing_idefics3.py
+++ b/src/transformers/models/idefics3/image_processing_idefics3.py
@@ -18,8 +18,10 @@
 
 import numpy as np
 
+from transformers.utils.import_utils import is_flax_available, is_tf_available, is_torch_available
+
 from ...image_processing_utils import BaseImageProcessor, BatchFeature
-from ...image_transforms import PaddingMode, pad, rescale, resize, to_channel_dimension_format
+from ...image_transforms import PaddingMode, pad, rescale, to_channel_dimension_format
 from ...image_utils import (
     IMAGENET_STANDARD_MEAN,
     IMAGENET_STANDARD_STD,
@@ -28,11 +30,10 @@
     PILImageResampling,
     get_image_size,
     infer_channel_dimension_format,
-    is_pil_image,
-    is_torch_tensor,
     is_jax_tensor,
-    is_tf_tensor,
     is_scaled_image,
+    is_tf_tensor,
+    is_torch_tensor,
     is_valid_image,
     to_numpy_array,
     valid_images,
@@ -41,6 +42,15 @@
 from ...utils import TensorType, is_vision_available, logging
 
 
+if is_torch_available():
+    import torch
+
+if is_tf_available():
+    import tensorflow as tf
+
+if is_flax_available():
+    import jax.numpy as jnp
+
 logger = logging.get_logger(__name__)
 
 
@@ -753,7 +763,7 @@ def preprocess(
         palettes = [image[0].getpalette() if isinstance(image[0], Image.Image) and image[0].mode == 'P' else None for image in images_list] # save the palettes for conversion to RGB
         # All transformations expect numpy arrays.
         images_list = [[to_numpy_array(image) for image in images] for images in images_list]
-        
+
         new_images_list = []
         for images in images_list:
             new_images = []
diff --git a/src/transformers/models/idefics3/modeling_idefics3.py b/src/transformers/models/idefics3/modeling_idefics3.py
index 19ca944187e622..bb8e8ba282cb60 100644
--- a/src/transformers/models/idefics3/modeling_idefics3.py
+++ b/src/transformers/models/idefics3/modeling_idefics3.py
@@ -14,9 +14,8 @@
 # limitations under the License.
 """PyTorch Idefics3 model."""
 
-import math
 from dataclasses import dataclass
-from typing import Dict, List, Optional, Tuple, Union
+from typing import List, Optional, Tuple, Union
 
 import torch
 import torch.utils.checkpoint
@@ -25,7 +24,7 @@
 
 from ... import PreTrainedModel
 from ...activations import ACT2FN
-from ...cache_utils import Cache, DynamicCache
+from ...cache_utils import Cache
 from ...modeling_attn_mask_utils import _prepare_4d_attention_mask
 from ...modeling_outputs import BaseModelOutput, ModelOutput
 from ...utils import (
diff --git a/src/transformers/models/idefics3/processing_idefics3.py b/src/transformers/models/idefics3/processing_idefics3.py
index 8c01eb1fae857e..5d50f41a959585 100644
--- a/src/transformers/models/idefics3/processing_idefics3.py
+++ b/src/transformers/models/idefics3/processing_idefics3.py
@@ -16,18 +16,24 @@
 Processor class for Idefics3.
 """
 
+import sys
+import warnings
 from typing import TYPE_CHECKING, List, Optional, Union
 
 from ...feature_extraction_utils import BatchFeature
 from ...image_utils import ImageInput, is_valid_image, load_image
-from ...processing_utils import ProcessorMixin
-from ...tokenization_utils_base import AddedToken, BatchEncoding, PaddingStrategy, TextInput, TruncationStrategy
-from ...utils import TensorType, logging
+from ...processing_utils import ImagesKwargs, ProcessingKwargs, ProcessorMixin
+from ...tokenization_utils_base import AddedToken, BatchEncoding, TextInput
+from ...utils import logging
 
 
 if TYPE_CHECKING:
     from ...tokenization_utils_base import PreTokenizedInput
 
+if sys.version_info >= (3, 11):
+    from typing import Unpack
+else:
+    from typing_extensions import Unpack
 
 logger = logging.get_logger(__name__)
 
@@ -73,6 +79,25 @@ def get_image_prompt_string(image_rows, image_cols, image_seq_len, fake_token_ar
         )
     return _prompt_split_image(image_seq_len, image_rows, image_cols, fake_token_around_image, image_token)
 
+class Idefics3ImagesKwargs(ImagesKwargs, total=False):
+    image_seq_len: Optional[int]
+    return_row_col_info: Optional[bool]
+
+
+class Idefics3ProcessorKwargs(ProcessingKwargs, total=False):
+    images_kwargs: Idefics3ImagesKwargs
+
+    _defaults = {
+        "text_kwargs": {
+            "add_special_tokens": True,
+            "padding": False,
+            "is_split_into_words": False,
+        },
+        "images_kwargs": {
+            "image_seq_len": 169,
+            "return_row_col_info": True,
+        },
+    }
 
 class Idefics3Processor(ProcessorMixin):
     r"""
@@ -125,15 +150,11 @@ def _extract_images_from_prompts(self, prompts):
 
     def __call__(
         self,
-        text: Union[TextInput, "PreTokenizedInput", List[TextInput], List["PreTokenizedInput"]] = None,
         images: Union[ImageInput, List[ImageInput], List[List[ImageInput]]] = None,
-        image_seq_len: int = 169,
-        padding: Union[bool, str, PaddingStrategy] = False,
-        truncation: Union[bool, str, TruncationStrategy] = None,
-        max_length: Optional[int] = None,
-        is_split_into_words: bool = False,
-        add_special_tokens: bool = True,
-        return_tensors: Optional[Union[str, TensorType]] = None,
+        text: Union[TextInput, "PreTokenizedInput", List[TextInput], List["PreTokenizedInput"]] = None,
+        audio=None,
+        videos=None,
+        **kwargs: Unpack[Idefics3ProcessorKwargs],
     ) -> BatchEncoding:
         """
         Processes the input prompts and returns a BatchEncoding.
@@ -158,7 +179,7 @@ def __call__(
         ...     "<image>In this image, we see",
         ...     "bla bla bla<image>",
         ... ]
-        >>> outputs = processor(text=text, images=images, return_tensors="pt", padding=True)
+        >>> outputs = processor(images=images, text=text, return_tensors="pt", padding=True)
         >>> input_ids = outputs.input_ids
         >>> input_tokens = processor.tokenizer.batch_decode(input_ids)
         >>> print(input_tokens)
@@ -166,16 +187,16 @@ def __call__(
         ```
 
         Args:
+            images (`PIL.Image.Image`, `np.ndarray`, `torch.Tensor`, `List[PIL.Image.Image]`, `List[np.ndarray]`, `List[torch.Tensor]`, *optional*):
+                The image or batch of images to be prepared. Each image can be a PIL image, NumPy array or PyTorch
+                tensor. If is of type `List[ImageInput]`, it's assumed that this is for a single prompt i.e. of batch size 1.
             text (`Union[TextInput, PreTokenizedInput, List[TextInput], List[PreTokenizedInput]]`, *optional*):
                 The sequence or batch of sequences to be encoded. Each sequence can be a string or a list of strings
                 (pretokenized string). If the sequences are provided as list of strings (pretokenized), you must set
                 `is_split_into_words=True` (to lift the ambiguity with a batch of sequences).
 
                 Wherever an image token, `<image>` is encountered it is expanded to
-                `<fake_token_around_image>` + `<image>` * `image_seq_len` * <fake_token_around_image>`.
-            images (`PIL.Image.Image`, `np.ndarray`, `torch.Tensor`, `List[PIL.Image.Image]`, `List[np.ndarray]`, `List[torch.Tensor]`, *optional*):
-                The image or batch of images to be prepared. Each image can be a PIL image, NumPy array or PyTorch
-                tensor. If is of type `List[ImageInput]`, it's assumed that this is for a single prompt i.e. of batch size 1.
+                `<fake_token_around_image>` + `<row_x_col_y>` + `<image>` * `image_seq_len` * <fake_token_around_image>`.
             image_seq_len (`int`, *optional*):
                 The length of the image sequence. If not provided, the default value of 169 is used.
             padding (`Union[bool, str, PaddingStrategy]`, *optional*, defaults to `False`):
@@ -194,6 +215,36 @@ def __call__(
                 If set, will return tensors of a particular framework. See [`PreTrainedTokenizerFast.__call__`] for more
                 information.
         """
+        if text is None and images is None:
+            raise ValueError("You must provide either `text` or `images`.")
+        # check if images and text inputs are reversed for BC
+        if (
+            text is not None
+            and not isinstance(text[0], str)
+            or images is not None
+            and not (
+                is_image_or_image_url(images)
+                or is_image_or_image_url(images[0])
+                or (isinstance(images[0], list) and is_image_or_image_url(images[0][0]))
+            )
+        ):
+            warnings.warn(
+                "It looks like you are passing the inputs in the wrong order. You should pass the images input first and the text input second."
+                "Images and text inputs will be swapped."
+            )
+            images, text = text, images
+
+        output_kwargs = self._merge_kwargs(
+            Idefics3ProcessorKwargs,
+            tokenizer_init_kwargs=self.tokenizer.init_kwargs,
+            **kwargs,
+        )
+
+        # Temporary fix for "paddding_side" in init_kwargs
+        _ = output_kwargs['text_kwargs'].pop("padding_side", None)
+
+        image_seq_len = output_kwargs["images_kwargs"].pop("image_seq_len", None)
+
         n_images_in_text = []
         n_images_in_images = []
         inputs = BatchFeature()
@@ -226,7 +277,7 @@ def __call__(
             images = new_images
             del new_images
 
-            image_inputs = self.image_processor(images, return_tensors=return_tensors, return_row_col_info=True)
+            image_inputs = self.image_processor(images, **output_kwargs["images_kwargs"])
             inputs.update(image_inputs)
 
         if text is not None:
@@ -265,15 +316,7 @@ def __call__(
                     sample += image_prompt_string + split_sample[i + 1]
                 prompt_strings.append(sample)
 
-            text_inputs = self.tokenizer(
-                text=prompt_strings,
-                add_special_tokens=add_special_tokens,
-                padding=padding,
-                truncation=truncation,
-                max_length=max_length,
-                is_split_into_words=is_split_into_words,
-                return_tensors=return_tensors,
-            )
+            text_inputs = self.tokenizer(text=prompt_strings, **output_kwargs["text_kwargs"])
             inputs.update(text_inputs)
 
             if n_images_in_images != n_images_in_text:
diff --git a/tests/models/idefics3/test_processing_idefics3.py b/tests/models/idefics3/test_processing_idefics3.py
index 7391f076a75486..64bbb13cdd0e4c 100644
--- a/tests/models/idefics3/test_processing_idefics3.py
+++ b/tests/models/idefics3/test_processing_idefics3.py
@@ -13,7 +13,6 @@
 # See the License for the specific language governing permissions and
 # limitations under the License.
 
-import unittest
 from io import BytesIO
 
 import requests
@@ -22,6 +21,8 @@
 from transformers.testing_utils import require_torch, require_vision
 from transformers.utils import is_vision_available
 
+from ...test_processing_common import ProcessorTesterMixin
+
 
 if is_vision_available():
     from PIL import Image
@@ -29,7 +30,7 @@
 
 @require_torch
 @require_vision
-class Idefics3ProcessorTest(unittest.TestCase):
+class Idefics3ProcessorTest(ProcessorTesterMixin):
     def setUp(self):
         self.processor = Idefics3Processor.from_pretrained("HuggingFaceM4/Idefics3-8B-Llama3", image_seq_len=2)
         self.image1 = Image.open(

From c504f001d5d5db6e4eb2284b940ddbe85b6f2df9 Mon Sep 17 00:00:00 2001
From: Andres Marafioti <andimarafioti@gmail.com>
Date: Tue, 13 Aug 2024 13:02:36 +0000
Subject: [PATCH 19/50] image processing tests

---
 .../models/idefics3/configuration_idefics3.py |  1 -
 .../idefics3/image_processing_idefics3.py     | 46 ++++++++------
 .../models/idefics3/modeling_idefics3.py      | 19 ++++--
 .../models/idefics3/processing_idefics3.py    | 14 +++--
 .../test_image_processing_idefics3.py         | 63 ++++++-------------
 5 files changed, 67 insertions(+), 76 deletions(-)

diff --git a/src/transformers/models/idefics3/configuration_idefics3.py b/src/transformers/models/idefics3/configuration_idefics3.py
index 4dcaded43e5839..4b03a3af1576fb 100644
--- a/src/transformers/models/idefics3/configuration_idefics3.py
+++ b/src/transformers/models/idefics3/configuration_idefics3.py
@@ -126,7 +126,6 @@ def from_pretrained(cls, pretrained_model_name_or_path: Union[str, os.PathLike],
         return cls.from_dict(config_dict, **kwargs)
 
 
-
 class Idefics3Config(PretrainedConfig):
     r"""
     This is the configuration class to store the configuration of a [`Idefics3Model`]. It is used to instantiate a
diff --git a/src/transformers/models/idefics3/image_processing_idefics3.py b/src/transformers/models/idefics3/image_processing_idefics3.py
index 106f8b2f28c580..6a8bf93140f491 100644
--- a/src/transformers/models/idefics3/image_processing_idefics3.py
+++ b/src/transformers/models/idefics3/image_processing_idefics3.py
@@ -164,13 +164,9 @@ def get_resize_output_image_size(
         height, width = get_image_size(image, channel_dim=input_data_format)
 
     # Find the output size, when rescaling the longest edge to max_len and preserving the aspect ratio
-    height, width = _resize_output_size_rescale_to_max_len(
-        height, width, max_len=resolution_max_side
-    )
+    height, width = _resize_output_size_rescale_to_max_len(height, width, max_len=resolution_max_side)
     # Find the output size when scaling the image to be below the max_image_size
-    height, width = _resize_output_size_scale_below_upper_bound(
-        height, width, max_len=max_image_size
-    )
+    height, width = _resize_output_size_scale_below_upper_bound(height, width, max_len=max_image_size)
     return height, width
 
 
@@ -292,7 +288,8 @@ def to_pil_image(
     image = image.astype(np.uint8)
     return PIL.Image.fromarray(image, mode=image_mode)
 
-def convert_to_rgb(image: ImageInput, palette: Optional[PIL.ImagePalette.ImagePalette]=None) -> ImageInput:
+
+def convert_to_rgb(image: ImageInput, palette: Optional[PIL.ImagePalette.ImagePalette] = None) -> ImageInput:
     """
     Converts an image to RGB format. Only converts if the image is of type PIL.Image.Image, otherwise returns the image
     as is.
@@ -305,7 +302,7 @@ def convert_to_rgb(image: ImageInput, palette: Optional[PIL.ImagePalette.ImagePa
     if not isinstance(image, PIL.Image.Image):
         mode = "P" if palette is not None else None
         image = to_pil_image(image, image_mode=mode)
-        if image.mode=='P' and palette is not None:
+        if image.mode == "P" and palette is not None:
             image.putpalette(palette)
 
     image_rgba = image.convert("RGBA")
@@ -407,7 +404,7 @@ def __init__(
         super().__init__(**kwargs)
         self.do_convert_rgb = do_convert_rgb
         self.do_resize = do_resize
-        self.size = size if size is not None else {"longest_edge": 4*364}
+        self.size = size if size is not None else {"longest_edge": 4 * 364}
         self.resample = resample
         self.do_image_splitting = do_image_splitting
         self.max_image_size = max_image_size if max_image_size is not None else {"longest_edge": 364}
@@ -457,7 +454,7 @@ def resize(
         else:
             image_mode = None
             if image.ndim == 2 or image.shape[-1] == 1:
-                image_mode = 'P'
+                image_mode = "P"
             image = to_pil_image(image, input_data_format=input_data_format, image_mode=image_mode)
 
         resized_image = image.resize((size[1], size[0]), resample=resample)
@@ -466,7 +463,6 @@ def resize(
             resized_array = np.expand_dims(resized_array, axis=-1)
         return resized_array
 
-
     def split_image(
         self,
         image,
@@ -528,7 +524,13 @@ def split_image(
                         cropped_image = image.crop((start_x, start_y, end_x, end_y))
                     else:
                         cropped_image = _crop(
-                            image, start_x, start_y, end_x, end_y, input_data_format=input_data_format, data_format=data_format
+                            image,
+                            start_x,
+                            start_y,
+                            end_x,
+                            end_y,
+                            input_data_format=input_data_format,
+                            data_format=data_format,
                         )
                     frames.append(cropped_image)
 
@@ -540,7 +542,7 @@ def split_image(
                 else:
                     image = self.resize(
                         image,
-                        {'height': global_image_height, 'width': global_image_width},
+                        {"height": global_image_height, "width": global_image_width},
                         resample=resample,
                         input_data_format=input_data_format,
                     )
@@ -553,6 +555,7 @@ def split_image(
         frames.append(image)
 
         return frames, num_splits_h, num_splits_w
+
     def _pad_image(
         self,
         image: np.ndarray,
@@ -750,7 +753,6 @@ def preprocess(
                 "Idefics3 was trained on splitted image to support high resolution. Setting do_image_splitting=False will degrade the performance."
             )
 
-
         images_list = make_list_of_images(images)
 
         if not valid_images(images_list[0]):
@@ -759,8 +761,10 @@ def preprocess(
                 "torch.Tensor, tf.Tensor or jax.ndarray."
             )
 
-
-        palettes = [image[0].getpalette() if isinstance(image[0], Image.Image) and image[0].mode == 'P' else None for image in images_list] # save the palettes for conversion to RGB
+        palettes = [
+            image[0].getpalette() if isinstance(image[0], Image.Image) and image[0].mode == "P" else None
+            for image in images_list
+        ]  # save the palettes for conversion to RGB
         # All transformations expect numpy arrays.
         images_list = [[to_numpy_array(image) for image in images] for images in images_list]
 
@@ -779,7 +783,6 @@ def preprocess(
             # We assume that all images have the same channel dimension format.
             input_data_format = infer_channel_dimension_format(images_list[0][0], num_channels=(1, 3, 4))
 
-
         validate_preprocess_arguments(
             do_rescale=do_rescale,
             rescale_factor=rescale_factor,
@@ -824,7 +827,9 @@ def preprocess(
                     width = int(height * aspect_ratio)
                     width = math.ceil(width / self.vision_encoder_max_size) * self.vision_encoder_max_size
                 new_size = {"height": height, "width": width}
-                new_images.append(self.resize(img, size=new_size, resample=resample, input_data_format=input_data_format))
+                new_images.append(
+                    self.resize(img, size=new_size, resample=resample, input_data_format=input_data_format)
+                )
             new_images_list.append(new_images)
         images_list = new_images_list
         del new_images_list
@@ -855,8 +860,9 @@ def preprocess(
             images_list_cols = [[0] * len(images) for images in images_list]
 
         if do_convert_rgb:
-            images_list = [[convert_to_rgb(image, palette) for image in images] for images, palette in zip(images_list, palettes)]
-
+            images_list = [
+                [convert_to_rgb(image, palette) for image in images] for images, palette in zip(images_list, palettes)
+            ]
 
         if is_scaled_image(images_list[0][0]) and do_rescale:
             logger.warning_once(
diff --git a/src/transformers/models/idefics3/modeling_idefics3.py b/src/transformers/models/idefics3/modeling_idefics3.py
index bb8e8ba282cb60..4c77f699c441e5 100644
--- a/src/transformers/models/idefics3/modeling_idefics3.py
+++ b/src/transformers/models/idefics3/modeling_idefics3.py
@@ -390,8 +390,8 @@ def __init__(self, config):
         super().__init__()
         self.config = config
 
-        input_size=config.vision_config.hidden_size * (config.scale_factor**2)
-        output_size=config.text_config.hidden_size
+        input_size = config.vision_config.hidden_size * (config.scale_factor**2)
+        output_size = config.text_config.hidden_size
         self.proj = nn.Linear(input_size, output_size, bias=False)
 
     def forward(self, x):
@@ -552,7 +552,8 @@ def forward(
             [`~PreTrainedModel.from_pretrained`] method to load the model weights.
 """
 
-#Copied from transformers.models.idefics2.modeling_idefics2.Idefics2VisionTransformer
+
+# Copied from transformers.models.idefics2.modeling_idefics2.Idefics2VisionTransformer
 @add_start_docstrings(
     "The Idefics3 Vision Transformer Model outputting raw image embedding.",
     IDEFICS3_VISION_START_DOCSTRING,
@@ -823,11 +824,15 @@ def __init__(self, config: Idefics3Config):
         self.padding_idx = self.config.text_config.pad_token_id
         self.vocab_size = self.config.text_config.vocab_size
 
-        self.vision_model = Idefics3VisionTransformer._from_config(config.vision_config, attn_implementation=config._attn_implementation)
+        self.vision_model = Idefics3VisionTransformer._from_config(
+            config.vision_config, attn_implementation=config._attn_implementation
+        )
         self.connector = Idefics3Connector(config)
         self.text_model = AutoModel.from_config(config.text_config, attn_implementation=config._attn_implementation)
 
-        self.image_seq_len = int(((config.vision_config.image_size // config.vision_config.patch_size) ** 2) / (config.scale_factor**2))
+        self.image_seq_len = int(
+            ((config.vision_config.image_size // config.vision_config.patch_size) ** 2) / (config.scale_factor**2)
+        )
         self.image_token_id = self.config.image_token_id
 
         self._use_flash_attention_2 = config._attn_implementation == "flash_attention_2"
@@ -884,7 +889,9 @@ def inputs_merger(
         num_images, _, vision_hidden_size = image_hidden_states.shape
         special_image_token_mask = input_ids == self.image_token_id
         new_inputs_embeds = inputs_embeds.clone()
-        reshaped_image_hidden_states = image_hidden_states.view(-1, vision_hidden_size).to(inputs_embeds.dtype) # cast to the dtype of the input_embeds to support quantized models
+        reshaped_image_hidden_states = image_hidden_states.view(-1, vision_hidden_size).to(
+            inputs_embeds.dtype
+        )  # cast to the dtype of the input_embeds to support quantized models
         new_inputs_embeds[special_image_token_mask] = reshaped_image_hidden_states
         return new_inputs_embeds
 
diff --git a/src/transformers/models/idefics3/processing_idefics3.py b/src/transformers/models/idefics3/processing_idefics3.py
index 5d50f41a959585..8f40d10b3b4fe1 100644
--- a/src/transformers/models/idefics3/processing_idefics3.py
+++ b/src/transformers/models/idefics3/processing_idefics3.py
@@ -52,9 +52,7 @@ def _prompt_split_image(image_seq_len, image_rows, image_cols, fake_token_around
     for n_h in range(image_rows):
         for n_w in range(image_cols):
             text_split_images += (
-                f"{fake_token_around_image}"
-                + f"<row_{n_h + 1}_col_{n_w + 1}>"
-                + f"{image_token}" * image_seq_len
+                f"{fake_token_around_image}" + f"<row_{n_h + 1}_col_{n_w + 1}>" + f"{image_token}" * image_seq_len
             )
         text_split_images += "\n"
 
@@ -69,7 +67,9 @@ def _prompt_split_image(image_seq_len, image_rows, image_cols, fake_token_around
 
 def _prompt_single_image(image_seq_len, fake_token_around_image, image_token):
     """Prompt with expanded image tokens for a single image."""
-    return f"{fake_token_around_image}" + "<global-img>" + f"{image_token}" * image_seq_len + f"{fake_token_around_image}"
+    return (
+        f"{fake_token_around_image}" + "<global-img>" + f"{image_token}" * image_seq_len + f"{fake_token_around_image}"
+    )
 
 
 def get_image_prompt_string(image_rows, image_cols, image_seq_len, fake_token_around_image, image_token):
@@ -79,6 +79,7 @@ def get_image_prompt_string(image_rows, image_cols, image_seq_len, fake_token_ar
         )
     return _prompt_split_image(image_seq_len, image_rows, image_cols, fake_token_around_image, image_token)
 
+
 class Idefics3ImagesKwargs(ImagesKwargs, total=False):
     image_seq_len: Optional[int]
     return_row_col_info: Optional[bool]
@@ -99,6 +100,7 @@ class Idefics3ProcessorKwargs(ProcessingKwargs, total=False):
         },
     }
 
+
 class Idefics3Processor(ProcessorMixin):
     r"""
     Constructs a Idefics3 processor which wraps a LLama tokenizer and Idefics3 image processor into a single processor.
@@ -241,7 +243,7 @@ def __call__(
         )
 
         # Temporary fix for "paddding_side" in init_kwargs
-        _ = output_kwargs['text_kwargs'].pop("padding_side", None)
+        _ = output_kwargs["text_kwargs"].pop("padding_side", None)
 
         image_seq_len = output_kwargs["images_kwargs"].pop("image_seq_len", None)
 
@@ -270,7 +272,7 @@ def __call__(
                 new_images.append([])
                 for im in sample:
                     if is_valid_image(im):
-                        new_images[-1].append(im) # already loaded
+                        new_images[-1].append(im)  # already loaded
                     elif isinstance(im, str):
                         new_images[-1].append(load_image(im))
 
diff --git a/tests/models/idefics3/test_image_processing_idefics3.py b/tests/models/idefics3/test_image_processing_idefics3.py
index ff4fa435ed51b2..d4f12d5dd574ad 100644
--- a/tests/models/idefics3/test_image_processing_idefics3.py
+++ b/tests/models/idefics3/test_image_processing_idefics3.py
@@ -18,6 +18,7 @@
 
 import numpy as np
 
+from transformers.image_utils import PILImageResampling
 from transformers.testing_utils import require_torch, require_vision
 from transformers.utils import is_torch_available, is_vision_available
 
@@ -27,14 +28,14 @@
 if is_vision_available():
     from PIL import Image
 
-    from transformers import Idefics2ImageProcessor
+    from transformers import Idefics3ImageProcessor
 
 
 if is_torch_available():
     import torch
 
 
-class Idefics2ImageProcessingTester(unittest.TestCase):
+class Idefics3ImageProcessingTester(unittest.TestCase):
     def __init__(
         self,
         parent,
@@ -43,9 +44,10 @@ def __init__(
         num_images=1,
         image_size=18,
         min_resolution=30,
-        max_resolution=400,
+        max_resolution=80,
         do_resize=True,
         size=None,
+        max_image_size=None,
         do_rescale=True,
         rescale_factor=1 / 255,
         do_normalize=True,
@@ -54,8 +56,9 @@ def __init__(
         do_convert_rgb=True,
         do_pad=True,
         do_image_splitting=True,
+        resample=PILImageResampling.LANCZOS,
     ):
-        size = size if size is not None else {"longest_edge": 4*364}
+        size = size if size is not None else {"longest_edge": 182}
         self.parent = parent
         self.batch_size = batch_size
         self.num_channels = num_channels
@@ -65,14 +68,16 @@ def __init__(
         self.max_resolution = max_resolution
         self.do_resize = do_resize
         self.size = size
+        self.resample = resample
+        self.do_image_splitting = do_image_splitting
+        self.max_image_size = max_image_size if max_image_size is not None else {"longest_edge": 182}
+        self.do_rescale = do_rescale
+        self.rescale_factor = rescale_factor
         self.do_normalize = do_normalize
         self.image_mean = image_mean
         self.image_std = image_std
-        self.do_rescale = do_rescale
-        self.rescale_factor = rescale_factor
         self.do_convert_rgb = do_convert_rgb
         self.do_pad = do_pad
-        self.do_image_splitting = do_image_splitting
 
     def prepare_image_processor_dict(self):
         return {
@@ -90,41 +95,10 @@ def prepare_image_processor_dict(self):
 
     def get_expected_values(self, image_inputs, batched=False):
         """
-        This function computes the expected height and width when providing images to BridgeTowerImageProcessor,
-        assuming do_resize is set to True with a scalar size and size_divisor.
+        This function computes the expected height and width when providing images to Idefics3ImageProcessor,
+        assuming do_resize is set to True. The expected size in that case the max image size.
         """
-        if not batched:
-            shortest_edge = self.size["shortest_edge"]
-            longest_edge = self.size["longest_edge"]
-            image = image_inputs[0]
-            if isinstance(image, Image.Image):
-                w, h = image.size
-            elif isinstance(image, np.ndarray):
-                h, w = image.shape[0], image.shape[1]
-            else:
-                h, w = image.shape[1], image.shape[2]
-
-            aspect_ratio = w / h
-            if w > h and w >= longest_edge:
-                w = longest_edge
-                h = int(w / aspect_ratio)
-            elif h > w and h >= longest_edge:
-                h = longest_edge
-                w = int(h * aspect_ratio)
-            w = max(w, shortest_edge)
-            h = max(h, shortest_edge)
-            expected_height = h
-            expected_width = w
-        else:
-            expected_values = []
-            for images in image_inputs:
-                for image in images:
-                    expected_height, expected_width = self.get_expected_values([image])
-                    expected_values.append((expected_height, expected_width))
-            expected_height = max(expected_values, key=lambda item: item[0])[0]
-            expected_width = max(expected_values, key=lambda item: item[1])[1]
-
-        return expected_height, expected_width
+        return self.max_image_size["longest_edge"], self.max_image_size["longest_edge"]
 
     def expected_output_image_shape(self, images):
         height, width = self.get_expected_values(images, batched=True)
@@ -188,11 +162,11 @@ def prepare_image_inputs(
 @require_torch
 @require_vision
 class Idefics2ImageProcessingTest(ImageProcessingTestMixin, unittest.TestCase):
-    image_processing_class = Idefics2ImageProcessor if is_vision_available() else None
+    image_processing_class = Idefics3ImageProcessor if is_vision_available() else None
 
     def setUp(self):
         super().setUp()
-        self.image_processor_tester = Idefics2ImageProcessingTester(self)
+        self.image_processor_tester = Idefics3ImageProcessingTester(self)
 
     @property
     def image_processor_dict(self):
@@ -203,6 +177,9 @@ def test_image_processor_properties(self):
         self.assertTrue(hasattr(image_processing, "do_convert_rgb"))
         self.assertTrue(hasattr(image_processing, "do_resize"))
         self.assertTrue(hasattr(image_processing, "size"))
+        self.assertTrue(hasattr(image_processing, "resample"))
+        self.assertTrue(hasattr(image_processing, "do_image_splitting"))
+        self.assertTrue(hasattr(image_processing, "max_image_size"))
         self.assertTrue(hasattr(image_processing, "do_rescale"))
         self.assertTrue(hasattr(image_processing, "rescale_factor"))
         self.assertTrue(hasattr(image_processing, "do_normalize"))

From a914e41cd8cd4597c284018b22a7189c2b07dd30 Mon Sep 17 00:00:00 2001
From: Andres Marafioti <andimarafioti@gmail.com>
Date: Tue, 13 Aug 2024 15:02:06 +0000
Subject: [PATCH 20/50] image processing tests fixes and some bugs they
 discovered

---
 .../idefics3/image_processing_idefics3.py     | 38 +++++++++++--------
 .../test_image_processing_idefics3.py         | 15 ++++----
 .../idefics3/test_processing_idefics3.py      |  2 +-
 3 files changed, 31 insertions(+), 24 deletions(-)

diff --git a/src/transformers/models/idefics3/image_processing_idefics3.py b/src/transformers/models/idefics3/image_processing_idefics3.py
index 6a8bf93140f491..eefd19b52a52d3 100644
--- a/src/transformers/models/idefics3/image_processing_idefics3.py
+++ b/src/transformers/models/idefics3/image_processing_idefics3.py
@@ -253,7 +253,7 @@ def to_pil_image(
     image: Union[np.ndarray, "PIL.Image.Image", "torch.Tensor", "tf.Tensor", "jnp.ndarray"],
     do_rescale: Optional[bool] = None,
     input_data_format: Optional[Union[str, ChannelDimension]] = None,
-    image_mode: str = "RGB",
+    image_mode: Optional[str] = None,
 ) -> "PIL.Image.Image":
     """
     Converts `image` to a PIL Image. Optionally rescales it and puts the channel dimension back as the last axis if
@@ -274,7 +274,6 @@ def to_pil_image(
     """
     if isinstance(image, PIL.Image.Image):
         return image
-
     # Convert all tensors to numpy arrays before converting to PIL image
     if is_torch_tensor(image) or is_tf_tensor(image):
         image = image.numpy()
@@ -283,13 +282,16 @@ def to_pil_image(
     elif not isinstance(image, np.ndarray):
         raise ValueError("Input image type not supported: {}".format(type(image)))
 
+    # If the channel has been moved to first dim, we put it back at the end.
+    image = to_channel_dimension_format(image, ChannelDimension.LAST, input_data_format)
+
     # If there is a single channel, we squeeze it, as otherwise PIL can't handle it.
     image = np.squeeze(image, axis=-1) if image.shape[-1] == 1 else image
     image = image.astype(np.uint8)
     return PIL.Image.fromarray(image, mode=image_mode)
 
 
-def convert_to_rgb(image: ImageInput, palette: Optional[PIL.ImagePalette.ImagePalette] = None) -> ImageInput:
+def convert_to_rgb(image: ImageInput, palette: Optional[PIL.ImagePalette.ImagePalette] = None, input_data_format: Optional[Union[str, ChannelDimension]] = None,) -> ImageInput:
     """
     Converts an image to RGB format. Only converts if the image is of type PIL.Image.Image, otherwise returns the image
     as is.
@@ -298,10 +300,12 @@ def convert_to_rgb(image: ImageInput, palette: Optional[PIL.ImagePalette.ImagePa
             The image to convert.
         palette (List[int], *optional*):
             The palette to use if given.
+        input_data_format (ChannelDimension or str, *optional*):
+            The channel dimension format of the input image.
     """
     if not isinstance(image, PIL.Image.Image):
         mode = "P" if palette is not None else None
-        image = to_pil_image(image, image_mode=mode)
+        image = to_pil_image(image, image_mode=mode, input_data_format=input_data_format)
         if image.mode == "P" and palette is not None:
             image.putpalette(palette)
 
@@ -398,7 +402,6 @@ def __init__(
         image_mean: Optional[Union[float, List[float]]] = None,
         image_std: Optional[Union[float, List[float]]] = None,
         do_pad: bool = True,
-        vision_encoder_max_size: int = 364,
         **kwargs,
     ) -> None:
         super().__init__(**kwargs)
@@ -414,7 +417,6 @@ def __init__(
         self.image_mean = image_mean if image_mean is not None else IMAGENET_STANDARD_MEAN
         self.image_std = image_std if image_std is not None else IMAGENET_STANDARD_STD
         self.do_pad = do_pad
-        self.vision_encoder_max_size = vision_encoder_max_size
 
     def resize(
         self,
@@ -806,10 +808,20 @@ def preprocess(
                 for images in images_list
             ]
 
+        # Resize might already change the channel dimension, so we will recompute it
+        input_data_format = infer_channel_dimension_format(images_list[0][0], num_channels=(1, 3, 4))
+
         # We will resize both height and width of each image to the nearest 364 multiple, disregarding the aspect ratio
         # for size=(10, 364) -> rescaled_size=(364, 364)
         # for size=(11, 365) -> rescaled_size=(364, 364*2)
         new_images_list = []
+        if 'longest_edge' in max_image_size:
+            vision_encoder_max_size = max_image_size["longest_edge"]
+        elif isinstance(max_image_size, int):
+            vision_encoder_max_size = max_image_size
+        else:
+            raise ValueError("Invalid max_image_size, must be a dictionary with key 'longest_edge' or an integer.")
+
         for images in images_list:
             new_images = []
             for img in images:
@@ -819,13 +831,13 @@ def preprocess(
                     height, width, _ = img.shape
                 aspect_ratio = width / height
                 if width >= height:
-                    width = math.ceil(width / self.vision_encoder_max_size) * self.vision_encoder_max_size
+                    width = math.ceil(width / vision_encoder_max_size) * vision_encoder_max_size
                     height = int(width / aspect_ratio)
-                    height = math.ceil(height / self.vision_encoder_max_size) * self.vision_encoder_max_size
+                    height = math.ceil(height / vision_encoder_max_size) * vision_encoder_max_size
                 elif height > width:
-                    height = math.ceil(height / self.vision_encoder_max_size) * self.vision_encoder_max_size
+                    height = math.ceil(height / vision_encoder_max_size) * vision_encoder_max_size
                     width = int(height * aspect_ratio)
-                    width = math.ceil(width / self.vision_encoder_max_size) * self.vision_encoder_max_size
+                    width = math.ceil(width / vision_encoder_max_size) * vision_encoder_max_size
                 new_size = {"height": height, "width": width}
                 new_images.append(
                     self.resize(img, size=new_size, resample=resample, input_data_format=input_data_format)
@@ -861,7 +873,7 @@ def preprocess(
 
         if do_convert_rgb:
             images_list = [
-                [convert_to_rgb(image, palette) for image in images] for images, palette in zip(images_list, palettes)
+                [convert_to_rgb(image, palette, input_data_format=input_data_format) for image in images] for images, palette in zip(images_list, palettes)
             ]
 
         if is_scaled_image(images_list[0][0]) and do_rescale:
@@ -875,10 +887,6 @@ def preprocess(
                 rescaled_images_array.append([rescale(img, rescale_factor) for img in image])
             images_list = rescaled_images_array
 
-        if input_data_format is None:
-            # We assume that all images have the same channel dimension format.
-            input_data_format = infer_channel_dimension_format(images_list[0][0])
-
         if do_normalize:
             images_list = [
                 [
diff --git a/tests/models/idefics3/test_image_processing_idefics3.py b/tests/models/idefics3/test_image_processing_idefics3.py
index d4f12d5dd574ad..20e3d85e743695 100644
--- a/tests/models/idefics3/test_image_processing_idefics3.py
+++ b/tests/models/idefics3/test_image_processing_idefics3.py
@@ -44,7 +44,7 @@ def __init__(
         num_images=1,
         image_size=18,
         min_resolution=30,
-        max_resolution=80,
+        max_resolution=40,
         do_resize=True,
         size=None,
         max_image_size=None,
@@ -58,7 +58,8 @@ def __init__(
         do_image_splitting=True,
         resample=PILImageResampling.LANCZOS,
     ):
-        size = size if size is not None else {"longest_edge": 182}
+        super().__init__()
+        self.size = size if size is not None else {"longest_edge": max_resolution}
         self.parent = parent
         self.batch_size = batch_size
         self.num_channels = num_channels
@@ -67,10 +68,9 @@ def __init__(
         self.min_resolution = min_resolution
         self.max_resolution = max_resolution
         self.do_resize = do_resize
-        self.size = size
         self.resample = resample
         self.do_image_splitting = do_image_splitting
-        self.max_image_size = max_image_size if max_image_size is not None else {"longest_edge": 182}
+        self.max_image_size = max_image_size if max_image_size is not None else {"longest_edge": 20}
         self.do_rescale = do_rescale
         self.rescale_factor = rescale_factor
         self.do_normalize = do_normalize
@@ -84,6 +84,7 @@ def prepare_image_processor_dict(self):
             "do_convert_rgb": self.do_convert_rgb,
             "do_resize": self.do_resize,
             "size": self.size,
+            "max_image_size": self.max_image_size,
             "do_rescale": self.do_rescale,
             "rescale_factor": self.rescale_factor,
             "do_normalize": self.do_normalize,
@@ -161,7 +162,7 @@ def prepare_image_inputs(
 
 @require_torch
 @require_vision
-class Idefics2ImageProcessingTest(ImageProcessingTestMixin, unittest.TestCase):
+class Idefics3ImageProcessingTest(ImageProcessingTestMixin, unittest.TestCase):
     image_processing_class = Idefics3ImageProcessor if is_vision_available() else None
 
     def setUp(self):
@@ -211,14 +212,12 @@ def test_call_numpy(self):
             )
 
     def test_call_numpy_4_channels(self):
+        # Idefics3 always processes images as RGB, so it always returns images with 3 channels
         for image_processing_class in self.image_processor_list:
             # Initialize image_processing
             image_processor_dict = self.image_processor_dict
-            image_processor_dict["image_mean"] = [0.5, 0.5, 0.5, 0.5]
-            image_processor_dict["image_std"] = [0.5, 0.5, 0.5, 0.5]
             image_processing = self.image_processing_class(**image_processor_dict)
             # create random numpy tensors
-            self.image_processor_tester.num_channels = 4
             image_inputs = self.image_processor_tester.prepare_image_inputs(equal_resolution=False, numpify=True)
 
             for sample_images in image_inputs:
diff --git a/tests/models/idefics3/test_processing_idefics3.py b/tests/models/idefics3/test_processing_idefics3.py
index 64bbb13cdd0e4c..aef9a214928b62 100644
--- a/tests/models/idefics3/test_processing_idefics3.py
+++ b/tests/models/idefics3/test_processing_idefics3.py
@@ -228,7 +228,7 @@ def test_apply_chat_template(self):
         rendered = processor.apply_chat_template(messages, add_generation_prompt=True)
 
         expected_rendered = (
-            "User: What do these images show?<image><image><end_of_utterance>\n"
+            "<|begin_of_text|>User: What do these images show?<image><image><end_of_utterance>\n"
             "Assistant: The first image shows the statue of Liberty in New York. The second image picture depicts Idefix, the dog of Obelix in Asterix and Obelix.<end_of_utterance>\n"
             "User: And who is that?<end_of_utterance>\n"
             "Assistant:"

From 6722d13bda6b05cf843a2c93021a450b2688e91d Mon Sep 17 00:00:00 2001
From: Andres Marafioti <andimarafioti@gmail.com>
Date: Tue, 13 Aug 2024 15:06:07 +0000
Subject: [PATCH 21/50] addressed review comments from Yoni

---
 .../models/idefics3/image_processing_idefics3.py | 11 ++++++++---
 .../models/idefics3/processing_idefics3.py       | 16 ----------------
 .../models/idefics3/test_processing_idefics3.py  |  3 ++-
 3 files changed, 10 insertions(+), 20 deletions(-)

diff --git a/src/transformers/models/idefics3/image_processing_idefics3.py b/src/transformers/models/idefics3/image_processing_idefics3.py
index eefd19b52a52d3..a7e61ce8e2f06d 100644
--- a/src/transformers/models/idefics3/image_processing_idefics3.py
+++ b/src/transformers/models/idefics3/image_processing_idefics3.py
@@ -291,7 +291,11 @@ def to_pil_image(
     return PIL.Image.fromarray(image, mode=image_mode)
 
 
-def convert_to_rgb(image: ImageInput, palette: Optional[PIL.ImagePalette.ImagePalette] = None, input_data_format: Optional[Union[str, ChannelDimension]] = None,) -> ImageInput:
+def convert_to_rgb(
+    image: ImageInput,
+    palette: Optional[PIL.ImagePalette.ImagePalette] = None,
+    input_data_format: Optional[Union[str, ChannelDimension]] = None,
+) -> ImageInput:
     """
     Converts an image to RGB format. Only converts if the image is of type PIL.Image.Image, otherwise returns the image
     as is.
@@ -815,7 +819,7 @@ def preprocess(
         # for size=(10, 364) -> rescaled_size=(364, 364)
         # for size=(11, 365) -> rescaled_size=(364, 364*2)
         new_images_list = []
-        if 'longest_edge' in max_image_size:
+        if "longest_edge" in max_image_size:
             vision_encoder_max_size = max_image_size["longest_edge"]
         elif isinstance(max_image_size, int):
             vision_encoder_max_size = max_image_size
@@ -873,7 +877,8 @@ def preprocess(
 
         if do_convert_rgb:
             images_list = [
-                [convert_to_rgb(image, palette, input_data_format=input_data_format) for image in images] for images, palette in zip(images_list, palettes)
+                [convert_to_rgb(image, palette, input_data_format=input_data_format) for image in images]
+                for images, palette in zip(images_list, palettes)
             ]
 
         if is_scaled_image(images_list[0][0]) and do_rescale:
diff --git a/src/transformers/models/idefics3/processing_idefics3.py b/src/transformers/models/idefics3/processing_idefics3.py
index 8f40d10b3b4fe1..f84499d4262dbd 100644
--- a/src/transformers/models/idefics3/processing_idefics3.py
+++ b/src/transformers/models/idefics3/processing_idefics3.py
@@ -219,22 +219,6 @@ def __call__(
         """
         if text is None and images is None:
             raise ValueError("You must provide either `text` or `images`.")
-        # check if images and text inputs are reversed for BC
-        if (
-            text is not None
-            and not isinstance(text[0], str)
-            or images is not None
-            and not (
-                is_image_or_image_url(images)
-                or is_image_or_image_url(images[0])
-                or (isinstance(images[0], list) and is_image_or_image_url(images[0][0]))
-            )
-        ):
-            warnings.warn(
-                "It looks like you are passing the inputs in the wrong order. You should pass the images input first and the text input second."
-                "Images and text inputs will be swapped."
-            )
-            images, text = text, images
 
         output_kwargs = self._merge_kwargs(
             Idefics3ProcessorKwargs,
diff --git a/tests/models/idefics3/test_processing_idefics3.py b/tests/models/idefics3/test_processing_idefics3.py
index aef9a214928b62..8ff31b581dde0c 100644
--- a/tests/models/idefics3/test_processing_idefics3.py
+++ b/tests/models/idefics3/test_processing_idefics3.py
@@ -14,6 +14,7 @@
 # limitations under the License.
 
 from io import BytesIO
+import unittest
 
 import requests
 
@@ -30,7 +31,7 @@
 
 @require_torch
 @require_vision
-class Idefics3ProcessorTest(ProcessorTesterMixin):
+class Idefics3ProcessorTest(ProcessorTesterMixin, unittest.TestCase):
     def setUp(self):
         self.processor = Idefics3Processor.from_pretrained("HuggingFaceM4/Idefics3-8B-Llama3", image_seq_len=2)
         self.image1 = Image.open(

From 0533eda76b2e16620a8d0e95bf8143850c766f90 Mon Sep 17 00:00:00 2001
From: Andres Marafioti <andimarafioti@gmail.com>
Date: Tue, 13 Aug 2024 15:23:54 +0000
Subject: [PATCH 22/50] fix modeling tests

---
 .../idefics3/image_processing_idefics3.py     |   9 +-
 .../models/idefics3/modeling_idefics3.py      |  13 +-
 .../models/idefics3/processing_idefics3.py    |  52 ++-
 .../models/idefics3/test_modeling_idefics3.py |  54 ++-
 .../idefics3/test_processing_idefics3.py      | 356 ++++++++++++++----
 5 files changed, 369 insertions(+), 115 deletions(-)

diff --git a/src/transformers/models/idefics3/image_processing_idefics3.py b/src/transformers/models/idefics3/image_processing_idefics3.py
index a7e61ce8e2f06d..ed6d2574b3912b 100644
--- a/src/transformers/models/idefics3/image_processing_idefics3.py
+++ b/src/transformers/models/idefics3/image_processing_idefics3.py
@@ -248,7 +248,6 @@ def make_pixel_mask(
     return mask
 
 
-# Copied from transformers.image_transforms.to_pil_image
 def to_pil_image(
     image: Union[np.ndarray, "PIL.Image.Image", "torch.Tensor", "tf.Tensor", "jnp.ndarray"],
     do_rescale: Optional[bool] = None,
@@ -589,7 +588,6 @@ def _pad_image(
         )
         return padded_image
 
-    # Copied from transformers.models.idefics2.image_processing_idefics2.pad
     def pad(
         self,
         images: List[np.ndarray],
@@ -643,7 +641,6 @@ def empty_image(size, input_data_format):
                 return np.zeros((n_channels, *size), dtype=np.uint8)
             elif input_data_format == ChannelDimension.LAST:
                 return np.zeros((*size, n_channels), dtype=np.uint8)
-            raise ValueError("Invalid channel dimension format.")
 
         padded_images_list = [
             [empty_image(pad_size, data_format) for _ in range(max_num_images)] for _ in range(batch_size)
@@ -685,6 +682,7 @@ def preprocess(
         return_row_col_info: bool = False,
         input_data_format: Optional[ChannelDimension] = None,
         data_format: Optional[ChannelDimension] = ChannelDimension.FIRST,
+        crop_size: Optional[Dict[str, int]] = None,
     ):
         """
         Preprocess a batch of images.
@@ -740,7 +738,12 @@ def preprocess(
                 - `"channels_first"` or `ChannelDimension.FIRST`: image in (num_channels, height, width) format.
                 - `"channels_last"` or `ChannelDimension.LAST`: image in (height, width, num_channels) format.
                 - `"none"` or `ChannelDimension.NONE`: image in (height, width) format.
+            crop_size (`Dict[str, int]`, *optional*):
+                This parameter is not used in this method. It is only present for compatibility.
         """
+        if crop_size is not None:
+            logger.warning("crop_size is not used in Idefics3ImageProcessor.preprocess.")
+
         do_resize = do_resize if do_resize is not None else self.do_resize
         size = size if size is not None else self.size
         resample = resample if resample is not None else self.resample
diff --git a/src/transformers/models/idefics3/modeling_idefics3.py b/src/transformers/models/idefics3/modeling_idefics3.py
index 4c77f699c441e5..614ea208b853e7 100644
--- a/src/transformers/models/idefics3/modeling_idefics3.py
+++ b/src/transformers/models/idefics3/modeling_idefics3.py
@@ -88,10 +88,10 @@ class Idefics3BaseModelOutputWithPast(ModelOutput):
 
 
 @dataclass
-# Copied from transformers.models.idefics.modeling_idefics.IdeficsCausalLMOutputWithPast with Idefics->Idefics3
 class Idefics3CausalLMOutputWithPast(ModelOutput):
     """
-    Base class for Idefics3 causal language model (or autoregressive) outputs.
+    Base class for Idefics causal language model (or autoregressive) outputs.
+
     Args:
         loss (`torch.FloatTensor` of shape `(1,)`, *optional*, returned when `labels` is provided):
             Language modeling loss (for next-token prediction).
@@ -100,20 +100,24 @@ class Idefics3CausalLMOutputWithPast(ModelOutput):
         past_key_values (`tuple(tuple(torch.FloatTensor))`, *optional*, returned when `use_cache=True` is passed or when `config.use_cache=True`):
             Tuple of `tuple(torch.FloatTensor)` of length `config.n_layers`, with each tuple having 2 tensors of shape
             `(batch_size, num_heads, sequence_length, embed_size_per_head)`)
+
             Contains pre-computed hidden-states (key and values in the self-attention blocks) that can be used (see
             `past_key_values` input) to speed up sequential decoding.
         hidden_states (`tuple(torch.FloatTensor)`, *optional*, returned when `output_hidden_states=True` is passed or when `config.output_hidden_states=True`):
             Tuple of `torch.FloatTensor` (one for the output of the embeddings, if the model has an embedding layer, +
             one for the output of each layer) of shape `(batch_size, sequence_length, hidden_size)`.
+
             Hidden-states of the model at the output of each layer plus the optional initial embedding outputs.
         attentions (`tuple(torch.FloatTensor)`, *optional*, returned when `output_attentions=True` is passed or when `config.output_attentions=True`):
             Tuple of `torch.FloatTensor` (one for each layer) of shape `(batch_size, num_heads, sequence_length,
             sequence_length)`.
+
             Attentions weights after the attention softmax, used to compute the weighted average in the self-attention
             heads.
         image_hidden_states (`tuple(torch.FloatTensor)`, *optional*):
             Tuple of `torch.FloatTensor` (one for the output of the image embeddings, `(batch_size, num_images,
             sequence_length, hidden_size)`.
+
             image_hidden_states of the model produced by the vision encoder
     """
 
@@ -125,7 +129,6 @@ class Idefics3CausalLMOutputWithPast(ModelOutput):
     image_hidden_states: Optional[Tuple[torch.FloatTensor]] = None
 
 
-# Copied from transformers.models.idefics2.modeling_idefics2.Idefics2VisionEmbeddings
 class Idefics3VisionEmbeddings(nn.Module):
     """
     This is a modified version of `siglip.modelign_siglip.SiglipVisionEmbeddings` to enable images of variable
@@ -553,7 +556,6 @@ def forward(
 """
 
 
-# Copied from transformers.models.idefics2.modeling_idefics2.Idefics2VisionTransformer
 @add_start_docstrings(
     "The Idefics3 Vision Transformer Model outputting raw image embedding.",
     IDEFICS3_VISION_START_DOCSTRING,
@@ -670,6 +672,9 @@ def forward(self, hidden_states):
         hidden_states = hidden_states * torch.rsqrt(variance + self.variance_epsilon)
         return self.weight * hidden_states.to(input_dtype)
 
+    def extra_repr(self):
+        return f"{tuple(self.weight.shape)}, eps={self.variance_epsilon}"
+
 
 class Idefics3Connector(nn.Module):
     def __init__(self, config):
diff --git a/src/transformers/models/idefics3/processing_idefics3.py b/src/transformers/models/idefics3/processing_idefics3.py
index f84499d4262dbd..13e89d136c4f27 100644
--- a/src/transformers/models/idefics3/processing_idefics3.py
+++ b/src/transformers/models/idefics3/processing_idefics3.py
@@ -17,7 +17,6 @@
 """
 
 import sys
-import warnings
 from typing import TYPE_CHECKING, List, Optional, Union
 
 from ...feature_extraction_utils import BatchFeature
@@ -46,7 +45,7 @@ def is_image_or_image_url(elem):
     return is_url(elem) or is_valid_image(elem)
 
 
-def _prompt_split_image(image_seq_len, image_rows, image_cols, fake_token_around_image, image_token):
+def _prompt_split_image(image_seq_len, image_rows, image_cols, fake_token_around_image, image_token, global_img_token):
     """Prompt with expanded image tokens for when the image is split into patches."""
     text_split_images = ""
     for n_h in range(image_rows):
@@ -58,31 +57,42 @@ def _prompt_split_image(image_seq_len, image_rows, image_cols, fake_token_around
 
     text_split_images += (
         f"\n{fake_token_around_image}"
-        + "<global-img>"
+        + f"{global_img_token}"
         + f"{image_token}" * image_seq_len
         + f"{fake_token_around_image}"
     )
     return text_split_images
 
 
-def _prompt_single_image(image_seq_len, fake_token_around_image, image_token):
+def _prompt_single_image(image_seq_len, fake_token_around_image, image_token, global_img_token):
     """Prompt with expanded image tokens for a single image."""
     return (
-        f"{fake_token_around_image}" + "<global-img>" + f"{image_token}" * image_seq_len + f"{fake_token_around_image}"
+        f"{fake_token_around_image}"
+        + f"{global_img_token}"
+        + f"{image_token}" * image_seq_len
+        + f"{fake_token_around_image}"
     )
 
 
-def get_image_prompt_string(image_rows, image_cols, image_seq_len, fake_token_around_image, image_token):
+def get_image_prompt_string(
+    image_rows, image_cols, image_seq_len, fake_token_around_image, image_token, global_img_token
+):
     if image_rows == 0 and image_cols == 0:
         return _prompt_single_image(
-            image_seq_len, fake_token_around_image=fake_token_around_image, image_token=image_token
+            image_seq_len,
+            fake_token_around_image=fake_token_around_image,
+            image_token=image_token,
+            global_img_token=global_img_token,
         )
-    return _prompt_split_image(image_seq_len, image_rows, image_cols, fake_token_around_image, image_token)
+    return _prompt_split_image(
+        image_seq_len, image_rows, image_cols, fake_token_around_image, image_token, global_img_token
+    )
 
 
 class Idefics3ImagesKwargs(ImagesKwargs, total=False):
     image_seq_len: Optional[int]
     return_row_col_info: Optional[bool]
+    max_image_size: Optional[dict[str, int]]
 
 
 class Idefics3ProcessorKwargs(ProcessingKwargs, total=False):
@@ -95,7 +105,6 @@ class Idefics3ProcessorKwargs(ProcessingKwargs, total=False):
             "is_split_into_words": False,
         },
         "images_kwargs": {
-            "image_seq_len": 169,
             "return_row_col_info": True,
         },
     }
@@ -113,6 +122,10 @@ class Idefics3Processor(ProcessorMixin):
             An instance of [`Idefics3ImageProcessor`]. The image processor is a required input.
         tokenizer (`PreTrainedTokenizerBase`, *optional*):
             An instance of [`PreTrainedTokenizerBase`]. This should correspond with the model's text model. The tokenizer is a required input.
+        image_seq_len (`int`, *optional*, defaults to 169):
+            The length of the image sequence i.e. the number of <image> tokens per image in the input.
+            This parameter is used to build the string from the input prompt and image tokens and should match the
+            value the model used. It is computed as: image_seq_len = int(((image_size // patch_size) ** 2) / (scale_factor**2))
         chat_template (`str`, *optional*): A Jinja template which will be used to convert lists of messages
             in a chat into a tokenizable string.
     """
@@ -121,7 +134,7 @@ class Idefics3Processor(ProcessorMixin):
     image_processor_class = "Idefics3ImageProcessor"
     tokenizer_class = "AutoTokenizer"
 
-    def __init__(self, image_processor, tokenizer=None, chat_template: str = None, **kwargs):
+    def __init__(self, image_processor, tokenizer=None, image_seq_len: int = 169, chat_template: str = None, **kwargs):
         if image_processor is None:
             raise ValueError("You need to specify an `image_processor`.")
         if tokenizer is None:
@@ -130,9 +143,16 @@ def __init__(self, image_processor, tokenizer=None, chat_template: str = None, *
         self.fake_image_token = AddedToken("<fake_token_around_image>", normalized=False, special=True)
         self.image_token = AddedToken("<image>", normalized=False, special=True)
         self.end_of_utterance_token = AddedToken("<end_of_utterance>", normalized=False, special=True)
+        self.global_img_token = "<global-img>"
+        self.global_img_token_id = tokenizer.convert_tokens_to_ids(self.global_img_token)
+        self.image_seq_len = image_seq_len
 
         tokens_to_add = {
-            "additional_special_tokens": [self.fake_image_token, self.image_token, self.end_of_utterance_token]
+            "additional_special_tokens": [
+                self.fake_image_token,
+                self.image_token,
+                self.end_of_utterance_token,
+            ]
         }
         tokenizer.add_special_tokens(tokens_to_add)
 
@@ -168,7 +188,7 @@ def __call__(
         >>> from transformers import Idefics3Processor
         >>> from transformers.image_utils import load_image
 
-        >>> processor = Idefics3Processor.from_pretrained("HuggingFaceM4/Idefics3-8B-Llama3", image_seq_len=2)
+        >>> processor = Idefics3Processor.from_pretrained("HuggingFaceM4/Idefics3-8B-Llama3")
         >>> processor.image_processor.do_image_splitting = False  # Force as False to simplify the example
 
         >>> url1 = "https://cdn.britannica.com/61/93061-050-99147DCE/Statue-of-Liberty-Island-New-York-Bay.jpg"
@@ -201,6 +221,7 @@ def __call__(
                 `<fake_token_around_image>` + `<row_x_col_y>` + `<image>` * `image_seq_len` * <fake_token_around_image>`.
             image_seq_len (`int`, *optional*):
                 The length of the image sequence. If not provided, the default value of 169 is used.
+                image_seq_len should be equal to int(((image_size // patch_size) ** 2) / (scale_factor**2))
             padding (`Union[bool, str, PaddingStrategy]`, *optional*, defaults to `False`):
                 Padding strategy applied to the input ids. See [`PreTrainedTokenizerFast.pad`] for more information.
             truncation (`Union[bool, str, TruncationStrategy]`, *optional*):
@@ -226,10 +247,11 @@ def __call__(
             **kwargs,
         )
 
-        # Temporary fix for "paddding_side" in init_kwargs
+        # Temporary fix for "padding_side" in init_kwargs
         _ = output_kwargs["text_kwargs"].pop("padding_side", None)
 
         image_seq_len = output_kwargs["images_kwargs"].pop("image_seq_len", None)
+        image_seq_len = image_seq_len if image_seq_len is not None else self.image_seq_len
 
         n_images_in_text = []
         n_images_in_images = []
@@ -277,6 +299,7 @@ def __call__(
 
             fake_image_token = self.fake_image_token.content
             image_token = self.image_token.content
+            global_img_token = self.global_img_token
 
             prompt_strings = []
             for sample, sample_rows, sample_cols in zip(text, image_rows, image_cols):
@@ -291,10 +314,13 @@ def __call__(
                         image_seq_len,
                         image_token=image_token,
                         fake_token_around_image=fake_image_token,
+                        global_img_token=global_img_token,
                     )
                     image_prompt_strings.append(image_prompt_string)
 
                 split_sample = sample.split(image_token)
+                if len(split_sample) == 0:
+                    raise ValueError("The image token should be present in the text.")
 
                 # Place in the image prompt strings where the image tokens are
                 sample = split_sample[0]
diff --git a/tests/models/idefics3/test_modeling_idefics3.py b/tests/models/idefics3/test_modeling_idefics3.py
index c032514f3836e9..43c01ba2f0aa30 100644
--- a/tests/models/idefics3/test_modeling_idefics3.py
+++ b/tests/models/idefics3/test_modeling_idefics3.py
@@ -51,12 +51,11 @@ def __init__(
         parent,
         is_training=True,
         batch_size=2,
+        scale_factor=2,
         num_images=2,
-        seq_length=10,
         vision_config={
-            "image_size": 12,
-            "patch_size": 12,
-            "num_channels": 3,
+            "image_size": 16,
+            "patch_size": 4,
             "hidden_size": 32,
             "num_hidden_layers": 2,
             "num_attention_heads": 4,
@@ -65,15 +64,6 @@ def __init__(
             "attention_dropout": 0.1,
             "initializer_range": 0.02,
         },
-        perceiver_config={
-            "hidden_act": "silu",
-            "resampler_n_latents": 2,
-            "resampler_depth": 2,
-            "resampler_n_heads": 2,
-            "num_key_value_heads": 1,
-            "resampler_head_dim": 12,
-            "attention_dropout": 0.0,
-        },
         text_config={
             "vocab_size": 100,
             "hidden_size": 64,
@@ -85,10 +75,10 @@ def __init__(
             "max_position_embeddings": 256,
             "initializer_range": 0.02,
             "rms_norm_eps": 1e-6,
-            "pad_token_id": 0,  # None in the original configuration_mistral, we set it to the unk_token_id
-            "bos_token_id": 1,
-            "eos_token_id": 2,
-            "image_token_id": 32_001,
+            "pad_token_id": 2,
+            "bos_token_id": 0,
+            "eos_token_id": 1,
+            "image_token_id": 57,
             "tie_word_embeddings": False,
             "rope_theta": 10000.0,
             "sliding_window": 32,
@@ -96,14 +86,17 @@ def __init__(
         },
         use_cache=False,
         tie_word_embeddings=False,
-        image_token_id=99,
+        image_token_id=57,
     ):
         self.parent = parent
         self.is_training = is_training
         self.batch_size = batch_size
         self.num_images = num_images
-        self.num_channels = 3
-        self.seq_length = seq_length
+        self.scale_factor = scale_factor
+        self.seq_length = (
+            int(((vision_config["image_size"] // vision_config["patch_size"]) ** 2) / (self.scale_factor**2))
+            * self.num_images
+        )
         self.use_cache = use_cache
         self.image_token_id = image_token_id
         self.tie_word_embeddings = tie_word_embeddings
@@ -114,7 +107,6 @@ def __init__(
         self.hidden_size = text_config["hidden_size"]
 
         self.vision_config = vision_config
-        self.perceiver_config = perceiver_config
         self.text_config = text_config
 
     def get_config(self):
@@ -123,9 +115,9 @@ def get_config(self):
             image_token_id=self.image_token_id,
             tie_word_embeddings=self.tie_word_embeddings,
             vision_config=self.vision_config,
-            perceiver_config=self.perceiver_config,
             text_config=self.text_config,
             vocab_size=self.vocab_size,
+            scale_factor=self.scale_factor,
         )
 
     def prepare_config_and_inputs(self):
@@ -133,7 +125,7 @@ def prepare_config_and_inputs(self):
             [
                 self.batch_size,
                 self.num_images,
-                self.vision_config["num_channels"],
+                3,  # Idefics3ImageProcessor always generates RGB pixel values
                 self.vision_config["image_size"],
                 self.vision_config["image_size"],
             ]
@@ -148,7 +140,7 @@ def prepare_config_and_inputs_for_common(self):
         input_ids = ids_tensor([self.batch_size, self.seq_length], config.text_config.vocab_size - 2) + 1
 
         # For simplicity just set the last n tokens to the image token
-        n_image_tokens_per_batch = self.num_images * self.perceiver_config["resampler_n_latents"]
+        n_image_tokens_per_batch = self.seq_length
         input_ids[:, -n_image_tokens_per_batch:] = self.image_token_id
         attention_mask = input_ids.ne(1).to(torch_device)
         inputs_dict = {
@@ -227,7 +219,7 @@ def test_resize_tokens_embeddings(self):
             # Check that the model can still do a forward pass successfully (every parameter should be resized)
             # Input ids should be clamped to the maximum size of the vocabulary - 1 and the image token should be the last token
             inputs_dict["input_ids"].clamp_(max=model_vocab_size - 15 - 2)
-            n_images = self.model_tester.num_images * self.model_tester.perceiver_config["resampler_n_latents"]
+            n_images = self.model_tester.num_images * self.model_tester.seq_length
             model.image_token_id = model_vocab_size - 15 - 1
             inputs_dict["input_ids"][:, -n_images:] = model.image_token_id
 
@@ -311,7 +303,7 @@ def test_resize_embeddings_untied(self):
             # Check that the model can still do a forward pass successfully (every parameter should be resized)
             # Input ids should be clamped to the maximum size of the vocabulary - 1 and the image token should be the last token
             inputs_dict["input_ids"].clamp_(max=model_vocab_size - 15 - 2)
-            n_images = self.model_tester.num_images * self.model_tester.perceiver_config["resampler_n_latents"]
+            n_images = self.model_tester.num_images * self.model_tester.seq_length
             model.image_token_id = model_vocab_size - 15 - 1
             inputs_dict["input_ids"][:, -n_images:] = model.image_token_id
 
@@ -379,7 +371,7 @@ def test_resize_tokens_embeddings(self):
             # Check that the model can still do a forward pass successfully (every parameter should be resized)
             # Input ids should be clamped to the maximum size of the vocabulary - 1 and the image token should be the last token
             inputs_dict["input_ids"].clamp_(max=model_vocab_size - 15 - 2)
-            n_images = self.model_tester.num_images * self.model_tester.perceiver_config["resampler_n_latents"]
+            n_images = self.model_tester.num_images * self.model_tester.seq_length
             model.model.image_token_id = model_vocab_size - 15 - 1
             inputs_dict["input_ids"][:, -n_images:] = model.model.image_token_id
 
@@ -456,7 +448,7 @@ def test_resize_embeddings_untied(self):
             # Check that the model can still do a forward pass successfully (every parameter should be resized)
             # Input ids should be clamped to the maximum size of the vocabulary - 1 and the image token should be the last token
             inputs_dict["input_ids"].clamp_(max=model_vocab_size - 15 - 2)
-            n_images = self.model_tester.num_images * self.model_tester.perceiver_config["resampler_n_latents"]
+            n_images = self.model_tester.num_images * self.model_tester.seq_length
             model.model.image_token_id = model_vocab_size - 15 - 1
             inputs_dict["input_ids"][:, -n_images:] = model.model.image_token_id
 
@@ -467,7 +459,7 @@ def test_resize_embeddings_untied(self):
 @require_torch
 class Idefics3ForConditionalGenerationIntegrationTest(unittest.TestCase):
     def setUp(self):
-        self.processor = AutoProcessor.from_pretrained("HuggingFaceM4/Idefics3-8B-Llama3-base")
+        self.processor = AutoProcessor.from_pretrained("HuggingFaceM4/Idefics3-8B-Llama3")
         self.image1 = Image.open(
             BytesIO(
                 requests.get(
@@ -493,7 +485,7 @@ def tearDown(self):
     @slow
     def test_integration_test(self):
         model = Idefics3ForConditionalGeneration.from_pretrained(
-            "HuggingFaceM4/Idefics3-8B-Llama3-base",
+            "HuggingFaceM4/Idefics3-8B-Llama3",
             torch_dtype=torch.bfloat16,
             device_map="auto",
         )
@@ -517,7 +509,7 @@ def test_integration_test(self):
     def test_integration_test_4bit(self):
         # Let' s make sure we test the preprocessing to replace what is used
         model = Idefics3ForConditionalGeneration.from_pretrained(
-            "HuggingFaceM4/Idefics3-8B-Llama3-base", load_in_4bit=True, device_map="auto"
+            "HuggingFaceM4/Idefics3-8B-Llama3", load_in_4bit=True, device_map="auto"
         )
 
         # Create pixel inputs
diff --git a/tests/models/idefics3/test_processing_idefics3.py b/tests/models/idefics3/test_processing_idefics3.py
index 8ff31b581dde0c..d50d2d4eb91ef8 100644
--- a/tests/models/idefics3/test_processing_idefics3.py
+++ b/tests/models/idefics3/test_processing_idefics3.py
@@ -13,12 +13,15 @@
 # See the License for the specific language governing permissions and
 # limitations under the License.
 
-from io import BytesIO
+import shutil
+import tempfile
 import unittest
+from io import BytesIO
 
 import requests
 
 from transformers import Idefics3Processor
+from transformers.models.auto.processing_auto import AutoProcessor
 from transformers.testing_utils import require_torch, require_vision
 from transformers.utils import is_vision_available
 
@@ -32,8 +35,13 @@
 @require_torch
 @require_vision
 class Idefics3ProcessorTest(ProcessorTesterMixin, unittest.TestCase):
+    processor_class = Idefics3Processor
+
     def setUp(self):
-        self.processor = Idefics3Processor.from_pretrained("HuggingFaceM4/Idefics3-8B-Llama3", image_seq_len=2)
+        self.tmpdirname = tempfile.mkdtemp()
+        processor = Idefics3Processor.from_pretrained("HuggingFaceM4/Idefics3-8B-Llama3", image_seq_len=2)
+        processor.save_pretrained(self.tmpdirname)
+        self.max_image_size = 364
         self.image1 = Image.open(
             BytesIO(
                 requests.get(
@@ -51,62 +59,100 @@ def setUp(self):
                 ).content
             )
         )
-        self.bos_token = self.processor.tokenizer.bos_token
-        self.image_token = self.processor.image_token.content
-        self.fake_image_token = self.processor.fake_image_token.content
+        self.bos_token = processor.tokenizer.bos_token
+        self.image_token = processor.image_token.content
+        self.fake_image_token = processor.fake_image_token.content
+        self.global_img_token = processor.global_img_token
+
+        self.bos_token_id = processor.tokenizer.convert_tokens_to_ids(self.bos_token)
+        self.image_token_id = processor.tokenizer.convert_tokens_to_ids(self.image_token)
+        self.fake_image_token_id = processor.tokenizer.convert_tokens_to_ids(self.fake_image_token)
+        self.global_img_token_id = processor.global_img_token_id
+        self.padding_token_id = processor.tokenizer.pad_token_id
+        self.image_seq_len = processor.image_seq_len
+
+    def get_tokenizer(self, **kwargs):
+        return AutoProcessor.from_pretrained(self.tmpdirname, **kwargs).tokenizer
+
+    def get_image_processor(self, **kwargs):
+        return AutoProcessor.from_pretrained(self.tmpdirname, **kwargs).image_processor
+
+    def get_processor(self, **kwargs):
+        return AutoProcessor.from_pretrained(self.tmpdirname, **kwargs)
+
+    def get_splitted_image_expected_tokens(self, processor, image_rows, image_cols):
+        text_split_images = []
+        for n_h in range(image_rows):
+            for n_w in range(image_cols):
+                text_split_images += (
+                    [self.fake_image_token_id]
+                    + processor.tokenizer(f"<row_{n_h + 1}_col_{n_w + 1}>", add_special_tokens=False)["input_ids"]
+                    + [self.image_token_id] * self.image_seq_len
+                )
+            text_split_images += processor.tokenizer("\n", add_special_tokens=False)["input_ids"]
+        text_split_images = text_split_images[:-1]  # remove last newline
+        text_split_images += processor.tokenizer("\n\n", add_special_tokens=False)[
+            "input_ids"
+        ]  # add double newline, as it gets its own token
+        text_split_images += (
+            [self.fake_image_token_id]
+            + [self.global_img_token_id]
+            + [self.image_token_id] * self.image_seq_len
+            + [self.fake_image_token_id]
+        )
+        return text_split_images
 
-        self.bos_token_id = self.processor.tokenizer.convert_tokens_to_ids(self.bos_token)
-        self.image_token_id = self.processor.tokenizer.convert_tokens_to_ids(self.image_token)
-        self.fake_image_token_id = self.processor.tokenizer.convert_tokens_to_ids(self.fake_image_token)
-        self.image_seq_len = self.processor.image_seq_len
+    def tearDown(self):
+        shutil.rmtree(self.tmpdirname)
 
     def test_process_interleaved_images_prompts_no_image_splitting(self):
-        old_image_splitting = self.processor.image_processor.do_image_splitting
-
-        self.processor.image_processor.do_image_splitting = False
+        processor = self.get_processor()
+        processor.image_processor.do_image_splitting = False
 
         # Test that a single image is processed correctly
-        inputs = self.processor(images=self.image1)
-        self.assertEqual(inputs["pixel_values"].shape, (1, 1, 3, 653, 980))
-        self.assertEqual(inputs["pixel_attention_mask"].shape, (1, 1, 653, 980))
+        inputs = processor(images=self.image1)
+        image1_expected_size = (1092, 1456)
+        self.assertEqual(inputs["pixel_values"].shape, (1, 1, 3, *image1_expected_size))
+        self.assertEqual(inputs["pixel_attention_mask"].shape, (1, 1, *image1_expected_size))
         # fmt: on
 
         # Test a single sample with image and text
         image_str = "<image>"
         text_str = "In this image, we see"
         text = image_str + text_str
-        inputs = self.processor(text=text, images=self.image1)
+        inputs = processor(text=text, images=self.image1)
 
         # fmt: off
-        tokenized_sentence = self.processor.tokenizer(text_str, add_special_tokens=False)
-        expected_input_ids = [[self.bos_token_id] + [self.fake_image_token_id] + [self.image_token_id] * self.image_seq_len + [self.fake_image_token_id] + tokenized_sentence["input_ids"]]
+        tokenized_sentence = processor.tokenizer(text_str, add_special_tokens=False)
+        expected_input_ids = [[self.bos_token_id] + [self.fake_image_token_id] + [self.global_img_token_id] + [self.image_token_id] * self.image_seq_len + [self.fake_image_token_id] + tokenized_sentence["input_ids"]]
         self.assertEqual(inputs["input_ids"], expected_input_ids)
         self.assertEqual(inputs["attention_mask"], [[1] * len(expected_input_ids[0])])
-        self.assertEqual(inputs["pixel_values"].shape, (1, 1, 3, 653, 980))
-        self.assertEqual(inputs["pixel_attention_mask"].shape, (1, 1, 653, 980))
+        self.assertEqual(inputs["pixel_values"].shape, (1, 1, 3, 1092, 1456))
+        self.assertEqual(inputs["pixel_attention_mask"].shape, (1, 1, 1092, 1456))
         # fmt: on
 
         # Test that batch is correctly processed
         image_str = "<image>"
         text_str_1 = "In this image, we see"
-        text_str_2 = "bla, bla"
+        text_str_2 = "In this image, we see"
 
         text = [
             image_str + text_str_1,
-            text_str_2 + image_str + image_str,
+            image_str + image_str + text_str_2,
         ]
         images = [[self.image1], [self.image2, self.image3]]
 
-        inputs = self.processor(text=text, images=images, padding=True)
+        inputs = processor(text=text, images=images, padding=True)
 
         # fmt: off
-        tokenized_sentence_1 = self.processor.tokenizer(text_str_1, add_special_tokens=False)
-        tokenized_sentence_2 = self.processor.tokenizer(text_str_2, add_special_tokens=False)
-        expected_input_ids_1 = [self.bos_token_id] + [self.fake_image_token_id] + [self.image_token_id] * self.image_seq_len + [self.fake_image_token_id] + tokenized_sentence_1["input_ids"]
-        expected_input_ids_2 = [self.bos_token_id] + tokenized_sentence_2["input_ids"] + [self.fake_image_token_id] + [self.image_token_id] * self.image_seq_len + [self.fake_image_token_id] + [self.image_token_id] * self.image_seq_len + [self.fake_image_token_id]
+        tokenized_sentence_1 = processor.tokenizer(text_str_1, add_special_tokens=False)
+        tokenized_sentence_2 = processor.tokenizer(text_str_2, add_special_tokens=False)
+        image_tokens = [self.fake_image_token_id] + [self.global_img_token_id] + [self.image_token_id] * self.image_seq_len + [self.fake_image_token_id]
+        expected_input_ids_1 = [self.bos_token_id] + image_tokens + tokenized_sentence_1["input_ids"]
+        expected_input_ids_2 = [self.bos_token_id] + 2 * image_tokens + tokenized_sentence_2["input_ids"]
         # Pad the first input to match the second input
         pad_len = len(expected_input_ids_2) - len(expected_input_ids_1)
-        padded_expected_input_ids_1 = [0] * pad_len + expected_input_ids_1
+        padded_expected_input_ids_1 = [self.padding_token_id] * pad_len + expected_input_ids_1
 
         self.assertEqual(
             inputs["input_ids"], [padded_expected_input_ids_1, expected_input_ids_2]
@@ -115,36 +161,35 @@ def test_process_interleaved_images_prompts_no_image_splitting(self):
             inputs["attention_mask"],
             [[0] * pad_len + [1] * len(expected_input_ids_1), [1] * len(expected_input_ids_2)]
         )
-        self.assertEqual(inputs['pixel_values'].shape, (2, 2, 3, 767, 980))
-        self.assertEqual(inputs['pixel_attention_mask'].shape, (2, 2, 767, 980))
+        self.assertEqual(inputs['pixel_values'].shape, (2, 2, 3, 1456, 1456))
+        self.assertEqual(inputs['pixel_attention_mask'].shape, (2, 2, 1456, 1456))
         # fmt: on
 
-        self.processor.image_processor.do_image_splitting = old_image_splitting
-
     def test_process_interleaved_images_prompts_image_splitting(self):
-        old_image_splitting = self.processor.image_processor.do_image_splitting
-
-        self.processor.image_processor.do_image_splitting = True
+        processor = self.get_processor()
+        processor.image_processor.do_image_splitting = True
 
         # Test that a single image is processed correctly
-        inputs = self.processor(images=self.image1)
-        self.assertEqual(inputs["pixel_values"].shape, (1, 5, 3, 653, 980))
-        self.assertEqual(inputs["pixel_attention_mask"].shape, (1, 5, 653, 980))
+        inputs = processor(images=self.image1)
+        self.assertEqual(inputs["pixel_values"].shape, (1, 13, 3, 364, 364))
+        self.assertEqual(inputs["pixel_attention_mask"].shape, (1, 13, 364, 364))
         # fmt: on
+        self.maxDiff = None
 
         # Test a single sample with image and text
         image_str = "<image>"
         text_str = "In this image, we see"
         text = image_str + text_str
-        inputs = self.processor(text=text, images=self.image1)
+        inputs = processor(text=text, images=self.image1)
 
         # fmt: off
-        tokenized_sentence = self.processor.tokenizer(text_str, add_special_tokens=False)
-        expected_input_ids = [[self.bos_token_id] + ([self.fake_image_token_id] + [self.image_token_id] * self.image_seq_len) * 5 + [self.fake_image_token_id] + tokenized_sentence["input_ids"]]
-        self.assertEqual(inputs["input_ids"], expected_input_ids)
-        self.assertEqual(inputs["attention_mask"], [[1] * len(expected_input_ids[0])])
-        self.assertEqual(inputs["pixel_values"].shape, (1, 5, 3, 653, 980))
-        self.assertEqual(inputs["pixel_attention_mask"].shape, (1, 5, 653, 980))
+        tokenized_sentence = processor.tokenizer(text_str, add_special_tokens=False)
+        splitted_image1_tokens = self.get_splitted_image_expected_tokens(processor, 3, 4)
+        expected_input_ids_1 = [[self.bos_token_id] + splitted_image1_tokens + tokenized_sentence["input_ids"]]
+        self.assertEqual(inputs["input_ids"], expected_input_ids_1)
+        self.assertEqual(inputs["attention_mask"], [[1] * len(expected_input_ids_1[0])])
+        self.assertEqual(inputs["pixel_values"].shape, (1, 13, 3, 364, 364))
+        self.assertEqual(inputs["pixel_attention_mask"].shape, (1, 13, 364, 364))
         # fmt: on
 
         # Test that batch is correctly processed
@@ -158,16 +203,20 @@ def test_process_interleaved_images_prompts_image_splitting(self):
         ]
         images = [[self.image1], [self.image2, self.image3]]
 
-        inputs = self.processor(text=text, images=images, padding=True)
+        inputs = processor(text=text, images=images, padding=True)
 
         # fmt: off
-        tokenized_sentence_1 = self.processor.tokenizer(text_str_1, add_special_tokens=False)
-        tokenized_sentence_2 = self.processor.tokenizer(text_str_2, add_special_tokens=False)
-        expected_input_ids_1 = [self.bos_token_id] + ([self.fake_image_token_id] + [self.image_token_id] * self.image_seq_len) * 5 + [self.fake_image_token_id] + tokenized_sentence_1["input_ids"]
-        expected_input_ids_2 = [self.bos_token_id] + tokenized_sentence_2["input_ids"] + ([self.fake_image_token_id] + [self.image_token_id] * self.image_seq_len) * 5 + ([self.fake_image_token_id] + [self.image_token_id] * self.image_seq_len) * 5 + [self.fake_image_token_id]
+        tokenized_sentence_1 = processor.tokenizer(text_str_1, add_special_tokens=False)
+        tokenized_sentence_2 = processor.tokenizer(text_str_2, add_special_tokens=False)
+
+        splitted_image1_tokens = self.get_splitted_image_expected_tokens(processor, 3, 4)
+        splitted_image2_tokens = self.get_splitted_image_expected_tokens(processor, 4, 4)
+        splitted_image3_tokens = self.get_splitted_image_expected_tokens(processor, 3, 4)
+        expected_input_ids_1 = [self.bos_token_id] + splitted_image1_tokens + tokenized_sentence_1["input_ids"]
+        expected_input_ids_2 = [self.bos_token_id] + tokenized_sentence_2["input_ids"] + splitted_image2_tokens + splitted_image3_tokens
         # Pad the first input to match the second input
         pad_len = len(expected_input_ids_2) - len(expected_input_ids_1)
-        padded_expected_input_ids_1 = [0] * pad_len + expected_input_ids_1
+        padded_expected_input_ids_1 = [self.padding_token_id] * pad_len + expected_input_ids_1
 
         self.assertEqual(
             inputs["input_ids"], [padded_expected_input_ids_1, expected_input_ids_2]
@@ -176,27 +225,26 @@ def test_process_interleaved_images_prompts_image_splitting(self):
             inputs["attention_mask"],
             [[0] * pad_len + [1] * len(expected_input_ids_1), [1] * len(expected_input_ids_2)]
         )
-        self.assertEqual(inputs['pixel_values'].shape, (2, 10, 3, 767, 980))
-        self.assertEqual(inputs['pixel_attention_mask'].shape, (2, 10, 767, 980))
+        self.assertEqual(inputs['pixel_values'].shape, (2, 30, 3, 364, 364))
+        self.assertEqual(inputs['pixel_attention_mask'].shape, (2, 30, 364, 364))
         # fmt: on
 
-        self.processor.image_processor.do_image_splitting = old_image_splitting
-
     def test_add_special_tokens_processor(self):
+        processor = self.get_processor()
+
         image_str = "<image>"
         text_str = "In this image, we see"
         text = text_str + image_str
 
-        n_image_repeat = 5 if self.processor.image_processor.do_image_splitting else 1
-
         # fmt: off
-        inputs = self.processor(text=text, images=self.image1, add_special_tokens=False)
-        tokenized_sentence = self.processor.tokenizer(text_str, add_special_tokens=False)
-        expected_input_ids = [tokenized_sentence["input_ids"] + ([self.fake_image_token_id] + [self.image_token_id] * self.image_seq_len) * n_image_repeat + [self.fake_image_token_id]]
+        inputs = processor(text=text, images=self.image1, add_special_tokens=False)
+        tokenized_sentence = processor.tokenizer(text_str, add_special_tokens=False)
+        splitted_image1_tokens = self.get_splitted_image_expected_tokens(processor, 3, 4)
+        expected_input_ids = [tokenized_sentence["input_ids"] + splitted_image1_tokens]
         self.assertEqual(inputs["input_ids"], expected_input_ids)
 
-        inputs = self.processor(text=text, images=self.image1)
-        expected_input_ids = [[self.bos_token_id] + tokenized_sentence["input_ids"] + ([self.fake_image_token_id] + [self.image_token_id] * self.image_seq_len) * n_image_repeat + [self.fake_image_token_id]]
+        inputs = processor(text=text, images=self.image1)
+        expected_input_ids = [[self.bos_token_id] + tokenized_sentence["input_ids"] + splitted_image1_tokens]
         self.assertEqual(inputs["input_ids"], expected_input_ids)
         # fmt: on
 
@@ -223,8 +271,7 @@ def test_apply_chat_template(self):
             },
             {"role": "user", "content": [{"type": "text", "text": "And who is that?"}]},
         ]
-
-        processor = self.processor
+        processor = self.get_processor()
         # Make short sequence length to test that the fake tokens are added correctly
         rendered = processor.apply_chat_template(messages, add_generation_prompt=True)
 
@@ -235,3 +282,184 @@ def test_apply_chat_template(self):
             "Assistant:"
         )
         self.assertEqual(rendered, expected_rendered)
+
+    # We need to overwrite this test to adapt it to our processor.
+    @require_torch
+    @require_vision
+    def test_image_processor_defaults_preserved_by_image_kwargs(self):
+        if "image_processor" not in self.processor_class.attributes:
+            self.skipTest(f"image_processor attribute not present in {self.processor_class}")
+        image_processor = self.get_component("image_processor", crop_size=(234, 234))
+        tokenizer = self.get_component("tokenizer", max_length=117)
+
+        processor = self.processor_class(tokenizer=tokenizer, image_processor=image_processor)
+        self.skip_processor_without_typed_kwargs(processor)
+
+        input_str = "lower newer <image>"
+        image_input = self.prepare_image_inputs()
+
+        inputs = processor(text=input_str, images=image_input)
+        self.assertEqual(len(inputs["pixel_values"][0][0]), 3)
+        self.assertEqual(len(inputs["pixel_values"][0][0][0]), 364)  # crop size doesn't affect our image processor
+
+    # We need to overwrite this test to adapt it to our processor.
+    @require_torch
+    @require_vision
+    def test_kwargs_overrides_default_image_processor_kwargs(self):
+        if "image_processor" not in self.processor_class.attributes:
+            self.skipTest(f"image_processor attribute not present in {self.processor_class}")
+        image_processor = self.get_component("image_processor", max_image_size={"longest_edge": 80})
+        tokenizer = self.get_component("tokenizer", max_length=117)
+
+        processor = self.processor_class(tokenizer=tokenizer, image_processor=image_processor)
+        self.skip_processor_without_typed_kwargs(processor)
+
+        input_str = "lower newer <image>"
+        image_input = self.prepare_image_inputs()
+
+        inputs = processor(text=input_str, images=image_input)
+        self.assertEqual(len(inputs["pixel_values"][0][0]), 3)
+        self.assertEqual(len(inputs["pixel_values"][0][0][0]), 80)
+
+    # We need to overwrite this test to adapt it to our processor.
+    @require_vision
+    @require_torch
+    def test_kwargs_overrides_default_tokenizer_kwargs(self):
+        if "image_processor" not in self.processor_class.attributes:
+            self.skipTest(f"image_processor attribute not present in {self.processor_class}")
+        image_processor = self.get_component("image_processor")
+        tokenizer = self.get_component("tokenizer", max_length=30)
+
+        processor = self.processor_class(tokenizer=tokenizer, image_processor=image_processor)
+        self.skip_processor_without_typed_kwargs(processor)
+        input_str = "lower newer<image>"
+        image_input = self.prepare_image_inputs()
+
+        inputs = processor(text=input_str, images=image_input, return_tensors="pt", max_length=30)
+        self.assertEqual(len(inputs["input_ids"][0]), 30)
+
+    # We need to overwrite this test to adapt it to our processor.
+    @require_torch
+    @require_vision
+    def test_structured_kwargs_nested(self):
+        if "image_processor" not in self.processor_class.attributes:
+            self.skipTest(f"image_processor attribute not present in {self.processor_class}")
+        image_processor = self.get_component("image_processor")
+        tokenizer = self.get_component("tokenizer")
+
+        processor = self.processor_class(tokenizer=tokenizer, image_processor=image_processor)
+        self.skip_processor_without_typed_kwargs(processor)
+
+        input_str = "lower newer<image>"
+        image_input = self.prepare_image_inputs()
+
+        # Define the kwargs for each modality
+        all_kwargs = {
+            "common_kwargs": {"return_tensors": "pt"},
+            "images_kwargs": {"max_image_size": {"longest_edge": 214}},
+            "text_kwargs": {"padding": "max_length", "max_length": 120},
+        }
+
+        inputs = processor(text=input_str, images=image_input, **all_kwargs)
+        self.skip_processor_without_typed_kwargs(processor)
+
+        self.assertEqual(inputs["pixel_values"].shape[3], 214)
+
+        self.assertEqual(len(inputs["input_ids"][0]), 120)
+
+    # We need to overwrite this test to adapt it to our processor.
+    @require_torch
+    @require_vision
+    def test_structured_kwargs_nested_from_dict(self):
+        if "image_processor" not in self.processor_class.attributes:
+            self.skipTest(f"image_processor attribute not present in {self.processor_class}")
+
+        image_processor = self.get_component("image_processor")
+        tokenizer = self.get_component("tokenizer")
+
+        processor = self.processor_class(tokenizer=tokenizer, image_processor=image_processor)
+        self.skip_processor_without_typed_kwargs(processor)
+        input_str = "lower newer<image>"
+        image_input = self.prepare_image_inputs()
+
+        # Define the kwargs for each modality
+        all_kwargs = {
+            "common_kwargs": {"return_tensors": "pt"},
+            "images_kwargs": {"max_image_size": {"longest_edge": 214}},
+            "text_kwargs": {"padding": "max_length", "max_length": 120},
+        }
+
+        inputs = processor(text=input_str, images=image_input, **all_kwargs)
+        self.assertEqual(inputs["pixel_values"].shape[3], 214)
+        self.assertEqual(len(inputs["input_ids"][0]), 120)
+
+    # We need to overwrite this test to adapt it to our processor.
+    @require_vision
+    @require_torch
+    def test_tokenizer_defaults_preserved_by_kwargs(self):
+        if "image_processor" not in self.processor_class.attributes:
+            self.skipTest(f"image_processor attribute not present in {self.processor_class}")
+        image_processor = self.get_component("image_processor")
+        tokenizer = self.get_component("tokenizer", max_length=30)
+
+        processor = self.processor_class(tokenizer=tokenizer, image_processor=image_processor)
+        self.skip_processor_without_typed_kwargs(processor)
+        input_str = "lower newer<image>"
+        image_input = self.prepare_image_inputs()
+
+        inputs = processor(text=input_str, images=image_input, return_tensors="pt")
+        self.assertEqual(len(inputs["input_ids"][0]), 30)
+
+    # We need to overwrite this test to adapt it to our processor.
+    @require_torch
+    @require_vision
+    def test_unstructured_kwargs_batched(self):
+        if "image_processor" not in self.processor_class.attributes:
+            self.skipTest(f"image_processor attribute not present in {self.processor_class}")
+        image_processor = self.get_component("image_processor")
+        tokenizer = self.get_component("tokenizer")
+
+        processor = self.processor_class(tokenizer=tokenizer, image_processor=image_processor)
+        self.skip_processor_without_typed_kwargs(processor)
+
+        input_str = ["<image>lower newer", "<image>upper older longer string"]
+        image_input = self.prepare_image_inputs()
+        print(image_input)
+        inputs = processor(
+            text=input_str,
+            images=[image_input, image_input],
+            return_tensors="pt",
+            padding="longest",
+            max_length=76,
+            max_image_size={"longest_edge": 214},
+        )
+
+        self.assertEqual(inputs["pixel_values"].shape[2], 3)
+        self.assertEqual(inputs["pixel_values"].shape[3], 214)
+        self.assertEqual(len(inputs["input_ids"][0]), 88)
+
+    # We need to overwrite this test to adapt it to our processor.
+    @require_torch
+    @require_vision
+    def test_unstructured_kwargs(self):
+        if "image_processor" not in self.processor_class.attributes:
+            self.skipTest(f"image_processor attribute not present in {self.processor_class}")
+        image_processor = self.get_component("image_processor")
+        tokenizer = self.get_component("tokenizer")
+
+        processor = self.processor_class(tokenizer=tokenizer, image_processor=image_processor)
+        self.skip_processor_without_typed_kwargs(processor)
+
+        input_str = "lower newer<image>"
+        image_input = self.prepare_image_inputs()
+        inputs = processor(
+            text=input_str,
+            images=image_input,
+            return_tensors="pt",
+            max_image_size={"longest_edge": 214},
+            padding="max_length",
+            max_length=120,
+        )
+
+        self.assertEqual(inputs["pixel_values"].shape[3], 214)
+        self.assertEqual(len(inputs["input_ids"][0]), 120)

From b03409159034191b71360890ab2334e6394b64e8 Mon Sep 17 00:00:00 2001
From: Andres Marafioti <andimarafioti@gmail.com>
Date: Thu, 15 Aug 2024 10:08:47 +0000
Subject: [PATCH 23/50] remove special tokens that are not special

---
 .../models/idefics3/processing_idefics3.py            | 11 ++++++++---
 1 file changed, 8 insertions(+), 3 deletions(-)

diff --git a/src/transformers/models/idefics3/processing_idefics3.py b/src/transformers/models/idefics3/processing_idefics3.py
index 13e89d136c4f27..2472a4543601ae 100644
--- a/src/transformers/models/idefics3/processing_idefics3.py
+++ b/src/transformers/models/idefics3/processing_idefics3.py
@@ -18,6 +18,7 @@
 
 import sys
 from typing import TYPE_CHECKING, List, Optional, Union
+import re
 
 from ...feature_extraction_utils import BatchFeature
 from ...image_utils import ImageInput, is_valid_image, load_image
@@ -144,9 +145,10 @@ def __init__(self, image_processor, tokenizer=None, image_seq_len: int = 169, ch
         self.image_token = AddedToken("<image>", normalized=False, special=True)
         self.end_of_utterance_token = AddedToken("<end_of_utterance>", normalized=False, special=True)
         self.global_img_token = "<global-img>"
-        self.global_img_token_id = tokenizer.convert_tokens_to_ids(self.global_img_token)
         self.image_seq_len = image_seq_len
 
+        self._regex_to_remove_extra_special_tokens = re.compile(r'(\n?<global-img>\n?|<row_\d+_col_\d+>\n?)+')
+
         tokens_to_add = {
             "additional_special_tokens": [
                 self.fake_image_token,
@@ -343,14 +345,17 @@ def batch_decode(self, *args, **kwargs):
         This method forwards all its arguments to Idefics3TokenizerFast's [`~PreTrainedTokenizer.batch_decode`]. Please
         refer to the docstring of this method for more information.
         """
-        return self.tokenizer.batch_decode(*args, **kwargs)
+        batched_decode_output = self.tokenizer.batch_decode(*args, **kwargs)
+        return [self._regex_to_remove_extra_special_tokens.sub("<image>", s) for s in batched_decode_output]
 
     def decode(self, *args, **kwargs):
         """
         This method forwards all its arguments to Idefics3TokenizerFast's [`~PreTrainedTokenizer.decode`]. Please refer to
         the docstring of this method for more information.
         """
-        return self.tokenizer.decode(*args, **kwargs)
+        decode_output = self.tokenizer.decode(*args, **kwargs)
+        return self._regex_to_remove_extra_special_tokens.sub("<image>", decode_output)
+
 
     @property
     def model_input_names(self):

From 47fb7ceda6c203aed7589c6ffde85f86f8ac50f8 Mon Sep 17 00:00:00 2001
From: Andres Marafioti <andimarafioti@gmail.com>
Date: Thu, 15 Aug 2024 10:10:18 +0000
Subject: [PATCH 24/50] fixes tests

---
 docs/source/en/model_doc/idefics3.md          | 16 ++++
 .../models/idefics3/configuration_idefics3.py |  4 -
 .../idefics3/image_processing_idefics3.py     |  2 -
 .../models/idefics3/processing_idefics3.py    | 14 +--
 src/transformers/processing_utils.py          | 11 +++
 src/transformers/utils/dummy_pt_objects.py    | 17 +++-
 .../models/idefics3/test_modeling_idefics3.py | 23 +++--
 .../idefics3/test_processing_idefics3.py      | 91 ++++++++++---------
 utils/check_repo.py                           |  1 +
 9 files changed, 110 insertions(+), 69 deletions(-)

diff --git a/docs/source/en/model_doc/idefics3.md b/docs/source/en/model_doc/idefics3.md
index 9391c9020ee502..17d2a965df9d96 100644
--- a/docs/source/en/model_doc/idefics3.md
+++ b/docs/source/en/model_doc/idefics3.md
@@ -40,6 +40,22 @@ This model was contributed by [amyeroberts](https://huggingface.co/amyeroberts)
 The original code can be found [here](<INSERT LINK TO GITHUB REPO HERE>).
 
 
+## Idefics3Config
+
+[[autodoc]] Idefics3Config
+
+
+## Idefics3Model
+
+[[autodoc]] Idefics3Model
+    - forward
+
+## Idefics3ForConditionalGeneration
+
+[[autodoc]] Idefics3ForConditionalGeneration
+    - forward
+
+
 ## Idefics3ImageProcessor
 [[autodoc]] Idefics3ImageProcessor
     - preprocess
diff --git a/src/transformers/models/idefics3/configuration_idefics3.py b/src/transformers/models/idefics3/configuration_idefics3.py
index 4b03a3af1576fb..75e7c1da261023 100644
--- a/src/transformers/models/idefics3/configuration_idefics3.py
+++ b/src/transformers/models/idefics3/configuration_idefics3.py
@@ -152,8 +152,6 @@ class Idefics3Config(PretrainedConfig):
             The scale factor for the image encoder.
         pad_token_id (`int`, *optional*, defaults to 128002):
             The id of the padding token.
-        max_position_embeddings (`int`, *optional*, defaults to 131072):
-            The maximum length of the input sequence.
 
     Example:
     ```python
@@ -178,7 +176,6 @@ def __init__(
         text_config=None,
         scale_factor=2,
         pad_token_id=128_002,
-        max_position_embeddings=131_072,
         **kwargs,
     ):
         self.image_token_id = image_token_id
@@ -199,7 +196,6 @@ def __init__(
         elif text_config is None:
             logger.info("text_config is None, using default text config")
             text_config = CONFIG_MAPPING["llama"](
-                max_position_embeddings=max_position_embeddings,
                 rms_norm_eps=1e-5,
                 pad_token_id=pad_token_id,
                 tie_word_embeddings=False,
diff --git a/src/transformers/models/idefics3/image_processing_idefics3.py b/src/transformers/models/idefics3/image_processing_idefics3.py
index ed6d2574b3912b..6681aa491405cc 100644
--- a/src/transformers/models/idefics3/image_processing_idefics3.py
+++ b/src/transformers/models/idefics3/image_processing_idefics3.py
@@ -385,8 +385,6 @@ class Idefics3ImageProcessor(BaseImageProcessor):
         do_pad (`bool`, *optional*, defaults to `True`):
             Whether or not to pad the images to the largest height and width in the batch and number of images per
             sample in the batch, such that the returned tensor is of shape (batch_size, max_num_images, num_channels, max_height, max_width).
-        vision_encoder_max_size (`int`, *optional*, defaults to `364`):
-            Maximum size of the images accepted by the vision encoder. The images are split into patches of this size.
     """
 
     model_input_names = ["pixel_values"]
diff --git a/src/transformers/models/idefics3/processing_idefics3.py b/src/transformers/models/idefics3/processing_idefics3.py
index 2472a4543601ae..6201bf3850fea6 100644
--- a/src/transformers/models/idefics3/processing_idefics3.py
+++ b/src/transformers/models/idefics3/processing_idefics3.py
@@ -16,9 +16,9 @@
 Processor class for Idefics3.
 """
 
-import sys
-from typing import TYPE_CHECKING, List, Optional, Union
 import re
+import sys
+from typing import TYPE_CHECKING, Dict, List, Optional, Union
 
 from ...feature_extraction_utils import BatchFeature
 from ...image_utils import ImageInput, is_valid_image, load_image
@@ -93,7 +93,7 @@ def get_image_prompt_string(
 class Idefics3ImagesKwargs(ImagesKwargs, total=False):
     image_seq_len: Optional[int]
     return_row_col_info: Optional[bool]
-    max_image_size: Optional[dict[str, int]]
+    max_image_size: Optional[Dict[str, int]]
 
 
 class Idefics3ProcessorKwargs(ProcessingKwargs, total=False):
@@ -111,6 +111,9 @@ class Idefics3ProcessorKwargs(ProcessingKwargs, total=False):
     }
 
 
+Idefics3ProcessorKwargs.__annotations__["images_kwargs"] = Idefics3ImagesKwargs  # python 3.8 compatibility
+
+
 class Idefics3Processor(ProcessorMixin):
     r"""
     Constructs a Idefics3 processor which wraps a LLama tokenizer and Idefics3 image processor into a single processor.
@@ -147,7 +150,7 @@ def __init__(self, image_processor, tokenizer=None, image_seq_len: int = 169, ch
         self.global_img_token = "<global-img>"
         self.image_seq_len = image_seq_len
 
-        self._regex_to_remove_extra_special_tokens = re.compile(r'(\n?<global-img>\n?|<row_\d+_col_\d+>\n?)+')
+        self._regex_to_remove_extra_special_tokens = re.compile(r"(\n?<global-img>\n?|<row_\d+_col_\d+>\n?)+")
 
         tokens_to_add = {
             "additional_special_tokens": [
@@ -158,7 +161,7 @@ def __init__(self, image_processor, tokenizer=None, image_seq_len: int = 169, ch
         }
         tokenizer.add_special_tokens(tokens_to_add)
 
-        super().__init__(image_processor, tokenizer, chat_template=chat_template)
+        super().__init__(image_processor, tokenizer, chat_template=chat_template, **kwargs)
 
     def _extract_images_from_prompts(self, prompts):
         prompt_images = []
@@ -356,7 +359,6 @@ def decode(self, *args, **kwargs):
         decode_output = self.tokenizer.decode(*args, **kwargs)
         return self._regex_to_remove_extra_special_tokens.sub("<image>", decode_output)
 
-
     @property
     def model_input_names(self):
         tokenizer_input_names = self.tokenizer.model_input_names
diff --git a/src/transformers/processing_utils.py b/src/transformers/processing_utils.py
index 8b33c456924ded..39ab38457def9d 100644
--- a/src/transformers/processing_utils.py
+++ b/src/transformers/processing_utils.py
@@ -291,6 +291,17 @@ class ModelProcessorKwargs(ProcessingKwargs, total=False):
         }
 
     ```
+
+    For Python 3.8 compatibility, when inheriting from this class and overriding one of the kwargs,
+    you need to manually update the __annotations__ dictionary. This can be done as follows:
+
+    ```python
+    class CustomProcessorKwargs(ProcessingKwargs, total=False):
+        images_kwargs: CustomImagesKwargs
+
+    CustomProcessorKwargs.__annotations__["images_kwargs"] = CustomImagesKwargs  # python 3.8 compatibility
+    ```python
+
     """
 
     common_kwargs: CommonKwargs = {
diff --git a/src/transformers/utils/dummy_pt_objects.py b/src/transformers/utils/dummy_pt_objects.py
index 52e3c33629fbdb..349abe7998b3f0 100644
--- a/src/transformers/utils/dummy_pt_objects.py
+++ b/src/transformers/utils/dummy_pt_objects.py
@@ -4828,35 +4828,42 @@ def __init__(self, *args, **kwargs):
         requires_backends(self, ["torch"])
 
 
-class Idefics3ForConditionalGeneration(metaclass=DummyObject):
+class Idefics2Model(metaclass=DummyObject):
     _backends = ["torch"]
 
     def __init__(self, *args, **kwargs):
         requires_backends(self, ["torch"])
 
 
-class Idefics2Model(metaclass=DummyObject):
+class Idefics2PreTrainedModel(metaclass=DummyObject):
     _backends = ["torch"]
 
     def __init__(self, *args, **kwargs):
         requires_backends(self, ["torch"])
 
 
-class Idefics2PreTrainedModel(metaclass=DummyObject):
+class Idefics2Processor(metaclass=DummyObject):
     _backends = ["torch"]
 
     def __init__(self, *args, **kwargs):
         requires_backends(self, ["torch"])
 
 
-class Idefics3PreTrainedModel(metaclass=DummyObject):
+class Idefics3ForConditionalGeneration(metaclass=DummyObject):
     _backends = ["torch"]
 
     def __init__(self, *args, **kwargs):
         requires_backends(self, ["torch"])
 
 
-class Idefics2Processor(metaclass=DummyObject):
+class Idefics3Model(metaclass=DummyObject):
+    _backends = ["torch"]
+
+    def __init__(self, *args, **kwargs):
+        requires_backends(self, ["torch"])
+
+
+class Idefics3PreTrainedModel(metaclass=DummyObject):
     _backends = ["torch"]
 
     def __init__(self, *args, **kwargs):
diff --git a/tests/models/idefics3/test_modeling_idefics3.py b/tests/models/idefics3/test_modeling_idefics3.py
index 43c01ba2f0aa30..d8945db7fa7ce6 100644
--- a/tests/models/idefics3/test_modeling_idefics3.py
+++ b/tests/models/idefics3/test_modeling_idefics3.py
@@ -23,13 +23,10 @@
 
 from transformers import (
     AutoProcessor,
-    Idefics3Config,
-    Idefics3ForConditionalGeneration,
-    Idefics3Model,
     is_torch_available,
     is_vision_available,
 )
-from transformers.testing_utils import require_bitsandbytes, require_torch, slow, torch_device
+from transformers.testing_utils import require_bitsandbytes, require_torch, require_torch_multi_gpu, slow, torch_device
 
 from ...generation.test_utils import GenerationTesterMixin
 from ...test_configuration_common import ConfigTester
@@ -38,6 +35,12 @@
 
 if is_torch_available():
     import torch
+
+    from transformers import (
+        Idefics3Config,
+        Idefics3ForConditionalGeneration,
+        Idefics3Model,
+    )
 else:
     is_torch_greater_or_equal_than_2_0 = False
 
@@ -483,13 +486,13 @@ def tearDown(self):
         torch.cuda.empty_cache()
 
     @slow
+    @require_torch_multi_gpu
     def test_integration_test(self):
         model = Idefics3ForConditionalGeneration.from_pretrained(
             "HuggingFaceM4/Idefics3-8B-Llama3",
             torch_dtype=torch.bfloat16,
             device_map="auto",
         )
-        model.to(torch_device)
 
         # Create inputs
         text = "<image>In this image, we see"
@@ -500,16 +503,18 @@ def test_integration_test(self):
         generated_ids = model.generate(**inputs, max_new_tokens=10)
         generated_texts = self.processor.batch_decode(generated_ids, skip_special_tokens=True)
 
-        # Batch affects generated text. Single batch output: ['In this image, we see the Statue of Liberty in the foreground and']
-        expected_generated_text = "In this image, we see the Statue of Liberty, the New York City"
+        expected_generated_text = "<image>In this image, we see the Statue of Liberty, which is located on Liberty"
         self.assertEqual(generated_texts[0], expected_generated_text)
 
     @slow
     @require_bitsandbytes
+    @require_torch_multi_gpu
     def test_integration_test_4bit(self):
         # Let' s make sure we test the preprocessing to replace what is used
         model = Idefics3ForConditionalGeneration.from_pretrained(
-            "HuggingFaceM4/Idefics3-8B-Llama3", load_in_4bit=True, device_map="auto"
+            "HuggingFaceM4/Idefics3-8B-Llama3",
+            load_in_4bit=True,
+            device_map="auto",
         )
 
         # Create pixel inputs
@@ -520,5 +525,5 @@ def test_integration_test_4bit(self):
         generated_ids = model.generate(**inputs, max_new_tokens=10)
         generated_texts = self.processor.batch_decode(generated_ids, skip_special_tokens=True)
 
-        expected_generated_text = "In this image, we see the Statue of Liberty, the Hudson River,"
+        expected_generated_text = "<image>In this image, we see the Statue of Liberty, trees, buildings, water"
         self.assertEqual(generated_texts[0], expected_generated_text)
diff --git a/tests/models/idefics3/test_processing_idefics3.py b/tests/models/idefics3/test_processing_idefics3.py
index d50d2d4eb91ef8..f4396b7c13d540 100644
--- a/tests/models/idefics3/test_processing_idefics3.py
+++ b/tests/models/idefics3/test_processing_idefics3.py
@@ -37,39 +37,39 @@
 class Idefics3ProcessorTest(ProcessorTesterMixin, unittest.TestCase):
     processor_class = Idefics3Processor
 
-    def setUp(self):
-        self.tmpdirname = tempfile.mkdtemp()
+    @classmethod
+    def setUpClass(cls):
+        cls.tmpdirname = tempfile.mkdtemp()
         processor = Idefics3Processor.from_pretrained("HuggingFaceM4/Idefics3-8B-Llama3", image_seq_len=2)
-        processor.save_pretrained(self.tmpdirname)
-        self.max_image_size = 364
-        self.image1 = Image.open(
+        processor.save_pretrained(cls.tmpdirname)
+        cls.image1 = Image.open(
             BytesIO(
                 requests.get(
                     "https://cdn.britannica.com/61/93061-050-99147DCE/Statue-of-Liberty-Island-New-York-Bay.jpg"
                 ).content
             )
         )
-        self.image2 = Image.open(
+        cls.image2 = Image.open(
             BytesIO(requests.get("https://cdn.britannica.com/59/94459-050-DBA42467/Skyline-Chicago.jpg").content)
         )
-        self.image3 = Image.open(
+        cls.image3 = Image.open(
             BytesIO(
                 requests.get(
                     "https://thumbs.dreamstime.com/b/golden-gate-bridge-san-francisco-purple-flowers-california-echium-candicans-36805947.jpg"
                 ).content
             )
         )
-        self.bos_token = processor.tokenizer.bos_token
-        self.image_token = processor.image_token.content
-        self.fake_image_token = processor.fake_image_token.content
-        self.global_img_token = processor.global_img_token
-
-        self.bos_token_id = processor.tokenizer.convert_tokens_to_ids(self.bos_token)
-        self.image_token_id = processor.tokenizer.convert_tokens_to_ids(self.image_token)
-        self.fake_image_token_id = processor.tokenizer.convert_tokens_to_ids(self.fake_image_token)
-        self.global_img_token_id = processor.global_img_token_id
-        self.padding_token_id = processor.tokenizer.pad_token_id
-        self.image_seq_len = processor.image_seq_len
+        cls.bos_token = processor.tokenizer.bos_token
+        cls.image_token = processor.image_token.content
+        cls.fake_image_token = processor.fake_image_token.content
+        cls.global_img_token = processor.global_img_token
+
+        cls.bos_token_id = processor.tokenizer.convert_tokens_to_ids(cls.bos_token)
+        cls.image_token_id = processor.tokenizer.convert_tokens_to_ids(cls.image_token)
+        cls.fake_image_token_id = processor.tokenizer.convert_tokens_to_ids(cls.fake_image_token)
+        cls.global_img_tokens_id = processor.tokenizer(cls.global_img_token, add_special_tokens=False)["input_ids"]
+        cls.padding_token_id = processor.tokenizer.pad_token_id
+        cls.image_seq_len = processor.image_seq_len
 
     def get_tokenizer(self, **kwargs):
         return AutoProcessor.from_pretrained(self.tmpdirname, **kwargs).tokenizer
@@ -96,14 +96,15 @@ def get_splitted_image_expected_tokens(self, processor, image_rows, image_cols):
         ]  # add double newline, as it gets its own token
         text_split_images += (
             [self.fake_image_token_id]
-            + [self.global_img_token_id]
+            + self.global_img_tokens_id
             + [self.image_token_id] * self.image_seq_len
             + [self.fake_image_token_id]
         )
         return text_split_images
 
-    def tearDown(self):
-        shutil.rmtree(self.tmpdirname)
+    @classmethod
+    def tearDownClass(cls):
+        shutil.rmtree(cls.tmpdirname)
 
     def test_process_interleaved_images_prompts_no_image_splitting(self):
         processor = self.get_processor()
@@ -124,7 +125,7 @@ def test_process_interleaved_images_prompts_no_image_splitting(self):
 
         # fmt: off
         tokenized_sentence = processor.tokenizer(text_str, add_special_tokens=False)
-        expected_input_ids = [[self.bos_token_id] + [self.fake_image_token_id] + [self.global_img_token_id] + [self.image_token_id] * self.image_seq_len + [self.fake_image_token_id] + tokenized_sentence["input_ids"]]
+        expected_input_ids = [[self.bos_token_id] + [self.fake_image_token_id] + self.global_img_tokens_id + [self.image_token_id] * self.image_seq_len + [self.fake_image_token_id] + tokenized_sentence["input_ids"]]
         self.assertEqual(inputs["input_ids"], expected_input_ids)
         self.assertEqual(inputs["attention_mask"], [[1] * len(expected_input_ids[0])])
         self.assertEqual(inputs["pixel_values"].shape, (1, 1, 3, 1092, 1456))
@@ -147,7 +148,7 @@ def test_process_interleaved_images_prompts_no_image_splitting(self):
         # fmt: off
         tokenized_sentence_1 = processor.tokenizer(text_str_1, add_special_tokens=False)
         tokenized_sentence_2 = processor.tokenizer(text_str_2, add_special_tokens=False)
-        image_tokens = [self.fake_image_token_id] + [self.global_img_token_id] + [self.image_token_id] * self.image_seq_len + [self.fake_image_token_id]
+        image_tokens = [self.fake_image_token_id] + self.global_img_tokens_id + [self.image_token_id] * self.image_seq_len + [self.fake_image_token_id]
         expected_input_ids_1 = [self.bos_token_id] + image_tokens + tokenized_sentence_1["input_ids"]
         expected_input_ids_2 = [self.bos_token_id] + 2 * image_tokens + tokenized_sentence_2["input_ids"]
         # Pad the first input to match the second input
@@ -308,8 +309,10 @@ def test_image_processor_defaults_preserved_by_image_kwargs(self):
     def test_kwargs_overrides_default_image_processor_kwargs(self):
         if "image_processor" not in self.processor_class.attributes:
             self.skipTest(f"image_processor attribute not present in {self.processor_class}")
-        image_processor = self.get_component("image_processor", max_image_size={"longest_edge": 80})
-        tokenizer = self.get_component("tokenizer", max_length=117)
+        image_processor = self.get_component(
+            "image_processor", max_image_size={"longest_edge": 32}, size={"longest_edge": 32}
+        )
+        tokenizer = self.get_component("tokenizer", max_length=117, padding="max_length")
 
         processor = self.processor_class(tokenizer=tokenizer, image_processor=image_processor)
         self.skip_processor_without_typed_kwargs(processor)
@@ -319,7 +322,8 @@ def test_kwargs_overrides_default_image_processor_kwargs(self):
 
         inputs = processor(text=input_str, images=image_input)
         self.assertEqual(len(inputs["pixel_values"][0][0]), 3)
-        self.assertEqual(len(inputs["pixel_values"][0][0][0]), 80)
+        self.assertEqual(len(inputs["pixel_values"][0][0][0]), 32)
+        self.assertEqual(len(inputs["input_ids"][0]), 117)
 
     # We need to overwrite this test to adapt it to our processor.
     @require_vision
@@ -354,16 +358,16 @@ def test_structured_kwargs_nested(self):
         image_input = self.prepare_image_inputs()
 
         # Define the kwargs for each modality
-        all_kwargs = {
-            "common_kwargs": {"return_tensors": "pt"},
-            "images_kwargs": {"max_image_size": {"longest_edge": 214}},
-            "text_kwargs": {"padding": "max_length", "max_length": 120},
-        }
-
-        inputs = processor(text=input_str, images=image_input, **all_kwargs)
+        inputs = processor(
+            text=input_str,
+            images=image_input,
+            common_kwargs={"return_tensors": "pt"},
+            images_kwargs={"max_image_size": {"longest_edge": 32}},
+            text_kwargs={"padding": "max_length", "max_length": 120, "truncation": "longest_first"},
+        )
         self.skip_processor_without_typed_kwargs(processor)
 
-        self.assertEqual(inputs["pixel_values"].shape[3], 214)
+        self.assertEqual(inputs["pixel_values"].shape[3], 32)
 
         self.assertEqual(len(inputs["input_ids"][0]), 120)
 
@@ -385,12 +389,12 @@ def test_structured_kwargs_nested_from_dict(self):
         # Define the kwargs for each modality
         all_kwargs = {
             "common_kwargs": {"return_tensors": "pt"},
-            "images_kwargs": {"max_image_size": {"longest_edge": 214}},
-            "text_kwargs": {"padding": "max_length", "max_length": 120},
+            "images_kwargs": {"max_image_size": {"longest_edge": 32}},
+            "text_kwargs": {"padding": "max_length", "max_length": 120, "truncation": "longest_first"},
         }
 
         inputs = processor(text=input_str, images=image_input, **all_kwargs)
-        self.assertEqual(inputs["pixel_values"].shape[3], 214)
+        self.assertEqual(inputs["pixel_values"].shape[3], 32)
         self.assertEqual(len(inputs["input_ids"][0]), 120)
 
     # We need to overwrite this test to adapt it to our processor.
@@ -424,19 +428,19 @@ def test_unstructured_kwargs_batched(self):
 
         input_str = ["<image>lower newer", "<image>upper older longer string"]
         image_input = self.prepare_image_inputs()
-        print(image_input)
         inputs = processor(
             text=input_str,
             images=[image_input, image_input],
             return_tensors="pt",
             padding="longest",
             max_length=76,
-            max_image_size={"longest_edge": 214},
+            truncation=True,
+            max_image_size={"longest_edge": 30},
         )
 
         self.assertEqual(inputs["pixel_values"].shape[2], 3)
-        self.assertEqual(inputs["pixel_values"].shape[3], 214)
-        self.assertEqual(len(inputs["input_ids"][0]), 88)
+        self.assertEqual(inputs["pixel_values"].shape[3], 30)
+        self.assertEqual(len(inputs["input_ids"][0]), 76)
 
     # We need to overwrite this test to adapt it to our processor.
     @require_torch
@@ -456,10 +460,11 @@ def test_unstructured_kwargs(self):
             text=input_str,
             images=image_input,
             return_tensors="pt",
-            max_image_size={"longest_edge": 214},
+            max_image_size={"longest_edge": 32},
             padding="max_length",
             max_length=120,
+            truncation="longest_first",
         )
 
-        self.assertEqual(inputs["pixel_values"].shape[3], 214)
+        self.assertEqual(inputs["pixel_values"].shape[3], 32)
         self.assertEqual(len(inputs["input_ids"][0]), 120)
diff --git a/utils/check_repo.py b/utils/check_repo.py
index 2f0e12c9cf51be..a78cfa33fdf30a 100644
--- a/utils/check_repo.py
+++ b/utils/check_repo.py
@@ -82,6 +82,7 @@
     "SeamlessM4Tv2TextToUnitModel",
     "SeamlessM4Tv2CodeHifiGan",
     "SeamlessM4Tv2TextToUnitForConditionalGeneration",
+    "Idefics3VisionTransformer",
 ]
 
 # Update this list for models that are not tested with a comment explaining the reason it should not be.

From 4032a6f45730292e66ba3de6926958613e940597 Mon Sep 17 00:00:00 2001
From: Andres Marafioti <andimarafioti@gmail.com>
Date: Wed, 21 Aug 2024 13:22:20 +0000
Subject: [PATCH 25/50] skip failing tests - they also fail for idefics2

---
 tests/models/idefics3/test_modeling_idefics3.py | 6 +++---
 1 file changed, 3 insertions(+), 3 deletions(-)

diff --git a/tests/models/idefics3/test_modeling_idefics3.py b/tests/models/idefics3/test_modeling_idefics3.py
index d8945db7fa7ce6..0a0b22df963f52 100644
--- a/tests/models/idefics3/test_modeling_idefics3.py
+++ b/tests/models/idefics3/test_modeling_idefics3.py
@@ -26,7 +26,7 @@
     is_torch_available,
     is_vision_available,
 )
-from transformers.testing_utils import require_bitsandbytes, require_torch, require_torch_multi_gpu, slow, torch_device
+from transformers.testing_utils import require_bitsandbytes, require_torch, slow, torch_device
 
 from ...generation.test_utils import GenerationTesterMixin
 from ...test_configuration_common import ConfigTester
@@ -486,7 +486,7 @@ def tearDown(self):
         torch.cuda.empty_cache()
 
     @slow
-    @require_torch_multi_gpu
+    @unittest.skip("Test hits OOM on CI - https://github.com/huggingface/transformers/issues/32288")
     def test_integration_test(self):
         model = Idefics3ForConditionalGeneration.from_pretrained(
             "HuggingFaceM4/Idefics3-8B-Llama3",
@@ -508,7 +508,7 @@ def test_integration_test(self):
 
     @slow
     @require_bitsandbytes
-    @require_torch_multi_gpu
+    @unittest.skip("Test hits OOM on CI - https://github.com/huggingface/transformers/issues/32288")
     def test_integration_test_4bit(self):
         # Let' s make sure we test the preprocessing to replace what is used
         model = Idefics3ForConditionalGeneration.from_pretrained(

From 757e834a06dae7eceb99f023d80e7185ac093abd Mon Sep 17 00:00:00 2001
From: Andres Marafioti <andimarafioti@gmail.com>
Date: Tue, 27 Aug 2024 10:13:13 +0000
Subject: [PATCH 26/50] added paper and readded the tests with multi gpu, who
 knows

---
 docs/source/en/model_doc/idefics3.md            | 12 ++++++------
 tests/models/idefics3/test_modeling_idefics3.py |  6 +++---
 2 files changed, 9 insertions(+), 9 deletions(-)

diff --git a/docs/source/en/model_doc/idefics3.md b/docs/source/en/model_doc/idefics3.md
index 17d2a965df9d96..d34f27bcc26e31 100644
--- a/docs/source/en/model_doc/idefics3.md
+++ b/docs/source/en/model_doc/idefics3.md
@@ -18,23 +18,23 @@ rendered properly in your Markdown viewer.
 
 ## Overview
 
-The Idefics3 model was proposed in [<INSERT PAPER NAME HERE>](<INSERT PAPER LINK HERE>) by <INSERT AUTHORS HERE>.
+The Idefics3 model was proposed in [Building and better understanding vision-language models: insights and future directions](https://huggingface.co/papers/2408.12637) by Hugo Laurençon, Andrés Marafioti, Victor Sanh, and Léo Tronchon.
 
-Idefics3 is an adaptation of the Idefics2 model with three main differences: 
-- the use of Llama3 for the text model
+Idefics3 is an adaptation of the Idefics2 model with three main differences:
+- the use of Llama3 for the text model.
 - an updated processing logic for the images.
-- The removal of the perceiver. 
+- The removal of the perceiver.
 
 The resolutions of input images can be directly controlled, and they are decomposed into
 patches, or not, depending on the resolution. See [Idefics2] for more details on the model architecture.
 
 The abstract from the paper is the following:
 
-*<INSERT PAPER ABSTRACT HERE>*
+The field of vision-language models (VLMs), which take images and texts as inputs and output texts, is rapidly evolving and has yet to reach consensus on several key aspects of the development pipeline, including data, architecture, and training methods. This paper can be seen as a tutorial for building a VLM. We begin by providing a comprehensive overview of the current state-of-the-art approaches, highlighting the strengths and weaknesses of each, addressing the major challenges in the field, and suggesting promising research directions for underexplored areas. We then walk through the practical steps to build Idefics3-8B, a powerful VLM that significantly outperforms its predecessor Idefics2-8B, while being trained efficiently, exclusively on open datasets, and using a straightforward pipeline. These steps include the creation of Docmatix, a dataset for improving document understanding capabilities, which is 240 times larger than previously available datasets. We release the model along with the datasets created for its training.
 
 Tips:
 
-<INSERT TIPS ABOUT MODEL HERE>
+The input given to the model will be resized by default such that the longest side is 4*364. For faster inference, set do_resize to False.
 
 This model was contributed by [amyeroberts](https://huggingface.co/amyeroberts) and [andimarafioti](https://huggingface.co/andito).
 The original code can be found [here](<INSERT LINK TO GITHUB REPO HERE>).
diff --git a/tests/models/idefics3/test_modeling_idefics3.py b/tests/models/idefics3/test_modeling_idefics3.py
index 0a0b22df963f52..d8945db7fa7ce6 100644
--- a/tests/models/idefics3/test_modeling_idefics3.py
+++ b/tests/models/idefics3/test_modeling_idefics3.py
@@ -26,7 +26,7 @@
     is_torch_available,
     is_vision_available,
 )
-from transformers.testing_utils import require_bitsandbytes, require_torch, slow, torch_device
+from transformers.testing_utils import require_bitsandbytes, require_torch, require_torch_multi_gpu, slow, torch_device
 
 from ...generation.test_utils import GenerationTesterMixin
 from ...test_configuration_common import ConfigTester
@@ -486,7 +486,7 @@ def tearDown(self):
         torch.cuda.empty_cache()
 
     @slow
-    @unittest.skip("Test hits OOM on CI - https://github.com/huggingface/transformers/issues/32288")
+    @require_torch_multi_gpu
     def test_integration_test(self):
         model = Idefics3ForConditionalGeneration.from_pretrained(
             "HuggingFaceM4/Idefics3-8B-Llama3",
@@ -508,7 +508,7 @@ def test_integration_test(self):
 
     @slow
     @require_bitsandbytes
-    @unittest.skip("Test hits OOM on CI - https://github.com/huggingface/transformers/issues/32288")
+    @require_torch_multi_gpu
     def test_integration_test_4bit(self):
         # Let' s make sure we test the preprocessing to replace what is used
         model = Idefics3ForConditionalGeneration.from_pretrained(

From 779727906597a7d8a94c000b2921d304729b244f Mon Sep 17 00:00:00 2001
From: =?UTF-8?q?Andr=C3=A9s=20Marafioti?= <andimarafioti@gmail.com>
Date: Fri, 30 Aug 2024 13:44:19 +0200
Subject: [PATCH 27/50] Update docs/source/en/model_doc/idefics3.md

Co-authored-by: amyeroberts <22614925+amyeroberts@users.noreply.github.com>
---
 docs/source/en/model_doc/idefics3.md | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/docs/source/en/model_doc/idefics3.md b/docs/source/en/model_doc/idefics3.md
index d34f27bcc26e31..0a0d9289aa3caa 100644
--- a/docs/source/en/model_doc/idefics3.md
+++ b/docs/source/en/model_doc/idefics3.md
@@ -34,7 +34,7 @@ The field of vision-language models (VLMs), which take images and texts as input
 
 Tips:
 
-The input given to the model will be resized by default such that the longest side is 4*364. For faster inference, set do_resize to False.
+- The input given to the model will be resized by default such that the longest side is 4*364. For faster inference, set `do_resize` to `False`.
 
 This model was contributed by [amyeroberts](https://huggingface.co/amyeroberts) and [andimarafioti](https://huggingface.co/andito).
 The original code can be found [here](<INSERT LINK TO GITHUB REPO HERE>).

From b47812439c26c8dbd18ee482740b1b94bab4775a Mon Sep 17 00:00:00 2001
From: =?UTF-8?q?Andr=C3=A9s=20Marafioti?= <andimarafioti@gmail.com>
Date: Fri, 30 Aug 2024 13:52:53 +0200
Subject: [PATCH 28/50] Apply suggestions from code review

Co-authored-by: amyeroberts <22614925+amyeroberts@users.noreply.github.com>
---
 docs/source/en/model_doc/idefics3.md             |  2 +-
 .../models/idefics3/configuration_idefics3.py    |  6 +++---
 .../models/idefics3/image_processing_idefics3.py | 16 ++++++----------
 .../models/idefics3/modeling_idefics3.py         |  8 +++-----
 .../models/idefics3/processing_idefics3.py       | 14 ++------------
 5 files changed, 15 insertions(+), 31 deletions(-)

diff --git a/docs/source/en/model_doc/idefics3.md b/docs/source/en/model_doc/idefics3.md
index 0a0d9289aa3caa..1f43d35578e1c6 100644
--- a/docs/source/en/model_doc/idefics3.md
+++ b/docs/source/en/model_doc/idefics3.md
@@ -30,7 +30,7 @@ patches, or not, depending on the resolution. See [Idefics2] for more details on
 
 The abstract from the paper is the following:
 
-The field of vision-language models (VLMs), which take images and texts as inputs and output texts, is rapidly evolving and has yet to reach consensus on several key aspects of the development pipeline, including data, architecture, and training methods. This paper can be seen as a tutorial for building a VLM. We begin by providing a comprehensive overview of the current state-of-the-art approaches, highlighting the strengths and weaknesses of each, addressing the major challenges in the field, and suggesting promising research directions for underexplored areas. We then walk through the practical steps to build Idefics3-8B, a powerful VLM that significantly outperforms its predecessor Idefics2-8B, while being trained efficiently, exclusively on open datasets, and using a straightforward pipeline. These steps include the creation of Docmatix, a dataset for improving document understanding capabilities, which is 240 times larger than previously available datasets. We release the model along with the datasets created for its training.
+*The field of vision-language models (VLMs), which take images and texts as inputs and output texts, is rapidly evolving and has yet to reach consensus on several key aspects of the development pipeline, including data, architecture, and training methods. This paper can be seen as a tutorial for building a VLM. We begin by providing a comprehensive overview of the current state-of-the-art approaches, highlighting the strengths and weaknesses of each, addressing the major challenges in the field, and suggesting promising research directions for underexplored areas. We then walk through the practical steps to build Idefics3-8B, a powerful VLM that significantly outperforms its predecessor Idefics2-8B, while being trained efficiently, exclusively on open datasets, and using a straightforward pipeline. These steps include the creation of Docmatix, a dataset for improving document understanding capabilities, which is 240 times larger than previously available datasets. We release the model along with the datasets created for its training.*
 
 Tips:
 
diff --git a/src/transformers/models/idefics3/configuration_idefics3.py b/src/transformers/models/idefics3/configuration_idefics3.py
index 75e7c1da261023..b1347851c884ec 100644
--- a/src/transformers/models/idefics3/configuration_idefics3.py
+++ b/src/transformers/models/idefics3/configuration_idefics3.py
@@ -144,9 +144,9 @@ class Idefics3Config(PretrainedConfig):
             The id of the "image" token.
         tie_word_embeddings (`bool`, *optional*, defaults to `False`):
             Whether or not to tie the word embeddings with the token embeddings.
-        vision_config (`IdeficsVisionConfig` or `dict`, *optional*):
-            Custom vision config or dict
-        text_config (`LlamaConfig` or `dict`, *optional*):
+        vision_config (`IdeficsVisionConfig` or `dict`, *optional*, defaults to `IdeficsVisionConfig`):
+            Custom vision config or dict for the vision tower
+        text_config (`PretrainedConfig` or `dict`, *optional*, defaults to `LlamaConfig`):
             Custom text config or dict for the text model
         scale_factor (`int`, *optional*, defaults to 2):
             The scale factor for the image encoder.
diff --git a/src/transformers/models/idefics3/image_processing_idefics3.py b/src/transformers/models/idefics3/image_processing_idefics3.py
index 6681aa491405cc..b67baa1df6706d 100644
--- a/src/transformers/models/idefics3/image_processing_idefics3.py
+++ b/src/transformers/models/idefics3/image_processing_idefics3.py
@@ -220,12 +220,12 @@ def get_max_height_width(
     if input_data_format is None:
         input_data_format = infer_channel_dimension_format(images_list[0][0])
 
-    image_sizes = []
+    max_height = max_width = float("-inf")
     for images in images_list:
         for image in images:
-            image_sizes.append(get_image_size(image, channel_dim=input_data_format))
-
-    max_height, max_width = max_across_indices(image_sizes)
+            height, width = get_image_size(image, channel_dim=input_data_format)
+            max_height = max(height, max_height)
+            max_width = max(width, max_width)
     return (max_height, max_width)
 
 
@@ -755,11 +755,6 @@ def preprocess(
         do_convert_rgb = do_convert_rgb if do_convert_rgb is not None else self.do_convert_rgb
         do_pad = do_pad if do_pad is not None else self.do_pad
 
-        if not do_image_splitting:
-            logger.warning_once(
-                "Idefics3 was trained on splitted image to support high resolution. Setting do_image_splitting=False will degrade the performance."
-            )
-
         images_list = make_list_of_images(images)
 
         if not valid_images(images_list[0]):
@@ -768,10 +763,11 @@ def preprocess(
                 "torch.Tensor, tf.Tensor or jax.ndarray."
             )
 
+        # save the palettes for conversion to RGB
         palettes = [
             image[0].getpalette() if isinstance(image[0], Image.Image) and image[0].mode == "P" else None
             for image in images_list
-        ]  # save the palettes for conversion to RGB
+        ]
         # All transformations expect numpy arrays.
         images_list = [[to_numpy_array(image) for image in images] for images in images_list]
 
diff --git a/src/transformers/models/idefics3/modeling_idefics3.py b/src/transformers/models/idefics3/modeling_idefics3.py
index 614ea208b853e7..83d371711c5e9a 100644
--- a/src/transformers/models/idefics3/modeling_idefics3.py
+++ b/src/transformers/models/idefics3/modeling_idefics3.py
@@ -391,8 +391,6 @@ def forward(self, hidden_states: torch.Tensor) -> torch.Tensor:
 class Idefics3SimpleMLP(nn.Module):
     def __init__(self, config):
         super().__init__()
-        self.config = config
-
         input_size = config.vision_config.hidden_size * (config.scale_factor**2)
         output_size = config.text_config.hidden_size
         self.proj = nn.Linear(input_size, output_size, bias=False)
@@ -894,9 +892,9 @@ def inputs_merger(
         num_images, _, vision_hidden_size = image_hidden_states.shape
         special_image_token_mask = input_ids == self.image_token_id
         new_inputs_embeds = inputs_embeds.clone()
-        reshaped_image_hidden_states = image_hidden_states.view(-1, vision_hidden_size).to(
-            inputs_embeds.dtype
-        )  # cast to the dtype of the input_embeds to support quantized models
+        reshaped_image_hidden_states = image_hidden_states.view(-1, vision_hidden_size)
+        # cast to the dtype of the input_embeds to support quantized models
+        reshaped_image_hidden_states = reshaped_image_hidden_states.to(inputs_embeds.dtype)
         new_inputs_embeds[special_image_token_mask] = reshaped_image_hidden_states
         return new_inputs_embeds
 
diff --git a/src/transformers/models/idefics3/processing_idefics3.py b/src/transformers/models/idefics3/processing_idefics3.py
index 6201bf3850fea6..2f815044d6a831 100644
--- a/src/transformers/models/idefics3/processing_idefics3.py
+++ b/src/transformers/models/idefics3/processing_idefics3.py
@@ -253,7 +253,7 @@ def __call__(
         )
 
         # Temporary fix for "padding_side" in init_kwargs
-        _ = output_kwargs["text_kwargs"].pop("padding_side", None)
+        output_kwargs["text_kwargs"].pop("padding_side", None)
 
         image_seq_len = output_kwargs["images_kwargs"].pop("image_seq_len", None)
         image_seq_len = image_seq_len if image_seq_len is not None else self.image_seq_len
@@ -278,17 +278,7 @@ def __call__(
             n_images_in_images = [len(sample) for sample in images]
 
             # Load images if they are URLs
-            new_images = []
-            for sample in images:
-                new_images.append([])
-                for im in sample:
-                    if is_valid_image(im):
-                        new_images[-1].append(im)  # already loaded
-                    elif isinstance(im, str):
-                        new_images[-1].append(load_image(im))
-
-            images = new_images
-            del new_images
+            images = [[load_image(im) if is_url(im) else im for im in sample] for sample in images]
 
             image_inputs = self.image_processor(images, **output_kwargs["images_kwargs"])
             inputs.update(image_inputs)

From ada6219101a18359d230dd5125dcaf1c35eb7cc1 Mon Sep 17 00:00:00 2001
From: Andres Marafioti <andimarafioti@gmail.com>
Date: Fri, 30 Aug 2024 15:55:51 +0000
Subject: [PATCH 29/50] review amy until image_processing_idefics3

---
 docs/source/en/model_doc/idefics3.md          |  14 +-
 .../models/idefics3/configuration_idefics3.py |   4 +-
 .../idefics3/image_processing_idefics3.py     | 188 +++++++++---------
 .../models/idefics3/test_modeling_idefics3.py |   6 +-
 .../idefics3/test_processing_idefics3.py      |  27 +--
 5 files changed, 125 insertions(+), 114 deletions(-)

diff --git a/docs/source/en/model_doc/idefics3.md b/docs/source/en/model_doc/idefics3.md
index 1f43d35578e1c6..4c3ae0d1c0146b 100644
--- a/docs/source/en/model_doc/idefics3.md
+++ b/docs/source/en/model_doc/idefics3.md
@@ -21,12 +21,13 @@ rendered properly in your Markdown viewer.
 The Idefics3 model was proposed in [Building and better understanding vision-language models: insights and future directions](https://huggingface.co/papers/2408.12637) by Hugo Laurençon, Andrés Marafioti, Victor Sanh, and Léo Tronchon.
 
 Idefics3 is an adaptation of the Idefics2 model with three main differences:
-- the use of Llama3 for the text model.
-- an updated processing logic for the images.
-- The removal of the perceiver.
 
-The resolutions of input images can be directly controlled, and they are decomposed into
-patches, or not, depending on the resolution. See [Idefics2] for more details on the model architecture.
+- It uses Llama3 for the text model.
+- It uses an updated processing logic for the images.
+- It removes the perceiver.
+
+Input images are either upsampled such that the longest side is 4*364 (if `do_resize` is set `True`) or processed in their original resolution.
+In any case, the image processors decomposes images in patches of 364x364 pixels.
 
 The abstract from the paper is the following:
 
@@ -34,10 +35,9 @@ The abstract from the paper is the following:
 
 Tips:
 
-- The input given to the model will be resized by default such that the longest side is 4*364. For faster inference, set `do_resize` to `False`.
+- By default, the input given to the model will be resized by default such that the longest side is 4*364. For faster inference, set `do_resize` to `False`.
 
 This model was contributed by [amyeroberts](https://huggingface.co/amyeroberts) and [andimarafioti](https://huggingface.co/andito).
-The original code can be found [here](<INSERT LINK TO GITHUB REPO HERE>).
 
 
 ## Idefics3Config
diff --git a/src/transformers/models/idefics3/configuration_idefics3.py b/src/transformers/models/idefics3/configuration_idefics3.py
index b1347851c884ec..45afe685f5209c 100644
--- a/src/transformers/models/idefics3/configuration_idefics3.py
+++ b/src/transformers/models/idefics3/configuration_idefics3.py
@@ -36,13 +36,13 @@ class Idefics3VisionConfig(PretrainedConfig):
     documentation from [`PretrainedConfig`] for more information.
 
     Args:
-        hidden_size (`int`, *optional*, defaults to 768):
+        hidden_size (`int`, *optional*, defaults to 1152):
             Dimensionality of the encoder layers and the pooler layer.
         intermediate_size (`int`, *optional*, defaults to 3072):
             Dimensionality of the "intermediate" (i.e., feed-forward) layer in the Transformer encoder.
         num_hidden_layers (`int`, *optional*, defaults to 12):
             Number of hidden layers in the Transformer encoder.
-        num_attention_heads (`int`, *optional*, defaults to 12):
+        num_attention_heads (`int`, *optional*, defaults to 16):
             Number of attention heads for each attention layer in the Transformer encoder.
         num_channels (`int`, *optional*, defaults to 3):
             Number of channels in the input images.
diff --git a/src/transformers/models/idefics3/image_processing_idefics3.py b/src/transformers/models/idefics3/image_processing_idefics3.py
index b67baa1df6706d..31c2ee71c69db2 100644
--- a/src/transformers/models/idefics3/image_processing_idefics3.py
+++ b/src/transformers/models/idefics3/image_processing_idefics3.py
@@ -158,10 +158,7 @@ def get_resize_output_image_size(
     if resolution_max_side > max_image_size:
         raise ValueError("`resolution_max_side` cannot be larger than `max_image_size`")
 
-    if isinstance(image, Image.Image):
-        width, height = image.size
-    else:
-        height, width = get_image_size(image, channel_dim=input_data_format)
+    height, width = get_image_size(image, channel_dim=input_data_format)
 
     # Find the output size, when rescaling the longest edge to max_len and preserving the aspect ratio
     height, width = _resize_output_size_rescale_to_max_len(height, width, max_len=resolution_max_side)
@@ -248,6 +245,7 @@ def make_pixel_mask(
     return mask
 
 
+# Custom to_pil_image function to support image_mode
 def to_pil_image(
     image: Union[np.ndarray, "PIL.Image.Image", "torch.Tensor", "tf.Tensor", "jnp.ndarray"],
     do_rescale: Optional[bool] = None,
@@ -308,6 +306,7 @@ def convert_to_rgb(
     """
     if not isinstance(image, PIL.Image.Image):
         mode = "P" if palette is not None else None
+        # Custom to_pil_image function to support image_mode
         image = to_pil_image(image, image_mode=mode, input_data_format=input_data_format)
         if image.mode == "P" and palette is not None:
             image.putpalette(palette)
@@ -355,18 +354,18 @@ class Idefics3ImageProcessor(BaseImageProcessor):
             Only has an effect if the input image is in the PIL format.
         do_resize (`bool`, *optional*, defaults to `True`):
             Whether to resize the image. The longest edge of the image is resized to  be <= `size["longest_edge"]`, with the
-            shortest edge resized to keep the input aspect ratio, with a minimum size of `size["shortest_edge"]`.
-        size (`Dict`, *optional*):
-            Controls the size of the output image. This is a dictionary containing the keys "shortest_edge" and "longest_edge".
+            shortest edge resized to keep the input aspect ratio.
+        size (`Dict`, *optional*, defaults to `{"longest_edge": 4 * 364}`):
+            Controls the size of the output image. This is a dictionary containing the key "longest_edge".
             The image will be resized such that the longest edge is <= `size["longest_edge"]` and the shortest edge is resized
-            to keep the input aspect ratio, with a lower bound of `size["shortest_edge"]`.
+            to keep the input aspect ratio.
         resample (`Resampling`, *optional*, defaults to `Resampling.LANCZOS`):
             Resampling filter to use when resizing the image.
         do_image_splitting (`bool`, *optional*, defaults to `True`):
             Whether to split the image into sub-images concatenated with the original image. They are split into patches
             such that each patch has a size of `max_image_size["height"]` x `max_image_size["width"]`.
-        max_image_size (`Dict`, *optional*, defaults to `self.max_image_size`):
-            Maximum resolution of the images accepted by the model. This is a dictionary containing the key "longest".
+        max_image_size (`Dict`, *optional*, defaults to `{"longest_edge": 364}`):
+            Maximum resolution of the patches of images accepted by the model. This is a dictionary containing the key "longest_edge".
         do_rescale (`bool`, *optional*, defaults to `True`):
             Whether to rescale the image. If set to `True`, the image is rescaled to have pixel values between 0 and 1.
         rescale_factor (`float`, *optional*, defaults to `1/255`):
@@ -437,7 +436,7 @@ def resize(
                 Image to resize.
             size (`Dict[str, int]`):
                 Size of the output image.
-            resample (`PILImageResampling`, *optional*, defaults to `PILImageResampling.BICUBIC`):
+            resample (`PILImageResampling`, *optional*, defaults to `PILImageResampling.LANCZOS`):
                 Resampling filter to use when resizing the image.
             data_format (`str` or `ChannelDimension`, *optional*):
                 The channel dimension format of the image. If not provided, it will be the same as the input image.
@@ -458,6 +457,7 @@ def resize(
             image_mode = None
             if image.ndim == 2 or image.shape[-1] == 1:
                 image_mode = "P"
+            # Custom to_pil_image function to support image_mode
             image = to_pil_image(image, input_data_format=input_data_format, image_mode=image_mode)
 
         resized_image = image.resize((size[1], size[0]), resample=resample)
@@ -496,10 +496,7 @@ def split_image(
             resample (`PILImageResampling`, *optional*, defaults to `PILImageResampling.LANCZOS`):
                 Resampling filter to use when resizing the image.
         """
-        if isinstance(image, Image.Image):
-            width, height = image.size
-        else:
-            height, width = get_image_size(image, channel_dim=input_data_format)
+        height, width = get_image_size(image, channel_dim=input_data_format)
         max_height = max_width = max_image_size["longest_edge"]
 
         frames = []
@@ -523,42 +520,70 @@ def split_image(
                     end_y = min(start_y + optimal_height, height)
 
                     # Crop the image
-                    if isinstance(image, Image.Image):
-                        cropped_image = image.crop((start_x, start_y, end_x, end_y))
-                    else:
-                        cropped_image = _crop(
-                            image,
-                            start_x,
-                            start_y,
-                            end_x,
-                            end_y,
-                            input_data_format=input_data_format,
-                            data_format=data_format,
-                        )
+                    cropped_image = _crop(
+                        image,
+                        start_x,
+                        start_y,
+                        end_x,
+                        end_y,
+                        input_data_format=input_data_format,
+                        data_format=data_format,
+                    )
                     frames.append(cropped_image)
 
             # For the global image at the end, we resize it to match the max_image_size, for cpu memory efficiency
             global_image_height, global_image_width = max_height, max_width
             if height != global_image_height or width != global_image_width:
-                if isinstance(image, Image.Image):
-                    image = image.resize((global_image_width, global_image_height), resample=resample)
-                else:
-                    image = self.resize(
-                        image,
-                        {"height": global_image_height, "width": global_image_width},
-                        resample=resample,
-                        input_data_format=input_data_format,
-                    )
+                image = self.resize(
+                    image,
+                    {"height": global_image_height, "width": global_image_width},
+                    resample=resample,
+                    input_data_format=input_data_format,
+                )
         else:
             num_splits_h, num_splits_w = 0, 0
 
-        if data_format is not None and not isinstance(image, Image.Image):
+        if data_format is not None:
             image = to_channel_dimension_format(image, data_format, input_channel_dim=input_data_format)
 
         frames.append(image)
 
         return frames, num_splits_h, num_splits_w
 
+    def resize_for_vision_encoder(
+        self,
+        image: np.ndarray,
+        vision_encoder_max_size: int,
+        resample: PILImageResampling = PILImageResampling.LANCZOS,
+        input_data_format: Optional[Union[str, ChannelDimension]] = None,
+    ):
+        """
+        Resize images to be multiples of `vision_encoder_max_size` while preserving the aspect ratio.
+
+        Args:
+            image (`np.ndarray`):
+                Images to resize.
+            vision_encoder_max_size (`int`):
+                Maximum size of the output image. If the image is larger than this size, it will be split into
+                patches of this size, and the original image will be concatenated with the patches, resized to max_size.
+            resample (`PILImageResampling`, *optional*, defaults to `PILImageResampling.LANCZOS`):
+                Resampling filter to use when resizing the image.
+            input_data_format (`ChannelDimension` or `str`, *optional*):
+                The channel dimension format of the input image. If not provided, it will be inferred
+        """
+        height, width, _ = image.shape
+        aspect_ratio = width / height
+        if width >= height:
+            width = math.ceil(width / vision_encoder_max_size) * vision_encoder_max_size
+            height = int(width / aspect_ratio)
+            height = math.ceil(height / vision_encoder_max_size) * vision_encoder_max_size
+        elif height > width:
+            height = math.ceil(height / vision_encoder_max_size) * vision_encoder_max_size
+            width = int(height * aspect_ratio)
+            width = math.ceil(width / vision_encoder_max_size) * vision_encoder_max_size
+        new_size = {"height": height, "width": width}
+        return self.resize(image, size=new_size, resample=resample, input_data_format=input_data_format)
+
     def _pad_image(
         self,
         image: np.ndarray,
@@ -764,13 +789,20 @@ def preprocess(
             )
 
         # save the palettes for conversion to RGB
-        palettes = [
-            image[0].getpalette() if isinstance(image[0], Image.Image) and image[0].mode == "P" else None
-            for image in images_list
+        palettes_list = [
+            [im.getpalette() if isinstance(im, Image.Image) and im.mode == "P" else None for im in images]
+            for images in images_list
         ]
+
         # All transformations expect numpy arrays.
         images_list = [[to_numpy_array(image) for image in images] for images in images_list]
 
+        if is_scaled_image(images_list[0][0]) and do_rescale:
+            logger.warning_once(
+                "It looks like you are trying to rescale already rescaled images. If the input"
+                " images have pixel values between 0 and 1, set `do_rescale=False` to avoid rescaling them again."
+            )
+
         new_images_list = []
         for images in images_list:
             new_images = []
@@ -797,9 +829,6 @@ def preprocess(
             resample=resample,
         )
 
-        # # All transformations expect numpy arrays.
-        # images_list = [[to_numpy_array(image) for image in images] for images in images_list]
-
         if do_resize:
             images_list = [
                 [
@@ -812,77 +841,54 @@ def preprocess(
         # Resize might already change the channel dimension, so we will recompute it
         input_data_format = infer_channel_dimension_format(images_list[0][0], num_channels=(1, 3, 4))
 
-        # We will resize both height and width of each image to the nearest 364 multiple, disregarding the aspect ratio
-        # for size=(10, 364) -> rescaled_size=(364, 364)
-        # for size=(11, 365) -> rescaled_size=(364, 364*2)
-        new_images_list = []
-        if "longest_edge" in max_image_size:
-            vision_encoder_max_size = max_image_size["longest_edge"]
-        elif isinstance(max_image_size, int):
-            vision_encoder_max_size = max_image_size
-        else:
-            raise ValueError("Invalid max_image_size, must be a dictionary with key 'longest_edge' or an integer.")
-
-        for images in images_list:
-            new_images = []
-            for img in images:
-                if isinstance(img, Image.Image):
-                    width, height = img.size
-                else:
-                    height, width, _ = img.shape
-                aspect_ratio = width / height
-                if width >= height:
-                    width = math.ceil(width / vision_encoder_max_size) * vision_encoder_max_size
-                    height = int(width / aspect_ratio)
-                    height = math.ceil(height / vision_encoder_max_size) * vision_encoder_max_size
-                elif height > width:
-                    height = math.ceil(height / vision_encoder_max_size) * vision_encoder_max_size
-                    width = int(height * aspect_ratio)
-                    width = math.ceil(width / vision_encoder_max_size) * vision_encoder_max_size
-                new_size = {"height": height, "width": width}
-                new_images.append(
-                    self.resize(img, size=new_size, resample=resample, input_data_format=input_data_format)
-                )
-            new_images_list.append(new_images)
-        images_list = new_images_list
-        del new_images_list
-
         if do_image_splitting:
+            # We first resize both height and width of each image to the nearest 364 multiple, disregarding the aspect ratio
+            # for size=(10, 364) -> rescaled_size=(364, 364)
+            # for size=(11, 365) -> rescaled_size=(364, 364*2)
+            images_list = [
+                [
+                    self.resize_for_vision_encoder(
+                        image, max_image_size["longest_edge"], resample=resample, input_data_format=input_data_format
+                    )
+                    for image in images
+                ]
+                for images in images_list
+            ]
             images_list_split_arrays = []
+            palettes_list_split_arrays = []
             images_list_rows = []
             images_list_cols = []
-            for images in images_list:
+            for images, palettes in zip(images_list, palettes_list):
                 split_image_arrays = []
+                split_palettes_arrays = []
                 image_rows = []
                 image_cols = []
-                for image in images:
+                for image, palette in zip(images, palettes):
                     split_image_array, rows, cols = self.split_image(
                         image,
                         max_image_size=max_image_size,
                         input_data_format=input_data_format,
                     )
                     split_image_arrays.extend(split_image_array)
+                    split_palettes_arrays.extend([palette] * len(split_image_array))
                     image_rows.append(rows)
                     image_cols.append(cols)
                 images_list_split_arrays.append(split_image_arrays)
+                palettes_list_split_arrays.append(split_palettes_arrays)
                 images_list_rows.append(image_rows)
                 images_list_cols.append(image_cols)
             images_list = images_list_split_arrays
+            palettes_list = palettes_list_split_arrays
         else:
             images_list_rows = [[0] * len(images) for images in images_list]
             images_list_cols = [[0] * len(images) for images in images_list]
 
         if do_convert_rgb:
             images_list = [
-                [convert_to_rgb(image, palette, input_data_format=input_data_format) for image in images]
-                for images, palette in zip(images_list, palettes)
+                [convert_to_rgb(img, plt, input_data_format=input_data_format) for img, plt in zip(images, palettes)]
+                for images, palettes in zip(images_list, palettes_list)
             ]
 
-        if is_scaled_image(images_list[0][0]) and do_rescale:
-            logger.warning_once(
-                "It looks like you are trying to rescale already rescaled images. If the input"
-                " images have pixel values between 0 and 1, set `do_rescale=False` to avoid rescaling them again."
-            )
         if do_rescale:
             rescaled_images_array = []
             for image in images_list:
@@ -913,9 +919,13 @@ def preprocess(
                 for images in images_list
             ]
 
-        data = {"pixel_values": np.array(images_list) if do_pad else images_list}  # Faster tensor conversion
+        data = {
+            "pixel_values": np.array(images_list) if do_pad and return_tensors is not None else images_list
+        }  # Faster tensor conversion
         if pixel_attention_mask is not None:
-            data["pixel_attention_mask"] = np.array(pixel_attention_mask) if do_pad else pixel_attention_mask
+            data["pixel_attention_mask"] = (
+                np.array(pixel_attention_mask) if do_pad and return_tensors is not None else pixel_attention_mask
+            )
 
         encoding = BatchFeature(data=data, tensor_type=return_tensors)
 
diff --git a/tests/models/idefics3/test_modeling_idefics3.py b/tests/models/idefics3/test_modeling_idefics3.py
index d8945db7fa7ce6..550bb2785e0057 100644
--- a/tests/models/idefics3/test_modeling_idefics3.py
+++ b/tests/models/idefics3/test_modeling_idefics3.py
@@ -26,7 +26,7 @@
     is_torch_available,
     is_vision_available,
 )
-from transformers.testing_utils import require_bitsandbytes, require_torch, require_torch_multi_gpu, slow, torch_device
+from transformers.testing_utils import require_bitsandbytes, require_torch, slow, torch_device
 
 from ...generation.test_utils import GenerationTesterMixin
 from ...test_configuration_common import ConfigTester
@@ -486,7 +486,7 @@ def tearDown(self):
         torch.cuda.empty_cache()
 
     @slow
-    @require_torch_multi_gpu
+    @unittest.skip("multi-gpu tests are disabled for now")
     def test_integration_test(self):
         model = Idefics3ForConditionalGeneration.from_pretrained(
             "HuggingFaceM4/Idefics3-8B-Llama3",
@@ -508,7 +508,7 @@ def test_integration_test(self):
 
     @slow
     @require_bitsandbytes
-    @require_torch_multi_gpu
+    @unittest.skip("multi-gpu tests are disabled for now")
     def test_integration_test_4bit(self):
         # Let' s make sure we test the preprocessing to replace what is used
         model = Idefics3ForConditionalGeneration.from_pretrained(
diff --git a/tests/models/idefics3/test_processing_idefics3.py b/tests/models/idefics3/test_processing_idefics3.py
index f4396b7c13d540..928c9a339a5a7a 100644
--- a/tests/models/idefics3/test_processing_idefics3.py
+++ b/tests/models/idefics3/test_processing_idefics3.py
@@ -18,6 +18,7 @@
 import unittest
 from io import BytesIO
 
+import numpy as np
 import requests
 
 from transformers import Idefics3Processor
@@ -112,9 +113,9 @@ def test_process_interleaved_images_prompts_no_image_splitting(self):
 
         # Test that a single image is processed correctly
         inputs = processor(images=self.image1)
-        image1_expected_size = (1092, 1456)
-        self.assertEqual(inputs["pixel_values"].shape, (1, 1, 3, *image1_expected_size))
-        self.assertEqual(inputs["pixel_attention_mask"].shape, (1, 1, *image1_expected_size))
+        image1_expected_size = (970, 1456)
+        self.assertEqual(np.array(inputs["pixel_values"]).shape, (1, 1, 3, *image1_expected_size))
+        self.assertEqual(np.array(inputs["pixel_attention_mask"]).shape, (1, 1, *image1_expected_size))
         # fmt: on
 
         # Test a single sample with image and text
@@ -128,8 +129,8 @@ def test_process_interleaved_images_prompts_no_image_splitting(self):
         expected_input_ids = [[self.bos_token_id] + [self.fake_image_token_id] + self.global_img_tokens_id + [self.image_token_id] * self.image_seq_len + [self.fake_image_token_id] + tokenized_sentence["input_ids"]]
         self.assertEqual(inputs["input_ids"], expected_input_ids)
         self.assertEqual(inputs["attention_mask"], [[1] * len(expected_input_ids[0])])
-        self.assertEqual(inputs["pixel_values"].shape, (1, 1, 3, 1092, 1456))
-        self.assertEqual(inputs["pixel_attention_mask"].shape, (1, 1, 1092, 1456))
+        self.assertEqual(np.array(inputs["pixel_values"]).shape, (1, 1, 3, *image1_expected_size))
+        self.assertEqual(np.array(inputs["pixel_attention_mask"]).shape, (1, 1, *image1_expected_size))
         # fmt: on
 
         # Test that batch is correctly processed
@@ -162,8 +163,8 @@ def test_process_interleaved_images_prompts_no_image_splitting(self):
             inputs["attention_mask"],
             [[0] * pad_len + [1] * len(expected_input_ids_1), [1] * len(expected_input_ids_2)]
         )
-        self.assertEqual(inputs['pixel_values'].shape, (2, 2, 3, 1456, 1456))
-        self.assertEqual(inputs['pixel_attention_mask'].shape, (2, 2, 1456, 1456))
+        self.assertEqual(np.array(inputs['pixel_values']).shape, (2, 2, 3, 1140, 1456))
+        self.assertEqual(np.array(inputs['pixel_attention_mask']).shape, (2, 2, 1140, 1456))
         # fmt: on
 
     def test_process_interleaved_images_prompts_image_splitting(self):
@@ -172,8 +173,8 @@ def test_process_interleaved_images_prompts_image_splitting(self):
 
         # Test that a single image is processed correctly
         inputs = processor(images=self.image1)
-        self.assertEqual(inputs["pixel_values"].shape, (1, 13, 3, 364, 364))
-        self.assertEqual(inputs["pixel_attention_mask"].shape, (1, 13, 364, 364))
+        self.assertEqual(np.array(inputs["pixel_values"]).shape, (1, 13, 3, 364, 364))
+        self.assertEqual(np.array(inputs["pixel_attention_mask"]).shape, (1, 13, 364, 364))
         # fmt: on
         self.maxDiff = None
 
@@ -189,8 +190,8 @@ def test_process_interleaved_images_prompts_image_splitting(self):
         expected_input_ids_1 = [[self.bos_token_id] + splitted_image1_tokens + tokenized_sentence["input_ids"]]
         self.assertEqual(inputs["input_ids"], expected_input_ids_1)
         self.assertEqual(inputs["attention_mask"], [[1] * len(expected_input_ids_1[0])])
-        self.assertEqual(inputs["pixel_values"].shape, (1, 13, 3, 364, 364))
-        self.assertEqual(inputs["pixel_attention_mask"].shape, (1, 13, 364, 364))
+        self.assertEqual(np.array(inputs["pixel_values"]).shape, (1, 13, 3, 364, 364))
+        self.assertEqual(np.array(inputs["pixel_attention_mask"]).shape, (1, 13, 364, 364))
         # fmt: on
 
         # Test that batch is correctly processed
@@ -226,8 +227,8 @@ def test_process_interleaved_images_prompts_image_splitting(self):
             inputs["attention_mask"],
             [[0] * pad_len + [1] * len(expected_input_ids_1), [1] * len(expected_input_ids_2)]
         )
-        self.assertEqual(inputs['pixel_values'].shape, (2, 30, 3, 364, 364))
-        self.assertEqual(inputs['pixel_attention_mask'].shape, (2, 30, 364, 364))
+        self.assertEqual(np.array(inputs['pixel_values']).shape, (2, 30, 3, 364, 364))
+        self.assertEqual(np.array(inputs['pixel_attention_mask']).shape, (2, 30, 364, 364))
         # fmt: on
 
     def test_add_special_tokens_processor(self):

From 164fbe800e86aa7b2f5b840798f1b9536fdb7ed2 Mon Sep 17 00:00:00 2001
From: Andres Marafioti <andimarafioti@gmail.com>
Date: Mon, 2 Sep 2024 13:29:23 +0000
Subject: [PATCH 30/50] last comments from Amy

---
 .../models/idefics3/modeling_idefics3.py      | 212 +++++++++---------
 .../models/idefics3/processing_idefics3.py    |   7 +-
 .../idefics3/test_processing_idefics3.py      |   2 +-
 3 files changed, 111 insertions(+), 110 deletions(-)

diff --git a/src/transformers/models/idefics3/modeling_idefics3.py b/src/transformers/models/idefics3/modeling_idefics3.py
index 83d371711c5e9a..7d507c06655423 100644
--- a/src/transformers/models/idefics3/modeling_idefics3.py
+++ b/src/transformers/models/idefics3/modeling_idefics3.py
@@ -129,6 +129,7 @@ class Idefics3CausalLMOutputWithPast(ModelOutput):
     image_hidden_states: Optional[Tuple[torch.FloatTensor]] = None
 
 
+# Copied from transformers.models.idefics2.modeling_idefics2.Idefics2VisionEmbeddings with Idefics2->Idefics3
 class Idefics3VisionEmbeddings(nn.Module):
     """
     This is a modified version of `siglip.modelign_siglip.SiglipVisionEmbeddings` to enable images of variable
@@ -537,109 +538,6 @@ def forward(
         )
 
 
-IDEFICS3_VISION_START_DOCSTRING = r"""
-    This model inherits from [`PreTrainedModel`]. Check the superclass documentation for the generic methods the
-    library implements for all its model (such as downloading or saving, resizing the input embeddings, pruning heads
-    etc.)
-
-    This model is also a PyTorch [torch.nn.Module](https://pytorch.org/docs/stable/nn.html#torch.nn.Module) subclass.
-    Use it as a regular PyTorch Module and refer to the PyTorch documentation for all matter related to general usage
-    and behavior.
-
-    Parameters:
-        config ([`Idefics3VisionConfig`]):
-            Model configuration class with all the parameters of the model. Initializing with a config file does not
-            load the weights associated with the model, only the configuration. Check out the
-            [`~PreTrainedModel.from_pretrained`] method to load the model weights.
-"""
-
-
-@add_start_docstrings(
-    "The Idefics3 Vision Transformer Model outputting raw image embedding.",
-    IDEFICS3_VISION_START_DOCSTRING,
-)
-class Idefics3VisionTransformer(PreTrainedModel):
-    config_class = Idefics3VisionConfig
-    base_model_prefix = "model"
-    supports_gradient_checkpointing = True
-    _no_split_modules = ["Idefics3VisionAttention"]
-    _skip_keys_device_placement = "past_key_values"
-    _supports_flash_attn_2 = True
-    _supports_cache_class = True
-
-    def __init__(self, config: Idefics3VisionConfig):
-        super().__init__(config)
-        embed_dim = config.hidden_size
-
-        self.embeddings = Idefics3VisionEmbeddings(config)
-        self.encoder = Idefics3Encoder(config)
-        self.post_layernorm = nn.LayerNorm(embed_dim, eps=config.layer_norm_eps)
-        self._use_flash_attention_2 = config._attn_implementation == "flash_attention_2"
-
-    def get_input_embeddings(self):
-        return self.embeddings
-
-    def set_input_embeddings(self, value):
-        self.embeddings = value
-
-    def forward(
-        self,
-        pixel_values,
-        patch_attention_mask: Optional[torch.BoolTensor] = None,
-        output_attentions: Optional[bool] = None,
-        output_hidden_states: Optional[bool] = None,
-        return_dict: Optional[bool] = None,
-    ) -> Union[Tuple, BaseModelOutput]:
-        output_attentions = output_attentions if output_attentions is not None else self.config.output_attentions
-        output_hidden_states = (
-            output_hidden_states if output_hidden_states is not None else self.config.output_hidden_states
-        )
-        return_dict = return_dict if return_dict is not None else self.config.use_return_dict
-
-        batch_size = pixel_values.size(0)
-        if patch_attention_mask is None:
-            patch_size = self.config.patch_size
-            patch_attention_mask = torch.ones(
-                (
-                    batch_size,
-                    pixel_values.size(2) // patch_size,
-                    pixel_values.size(3) // patch_size,
-                )
-            )
-            patch_attention_mask = patch_attention_mask.to(dtype=torch.bool, device=pixel_values.device)
-
-        hidden_states = self.embeddings(pixel_values=pixel_values, patch_attention_mask=patch_attention_mask)
-
-        patch_attention_mask = patch_attention_mask.view(batch_size, -1)
-        # The call to `_upad_input` in `_flash_attention_forward` is expensive
-        # So when the `patch_attention_mask` is full of 1s (i.e. attending to the whole sequence),
-        # avoiding passing the attention_mask, which is equivalent to attending to the full sequence
-        if not torch.any(~patch_attention_mask):
-            patch_attention_mask = None
-        elif not self._use_flash_attention_2:
-            patch_attention_mask = _prepare_4d_attention_mask(patch_attention_mask, hidden_states.dtype)
-
-        encoder_outputs = self.encoder(
-            inputs_embeds=hidden_states,
-            attention_mask=patch_attention_mask,
-            output_attentions=output_attentions,
-            output_hidden_states=output_hidden_states,
-            return_dict=return_dict,
-        )
-
-        last_hidden_state = encoder_outputs[0]
-        last_hidden_state = self.post_layernorm(last_hidden_state)
-
-        if not return_dict:
-            return (last_hidden_state,) + encoder_outputs[1:]
-
-        return BaseModelOutput(
-            last_hidden_state=last_hidden_state,
-            hidden_states=encoder_outputs.hidden_states,
-            attentions=encoder_outputs.attentions,
-        )
-
-
 # Copied from transformers.models.llama.modeling_llama.repeat_kv
 def repeat_kv(hidden_states: torch.Tensor, n_rep: int) -> torch.Tensor:
     """
@@ -747,6 +645,109 @@ def _init_weights(self, module):
                 module.weight.data[module.padding_idx].zero_()
 
 
+IDEFICS3_VISION_START_DOCSTRING = r"""
+    This model inherits from [`PreTrainedModel`]. Check the superclass documentation for the generic methods the
+    library implements for all its model (such as downloading or saving, resizing the input embeddings, pruning heads
+    etc.)
+
+    This model is also a PyTorch [torch.nn.Module](https://pytorch.org/docs/stable/nn.html#torch.nn.Module) subclass.
+    Use it as a regular PyTorch Module and refer to the PyTorch documentation for all matter related to general usage
+    and behavior.
+
+    Parameters:
+        config ([`Idefics3VisionConfig`]):
+            Model configuration class with all the parameters of the model. Initializing with a config file does not
+            load the weights associated with the model, only the configuration. Check out the
+            [`~PreTrainedModel.from_pretrained`] method to load the model weights.
+"""
+
+
+@add_start_docstrings(
+    "The Idefics3 Vision Transformer Model outputting raw image embedding.",
+    IDEFICS3_VISION_START_DOCSTRING,
+)
+class Idefics3VisionTransformer(Idefics3PreTrainedModel):
+    config_class = Idefics3VisionConfig
+    base_model_prefix = "model"
+    supports_gradient_checkpointing = True
+    _no_split_modules = ["Idefics3VisionAttention"]
+    _skip_keys_device_placement = "past_key_values"
+    _supports_flash_attn_2 = True
+    _supports_cache_class = True
+
+    def __init__(self, config: Idefics3VisionConfig):
+        super().__init__(config)
+        embed_dim = config.hidden_size
+
+        self.embeddings = Idefics3VisionEmbeddings(config)
+        self.encoder = Idefics3Encoder(config)
+        self.post_layernorm = nn.LayerNorm(embed_dim, eps=config.layer_norm_eps)
+        self._use_flash_attention_2 = config._attn_implementation == "flash_attention_2"
+
+    def get_input_embeddings(self):
+        return self.embeddings
+
+    def set_input_embeddings(self, value):
+        self.embeddings = value
+
+    def forward(
+        self,
+        pixel_values,
+        patch_attention_mask: Optional[torch.BoolTensor] = None,
+        output_attentions: Optional[bool] = None,
+        output_hidden_states: Optional[bool] = None,
+        return_dict: Optional[bool] = None,
+    ) -> Union[Tuple, BaseModelOutput]:
+        output_attentions = output_attentions if output_attentions is not None else self.config.output_attentions
+        output_hidden_states = (
+            output_hidden_states if output_hidden_states is not None else self.config.output_hidden_states
+        )
+        return_dict = return_dict if return_dict is not None else self.config.use_return_dict
+
+        batch_size = pixel_values.size(0)
+        if patch_attention_mask is None:
+            patch_size = self.config.patch_size
+            patch_attention_mask = torch.ones(
+                (
+                    batch_size,
+                    pixel_values.size(2) // patch_size,
+                    pixel_values.size(3) // patch_size,
+                )
+            )
+            patch_attention_mask = patch_attention_mask.to(dtype=torch.bool, device=pixel_values.device)
+
+        hidden_states = self.embeddings(pixel_values=pixel_values, patch_attention_mask=patch_attention_mask)
+
+        patch_attention_mask = patch_attention_mask.view(batch_size, -1)
+        # The call to `_upad_input` in `_flash_attention_forward` is expensive
+        # So when the `patch_attention_mask` is full of 1s (i.e. attending to the whole sequence),
+        # avoiding passing the attention_mask, which is equivalent to attending to the full sequence
+        if not torch.any(~patch_attention_mask):
+            patch_attention_mask = None
+        elif not self._use_flash_attention_2:
+            patch_attention_mask = _prepare_4d_attention_mask(patch_attention_mask, hidden_states.dtype)
+
+        encoder_outputs = self.encoder(
+            inputs_embeds=hidden_states,
+            attention_mask=patch_attention_mask,
+            output_attentions=output_attentions,
+            output_hidden_states=output_hidden_states,
+            return_dict=return_dict,
+        )
+
+        last_hidden_state = encoder_outputs[0]
+        last_hidden_state = self.post_layernorm(last_hidden_state)
+
+        if not return_dict:
+            return (last_hidden_state,) + encoder_outputs[1:]
+
+        return BaseModelOutput(
+            last_hidden_state=last_hidden_state,
+            hidden_states=encoder_outputs.hidden_states,
+            attentions=encoder_outputs.attentions,
+        )
+
+
 IDEFICS3_INPUTS_DOCSTRING = r"""
     Args:
         input_ids (`torch.LongTensor` of shape `(batch_size, sequence_length)`):
@@ -891,12 +892,11 @@ def inputs_merger(
         """
         num_images, _, vision_hidden_size = image_hidden_states.shape
         special_image_token_mask = input_ids == self.image_token_id
-        new_inputs_embeds = inputs_embeds.clone()
         reshaped_image_hidden_states = image_hidden_states.view(-1, vision_hidden_size)
         # cast to the dtype of the input_embeds to support quantized models
         reshaped_image_hidden_states = reshaped_image_hidden_states.to(inputs_embeds.dtype)
-        new_inputs_embeds[special_image_token_mask] = reshaped_image_hidden_states
-        return new_inputs_embeds
+        inputs_embeds[special_image_token_mask] = reshaped_image_hidden_states
+        return inputs_embeds
 
     @add_start_docstrings_to_model_forward(
         """
diff --git a/src/transformers/models/idefics3/processing_idefics3.py b/src/transformers/models/idefics3/processing_idefics3.py
index 2f815044d6a831..50e1715b94a7ae 100644
--- a/src/transformers/models/idefics3/processing_idefics3.py
+++ b/src/transformers/models/idefics3/processing_idefics3.py
@@ -91,7 +91,6 @@ def get_image_prompt_string(
 
 
 class Idefics3ImagesKwargs(ImagesKwargs, total=False):
-    image_seq_len: Optional[int]
     return_row_col_info: Optional[bool]
     max_image_size: Optional[Dict[str, int]]
 
@@ -150,6 +149,8 @@ def __init__(self, image_processor, tokenizer=None, image_seq_len: int = 169, ch
         self.global_img_token = "<global-img>"
         self.image_seq_len = image_seq_len
 
+        # This regex matches one or more occurrences of <global-img> tags (optionally surrounded by newline characters)
+        # or <row_x_col_y> tags (where x and y are digits, also optionally surrounded by newline characters).
         self._regex_to_remove_extra_special_tokens = re.compile(r"(\n?<global-img>\n?|<row_\d+_col_\d+>\n?)+")
 
         tokens_to_add = {
@@ -181,6 +182,7 @@ def __call__(
         text: Union[TextInput, "PreTokenizedInput", List[TextInput], List["PreTokenizedInput"]] = None,
         audio=None,
         videos=None,
+        image_seq_len: Optional[int] = None,
         **kwargs: Unpack[Idefics3ProcessorKwargs],
     ) -> BatchEncoding:
         """
@@ -210,7 +212,7 @@ def __call__(
         >>> input_ids = outputs.input_ids
         >>> input_tokens = processor.tokenizer.batch_decode(input_ids)
         >>> print(input_tokens)
-        ['<s><fake_token_around_image><image><image><fake_token_around_image> In this image, we see', '<s> bla bla bla<fake_token_around_image><image><image><fake_token_around_image>']
+        ['<|begin_of_text|><fake_token_around_image><global-img>((<image>)*169)<fake_token_around_image> In this image, we see', '<|reserved_special_token_0|><|reserved_special_token_0|><|reserved_special_token_0|><|begin_of_text|>bla bla bla<fake_token_around_image><global-img>((<image>)*169)<fake_token_around_image>']
         ```
 
         Args:
@@ -255,7 +257,6 @@ def __call__(
         # Temporary fix for "padding_side" in init_kwargs
         output_kwargs["text_kwargs"].pop("padding_side", None)
 
-        image_seq_len = output_kwargs["images_kwargs"].pop("image_seq_len", None)
         image_seq_len = image_seq_len if image_seq_len is not None else self.image_seq_len
 
         n_images_in_text = []
diff --git a/tests/models/idefics3/test_processing_idefics3.py b/tests/models/idefics3/test_processing_idefics3.py
index 928c9a339a5a7a..bf111b3aebd684 100644
--- a/tests/models/idefics3/test_processing_idefics3.py
+++ b/tests/models/idefics3/test_processing_idefics3.py
@@ -315,7 +315,7 @@ def test_kwargs_overrides_default_image_processor_kwargs(self):
         )
         tokenizer = self.get_component("tokenizer", max_length=117, padding="max_length")
 
-        processor = self.processor_class(tokenizer=tokenizer, image_processor=image_processor)
+        processor = self.processor_class(tokenizer=tokenizer, image_processor=image_processor, image_seq_len=2)
         self.skip_processor_without_typed_kwargs(processor)
 
         input_str = "lower newer <image>"

From 000c8eae5ab53d70bc0ee1cbe3600f78b380ed4a Mon Sep 17 00:00:00 2001
From: Andres Marafioti <andimarafioti@gmail.com>
Date: Fri, 6 Sep 2024 15:12:46 +0000
Subject: [PATCH 31/50] review amy

---
 .../idefics3/image_processing_idefics3.py     | 57 ++++++++----------
 .../models/idefics3/modeling_idefics3.py      | 59 +++++++++++--------
 .../models/idefics3/processing_idefics3.py    |  1 -
 .../idefics3/test_processing_idefics3.py      | 10 +---
 4 files changed, 62 insertions(+), 65 deletions(-)

diff --git a/src/transformers/models/idefics3/image_processing_idefics3.py b/src/transformers/models/idefics3/image_processing_idefics3.py
index 31c2ee71c69db2..3cd6c2e1480cec 100644
--- a/src/transformers/models/idefics3/image_processing_idefics3.py
+++ b/src/transformers/models/idefics3/image_processing_idefics3.py
@@ -30,10 +30,7 @@
     PILImageResampling,
     get_image_size,
     infer_channel_dimension_format,
-    is_jax_tensor,
     is_scaled_image,
-    is_tf_tensor,
-    is_torch_tensor,
     is_valid_image,
     to_numpy_array,
     valid_images,
@@ -265,6 +262,8 @@ def to_pil_image(
             and `False` otherwise.
         input_data_format (`ChannelDimension`, *optional*):
             The channel dimension format of the input image. If unset, will use the inferred format from the input.
+        image_mode (`str`, *optional*):
+            The mode of the image.
 
     Returns:
         `PIL.Image.Image`: The converted image.
@@ -272,12 +271,7 @@ def to_pil_image(
     if isinstance(image, PIL.Image.Image):
         return image
     # Convert all tensors to numpy arrays before converting to PIL image
-    if is_torch_tensor(image) or is_tf_tensor(image):
-        image = image.numpy()
-    elif is_jax_tensor(image):
-        image = np.array(image)
-    elif not isinstance(image, np.ndarray):
-        raise ValueError("Input image type not supported: {}".format(type(image)))
+    image = to_numpy_array(image)
 
     # If the channel has been moved to first dim, we put it back at the end.
     image = to_channel_dimension_format(image, ChannelDimension.LAST, input_data_format)
@@ -451,19 +445,21 @@ def resize(
             size = (size["height"], size["width"])
         else:
             raise ValueError("size must be a dictionary with key 'longest_edge' or 'height' and 'width'.")
-        if isinstance(image, Image.Image):
-            return image.resize((size[1], size[0]), resample=resample)
-        else:
-            image_mode = None
-            if image.ndim == 2 or image.shape[-1] == 1:
-                image_mode = "P"
-            # Custom to_pil_image function to support image_mode
-            image = to_pil_image(image, input_data_format=input_data_format, image_mode=image_mode)
+        image_mode = None
+        if image.ndim == 2 or image.shape[-1] == 1:
+            image_mode = "P"
+        # Custom to_pil_image function to support image_mode
+        image = to_pil_image(image, input_data_format=input_data_format, image_mode=image_mode)
 
         resized_image = image.resize((size[1], size[0]), resample=resample)
         resized_array = np.array(resized_image)
+        if resized_array.ndim == 3 and input_data_format == ChannelDimension.FIRST:
+            resized_array = np.moveaxis(resized_array, -1, 0)
         if resized_array.ndim == 2:
-            resized_array = np.expand_dims(resized_array, axis=-1)
+            if input_data_format == ChannelDimension.FIRST:
+                resized_array = np.expand_dims(resized_array, axis=0)
+            elif input_data_format == ChannelDimension.LAST:
+                resized_array = np.expand_dims(resized_array, axis=-1)
         return resized_array
 
     def split_image(
@@ -705,7 +701,6 @@ def preprocess(
         return_row_col_info: bool = False,
         input_data_format: Optional[ChannelDimension] = None,
         data_format: Optional[ChannelDimension] = ChannelDimension.FIRST,
-        crop_size: Optional[Dict[str, int]] = None,
     ):
         """
         Preprocess a batch of images.
@@ -761,12 +756,7 @@ def preprocess(
                 - `"channels_first"` or `ChannelDimension.FIRST`: image in (num_channels, height, width) format.
                 - `"channels_last"` or `ChannelDimension.LAST`: image in (height, width, num_channels) format.
                 - `"none"` or `ChannelDimension.NONE`: image in (height, width) format.
-            crop_size (`Dict[str, int]`, *optional*):
-                This parameter is not used in this method. It is only present for compatibility.
         """
-        if crop_size is not None:
-            logger.warning("crop_size is not used in Idefics3ImageProcessor.preprocess.")
-
         do_resize = do_resize if do_resize is not None else self.do_resize
         size = size if size is not None else self.size
         resample = resample if resample is not None else self.resample
@@ -817,6 +807,12 @@ def preprocess(
         if input_data_format is None:
             # We assume that all images have the same channel dimension format.
             input_data_format = infer_channel_dimension_format(images_list[0][0], num_channels=(1, 3, 4))
+            if input_data_format is ChannelDimension.FIRST:
+                images_list = [
+                    [to_channel_dimension_format(image, ChannelDimension.LAST, input_data_format) for image in images]
+                    for images in images_list
+                ]
+                input_data_format = ChannelDimension.LAST
 
         validate_preprocess_arguments(
             do_rescale=do_rescale,
@@ -838,13 +834,10 @@ def preprocess(
                 for images in images_list
             ]
 
-        # Resize might already change the channel dimension, so we will recompute it
-        input_data_format = infer_channel_dimension_format(images_list[0][0], num_channels=(1, 3, 4))
-
         if do_image_splitting:
-            # We first resize both height and width of each image to the nearest 364 multiple, disregarding the aspect ratio
-            # for size=(10, 364) -> rescaled_size=(364, 364)
-            # for size=(11, 365) -> rescaled_size=(364, 364*2)
+            # We first resize both height and width of each image to the nearest max_image_size multiple, disregarding the aspect ratio
+            # for size=(10, max_image_size) -> rescaled_size=(max_image_size, max_image_size)
+            # for size=(11, max_image_size+1) -> rescaled_size=(max_image_size, max_image_size*2)
             images_list = [
                 [
                     self.resize_for_vision_encoder(
@@ -892,7 +885,9 @@ def preprocess(
         if do_rescale:
             rescaled_images_array = []
             for image in images_list:
-                rescaled_images_array.append([rescale(img, rescale_factor) for img in image])
+                rescaled_images_array.append(
+                    [rescale(img, rescale_factor, input_data_format=input_data_format) for img in image]
+                )
             images_list = rescaled_images_array
 
         if do_normalize:
diff --git a/src/transformers/models/idefics3/modeling_idefics3.py b/src/transformers/models/idefics3/modeling_idefics3.py
index 7d507c06655423..6d814de584ea8e 100644
--- a/src/transformers/models/idefics3/modeling_idefics3.py
+++ b/src/transformers/models/idefics3/modeling_idefics3.py
@@ -100,24 +100,20 @@ class Idefics3CausalLMOutputWithPast(ModelOutput):
         past_key_values (`tuple(tuple(torch.FloatTensor))`, *optional*, returned when `use_cache=True` is passed or when `config.use_cache=True`):
             Tuple of `tuple(torch.FloatTensor)` of length `config.n_layers`, with each tuple having 2 tensors of shape
             `(batch_size, num_heads, sequence_length, embed_size_per_head)`)
-
             Contains pre-computed hidden-states (key and values in the self-attention blocks) that can be used (see
             `past_key_values` input) to speed up sequential decoding.
         hidden_states (`tuple(torch.FloatTensor)`, *optional*, returned when `output_hidden_states=True` is passed or when `config.output_hidden_states=True`):
             Tuple of `torch.FloatTensor` (one for the output of the embeddings, if the model has an embedding layer, +
             one for the output of each layer) of shape `(batch_size, sequence_length, hidden_size)`.
-
             Hidden-states of the model at the output of each layer plus the optional initial embedding outputs.
         attentions (`tuple(torch.FloatTensor)`, *optional*, returned when `output_attentions=True` is passed or when `config.output_attentions=True`):
             Tuple of `torch.FloatTensor` (one for each layer) of shape `(batch_size, num_heads, sequence_length,
             sequence_length)`.
-
             Attentions weights after the attention softmax, used to compute the weighted average in the self-attention
             heads.
         image_hidden_states (`tuple(torch.FloatTensor)`, *optional*):
             Tuple of `torch.FloatTensor` (one for the output of the image embeddings, `(batch_size, num_images,
             sequence_length, hidden_size)`.
-
             image_hidden_states of the model produced by the vision encoder
     """
 
@@ -400,6 +396,7 @@ def forward(self, x):
         return self.proj(x)
 
 
+# Copied from transformers.models.idefics2.modeling_idefics2.Idefics2EncoderLayer with Idefics2->Idefics3
 class Idefics3EncoderLayer(nn.Module):
     def __init__(self, config: Idefics3Config):
         super().__init__()
@@ -625,6 +622,7 @@ class Idefics3PreTrainedModel(PreTrainedModel):
     _supports_flash_attn_2 = True
     _supports_cache_class = True
 
+    # Copied from transformers.models.idefics2.modeling_idefics2.Idefics2PreTrainedModel._init_weights
     def _init_weights(self, module):
         std = (
             self.config.text_config.initializer_range
@@ -681,12 +679,15 @@ def __init__(self, config: Idefics3VisionConfig):
 
         self.embeddings = Idefics3VisionEmbeddings(config)
         self.encoder = Idefics3Encoder(config)
+        self.patch_size = config.patch_size
         self.post_layernorm = nn.LayerNorm(embed_dim, eps=config.layer_norm_eps)
         self._use_flash_attention_2 = config._attn_implementation == "flash_attention_2"
 
+    # Copied from transformers.models.idefics2.modeling_idefics2.Idefics2VisionTransformer.get_input_embeddings
     def get_input_embeddings(self):
         return self.embeddings
 
+    # Copied from transformers.models.idefics2.modeling_idefics2.Idefics2VisionTransformer.set_input_embeddings
     def set_input_embeddings(self, value):
         self.embeddings = value
 
@@ -706,7 +707,7 @@ def forward(
 
         batch_size = pixel_values.size(0)
         if patch_attention_mask is None:
-            patch_size = self.config.patch_size
+            patch_size = self.patch_size
             patch_attention_mask = torch.ones(
                 (
                     batch_size,
@@ -843,6 +844,7 @@ def __init__(self, config: Idefics3Config):
 
         self.post_init()
 
+    # Copied from transformers.models.idefics2.modeling_idefics2.Idefics2Model.enable_input_require_grads
     def enable_input_require_grads(self):
         """
         Enables the gradients for the input embeddings.
@@ -869,9 +871,11 @@ def make_inputs_require_grads(module, input, output):
             make_inputs_require_grads
         )
 
+    # Copied from transformers.models.idefics2.modeling_idefics2.Idefics2Model.get_input_embeddings
     def get_input_embeddings(self):
         return self.text_model.get_input_embeddings()
 
+    # Copied from transformers.models.idefics2.modeling_idefics2.Idefics2Model.set_input_embeddings
     def set_input_embeddings(self, value):
         self.text_model.set_input_embeddings(value)
 
@@ -1038,6 +1042,7 @@ def forward(
 class Idefics3ForConditionalGeneration(Idefics3PreTrainedModel):
     _tied_weights_keys = ["lm_head.weight"]
 
+    # Copied from transformers.models.idefics2.modeling_idefics2.Idefics2ForConditionalGeneration.__init__ with Idefics2->Idefics3
     def __init__(self, config):
         super().__init__(config)
         self.model = Idefics3Model(config)
@@ -1049,6 +1054,7 @@ def __init__(self, config):
         # Initialize weights and apply final processing
         self.post_init()
 
+    # Copied from transformers.models.idefics2.modeling_idefics2.Idefics2ForConditionalGeneration.enable_input_require_grads
     def enable_input_require_grads(self):
         """
         Enables the gradients for the input embeddings. This is useful for fine-tuning adapter weights while keeping
@@ -1063,18 +1069,23 @@ def make_inputs_require_grads(module, input, output):
             make_inputs_require_grads
         )
 
+    # Copied from transformers.models.idefics2.modeling_idefics2.Idefics2ForConditionalGeneration.get_input_embeddings
     def get_input_embeddings(self):
         return self.model.text_model.get_input_embeddings()
 
+    # Copied from transformers.models.idefics2.modeling_idefics2.Idefics2ForConditionalGeneration.set_input_embeddings
     def set_input_embeddings(self, value):
         self.model.text_model.set_input_embeddings(value)
 
+    # Copied from transformers.models.idefics2.modeling_idefics2.Idefics2ForConditionalGeneration.get_output_embeddings
     def get_output_embeddings(self):
         return self.lm_head
 
+    # Copied from transformers.models.idefics2.modeling_idefics2.Idefics2ForConditionalGeneration.set_output_embeddings
     def set_output_embeddings(self, new_embeddings):
         self.lm_head = new_embeddings
 
+    # Copied from transformers.models.idefics2.modeling_idefics2.Idefics2ForConditionalGeneration.tie_weights
     def tie_weights(self):
         """
         Overwrite `transformers.modeling_utils.PreTrainedModel.tie_weights` to handle the case of DecoupledLinear and DecoupledEmbedding.
@@ -1133,25 +1144,23 @@ def forward(
 
         >>> # Create inputs
         >>> messages = [
-                {
-                    "role": "user",
-                    "content": [
-                        {"type": "image"},
-                        {"type": "text", "text": "In this image, we can see the city of New York, and more specifically the Statue of Liberty."},
-                        {"type": "image"},
-                        {"type": "text", "text": "What can we see in this image?"},
-                    ]
-                },
-                {
-                "role": "user",
-                "content": [
-                    {"type": "image"},
-                    {"type": "text", "text": "In which city is that bridge located?"},
-                ]
-            }
-
-            ]
-
+        ...     {
+        ...         "role": "user",
+        ...         "content": [
+        ...             {"type": "image"},
+        ...             {"type": "text", "text": "In this image, we can see the city of New York, and more specifically the Statue of Liberty."},
+        ...             {"type": "image"},
+        ...             {"type": "text", "text": "What can we see in this image?"},
+        ...         ]
+        ...     },
+        ...     {
+        ...         "role": "user",
+        ...         "content": [
+        ...             {"type": "image"},
+        ...             {"type": "text", "text": "In which city is that bridge located?"},
+        ...         ]
+        ...     }
+        ... ]
 
         >>> prompts = [processor.apply_chat_template([message], add_generation_prompt=True) for message in messages]
         >>> images = [[image1, image2], [image3]]
@@ -1221,6 +1230,7 @@ def forward(
             image_hidden_states=outputs.image_hidden_states,
         )
 
+    # Copied from transformers.models.idefics2.modeling_idefics2.Idefics2ForConditionalGeneration.prepare_inputs_for_generation
     def prepare_inputs_for_generation(
         self, input_ids, past_key_values=None, attention_mask=None, inputs_embeds=None, **kwargs
     ):
@@ -1285,6 +1295,7 @@ def prepare_inputs_for_generation(
         )
         return model_inputs
 
+    # Copied from transformers.models.idefics2.modeling_idefics2.Idefics2ForConditionalGeneration._update_model_kwargs_for_generation
     def _update_model_kwargs_for_generation(self, outputs, model_kwargs, is_encoder_decoder, **kwargs):
         model_kwargs = super()._update_model_kwargs_for_generation(
             outputs=outputs,
diff --git a/src/transformers/models/idefics3/processing_idefics3.py b/src/transformers/models/idefics3/processing_idefics3.py
index 50e1715b94a7ae..6955e7ae7bb0c8 100644
--- a/src/transformers/models/idefics3/processing_idefics3.py
+++ b/src/transformers/models/idefics3/processing_idefics3.py
@@ -223,7 +223,6 @@ def __call__(
                 The sequence or batch of sequences to be encoded. Each sequence can be a string or a list of strings
                 (pretokenized string). If the sequences are provided as list of strings (pretokenized), you must set
                 `is_split_into_words=True` (to lift the ambiguity with a batch of sequences).
-
                 Wherever an image token, `<image>` is encountered it is expanded to
                 `<fake_token_around_image>` + `<row_x_col_y>` + `<image>` * `image_seq_len` * <fake_token_around_image>`.
             image_seq_len (`int`, *optional*):
diff --git a/tests/models/idefics3/test_processing_idefics3.py b/tests/models/idefics3/test_processing_idefics3.py
index bf111b3aebd684..997847c1ba5210 100644
--- a/tests/models/idefics3/test_processing_idefics3.py
+++ b/tests/models/idefics3/test_processing_idefics3.py
@@ -285,13 +285,12 @@ def test_apply_chat_template(self):
         )
         self.assertEqual(rendered, expected_rendered)
 
-    # We need to overwrite this test to adapt it to our processor.
     @require_torch
     @require_vision
     def test_image_processor_defaults_preserved_by_image_kwargs(self):
         if "image_processor" not in self.processor_class.attributes:
             self.skipTest(f"image_processor attribute not present in {self.processor_class}")
-        image_processor = self.get_component("image_processor", crop_size=(234, 234))
+        image_processor = self.get_component("image_processor")
         tokenizer = self.get_component("tokenizer", max_length=117)
 
         processor = self.processor_class(tokenizer=tokenizer, image_processor=image_processor)
@@ -304,7 +303,6 @@ def test_image_processor_defaults_preserved_by_image_kwargs(self):
         self.assertEqual(len(inputs["pixel_values"][0][0]), 3)
         self.assertEqual(len(inputs["pixel_values"][0][0][0]), 364)  # crop size doesn't affect our image processor
 
-    # We need to overwrite this test to adapt it to our processor.
     @require_torch
     @require_vision
     def test_kwargs_overrides_default_image_processor_kwargs(self):
@@ -326,7 +324,6 @@ def test_kwargs_overrides_default_image_processor_kwargs(self):
         self.assertEqual(len(inputs["pixel_values"][0][0][0]), 32)
         self.assertEqual(len(inputs["input_ids"][0]), 117)
 
-    # We need to overwrite this test to adapt it to our processor.
     @require_vision
     @require_torch
     def test_kwargs_overrides_default_tokenizer_kwargs(self):
@@ -343,7 +340,6 @@ def test_kwargs_overrides_default_tokenizer_kwargs(self):
         inputs = processor(text=input_str, images=image_input, return_tensors="pt", max_length=30)
         self.assertEqual(len(inputs["input_ids"][0]), 30)
 
-    # We need to overwrite this test to adapt it to our processor.
     @require_torch
     @require_vision
     def test_structured_kwargs_nested(self):
@@ -372,7 +368,6 @@ def test_structured_kwargs_nested(self):
 
         self.assertEqual(len(inputs["input_ids"][0]), 120)
 
-    # We need to overwrite this test to adapt it to our processor.
     @require_torch
     @require_vision
     def test_structured_kwargs_nested_from_dict(self):
@@ -398,7 +393,6 @@ def test_structured_kwargs_nested_from_dict(self):
         self.assertEqual(inputs["pixel_values"].shape[3], 32)
         self.assertEqual(len(inputs["input_ids"][0]), 120)
 
-    # We need to overwrite this test to adapt it to our processor.
     @require_vision
     @require_torch
     def test_tokenizer_defaults_preserved_by_kwargs(self):
@@ -415,7 +409,6 @@ def test_tokenizer_defaults_preserved_by_kwargs(self):
         inputs = processor(text=input_str, images=image_input, return_tensors="pt")
         self.assertEqual(len(inputs["input_ids"][0]), 30)
 
-    # We need to overwrite this test to adapt it to our processor.
     @require_torch
     @require_vision
     def test_unstructured_kwargs_batched(self):
@@ -443,7 +436,6 @@ def test_unstructured_kwargs_batched(self):
         self.assertEqual(inputs["pixel_values"].shape[3], 30)
         self.assertEqual(len(inputs["input_ids"][0]), 76)
 
-    # We need to overwrite this test to adapt it to our processor.
     @require_torch
     @require_vision
     def test_unstructured_kwargs(self):

From 4d02e0cea90266f38b3af76398695e9e319118ef Mon Sep 17 00:00:00 2001
From: =?UTF-8?q?Andr=C3=A9s=20Marafioti?= <andimarafioti@gmail.com>
Date: Wed, 4 Sep 2024 17:25:08 +0200
Subject: [PATCH 32/50] Update
 src/transformers/models/idefics3/image_processing_idefics3.py

Co-authored-by: amyeroberts <22614925+amyeroberts@users.noreply.github.com>
---
 .../models/idefics3/image_processing_idefics3.py    | 13 +++----------
 1 file changed, 3 insertions(+), 10 deletions(-)

diff --git a/src/transformers/models/idefics3/image_processing_idefics3.py b/src/transformers/models/idefics3/image_processing_idefics3.py
index 3cd6c2e1480cec..8dec6afde7d595 100644
--- a/src/transformers/models/idefics3/image_processing_idefics3.py
+++ b/src/transformers/models/idefics3/image_processing_idefics3.py
@@ -793,16 +793,9 @@ def preprocess(
                 " images have pixel values between 0 and 1, set `do_rescale=False` to avoid rescaling them again."
             )
 
-        new_images_list = []
-        for images in images_list:
-            new_images = []
-            for img in images:
-                if img.ndim == 2:
-                    img = np.expand_dims(img, axis=-1)
-                new_images.append(img)
-            new_images_list.append(new_images)
-        images_list = new_images_list
-        del new_images_list
+        images_list = [
+            [np.expand_dims(img, axis=-1) if img.ndim == 2 else img for img in images] for images in images_list
+        ]
 
         if input_data_format is None:
             # We assume that all images have the same channel dimension format.

From 3bf03c213c4aec71675599eab77b06bc1073dc33 Mon Sep 17 00:00:00 2001
From: =?UTF-8?q?Andr=C3=A9s=20Marafioti?= <andimarafioti@gmail.com>
Date: Wed, 4 Sep 2024 18:29:40 +0200
Subject: [PATCH 33/50] Update
 src/transformers/models/idefics3/modeling_idefics3.py

Co-authored-by: amyeroberts <22614925+amyeroberts@users.noreply.github.com>
---
 src/transformers/models/idefics3/modeling_idefics3.py | 6 ------
 1 file changed, 6 deletions(-)

diff --git a/src/transformers/models/idefics3/modeling_idefics3.py b/src/transformers/models/idefics3/modeling_idefics3.py
index 6d814de584ea8e..0435ebfe8e507b 100644
--- a/src/transformers/models/idefics3/modeling_idefics3.py
+++ b/src/transformers/models/idefics3/modeling_idefics3.py
@@ -666,12 +666,6 @@ def _init_weights(self, module):
 )
 class Idefics3VisionTransformer(Idefics3PreTrainedModel):
     config_class = Idefics3VisionConfig
-    base_model_prefix = "model"
-    supports_gradient_checkpointing = True
-    _no_split_modules = ["Idefics3VisionAttention"]
-    _skip_keys_device_placement = "past_key_values"
-    _supports_flash_attn_2 = True
-    _supports_cache_class = True
 
     def __init__(self, config: Idefics3VisionConfig):
         super().__init__(config)

From 57bfd51a9d9b9756c318ff1b948ddb8280ba1b75 Mon Sep 17 00:00:00 2001
From: =?UTF-8?q?Andr=C3=A9s=20Marafioti?= <andimarafioti@gmail.com>
Date: Fri, 6 Sep 2024 17:11:34 +0200
Subject: [PATCH 34/50] Update docs/source/en/model_doc/idefics3.md

Co-authored-by: amyeroberts <22614925+amyeroberts@users.noreply.github.com>
---
 docs/source/en/model_doc/idefics3.md | 3 +--
 1 file changed, 1 insertion(+), 2 deletions(-)

diff --git a/docs/source/en/model_doc/idefics3.md b/docs/source/en/model_doc/idefics3.md
index 4c3ae0d1c0146b..621d795403c647 100644
--- a/docs/source/en/model_doc/idefics3.md
+++ b/docs/source/en/model_doc/idefics3.md
@@ -26,8 +26,7 @@ Idefics3 is an adaptation of the Idefics2 model with three main differences:
 - It uses an updated processing logic for the images.
 - It removes the perceiver.
 
-Input images are either upsampled such that the longest side is 4*364 (if `do_resize` is set `True`) or processed in their original resolution.
-In any case, the image processors decomposes images in patches of 364x364 pixels.
+Input images are either upsampled such that the longest side is 4*364 (if `do_resize` is set `True`) or processed in their original resolution. In both cases, the image processor then decomposes images into patches of 364x364 pixels.
 
 The abstract from the paper is the following:
 

From 63b1d7f01d5aea8fd63b1a988d4490b4aa23d4d9 Mon Sep 17 00:00:00 2001
From: Andres Marafioti <andimarafioti@gmail.com>
Date: Fri, 6 Sep 2024 15:28:29 +0000
Subject: [PATCH 35/50] doc improvement - amy review

---
 docs/source/en/model_doc/idefics3.md | 12 +++++++++++-
 1 file changed, 11 insertions(+), 1 deletion(-)

diff --git a/docs/source/en/model_doc/idefics3.md b/docs/source/en/model_doc/idefics3.md
index 621d795403c647..be912072e7dc0d 100644
--- a/docs/source/en/model_doc/idefics3.md
+++ b/docs/source/en/model_doc/idefics3.md
@@ -26,7 +26,17 @@ Idefics3 is an adaptation of the Idefics2 model with three main differences:
 - It uses an updated processing logic for the images.
 - It removes the perceiver.
 
-Input images are either upsampled such that the longest side is 4*364 (if `do_resize` is set `True`) or processed in their original resolution. In both cases, the image processor then decomposes images into patches of 364x364 pixels.
+Input images are processed either by upsampling (if resizing is enabled) or at their original resolution. The resizing behavior depends on two parameters: do_resize and size.
+
+If `do_resize` is set to `True`, the model resizes images so that the longest edge is 4*364 pixels by default.
+The default resizing behavior can be customized by passing a dictionary to the `size` parameter. For example, `{"longest_edge": 4 * 364}` is the default, but you can change it to a different value if needed.
+
+Here’s how to control resizing and set a custom size:
+```python
+image_processor = Idefics3ImageProcessor(do_resize=True, size={"longest_edge": 2 * 364}, max_image_size=364)
+```
+
+Additionally, the `max_image_size` parameter, which controls the size of each square patch the image is decomposed into, is set to 364 by default but can be adjusted as needed. After resizing (if applicable), the image processor decomposes the images into square patches based on the `max_image_size` parameter.
 
 The abstract from the paper is the following:
 

From 6325fbc05ad3047b0eaa8fcc4466a9084b3c4af6 Mon Sep 17 00:00:00 2001
From: Andres Marafioti <andimarafioti@gmail.com>
Date: Tue, 10 Sep 2024 10:01:50 +0000
Subject: [PATCH 36/50] fix runtime error during fine-tuning

---
 src/transformers/models/idefics3/modeling_idefics3.py | 7 +++++--
 1 file changed, 5 insertions(+), 2 deletions(-)

diff --git a/src/transformers/models/idefics3/modeling_idefics3.py b/src/transformers/models/idefics3/modeling_idefics3.py
index 0435ebfe8e507b..ddedc663be2a91 100644
--- a/src/transformers/models/idefics3/modeling_idefics3.py
+++ b/src/transformers/models/idefics3/modeling_idefics3.py
@@ -890,11 +890,14 @@ def inputs_merger(
         """
         num_images, _, vision_hidden_size = image_hidden_states.shape
         special_image_token_mask = input_ids == self.image_token_id
+        new_inputs_embeds = (
+            inputs_embeds.clone()
+        )  #  Fixes RuntimeError: a leaf Variable that requires grad is being used in an in-place operation.
         reshaped_image_hidden_states = image_hidden_states.view(-1, vision_hidden_size)
         # cast to the dtype of the input_embeds to support quantized models
         reshaped_image_hidden_states = reshaped_image_hidden_states.to(inputs_embeds.dtype)
-        inputs_embeds[special_image_token_mask] = reshaped_image_hidden_states
-        return inputs_embeds
+        new_inputs_embeds[special_image_token_mask] = reshaped_image_hidden_states
+        return new_inputs_embeds
 
     @add_start_docstrings_to_model_forward(
         """

From 76b8892a909aef9c3105a319286da36bd944d542 Mon Sep 17 00:00:00 2001
From: Andres Marafioti <andimarafioti@gmail.com>
Date: Mon, 16 Sep 2024 15:47:34 +0000
Subject: [PATCH 37/50] amy's review

---
 docs/source/en/model_doc/idefics3.md          |   4 -
 .../idefics3/image_processing_idefics3.py     | 121 ++++++++----------
 .../models/idefics3/processing_idefics3.py    |  18 +--
 .../test_image_processing_idefics3.py         |   4 +-
 .../idefics3/test_processing_idefics3.py      |   2 +-
 5 files changed, 57 insertions(+), 92 deletions(-)

diff --git a/docs/source/en/model_doc/idefics3.md b/docs/source/en/model_doc/idefics3.md
index be912072e7dc0d..3ccdc3f576767f 100644
--- a/docs/source/en/model_doc/idefics3.md
+++ b/docs/source/en/model_doc/idefics3.md
@@ -42,10 +42,6 @@ The abstract from the paper is the following:
 
 *The field of vision-language models (VLMs), which take images and texts as inputs and output texts, is rapidly evolving and has yet to reach consensus on several key aspects of the development pipeline, including data, architecture, and training methods. This paper can be seen as a tutorial for building a VLM. We begin by providing a comprehensive overview of the current state-of-the-art approaches, highlighting the strengths and weaknesses of each, addressing the major challenges in the field, and suggesting promising research directions for underexplored areas. We then walk through the practical steps to build Idefics3-8B, a powerful VLM that significantly outperforms its predecessor Idefics2-8B, while being trained efficiently, exclusively on open datasets, and using a straightforward pipeline. These steps include the creation of Docmatix, a dataset for improving document understanding capabilities, which is 240 times larger than previously available datasets. We release the model along with the datasets created for its training.*
 
-Tips:
-
-- By default, the input given to the model will be resized by default such that the longest side is 4*364. For faster inference, set `do_resize` to `False`.
-
 This model was contributed by [amyeroberts](https://huggingface.co/amyeroberts) and [andimarafioti](https://huggingface.co/andito).
 
 
diff --git a/src/transformers/models/idefics3/image_processing_idefics3.py b/src/transformers/models/idefics3/image_processing_idefics3.py
index 8dec6afde7d595..e879316598c916 100644
--- a/src/transformers/models/idefics3/image_processing_idefics3.py
+++ b/src/transformers/models/idefics3/image_processing_idefics3.py
@@ -212,7 +212,7 @@ def get_max_height_width(
     Get the maximum height and width across all images in a batch.
     """
     if input_data_format is None:
-        input_data_format = infer_channel_dimension_format(images_list[0][0])
+        input_data_format = infer_channel_dimension_format(images_list[0][0], num_channels=(1, 3, 4))
 
     max_height = max_width = float("-inf")
     for images in images_list:
@@ -245,8 +245,6 @@ def make_pixel_mask(
 # Custom to_pil_image function to support image_mode
 def to_pil_image(
     image: Union[np.ndarray, "PIL.Image.Image", "torch.Tensor", "tf.Tensor", "jnp.ndarray"],
-    do_rescale: Optional[bool] = None,
-    input_data_format: Optional[Union[str, ChannelDimension]] = None,
     image_mode: Optional[str] = None,
 ) -> "PIL.Image.Image":
     """
@@ -256,12 +254,6 @@ def to_pil_image(
     Args:
         image (`PIL.Image.Image` or `numpy.ndarray` or `torch.Tensor` or `tf.Tensor`):
             The image to convert to the `PIL.Image` format.
-        do_rescale (`bool`, *optional*):
-            Whether or not to apply the scaling factor (to make pixel values integers between 0 and 255). Will default
-            to `True` if the image type is a floating type and casting to `int` would result in a loss of precision,
-            and `False` otherwise.
-        input_data_format (`ChannelDimension`, *optional*):
-            The channel dimension format of the input image. If unset, will use the inferred format from the input.
         image_mode (`str`, *optional*):
             The mode of the image.
 
@@ -274,7 +266,7 @@ def to_pil_image(
     image = to_numpy_array(image)
 
     # If the channel has been moved to first dim, we put it back at the end.
-    image = to_channel_dimension_format(image, ChannelDimension.LAST, input_data_format)
+    image = to_channel_dimension_format(image, ChannelDimension.LAST, infer_channel_dimension_format(image, num_channels=(1, 3, 4)))
 
     # If there is a single channel, we squeeze it, as otherwise PIL can't handle it.
     image = np.squeeze(image, axis=-1) if image.shape[-1] == 1 else image
@@ -285,7 +277,7 @@ def to_pil_image(
 def convert_to_rgb(
     image: ImageInput,
     palette: Optional[PIL.ImagePalette.ImagePalette] = None,
-    input_data_format: Optional[Union[str, ChannelDimension]] = None,
+    data_format: Optional[Union[str, ChannelDimension]] = None,
 ) -> ImageInput:
     """
     Converts an image to RGB format. Only converts if the image is of type PIL.Image.Image, otherwise returns the image
@@ -301,7 +293,7 @@ def convert_to_rgb(
     if not isinstance(image, PIL.Image.Image):
         mode = "P" if palette is not None else None
         # Custom to_pil_image function to support image_mode
-        image = to_pil_image(image, image_mode=mode, input_data_format=input_data_format)
+        image = to_pil_image(image, image_mode=mode)
         if image.mode == "P" and palette is not None:
             image.putpalette(palette)
 
@@ -309,7 +301,11 @@ def convert_to_rgb(
     background = Image.new("RGBA", image_rgba.size, (255, 255, 255))
     alpha_composite = Image.alpha_composite(background, image_rgba)
     alpha_composite = alpha_composite.convert("RGB")
-    return np.array(alpha_composite)
+    output_array = np.array(alpha_composite)
+    output_data_format = infer_channel_dimension_format(output_array, num_channels=(1, 3, 4))
+    if data_format != output_data_format:
+        output_array = to_channel_dimension_format(output_array, data_format, output_data_format)
+    return output_array
 
 
 # FIXME Amy: make a more general crop function that isn't just centre crop
@@ -319,22 +315,18 @@ def _crop(
     h1: int,
     w2: int,
     h2: int,
-    input_data_format: Optional[Union[str, ChannelDimension]] = None,
     data_format: Optional[Union[str, ChannelDimension]] = None,
 ) -> np.ndarray:
-    if input_data_format is None:
-        input_data_format = infer_channel_dimension_format(image)
+    if data_format is None:
+        data_format = infer_channel_dimension_format(image, num_channels=(1, 3, 4))
 
-    if input_data_format == ChannelDimension.FIRST:
+    if data_format == ChannelDimension.FIRST:
         image = image[:, h1:h2, w1:w2]
-    elif input_data_format == ChannelDimension.LAST:
+    elif data_format == ChannelDimension.LAST:
         image = image[h1:h2, w1:w2, :]
     else:
         raise ValueError("Invalid channel dimension format.")
 
-    if data_format is not None:
-        image = to_channel_dimension_format(image, data_format)
-
     return image
 
 
@@ -417,7 +409,6 @@ def resize(
         image: np.ndarray,
         size: Dict[str, int],
         resample: PILImageResampling = PILImageResampling.LANCZOS,
-        data_format: Optional[Union[str, ChannelDimension]] = None,
         input_data_format: Optional[Union[str, ChannelDimension]] = None,
         **kwargs,
     ) -> np.ndarray:
@@ -432,11 +423,12 @@ def resize(
                 Size of the output image.
             resample (`PILImageResampling`, *optional*, defaults to `PILImageResampling.LANCZOS`):
                 Resampling filter to use when resizing the image.
-            data_format (`str` or `ChannelDimension`, *optional*):
-                The channel dimension format of the image. If not provided, it will be the same as the input image.
             input_data_format (`ChannelDimension` or `str`, *optional*):
                 The channel dimension format of the input image. If not provided, it will be inferred.
         """
+        if input_data_format is None:
+            input_data_format = infer_channel_dimension_format(image, num_channels=(1, 3, 4))
+
         if "longest_edge" in size:
             size = get_resize_output_image_size(
                 image, resolution_max_side=size["longest_edge"], input_data_format=input_data_format
@@ -449,7 +441,7 @@ def resize(
         if image.ndim == 2 or image.shape[-1] == 1:
             image_mode = "P"
         # Custom to_pil_image function to support image_mode
-        image = to_pil_image(image, input_data_format=input_data_format, image_mode=image_mode)
+        image = to_pil_image(image, image_mode=image_mode)
 
         resized_image = image.resize((size[1], size[0]), resample=resample)
         resized_array = np.array(resized_image)
@@ -466,7 +458,6 @@ def split_image(
         self,
         image,
         max_image_size: Dict[str, int],
-        input_data_format: Optional[Union[str, ChannelDimension]] = None,
         data_format: Optional[Union[str, ChannelDimension]] = None,
         resample: PILImageResampling = PILImageResampling.LANCZOS,
     ):
@@ -485,14 +476,12 @@ def split_image(
             max_image_size (`Dict[str, int]`):
                 Maximum size of the output image. If the image is larger than this size, it will be split into
                 patches of this size, and the original image will be concatenated with the patches, resized to max_size.
-            input_data_format (`ChannelDimension` or `str`, *optional*):
-                The channel dimension format of the input image. If not provided, it will be inferred.
             data_format (`ChannelDimension` or `str`, *optional*):
                 The channel dimension format of the output image. If not provided, it will be the same as the input image.
             resample (`PILImageResampling`, *optional*, defaults to `PILImageResampling.LANCZOS`):
                 Resampling filter to use when resizing the image.
         """
-        height, width = get_image_size(image, channel_dim=input_data_format)
+        height, width = get_image_size(image, channel_dim=data_format)
         max_height = max_width = max_image_size["longest_edge"]
 
         frames = []
@@ -522,7 +511,6 @@ def split_image(
                         start_y,
                         end_x,
                         end_y,
-                        input_data_format=input_data_format,
                         data_format=data_format,
                     )
                     frames.append(cropped_image)
@@ -534,14 +522,11 @@ def split_image(
                     image,
                     {"height": global_image_height, "width": global_image_width},
                     resample=resample,
-                    input_data_format=input_data_format,
+                    input_data_format=data_format,
                 )
         else:
             num_splits_h, num_splits_w = 0, 0
 
-        if data_format is not None:
-            image = to_channel_dimension_format(image, data_format, input_channel_dim=input_data_format)
-
         frames.append(image)
 
         return frames, num_splits_h, num_splits_w
@@ -567,7 +552,12 @@ def resize_for_vision_encoder(
             input_data_format (`ChannelDimension` or `str`, *optional*):
                 The channel dimension format of the input image. If not provided, it will be inferred
         """
-        height, width, _ = image.shape
+        if input_data_format == ChannelDimension.FIRST:
+            _, height, width = image.shape
+        elif input_data_format == ChannelDimension.LAST:
+            height, width, _ = image.shape
+        else:
+            raise ValueError("Invalid input_data_format.")
         aspect_ratio = width / height
         if width >= height:
             width = math.ceil(width / vision_encoder_max_size) * vision_encoder_max_size
@@ -644,7 +634,7 @@ def pad(
         batch_size = len(images)
         max_num_images = max(len(images_) for images_ in images)
         input_data_format = (
-            infer_channel_dimension_format(images[0][0]) if input_data_format is None else input_data_format
+            infer_channel_dimension_format(images[0][0], num_channels=(1, 3, 4)) if input_data_format is None else input_data_format
         )
         data_format = input_data_format if data_format is None else data_format
 
@@ -699,7 +689,6 @@ def preprocess(
         do_pad: Optional[bool] = None,
         return_tensors: Optional[Union[str, TensorType]] = None,
         return_row_col_info: bool = False,
-        input_data_format: Optional[ChannelDimension] = None,
         data_format: Optional[ChannelDimension] = ChannelDimension.FIRST,
     ):
         """
@@ -750,12 +739,6 @@ def preprocess(
                 - `"channels_first"` or `ChannelDimension.FIRST`: image in (num_channels, height, width) format.
                 - `"channels_last"` or `ChannelDimension.LAST`: image in (height, width, num_channels) format.
                 - Unset: Use the channel dimension format of the input image.
-            input_data_format (`ChannelDimension` or `str`, *optional*):
-                The channel dimension format for the input image. If unset, the channel dimension format is inferred
-                from the input image. Can be one of:
-                - `"channels_first"` or `ChannelDimension.FIRST`: image in (num_channels, height, width) format.
-                - `"channels_last"` or `ChannelDimension.LAST`: image in (height, width, num_channels) format.
-                - `"none"` or `ChannelDimension.NONE`: image in (height, width) format.
         """
         do_resize = do_resize if do_resize is not None else self.do_resize
         size = size if size is not None else self.size
@@ -793,19 +776,26 @@ def preprocess(
                 " images have pixel values between 0 and 1, set `do_rescale=False` to avoid rescaling them again."
             )
 
-        images_list = [
-            [np.expand_dims(img, axis=-1) if img.ndim == 2 else img for img in images] for images in images_list
-        ]
+        if data_format is None or data_format == ChannelDimension.LAST:
+            images_list = [
+                [np.expand_dims(img, axis=-1) if img.ndim == 2 else img for img in images] for images in images_list
+            ]
+        elif data_format == ChannelDimension.FIRST:
+            images_list = [
+                [np.expand_dims(img, axis=0) if img.ndim == 2 else img for img in images] for images in images_list
+            ]
 
-        if input_data_format is None:
-            # We assume that all images have the same channel dimension format.
-            input_data_format = infer_channel_dimension_format(images_list[0][0], num_channels=(1, 3, 4))
-            if input_data_format is ChannelDimension.FIRST:
-                images_list = [
-                    [to_channel_dimension_format(image, ChannelDimension.LAST, input_data_format) for image in images]
-                    for images in images_list
+        if data_format is not None:
+            images_list = [
+                [
+                    to_channel_dimension_format(image, data_format, infer_channel_dimension_format(image, num_channels=(1, 3, 4)))
+                    for image in images
                 ]
-                input_data_format = ChannelDimension.LAST
+                for images in images_list
+            ]
+        else:
+            # We assume that all images have the same channel dimension format.
+            data_format = infer_channel_dimension_format(images_list[0][0], num_channels=(1, 3, 4))
 
         validate_preprocess_arguments(
             do_rescale=do_rescale,
@@ -821,7 +811,7 @@ def preprocess(
         if do_resize:
             images_list = [
                 [
-                    self.resize(image=image, size=size, resample=resample, input_data_format=input_data_format)
+                    self.resize(image=image, size=size, resample=resample, input_data_format=data_format)
                     for image in images
                 ]
                 for images in images_list
@@ -834,7 +824,7 @@ def preprocess(
             images_list = [
                 [
                     self.resize_for_vision_encoder(
-                        image, max_image_size["longest_edge"], resample=resample, input_data_format=input_data_format
+                        image, max_image_size["longest_edge"], resample=resample, input_data_format=data_format
                     )
                     for image in images
                 ]
@@ -853,7 +843,7 @@ def preprocess(
                     split_image_array, rows, cols = self.split_image(
                         image,
                         max_image_size=max_image_size,
-                        input_data_format=input_data_format,
+                        data_format=data_format,
                     )
                     split_image_arrays.extend(split_image_array)
                     split_palettes_arrays.extend([palette] * len(split_image_array))
@@ -871,7 +861,7 @@ def preprocess(
 
         if do_convert_rgb:
             images_list = [
-                [convert_to_rgb(img, plt, input_data_format=input_data_format) for img, plt in zip(images, palettes)]
+                [convert_to_rgb(img, plt, data_format=data_format) for img, plt in zip(images, palettes)]
                 for images, palettes in zip(images_list, palettes_list)
             ]
 
@@ -879,14 +869,14 @@ def preprocess(
             rescaled_images_array = []
             for image in images_list:
                 rescaled_images_array.append(
-                    [rescale(img, rescale_factor, input_data_format=input_data_format) for img in image]
+                    [rescale(img, rescale_factor, data_format=data_format) for img in image]
                 )
             images_list = rescaled_images_array
 
         if do_normalize:
             images_list = [
                 [
-                    self.normalize(image=image, mean=image_mean, std=image_std, input_data_format=input_data_format)
+                    self.normalize(image=image, mean=image_mean, std=image_std, data_format=data_format)
                     for image in images
                 ]
                 for images in images_list
@@ -895,18 +885,9 @@ def preprocess(
         pixel_attention_mask = None
         if do_pad:
             images_list, pixel_attention_mask = self.pad(
-                images_list, return_pixel_mask=True, return_tensors=return_tensors, input_data_format=input_data_format
+                images_list, return_pixel_mask=True, return_tensors=return_tensors, data_format=data_format
             )
 
-        if data_format is not None:
-            images_list = [
-                [
-                    to_channel_dimension_format(image, data_format, input_channel_dim=input_data_format)
-                    for image in images
-                ]
-                for images in images_list
-            ]
-
         data = {
             "pixel_values": np.array(images_list) if do_pad and return_tensors is not None else images_list
         }  # Faster tensor conversion
diff --git a/src/transformers/models/idefics3/processing_idefics3.py b/src/transformers/models/idefics3/processing_idefics3.py
index 6955e7ae7bb0c8..10042ca4529204 100644
--- a/src/transformers/models/idefics3/processing_idefics3.py
+++ b/src/transformers/models/idefics3/processing_idefics3.py
@@ -146,7 +146,7 @@ def __init__(self, image_processor, tokenizer=None, image_seq_len: int = 169, ch
         self.fake_image_token = AddedToken("<fake_token_around_image>", normalized=False, special=True)
         self.image_token = AddedToken("<image>", normalized=False, special=True)
         self.end_of_utterance_token = AddedToken("<end_of_utterance>", normalized=False, special=True)
-        self.global_img_token = "<global-img>"
+        self.global_image_tag = "<global-img>"  # https://github.com/huggingface/transformers/pull/32473/files/8063e5e17362571b693f1db95167f5443a3be1b2#r1734825341
         self.image_seq_len = image_seq_len
 
         # This regex matches one or more occurrences of <global-img> tags (optionally surrounded by newline characters)
@@ -226,20 +226,8 @@ def __call__(
                 Wherever an image token, `<image>` is encountered it is expanded to
                 `<fake_token_around_image>` + `<row_x_col_y>` + `<image>` * `image_seq_len` * <fake_token_around_image>`.
             image_seq_len (`int`, *optional*):
-                The length of the image sequence. If not provided, the default value of 169 is used.
+                The length of the image sequence. If not provided, the default value of self.image_seq_len is used.
                 image_seq_len should be equal to int(((image_size // patch_size) ** 2) / (scale_factor**2))
-            padding (`Union[bool, str, PaddingStrategy]`, *optional*, defaults to `False`):
-                Padding strategy applied to the input ids. See [`PreTrainedTokenizerFast.pad`] for more information.
-            truncation (`Union[bool, str, TruncationStrategy]`, *optional*):
-                Truncation strategy applied to the input ids. See [`PreTrainedTokenizerFast.truncate`] for more information.
-            max_length (`int`, *optional*):
-                Maximum length of the returned list and optionally padding/truncation length. See
-                [`PreTrainedTokenizerFast.__call__`] for more information.
-            is_split_into_words (`bool`, *optional*, defaults to `False`):
-                Whether the input text is split into words or not. If set to `True`, the tokenizer will skip the
-                tokenization process and assume the input is already tokenized.
-            add_special_tokens (`bool`, *optional*, defaults to `True`):
-                Whether to add special tokens or not. See [`PreTrainedTokenizerFast.__call__`] for more information.
             return_tensors (`Union[str, TensorType]`, *optional*):
                 If set, will return tensors of a particular framework. See [`PreTrainedTokenizerFast.__call__`] for more
                 information.
@@ -294,7 +282,7 @@ def __call__(
 
             fake_image_token = self.fake_image_token.content
             image_token = self.image_token.content
-            global_img_token = self.global_img_token
+            global_img_token = self.global_image_tag
 
             prompt_strings = []
             for sample, sample_rows, sample_cols in zip(text, image_rows, image_cols):
diff --git a/tests/models/idefics3/test_image_processing_idefics3.py b/tests/models/idefics3/test_image_processing_idefics3.py
index 20e3d85e743695..96534a2b2b0335 100644
--- a/tests/models/idefics3/test_image_processing_idefics3.py
+++ b/tests/models/idefics3/test_image_processing_idefics3.py
@@ -226,14 +226,14 @@ def test_call_numpy_4_channels(self):
 
             # Test not batched input
             encoded_images = image_processing(
-                image_inputs[0], input_data_format="channels_last", return_tensors="pt"
+                image_inputs[0], return_tensors="pt"
             ).pixel_values
             expected_output_image_shape = self.image_processor_tester.expected_output_image_shape([image_inputs[0]])
             self.assertEqual(tuple(encoded_images.shape), (1, *expected_output_image_shape))
 
             # Test batched
             encoded_images = image_processing(
-                image_inputs, input_data_format="channels_last", return_tensors="pt"
+                image_inputs, return_tensors="pt"
             ).pixel_values
             expected_output_image_shape = self.image_processor_tester.expected_output_image_shape(image_inputs)
             self.assertEqual(
diff --git a/tests/models/idefics3/test_processing_idefics3.py b/tests/models/idefics3/test_processing_idefics3.py
index 997847c1ba5210..3f7f4f22791907 100644
--- a/tests/models/idefics3/test_processing_idefics3.py
+++ b/tests/models/idefics3/test_processing_idefics3.py
@@ -63,7 +63,7 @@ def setUpClass(cls):
         cls.bos_token = processor.tokenizer.bos_token
         cls.image_token = processor.image_token.content
         cls.fake_image_token = processor.fake_image_token.content
-        cls.global_img_token = processor.global_img_token
+        cls.global_img_token = processor.global_image_tag
 
         cls.bos_token_id = processor.tokenizer.convert_tokens_to_ids(cls.bos_token)
         cls.image_token_id = processor.tokenizer.convert_tokens_to_ids(cls.image_token)

From 9a203066c3aef920eef226df64216407a995ad42 Mon Sep 17 00:00:00 2001
From: =?UTF-8?q?Andr=C3=A9s=20Marafioti?= <andimarafioti@gmail.com>
Date: Mon, 16 Sep 2024 14:23:41 +0200
Subject: [PATCH 38/50] Update
 src/transformers/models/idefics3/image_processing_idefics3.py

Co-authored-by: amyeroberts <22614925+amyeroberts@users.noreply.github.com>
---
 src/transformers/models/idefics3/image_processing_idefics3.py | 1 -
 1 file changed, 1 deletion(-)

diff --git a/src/transformers/models/idefics3/image_processing_idefics3.py b/src/transformers/models/idefics3/image_processing_idefics3.py
index e879316598c916..d59fb4dbd3e064 100644
--- a/src/transformers/models/idefics3/image_processing_idefics3.py
+++ b/src/transformers/models/idefics3/image_processing_idefics3.py
@@ -459,7 +459,6 @@ def split_image(
         image,
         max_image_size: Dict[str, int],
         data_format: Optional[Union[str, ChannelDimension]] = None,
-        resample: PILImageResampling = PILImageResampling.LANCZOS,
     ):
         """
         Split an image into squares of side max_image_size and the original image resized to max_image_size.

From 31299204f97523065fa902282bfa47e5d4d95d25 Mon Sep 17 00:00:00 2001
From: =?UTF-8?q?Andr=C3=A9s=20Marafioti?= <andimarafioti@gmail.com>
Date: Mon, 16 Sep 2024 14:24:15 +0200
Subject: [PATCH 39/50] Update
 src/transformers/models/idefics3/image_processing_idefics3.py

Co-authored-by: amyeroberts <22614925+amyeroberts@users.noreply.github.com>
---
 src/transformers/models/idefics3/image_processing_idefics3.py | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/src/transformers/models/idefics3/image_processing_idefics3.py b/src/transformers/models/idefics3/image_processing_idefics3.py
index d59fb4dbd3e064..1dbbe7bfc4c2b0 100644
--- a/src/transformers/models/idefics3/image_processing_idefics3.py
+++ b/src/transformers/models/idefics3/image_processing_idefics3.py
@@ -610,7 +610,7 @@ def pad(
         For each sample in the batch, pads the sample with empty images to the max_number of images per sample in the batch. Optionally returns a pixel mask.
 
         Args:
-            images (`np.ndarray`):
+            images (`List[np.ndarray]`):
                 List of list of images to pad. Pads to the largest height and width in the batch.
             constant_values (`float` or `Iterable[float]`, *optional*):
                 The value to use for the padding if `mode` is `"constant"`.

From e1a10b3ab8f4fe7b8edd2ff95f4dbd124d10d854 Mon Sep 17 00:00:00 2001
From: =?UTF-8?q?Andr=C3=A9s=20Marafioti?= <andimarafioti@gmail.com>
Date: Mon, 16 Sep 2024 17:37:50 +0200
Subject: [PATCH 40/50] Update
 src/transformers/models/idefics3/modeling_idefics3.py

Co-authored-by: amyeroberts <22614925+amyeroberts@users.noreply.github.com>
---
 src/transformers/models/idefics3/modeling_idefics3.py | 5 ++---
 1 file changed, 2 insertions(+), 3 deletions(-)

diff --git a/src/transformers/models/idefics3/modeling_idefics3.py b/src/transformers/models/idefics3/modeling_idefics3.py
index ddedc663be2a91..086932d012ce69 100644
--- a/src/transformers/models/idefics3/modeling_idefics3.py
+++ b/src/transformers/models/idefics3/modeling_idefics3.py
@@ -890,9 +890,8 @@ def inputs_merger(
         """
         num_images, _, vision_hidden_size = image_hidden_states.shape
         special_image_token_mask = input_ids == self.image_token_id
-        new_inputs_embeds = (
-            inputs_embeds.clone()
-        )  #  Fixes RuntimeError: a leaf Variable that requires grad is being used in an in-place operation.
+        #  Fixes RuntimeError: a leaf Variable that requires grad is being used in an in-place operation.
+        new_inputs_embeds = inputs_embeds.clone()
         reshaped_image_hidden_states = image_hidden_states.view(-1, vision_hidden_size)
         # cast to the dtype of the input_embeds to support quantized models
         reshaped_image_hidden_states = reshaped_image_hidden_states.to(inputs_embeds.dtype)

From 4c3756fd3e5e626a42d3a1cd58b0dfcf7fc3187a Mon Sep 17 00:00:00 2001
From: Andres Marafioti <andimarafioti@gmail.com>
Date: Mon, 16 Sep 2024 15:54:21 +0000
Subject: [PATCH 41/50] ruff

---
 .../models/idefics3/image_processing_idefics3.py    | 13 ++++++++++---
 .../idefics3/test_image_processing_idefics3.py      |  8 ++------
 2 files changed, 12 insertions(+), 9 deletions(-)

diff --git a/src/transformers/models/idefics3/image_processing_idefics3.py b/src/transformers/models/idefics3/image_processing_idefics3.py
index 1dbbe7bfc4c2b0..91dcb9d486f3f7 100644
--- a/src/transformers/models/idefics3/image_processing_idefics3.py
+++ b/src/transformers/models/idefics3/image_processing_idefics3.py
@@ -266,7 +266,9 @@ def to_pil_image(
     image = to_numpy_array(image)
 
     # If the channel has been moved to first dim, we put it back at the end.
-    image = to_channel_dimension_format(image, ChannelDimension.LAST, infer_channel_dimension_format(image, num_channels=(1, 3, 4)))
+    image = to_channel_dimension_format(
+        image, ChannelDimension.LAST, infer_channel_dimension_format(image, num_channels=(1, 3, 4))
+    )
 
     # If there is a single channel, we squeeze it, as otherwise PIL can't handle it.
     image = np.squeeze(image, axis=-1) if image.shape[-1] == 1 else image
@@ -459,6 +461,7 @@ def split_image(
         image,
         max_image_size: Dict[str, int],
         data_format: Optional[Union[str, ChannelDimension]] = None,
+        resample: PILImageResampling = PILImageResampling.LANCZOS,
     ):
         """
         Split an image into squares of side max_image_size and the original image resized to max_image_size.
@@ -633,7 +636,9 @@ def pad(
         batch_size = len(images)
         max_num_images = max(len(images_) for images_ in images)
         input_data_format = (
-            infer_channel_dimension_format(images[0][0], num_channels=(1, 3, 4)) if input_data_format is None else input_data_format
+            infer_channel_dimension_format(images[0][0], num_channels=(1, 3, 4))
+            if input_data_format is None
+            else input_data_format
         )
         data_format = input_data_format if data_format is None else data_format
 
@@ -787,7 +792,9 @@ def preprocess(
         if data_format is not None:
             images_list = [
                 [
-                    to_channel_dimension_format(image, data_format, infer_channel_dimension_format(image, num_channels=(1, 3, 4)))
+                    to_channel_dimension_format(
+                        image, data_format, infer_channel_dimension_format(image, num_channels=(1, 3, 4))
+                    )
                     for image in images
                 ]
                 for images in images_list
diff --git a/tests/models/idefics3/test_image_processing_idefics3.py b/tests/models/idefics3/test_image_processing_idefics3.py
index 96534a2b2b0335..a00e7f962208a9 100644
--- a/tests/models/idefics3/test_image_processing_idefics3.py
+++ b/tests/models/idefics3/test_image_processing_idefics3.py
@@ -225,16 +225,12 @@ def test_call_numpy_4_channels(self):
                     self.assertIsInstance(image, np.ndarray)
 
             # Test not batched input
-            encoded_images = image_processing(
-                image_inputs[0], return_tensors="pt"
-            ).pixel_values
+            encoded_images = image_processing(image_inputs[0], return_tensors="pt").pixel_values
             expected_output_image_shape = self.image_processor_tester.expected_output_image_shape([image_inputs[0]])
             self.assertEqual(tuple(encoded_images.shape), (1, *expected_output_image_shape))
 
             # Test batched
-            encoded_images = image_processing(
-                image_inputs, return_tensors="pt"
-            ).pixel_values
+            encoded_images = image_processing(image_inputs, return_tensors="pt").pixel_values
             expected_output_image_shape = self.image_processor_tester.expected_output_image_shape(image_inputs)
             self.assertEqual(
                 tuple(encoded_images.shape), (self.image_processor_tester.batch_size, *expected_output_image_shape)

From fbaf07e699391818f6403a2c5a4c81de1240c353 Mon Sep 17 00:00:00 2001
From: Andres Marafioti <andimarafioti@gmail.com>
Date: Mon, 16 Sep 2024 16:02:21 +0000
Subject: [PATCH 42/50] amy's comment on the order

---
 docs/source/en/model_doc/idefics3.md | 10 ++++++----
 1 file changed, 6 insertions(+), 4 deletions(-)

diff --git a/docs/source/en/model_doc/idefics3.md b/docs/source/en/model_doc/idefics3.md
index 3ccdc3f576767f..dfaf40477a7b52 100644
--- a/docs/source/en/model_doc/idefics3.md
+++ b/docs/source/en/model_doc/idefics3.md
@@ -26,6 +26,12 @@ Idefics3 is an adaptation of the Idefics2 model with three main differences:
 - It uses an updated processing logic for the images.
 - It removes the perceiver.
 
+The abstract from the paper is the following:
+
+*The field of vision-language models (VLMs), which take images and texts as inputs and output texts, is rapidly evolving and has yet to reach consensus on several key aspects of the development pipeline, including data, architecture, and training methods. This paper can be seen as a tutorial for building a VLM. We begin by providing a comprehensive overview of the current state-of-the-art approaches, highlighting the strengths and weaknesses of each, addressing the major challenges in the field, and suggesting promising research directions for underexplored areas. We then walk through the practical steps to build Idefics3-8B, a powerful VLM that significantly outperforms its predecessor Idefics2-8B, while being trained efficiently, exclusively on open datasets, and using a straightforward pipeline. These steps include the creation of Docmatix, a dataset for improving document understanding capabilities, which is 240 times larger than previously available datasets. We release the model along with the datasets created for its training.*
+
+## Usage tips
+
 Input images are processed either by upsampling (if resizing is enabled) or at their original resolution. The resizing behavior depends on two parameters: do_resize and size.
 
 If `do_resize` is set to `True`, the model resizes images so that the longest edge is 4*364 pixels by default.
@@ -38,10 +44,6 @@ image_processor = Idefics3ImageProcessor(do_resize=True, size={"longest_edge": 2
 
 Additionally, the `max_image_size` parameter, which controls the size of each square patch the image is decomposed into, is set to 364 by default but can be adjusted as needed. After resizing (if applicable), the image processor decomposes the images into square patches based on the `max_image_size` parameter.
 
-The abstract from the paper is the following:
-
-*The field of vision-language models (VLMs), which take images and texts as inputs and output texts, is rapidly evolving and has yet to reach consensus on several key aspects of the development pipeline, including data, architecture, and training methods. This paper can be seen as a tutorial for building a VLM. We begin by providing a comprehensive overview of the current state-of-the-art approaches, highlighting the strengths and weaknesses of each, addressing the major challenges in the field, and suggesting promising research directions for underexplored areas. We then walk through the practical steps to build Idefics3-8B, a powerful VLM that significantly outperforms its predecessor Idefics2-8B, while being trained efficiently, exclusively on open datasets, and using a straightforward pipeline. These steps include the creation of Docmatix, a dataset for improving document understanding capabilities, which is 240 times larger than previously available datasets. We release the model along with the datasets created for its training.*
-
 This model was contributed by [amyeroberts](https://huggingface.co/amyeroberts) and [andimarafioti](https://huggingface.co/andito).
 
 

From 87fa179e54e23eedf6a5a17277d3948dd3689e40 Mon Sep 17 00:00:00 2001
From: Andres Marafioti <andimarafioti@gmail.com>
Date: Tue, 17 Sep 2024 08:24:59 +0000
Subject: [PATCH 43/50] ruff ruff

---
 src/transformers/models/idefics3/image_processing_idefics3.py | 4 +---
 1 file changed, 1 insertion(+), 3 deletions(-)

diff --git a/src/transformers/models/idefics3/image_processing_idefics3.py b/src/transformers/models/idefics3/image_processing_idefics3.py
index 91dcb9d486f3f7..d66c8a15ea2d01 100644
--- a/src/transformers/models/idefics3/image_processing_idefics3.py
+++ b/src/transformers/models/idefics3/image_processing_idefics3.py
@@ -874,9 +874,7 @@ def preprocess(
         if do_rescale:
             rescaled_images_array = []
             for image in images_list:
-                rescaled_images_array.append(
-                    [rescale(img, rescale_factor, data_format=data_format) for img in image]
-                )
+                rescaled_images_array.append([rescale(img, rescale_factor, data_format=data_format) for img in image])
             images_list = rescaled_images_array
 
         if do_normalize:

From 23d4cf8d57574e53496ed76f94171c8960cb39ee Mon Sep 17 00:00:00 2001
From: Andres Marafioti <andimarafioti@gmail.com>
Date: Tue, 17 Sep 2024 08:53:14 +0000
Subject: [PATCH 44/50] fix copies

---
 .../models/idefics3/modeling_idefics3.py            | 13 +++++++++++--
 1 file changed, 11 insertions(+), 2 deletions(-)

diff --git a/src/transformers/models/idefics3/modeling_idefics3.py b/src/transformers/models/idefics3/modeling_idefics3.py
index 086932d012ce69..cde21b42bb90f6 100644
--- a/src/transformers/models/idefics3/modeling_idefics3.py
+++ b/src/transformers/models/idefics3/modeling_idefics3.py
@@ -398,7 +398,7 @@ def forward(self, x):
 
 # Copied from transformers.models.idefics2.modeling_idefics2.Idefics2EncoderLayer with Idefics2->Idefics3
 class Idefics3EncoderLayer(nn.Module):
-    def __init__(self, config: Idefics3Config):
+    def __init__(self, config: Idefics3VisionConfig):
         super().__init__()
         self.embed_dim = config.hidden_size
         self.self_attn = IDEFICS_VISION_ATTENTION_CLASSES[config._attn_implementation](config)
@@ -1228,7 +1228,13 @@ def forward(
 
     # Copied from transformers.models.idefics2.modeling_idefics2.Idefics2ForConditionalGeneration.prepare_inputs_for_generation
     def prepare_inputs_for_generation(
-        self, input_ids, past_key_values=None, attention_mask=None, inputs_embeds=None, **kwargs
+        self,
+        input_ids,
+        past_key_values=None,
+        attention_mask=None,
+        inputs_embeds=None,
+        num_logits_to_keep=None,
+        **kwargs,
     ):
         past_length = 0
         # Omit tokens covered by past_key_values
@@ -1271,6 +1277,9 @@ def prepare_inputs_for_generation(
         else:
             model_inputs = {"input_ids": input_ids}
 
+        if num_logits_to_keep is not None:
+            model_inputs["num_logits_to_keep"] = num_logits_to_keep
+
         image_hidden_states = kwargs.get("image_hidden_states", None)
         if image_hidden_states is not None:
             pixel_values = None

From 9e925b9d085005fe6c4c0c7be3b62ed89be63e7c Mon Sep 17 00:00:00 2001
From: Andres Marafioti <andimarafioti@gmail.com>
Date: Tue, 17 Sep 2024 10:15:08 +0000
Subject: [PATCH 45/50] square images when they are not splitted

---
 .../models/idefics3/image_processing_idefics3.py          | 8 ++++++++
 tests/models/idefics3/test_processing_idefics3.py         | 6 +++---
 2 files changed, 11 insertions(+), 3 deletions(-)

diff --git a/src/transformers/models/idefics3/image_processing_idefics3.py b/src/transformers/models/idefics3/image_processing_idefics3.py
index d66c8a15ea2d01..9953f484ec125d 100644
--- a/src/transformers/models/idefics3/image_processing_idefics3.py
+++ b/src/transformers/models/idefics3/image_processing_idefics3.py
@@ -862,6 +862,14 @@ def preprocess(
             images_list = images_list_split_arrays
             palettes_list = palettes_list_split_arrays
         else:
+            # We square the images to max_image_size
+            images_list = [
+                [
+                    self.resize(image=image, size={'height': max_image_size["longest_edge"], "width": max_image_size["longest_edge"]}, resample=resample, input_data_format=data_format)
+                    for image in images
+                ]
+                for images in images_list
+            ]
             images_list_rows = [[0] * len(images) for images in images_list]
             images_list_cols = [[0] * len(images) for images in images_list]
 
diff --git a/tests/models/idefics3/test_processing_idefics3.py b/tests/models/idefics3/test_processing_idefics3.py
index 3f7f4f22791907..c204ecdb4151b8 100644
--- a/tests/models/idefics3/test_processing_idefics3.py
+++ b/tests/models/idefics3/test_processing_idefics3.py
@@ -113,7 +113,7 @@ def test_process_interleaved_images_prompts_no_image_splitting(self):
 
         # Test that a single image is processed correctly
         inputs = processor(images=self.image1)
-        image1_expected_size = (970, 1456)
+        image1_expected_size = (364, 364)
         self.assertEqual(np.array(inputs["pixel_values"]).shape, (1, 1, 3, *image1_expected_size))
         self.assertEqual(np.array(inputs["pixel_attention_mask"]).shape, (1, 1, *image1_expected_size))
         # fmt: on
@@ -163,8 +163,8 @@ def test_process_interleaved_images_prompts_no_image_splitting(self):
             inputs["attention_mask"],
             [[0] * pad_len + [1] * len(expected_input_ids_1), [1] * len(expected_input_ids_2)]
         )
-        self.assertEqual(np.array(inputs['pixel_values']).shape, (2, 2, 3, 1140, 1456))
-        self.assertEqual(np.array(inputs['pixel_attention_mask']).shape, (2, 2, 1140, 1456))
+        self.assertEqual(np.array(inputs['pixel_values']).shape, (2, 2, 3, 364, 364))
+        self.assertEqual(np.array(inputs['pixel_attention_mask']).shape, (2, 2, 364, 364))
         # fmt: on
 
     def test_process_interleaved_images_prompts_image_splitting(self):

From 215b636e10bfa4df2ab13f47ce477426550533d9 Mon Sep 17 00:00:00 2001
From: Andres Marafioti <andimarafioti@gmail.com>
Date: Tue, 17 Sep 2024 10:17:47 +0000
Subject: [PATCH 46/50] ruff :(

---
 .../models/idefics3/image_processing_idefics3.py           | 7 ++++++-
 1 file changed, 6 insertions(+), 1 deletion(-)

diff --git a/src/transformers/models/idefics3/image_processing_idefics3.py b/src/transformers/models/idefics3/image_processing_idefics3.py
index 9953f484ec125d..80571c1b13444f 100644
--- a/src/transformers/models/idefics3/image_processing_idefics3.py
+++ b/src/transformers/models/idefics3/image_processing_idefics3.py
@@ -865,7 +865,12 @@ def preprocess(
             # We square the images to max_image_size
             images_list = [
                 [
-                    self.resize(image=image, size={'height': max_image_size["longest_edge"], "width": max_image_size["longest_edge"]}, resample=resample, input_data_format=data_format)
+                    self.resize(
+                        image=image,
+                        size={"height": max_image_size["longest_edge"], "width": max_image_size["longest_edge"]},
+                        resample=resample,
+                        input_data_format=data_format,
+                    )
                     for image in images
                 ]
                 for images in images_list

From 29679743ca6f6747c669bc983f55812008f26343 Mon Sep 17 00:00:00 2001
From: =?UTF-8?q?Andr=C3=A9s=20Marafioti?= <andimarafioti@gmail.com>
Date: Wed, 18 Sep 2024 14:29:10 +0200
Subject: [PATCH 47/50] Update
 src/transformers/models/idefics3/image_processing_idefics3.py

Co-authored-by: amyeroberts <22614925+amyeroberts@users.noreply.github.com>
---
 .../models/idefics3/image_processing_idefics3.py             | 5 +----
 1 file changed, 1 insertion(+), 4 deletions(-)

diff --git a/src/transformers/models/idefics3/image_processing_idefics3.py b/src/transformers/models/idefics3/image_processing_idefics3.py
index 80571c1b13444f..fa037b45296420 100644
--- a/src/transformers/models/idefics3/image_processing_idefics3.py
+++ b/src/transformers/models/idefics3/image_processing_idefics3.py
@@ -885,10 +885,7 @@ def preprocess(
             ]
 
         if do_rescale:
-            rescaled_images_array = []
-            for image in images_list:
-                rescaled_images_array.append([rescale(img, rescale_factor, data_format=data_format) for img in image])
-            images_list = rescaled_images_array
+            images_list = [[self.rescale(img, rescale_factor, data_format=data_format) for image in images] for images in images_list]
 
         if do_normalize:
             images_list = [

From ee041bf2133c27b4180e03667b16bf0735322e59 Mon Sep 17 00:00:00 2001
From: =?UTF-8?q?Andr=C3=A9s=20Marafioti?= <andimarafioti@gmail.com>
Date: Wed, 18 Sep 2024 14:29:28 +0200
Subject: [PATCH 48/50] Update
 tests/models/idefics3/test_processing_idefics3.py

Co-authored-by: amyeroberts <22614925+amyeroberts@users.noreply.github.com>
---
 tests/models/idefics3/test_processing_idefics3.py | 5 ++---
 1 file changed, 2 insertions(+), 3 deletions(-)

diff --git a/tests/models/idefics3/test_processing_idefics3.py b/tests/models/idefics3/test_processing_idefics3.py
index c204ecdb4151b8..b148d2e9472ceb 100644
--- a/tests/models/idefics3/test_processing_idefics3.py
+++ b/tests/models/idefics3/test_processing_idefics3.py
@@ -92,9 +92,8 @@ def get_splitted_image_expected_tokens(self, processor, image_rows, image_cols):
                 )
             text_split_images += processor.tokenizer("\n", add_special_tokens=False)["input_ids"]
         text_split_images = text_split_images[:-1]  # remove last newline
-        text_split_images += processor.tokenizer("\n\n", add_special_tokens=False)[
-            "input_ids"
-        ]  # add double newline, as it gets its own token
+        # add double newline, as it gets its own token
+        text_split_images += processor.tokenizer("\n\n", add_special_tokens=False)["input_ids"]
         text_split_images += (
             [self.fake_image_token_id]
             + self.global_img_tokens_id

From 4aad2663f554189a963d2b86e5c406b08d9fe30b Mon Sep 17 00:00:00 2001
From: Andres Marafioti <andimarafioti@gmail.com>
Date: Wed, 18 Sep 2024 12:35:40 +0000
Subject: [PATCH 49/50] fix small bug introduced in refactor

---
 src/transformers/models/idefics3/image_processing_idefics3.py | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/src/transformers/models/idefics3/image_processing_idefics3.py b/src/transformers/models/idefics3/image_processing_idefics3.py
index fa037b45296420..cd5a87c149d89d 100644
--- a/src/transformers/models/idefics3/image_processing_idefics3.py
+++ b/src/transformers/models/idefics3/image_processing_idefics3.py
@@ -885,7 +885,7 @@ def preprocess(
             ]
 
         if do_rescale:
-            images_list = [[self.rescale(img, rescale_factor, data_format=data_format) for image in images] for images in images_list]
+            images_list = [[self.rescale(image, rescale_factor, data_format=data_format) for image in images] for images in images_list]
 
         if do_normalize:
             images_list = [

From ab8e6dd2ec8deacbf761595cfb46a107699afb91 Mon Sep 17 00:00:00 2001
From: Amy Roberts <22614925+amyeroberts@users.noreply.github.com>
Date: Thu, 19 Sep 2024 17:25:22 +0000
Subject: [PATCH 50/50] Idefics3 - resolve data_format for image processing

---
 src/transformers/image_transforms.py          |   5 +-
 .../idefics3/image_processing_idefics3.py     | 216 +++++++++---------
 2 files changed, 111 insertions(+), 110 deletions(-)

diff --git a/src/transformers/image_transforms.py b/src/transformers/image_transforms.py
index baf5ec95c4b8d0..4fef6012012f36 100644
--- a/src/transformers/image_transforms.py
+++ b/src/transformers/image_transforms.py
@@ -162,6 +162,7 @@ def _rescale_for_pil_conversion(image):
 def to_pil_image(
     image: Union[np.ndarray, "PIL.Image.Image", "torch.Tensor", "tf.Tensor", "jnp.ndarray"],
     do_rescale: Optional[bool] = None,
+    image_mode: Optional[str] = None,
     input_data_format: Optional[Union[str, ChannelDimension]] = None,
 ) -> "PIL.Image.Image":
     """
@@ -175,6 +176,8 @@ def to_pil_image(
             Whether or not to apply the scaling factor (to make pixel values integers between 0 and 255). Will default
             to `True` if the image type is a floating type and casting to `int` would result in a loss of precision,
             and `False` otherwise.
+        image_mode (`str`, *optional*):
+            The mode to use for the PIL image. If unset, will use the default mode for the input image type.
         input_data_format (`ChannelDimension`, *optional*):
             The channel dimension format of the input image. If unset, will use the inferred format from the input.
 
@@ -207,7 +210,7 @@ def to_pil_image(
         image = rescale(image, 255)
 
     image = image.astype(np.uint8)
-    return PIL.Image.fromarray(image)
+    return PIL.Image.fromarray(image, mode=image_mode)
 
 
 # Logic adapted from torchvision resizing logic: https://github.com/pytorch/vision/blob/511924c1ced4ce0461197e5caa64ce5b9e558aab/torchvision/transforms/functional.py#L366
diff --git a/src/transformers/models/idefics3/image_processing_idefics3.py b/src/transformers/models/idefics3/image_processing_idefics3.py
index cd5a87c149d89d..fd4c4aea67f791 100644
--- a/src/transformers/models/idefics3/image_processing_idefics3.py
+++ b/src/transformers/models/idefics3/image_processing_idefics3.py
@@ -21,7 +21,7 @@
 from transformers.utils.import_utils import is_flax_available, is_tf_available, is_torch_available
 
 from ...image_processing_utils import BaseImageProcessor, BatchFeature
-from ...image_transforms import PaddingMode, pad, rescale, to_channel_dimension_format
+from ...image_transforms import PaddingMode, pad, to_channel_dimension_format, to_pil_image
 from ...image_utils import (
     IMAGENET_STANDARD_MEAN,
     IMAGENET_STANDARD_STD,
@@ -40,13 +40,13 @@
 
 
 if is_torch_available():
-    import torch
+    pass
 
 if is_tf_available():
-    import tensorflow as tf
+    pass
 
 if is_flax_available():
-    import jax.numpy as jnp
+    pass
 
 logger = logging.get_logger(__name__)
 
@@ -242,71 +242,46 @@ def make_pixel_mask(
     return mask
 
 
-# Custom to_pil_image function to support image_mode
-def to_pil_image(
-    image: Union[np.ndarray, "PIL.Image.Image", "torch.Tensor", "tf.Tensor", "jnp.ndarray"],
-    image_mode: Optional[str] = None,
-) -> "PIL.Image.Image":
-    """
-    Converts `image` to a PIL Image. Optionally rescales it and puts the channel dimension back as the last axis if
-    needed.
-
-    Args:
-        image (`PIL.Image.Image` or `numpy.ndarray` or `torch.Tensor` or `tf.Tensor`):
-            The image to convert to the `PIL.Image` format.
-        image_mode (`str`, *optional*):
-            The mode of the image.
-
-    Returns:
-        `PIL.Image.Image`: The converted image.
-    """
-    if isinstance(image, PIL.Image.Image):
-        return image
-    # Convert all tensors to numpy arrays before converting to PIL image
-    image = to_numpy_array(image)
-
-    # If the channel has been moved to first dim, we put it back at the end.
-    image = to_channel_dimension_format(
-        image, ChannelDimension.LAST, infer_channel_dimension_format(image, num_channels=(1, 3, 4))
-    )
-
-    # If there is a single channel, we squeeze it, as otherwise PIL can't handle it.
-    image = np.squeeze(image, axis=-1) if image.shape[-1] == 1 else image
-    image = image.astype(np.uint8)
-    return PIL.Image.fromarray(image, mode=image_mode)
-
-
 def convert_to_rgb(
-    image: ImageInput,
+    image: np.ndarray,
     palette: Optional[PIL.ImagePalette.ImagePalette] = None,
     data_format: Optional[Union[str, ChannelDimension]] = None,
+    input_data_format: Optional[Union[str, ChannelDimension]] = None,
 ) -> ImageInput:
     """
-    Converts an image to RGB format. Only converts if the image is of type PIL.Image.Image, otherwise returns the image
-    as is.
+    Converts an image to RGB format.
+
     Args:
-        image (Image):
+        image (`np.ndarray`):
             The image to convert.
         palette (List[int], *optional*):
             The palette to use if given.
+        data_format (ChannelDimension or str, *optional*):
+            The channel dimension format for the output image. If not provided, it will be the same as the input image.
         input_data_format (ChannelDimension or str, *optional*):
             The channel dimension format of the input image.
     """
-    if not isinstance(image, PIL.Image.Image):
-        mode = "P" if palette is not None else None
-        # Custom to_pil_image function to support image_mode
-        image = to_pil_image(image, image_mode=mode)
-        if image.mode == "P" and palette is not None:
-            image.putpalette(palette)
+    if input_data_format is None:
+        input_data_format = infer_channel_dimension_format(image, num_channels=(1, 3, 4))
+
+    # For all transformations, we want to keep the same data format as the input image unless otherwise specified.
+    # The resized image from PIL will always have channels last, so find the input format first.
+    data_format = input_data_format if data_format is None else data_format
+
+    mode = "P" if palette is not None else None
+    # Custom to_pil_image function to support image_mode
+    image = to_pil_image(image, image_mode=mode)
+    if image.mode == "P" and palette is not None:
+        image.putpalette(palette)
 
     image_rgba = image.convert("RGBA")
     background = Image.new("RGBA", image_rgba.size, (255, 255, 255))
     alpha_composite = Image.alpha_composite(background, image_rgba)
     alpha_composite = alpha_composite.convert("RGB")
+
     output_array = np.array(alpha_composite)
-    output_data_format = infer_channel_dimension_format(output_array, num_channels=(1, 3, 4))
-    if data_format != output_data_format:
-        output_array = to_channel_dimension_format(output_array, data_format, output_data_format)
+    # The image is always in channels last format after converting from a PIL image
+    output_array = to_channel_dimension_format(output_array, data_format, input_channel_dim=ChannelDimension.LAST)
     return output_array
 
 
@@ -411,6 +386,7 @@ def resize(
         image: np.ndarray,
         size: Dict[str, int],
         resample: PILImageResampling = PILImageResampling.LANCZOS,
+        data_format: Optional[Union[str, ChannelDimension]] = None,
         input_data_format: Optional[Union[str, ChannelDimension]] = None,
         **kwargs,
     ) -> np.ndarray:
@@ -425,12 +401,18 @@ def resize(
                 Size of the output image.
             resample (`PILImageResampling`, *optional*, defaults to `PILImageResampling.LANCZOS`):
                 Resampling filter to use when resizing the image.
+            data_format (`ChannelDimension` or `str`, *optional*):
+                The channel dimension format of the output image. If not provided, it will be the same as the input image.
             input_data_format (`ChannelDimension` or `str`, *optional*):
                 The channel dimension format of the input image. If not provided, it will be inferred.
         """
         if input_data_format is None:
             input_data_format = infer_channel_dimension_format(image, num_channels=(1, 3, 4))
 
+        # For all transformations, we want to keep the same data format as the input image unless otherwise specified.
+        # The resized image from PIL will always have channels last, so find the input format first.
+        data_format = input_data_format if data_format is None else data_format
+
         if "longest_edge" in size:
             size = get_resize_output_image_size(
                 image, resolution_max_side=size["longest_edge"], input_data_format=input_data_format
@@ -439,29 +421,31 @@ def resize(
             size = (size["height"], size["width"])
         else:
             raise ValueError("size must be a dictionary with key 'longest_edge' or 'height' and 'width'.")
+
         image_mode = None
         if image.ndim == 2 or image.shape[-1] == 1:
             image_mode = "P"
-        # Custom to_pil_image function to support image_mode
         image = to_pil_image(image, image_mode=image_mode)
 
         resized_image = image.resize((size[1], size[0]), resample=resample)
-        resized_array = np.array(resized_image)
-        if resized_array.ndim == 3 and input_data_format == ChannelDimension.FIRST:
-            resized_array = np.moveaxis(resized_array, -1, 0)
-        if resized_array.ndim == 2:
-            if input_data_format == ChannelDimension.FIRST:
-                resized_array = np.expand_dims(resized_array, axis=0)
-            elif input_data_format == ChannelDimension.LAST:
-                resized_array = np.expand_dims(resized_array, axis=-1)
-        return resized_array
+        resized_image = np.array(resized_image)
+
+        # If the input image channel dimension was of size 1, then it is dropped when converting to a PIL image
+        # so we need to add it back if necessary.
+        resized_image = np.expand_dims(resized_image, axis=-1) if resized_image.ndim == 2 else resized_image
+        # The image is always in channels last format after converting from a PIL image
+        resized_image = to_channel_dimension_format(
+            resized_image, data_format, input_channel_dim=ChannelDimension.LAST
+        )
+        return resized_image
 
     def split_image(
         self,
         image,
         max_image_size: Dict[str, int],
-        data_format: Optional[Union[str, ChannelDimension]] = None,
         resample: PILImageResampling = PILImageResampling.LANCZOS,
+        data_format: Optional[Union[str, ChannelDimension]] = None,
+        input_data_format: Optional[Union[str, ChannelDimension]] = None,
     ):
         """
         Split an image into squares of side max_image_size and the original image resized to max_image_size.
@@ -478,12 +462,14 @@ def split_image(
             max_image_size (`Dict[str, int]`):
                 Maximum size of the output image. If the image is larger than this size, it will be split into
                 patches of this size, and the original image will be concatenated with the patches, resized to max_size.
-            data_format (`ChannelDimension` or `str`, *optional*):
-                The channel dimension format of the output image. If not provided, it will be the same as the input image.
             resample (`PILImageResampling`, *optional*, defaults to `PILImageResampling.LANCZOS`):
                 Resampling filter to use when resizing the image.
+            data_format (`ChannelDimension` or `str`, *optional*):
+                The channel dimension format of the output image. If not provided, it will be the same as the input image.
+            input_data_format (`ChannelDimension` or `str`, *optional*):
+                The channel dimension format of the input image. If not provided, it will be inferred.
         """
-        height, width = get_image_size(image, channel_dim=data_format)
+        height, width = get_image_size(image, channel_dim=input_data_format)
         max_height = max_width = max_image_size["longest_edge"]
 
         frames = []
@@ -538,6 +524,7 @@ def resize_for_vision_encoder(
         image: np.ndarray,
         vision_encoder_max_size: int,
         resample: PILImageResampling = PILImageResampling.LANCZOS,
+        data_format: Optional[Union[str, ChannelDimension]] = None,
         input_data_format: Optional[Union[str, ChannelDimension]] = None,
     ):
         """
@@ -551,15 +538,13 @@ def resize_for_vision_encoder(
                 patches of this size, and the original image will be concatenated with the patches, resized to max_size.
             resample (`PILImageResampling`, *optional*, defaults to `PILImageResampling.LANCZOS`):
                 Resampling filter to use when resizing the image.
+            data_format (`ChannelDimension` or `str`, *optional*):
+                The channel dimension format of the output image. If not provided, it will be the same as the input image.
             input_data_format (`ChannelDimension` or `str`, *optional*):
                 The channel dimension format of the input image. If not provided, it will be inferred
         """
-        if input_data_format == ChannelDimension.FIRST:
-            _, height, width = image.shape
-        elif input_data_format == ChannelDimension.LAST:
-            height, width, _ = image.shape
-        else:
-            raise ValueError("Invalid input_data_format.")
+        height, width = get_image_size(image, channel_dim=input_data_format)
+
         aspect_ratio = width / height
         if width >= height:
             width = math.ceil(width / vision_encoder_max_size) * vision_encoder_max_size
@@ -570,7 +555,9 @@ def resize_for_vision_encoder(
             width = int(height * aspect_ratio)
             width = math.ceil(width / vision_encoder_max_size) * vision_encoder_max_size
         new_size = {"height": height, "width": width}
-        return self.resize(image, size=new_size, resample=resample, input_data_format=input_data_format)
+        return self.resize(
+            image, size=new_size, resample=resample, input_data_format=input_data_format, data_format=data_format
+        )
 
     def _pad_image(
         self,
@@ -694,6 +681,7 @@ def preprocess(
         return_tensors: Optional[Union[str, TensorType]] = None,
         return_row_col_info: bool = False,
         data_format: Optional[ChannelDimension] = ChannelDimension.FIRST,
+        input_data_format: Optional[Union[str, ChannelDimension]] = None,
     ):
         """
         Preprocess a batch of images.
@@ -743,6 +731,12 @@ def preprocess(
                 - `"channels_first"` or `ChannelDimension.FIRST`: image in (num_channels, height, width) format.
                 - `"channels_last"` or `ChannelDimension.LAST`: image in (height, width, num_channels) format.
                 - Unset: Use the channel dimension format of the input image.
+            input_data_format (`ChannelDimension` or `str`, *optional*):
+                The channel dimension format for the input image. If unset, the channel dimension format is inferred
+                from the input image. Can be one of:
+                - `"channels_first"` or `ChannelDimension.FIRST`: image in (num_channels, height, width) format.
+                - `"channels_last"` or `ChannelDimension.LAST`: image in (height, width, num_channels) format.
+                - `"none"` or `ChannelDimension.NONE`: image in (height, width) format.
         """
         do_resize = do_resize if do_resize is not None else self.do_resize
         size = size if size is not None else self.size
@@ -765,6 +759,17 @@ def preprocess(
                 "torch.Tensor, tf.Tensor or jax.ndarray."
             )
 
+        validate_preprocess_arguments(
+            do_rescale=do_rescale,
+            rescale_factor=rescale_factor,
+            do_normalize=do_normalize,
+            image_mean=image_mean,
+            image_std=image_std,
+            do_resize=do_resize,
+            size=size,
+            resample=resample,
+        )
+
         # save the palettes for conversion to RGB
         palettes_list = [
             [im.getpalette() if isinstance(im, Image.Image) and im.mode == "P" else None for im in images]
@@ -780,44 +785,26 @@ def preprocess(
                 " images have pixel values between 0 and 1, set `do_rescale=False` to avoid rescaling them again."
             )
 
-        if data_format is None or data_format == ChannelDimension.LAST:
+        # We assume that all images have the same channel dimension format.
+        if input_data_format is None:
+            input_data_format = infer_channel_dimension_format(images_list[0][0], num_channels=(1, 3, 4))
+
+        # Extra channel dimension for grayscale images
+        if input_data_format == ChannelDimension.LAST:
             images_list = [
                 [np.expand_dims(img, axis=-1) if img.ndim == 2 else img for img in images] for images in images_list
             ]
-        elif data_format == ChannelDimension.FIRST:
+        elif input_data_format == ChannelDimension.FIRST:
             images_list = [
                 [np.expand_dims(img, axis=0) if img.ndim == 2 else img for img in images] for images in images_list
             ]
-
-        if data_format is not None:
-            images_list = [
-                [
-                    to_channel_dimension_format(
-                        image, data_format, infer_channel_dimension_format(image, num_channels=(1, 3, 4))
-                    )
-                    for image in images
-                ]
-                for images in images_list
-            ]
         else:
-            # We assume that all images have the same channel dimension format.
-            data_format = infer_channel_dimension_format(images_list[0][0], num_channels=(1, 3, 4))
-
-        validate_preprocess_arguments(
-            do_rescale=do_rescale,
-            rescale_factor=rescale_factor,
-            do_normalize=do_normalize,
-            image_mean=image_mean,
-            image_std=image_std,
-            do_resize=do_resize,
-            size=size,
-            resample=resample,
-        )
+            raise ValueError(f"Invalid channel dimension format {input_data_format}.")
 
         if do_resize:
             images_list = [
                 [
-                    self.resize(image=image, size=size, resample=resample, input_data_format=data_format)
+                    self.resize(image=image, size=size, resample=resample, input_data_format=input_data_format)
                     for image in images
                 ]
                 for images in images_list
@@ -830,7 +817,7 @@ def preprocess(
             images_list = [
                 [
                     self.resize_for_vision_encoder(
-                        image, max_image_size["longest_edge"], resample=resample, input_data_format=data_format
+                        image, max_image_size["longest_edge"], resample=resample, input_data_format=input_data_format
                     )
                     for image in images
                 ]
@@ -849,7 +836,7 @@ def preprocess(
                     split_image_array, rows, cols = self.split_image(
                         image,
                         max_image_size=max_image_size,
-                        data_format=data_format,
+                        input_data_format=input_data_format,
                     )
                     split_image_arrays.extend(split_image_array)
                     split_palettes_arrays.extend([palette] * len(split_image_array))
@@ -869,7 +856,7 @@ def preprocess(
                         image=image,
                         size={"height": max_image_size["longest_edge"], "width": max_image_size["longest_edge"]},
                         resample=resample,
-                        input_data_format=data_format,
+                        input_data_format=input_data_format,
                     )
                     for image in images
                 ]
@@ -880,17 +867,20 @@ def preprocess(
 
         if do_convert_rgb:
             images_list = [
-                [convert_to_rgb(img, plt, data_format=data_format) for img, plt in zip(images, palettes)]
+                [convert_to_rgb(img, palette) for img, palette in zip(images, palettes)]
                 for images, palettes in zip(images_list, palettes_list)
             ]
 
         if do_rescale:
-            images_list = [[self.rescale(image, rescale_factor, data_format=data_format) for image in images] for images in images_list]
+            images_list = [
+                [self.rescale(image, rescale_factor, input_data_format=input_data_format) for image in images]
+                for images in images_list
+            ]
 
         if do_normalize:
             images_list = [
                 [
-                    self.normalize(image=image, mean=image_mean, std=image_std, data_format=data_format)
+                    self.normalize(image=image, mean=image_mean, std=image_std, input_data_format=input_data_format)
                     for image in images
                 ]
                 for images in images_list
@@ -899,12 +889,20 @@ def preprocess(
         pixel_attention_mask = None
         if do_pad:
             images_list, pixel_attention_mask = self.pad(
-                images_list, return_pixel_mask=True, return_tensors=return_tensors, data_format=data_format
+                images_list, return_pixel_mask=True, return_tensors=return_tensors, input_data_format=input_data_format
             )
 
-        data = {
-            "pixel_values": np.array(images_list) if do_pad and return_tensors is not None else images_list
-        }  # Faster tensor conversion
+        if data_format is not None:
+            images_list = [
+                [
+                    to_channel_dimension_format(image, data_format, input_channel_dim=input_data_format)
+                    for image in images
+                ]
+                for images in images_list
+            ]
+
+        # Faster tensor conversion
+        data = {"pixel_values": np.array(images_list) if do_pad and return_tensors is not None else images_list}
         if pixel_attention_mask is not None:
             data["pixel_attention_mask"] = (
                 np.array(pixel_attention_mask) if do_pad and return_tensors is not None else pixel_attention_mask