From c8c41a822d6bf97c62d3fd6c2e51c5c21f6635bc Mon Sep 17 00:00:00 2001
From: Niels <niels.rogge1@gmail.com>
Date: Sat, 14 Sep 2024 21:31:54 +0200
Subject: [PATCH] Fix style

---
 docs/source/en/model_doc/pixtral.md                |  2 +-
 .../models/pixtral/configuration_pixtral.py        | 14 +++++---------
 2 files changed, 6 insertions(+), 10 deletions(-)

diff --git a/docs/source/en/model_doc/pixtral.md b/docs/source/en/model_doc/pixtral.md
index dfb3df7477708a..7b48b8dcdf9d5e 100644
--- a/docs/source/en/model_doc/pixtral.md
+++ b/docs/source/en/model_doc/pixtral.md
@@ -23,7 +23,7 @@ The Pixtral model was released by the Mistral AI team on [vLLM](https://github.c
 Tips:
 
 - Pixtral is a multimodal model, taking images and text as input, and producing text as output.
-- This model follows the [Llava](llava) family, meaning image embeddings are placed instead of the `[IMG]` token placeholders. The model uses [`PixtralVisionModel`] for its vision encoder.
+- This model follows the [Llava](llava) family, meaning image embeddings are placed instead of the `[IMG]` token placeholders. The model uses [`PixtralVisionModel`] for its vision encoder, and [`MistralForCausalLM`] for its language decoder.
 - The main contribution is the 2d ROPE (rotary postiion embeddings) on the images, and support for arbitrary image sizes (the images are not padded together nor are they resized).
 - The format for one or mulitple prompts is the following:
 ```
diff --git a/src/transformers/models/pixtral/configuration_pixtral.py b/src/transformers/models/pixtral/configuration_pixtral.py
index a2fdc9b2ed303a..32325a929411ba 100644
--- a/src/transformers/models/pixtral/configuration_pixtral.py
+++ b/src/transformers/models/pixtral/configuration_pixtral.py
@@ -23,8 +23,8 @@
 class PixtralVisionConfig(PretrainedConfig):
     r"""
     This is the configuration class to store the configuration of a [`PixtralVisionModel`]. It is used to instantiate an
-    Pixtral model according to the specified arguments, defining the model architecture. Instantiating a configuration
-    with the defaults will yield a similar configuration to that of the Pixtral-9B.
+    Pixtral vision encoder according to the specified arguments, defining the model architecture. Instantiating a configuration
+    with the defaults will yield a similar configuration to the vision encoder used by Pixtral-12B.
 
     e.g. [pixtral-hf/pixtral-9b](https://huggingface.co/pixtral-hf/pixtral-9b)
 
@@ -52,18 +52,16 @@ class PixtralVisionConfig(PretrainedConfig):
             Dropout probability for the attention layers.
         rope_theta (`float`, *optional*, defaults to 10000.0):
             The base period of the RoPE embeddings.
-        tie_word_embeddings (`bool`, *optional*, defaults to `False`):
-            Whether to tie the word embeddings with the input embeddings.
 
     Example:
 
     ```python
-    >>> from transformers import PixtralVisionModel, PixtralVisionConfig, CLIPVisionConfig, LlamaConfig
+    >>> from transformers import PixtralVisionModel, PixtralVisionConfig
 
-    >>> # Initializing a Pixtral 12B style configuration
+    >>> # Initializing a Pixtral-12B style configuration
     >>> config = PixtralVisionConfig()
 
-    >>> # Initializing a model from the pixtral 12B style configuration
+    >>> # Initializing a model (with randomly initialized weights) from the configuration
     >>> model = PixtralVisionModel(configuration)
 
     >>> # Accessing the model configuration
@@ -84,7 +82,6 @@ def __init__(
         hidden_act="gelu",
         attention_dropout=0.0,
         rope_theta=10000.0,
-        tie_word_embeddings=False,
         **kwargs,
     ):
         super().__init__(**kwargs)
@@ -99,5 +96,4 @@ def __init__(
         self.attention_dropout = attention_dropout
         self.hidden_act = hidden_act
         self.rope_theta = rope_theta
-        self.tie_word_embeddings = tie_word_embeddings
         self.head_dim = hidden_size // num_attention_heads