Fix style

NielsRogge · Sep 14, 2024 · c8c41a8 · c8c41a8
1 parent 628331e
commit c8c41a8
Show file tree

Hide file tree

Showing 2 changed files with 6 additions and 10 deletions.
diff --git a/docs/source/en/model_doc/pixtral.md b/docs/source/en/model_doc/pixtral.md
@@ -23,7 +23,7 @@ The Pixtral model was released by the Mistral AI team on [vLLM](https://github.c
 Tips:
 
 - Pixtral is a multimodal model, taking images and text as input, and producing text as output.
-- This model follows the [Llava](llava) family, meaning image embeddings are placed instead of the `[IMG]` token placeholders. The model uses [`PixtralVisionModel`] for its vision encoder.
+- This model follows the [Llava](llava) family, meaning image embeddings are placed instead of the `[IMG]` token placeholders. The model uses [`PixtralVisionModel`] for its vision encoder, and [`MistralForCausalLM`] for its language decoder.
 - The main contribution is the 2d ROPE (rotary postiion embeddings) on the images, and support for arbitrary image sizes (the images are not padded together nor are they resized).
 - The format for one or mulitple prompts is the following:
 ```

diff --git a/src/transformers/models/pixtral/configuration_pixtral.py b/src/transformers/models/pixtral/configuration_pixtral.py
@@ -23,8 +23,8 @@
 class PixtralVisionConfig(PretrainedConfig):
     r"""
     This is the configuration class to store the configuration of a [`PixtralVisionModel`]. It is used to instantiate an
-    Pixtral model according to the specified arguments, defining the model architecture. Instantiating a configuration
-    with the defaults will yield a similar configuration to that of the Pixtral-9B.
+    Pixtral vision encoder according to the specified arguments, defining the model architecture. Instantiating a configuration
+    with the defaults will yield a similar configuration to the vision encoder used by Pixtral-12B.
 
     e.g. [pixtral-hf/pixtral-9b](https://huggingface.co/pixtral-hf/pixtral-9b)
 
@@ -52,18 +52,16 @@ class PixtralVisionConfig(PretrainedConfig):
             Dropout probability for the attention layers.
         rope_theta (`float`, *optional*, defaults to 10000.0):
             The base period of the RoPE embeddings.
-        tie_word_embeddings (`bool`, *optional*, defaults to `False`):
-            Whether to tie the word embeddings with the input embeddings.
 
     Example:
 
     ```python
-    >>> from transformers import PixtralVisionModel, PixtralVisionConfig, CLIPVisionConfig, LlamaConfig
+    >>> from transformers import PixtralVisionModel, PixtralVisionConfig
 
-    >>> # Initializing a Pixtral 12B style configuration
+    >>> # Initializing a Pixtral-12B style configuration
     >>> config = PixtralVisionConfig()
 
-    >>> # Initializing a model from the pixtral 12B style configuration
+    >>> # Initializing a model (with randomly initialized weights) from the configuration
     >>> model = PixtralVisionModel(configuration)
 
     >>> # Accessing the model configuration
@@ -84,7 +82,6 @@ def __init__(
         hidden_act="gelu",
         attention_dropout=0.0,
         rope_theta=10000.0,
-        tie_word_embeddings=False,
         **kwargs,
     ):
         super().__init__(**kwargs)
@@ -99,5 +96,4 @@ def __init__(
         self.attention_dropout = attention_dropout
         self.hidden_act = hidden_act
         self.rope_theta = rope_theta
-        self.tie_word_embeddings = tie_word_embeddings
         self.head_dim = hidden_size // num_attention_heads