From c8c41a822d6bf97c62d3fd6c2e51c5c21f6635bc Mon Sep 17 00:00:00 2001 From: Niels Date: Sat, 14 Sep 2024 21:31:54 +0200 Subject: [PATCH] Fix style --- docs/source/en/model_doc/pixtral.md | 2 +- .../models/pixtral/configuration_pixtral.py | 14 +++++--------- 2 files changed, 6 insertions(+), 10 deletions(-) diff --git a/docs/source/en/model_doc/pixtral.md b/docs/source/en/model_doc/pixtral.md index dfb3df7477708a..7b48b8dcdf9d5e 100644 --- a/docs/source/en/model_doc/pixtral.md +++ b/docs/source/en/model_doc/pixtral.md @@ -23,7 +23,7 @@ The Pixtral model was released by the Mistral AI team on [vLLM](https://github.c Tips: - Pixtral is a multimodal model, taking images and text as input, and producing text as output. -- This model follows the [Llava](llava) family, meaning image embeddings are placed instead of the `[IMG]` token placeholders. The model uses [`PixtralVisionModel`] for its vision encoder. +- This model follows the [Llava](llava) family, meaning image embeddings are placed instead of the `[IMG]` token placeholders. The model uses [`PixtralVisionModel`] for its vision encoder, and [`MistralForCausalLM`] for its language decoder. - The main contribution is the 2d ROPE (rotary postiion embeddings) on the images, and support for arbitrary image sizes (the images are not padded together nor are they resized). - The format for one or mulitple prompts is the following: ``` diff --git a/src/transformers/models/pixtral/configuration_pixtral.py b/src/transformers/models/pixtral/configuration_pixtral.py index a2fdc9b2ed303a..32325a929411ba 100644 --- a/src/transformers/models/pixtral/configuration_pixtral.py +++ b/src/transformers/models/pixtral/configuration_pixtral.py @@ -23,8 +23,8 @@ class PixtralVisionConfig(PretrainedConfig): r""" This is the configuration class to store the configuration of a [`PixtralVisionModel`]. It is used to instantiate an - Pixtral model according to the specified arguments, defining the model architecture. Instantiating a configuration - with the defaults will yield a similar configuration to that of the Pixtral-9B. + Pixtral vision encoder according to the specified arguments, defining the model architecture. Instantiating a configuration + with the defaults will yield a similar configuration to the vision encoder used by Pixtral-12B. e.g. [pixtral-hf/pixtral-9b](https://huggingface.co/pixtral-hf/pixtral-9b) @@ -52,18 +52,16 @@ class PixtralVisionConfig(PretrainedConfig): Dropout probability for the attention layers. rope_theta (`float`, *optional*, defaults to 10000.0): The base period of the RoPE embeddings. - tie_word_embeddings (`bool`, *optional*, defaults to `False`): - Whether to tie the word embeddings with the input embeddings. Example: ```python - >>> from transformers import PixtralVisionModel, PixtralVisionConfig, CLIPVisionConfig, LlamaConfig + >>> from transformers import PixtralVisionModel, PixtralVisionConfig - >>> # Initializing a Pixtral 12B style configuration + >>> # Initializing a Pixtral-12B style configuration >>> config = PixtralVisionConfig() - >>> # Initializing a model from the pixtral 12B style configuration + >>> # Initializing a model (with randomly initialized weights) from the configuration >>> model = PixtralVisionModel(configuration) >>> # Accessing the model configuration @@ -84,7 +82,6 @@ def __init__( hidden_act="gelu", attention_dropout=0.0, rope_theta=10000.0, - tie_word_embeddings=False, **kwargs, ): super().__init__(**kwargs) @@ -99,5 +96,4 @@ def __init__( self.attention_dropout = attention_dropout self.hidden_act = hidden_act self.rope_theta = rope_theta - self.tie_word_embeddings = tie_word_embeddings self.head_dim = hidden_size // num_attention_heads