From 95e816f2bca48de32167ce6243e6770dee23923d Mon Sep 17 00:00:00 2001
From: Joao Gante
Date: Mon, 16 Sep 2024 09:44:57 +0100
Subject: [PATCH 01/50] Cohere: update RoPE structure (#33408)
---
.../models/cohere/configuration_cohere.py | 43 +++++
.../models/cohere/modeling_cohere.py | 170 ++++++++++++++----
src/transformers/models/dbrx/modeling_dbrx.py | 2 +-
.../models/gemma/modeling_gemma.py | 2 +-
.../models/granite/modeling_granite.py | 2 +-
.../models/llama/configuration_llama.py | 2 +-
.../models/llama/modeling_llama.py | 2 +-
.../models/mistral/modeling_mistral.py | 2 +-
.../models/mixtral/modeling_mixtral.py | 2 +-
src/transformers/models/olmo/modeling_olmo.py | 2 +-
.../models/olmoe/modeling_olmoe.py | 2 +-
.../models/persimmon/modeling_persimmon.py | 2 +-
src/transformers/models/phi/modeling_phi.py | 2 +-
src/transformers/models/phi3/modeling_phi3.py | 2 +-
.../models/qwen2/modeling_qwen2.py | 2 +-
.../models/qwen2_moe/modeling_qwen2_moe.py | 2 +-
.../models/stablelm/modeling_stablelm.py | 2 +-
.../models/starcoder2/modeling_starcoder2.py | 2 +-
18 files changed, 190 insertions(+), 55 deletions(-)
diff --git a/src/transformers/models/cohere/configuration_cohere.py b/src/transformers/models/cohere/configuration_cohere.py
index 73973bfad60b93..3c1237e5113789 100644
--- a/src/transformers/models/cohere/configuration_cohere.py
+++ b/src/transformers/models/cohere/configuration_cohere.py
@@ -20,6 +20,7 @@
"""Cohere model configuration"""
from ...configuration_utils import PretrainedConfig
+from ...modeling_rope_utils import rope_config_validation
from ...utils import logging
@@ -79,6 +80,43 @@ class CohereConfig(PretrainedConfig):
Whether to tie weight embeddings
rope_theta (`float`, *optional*, defaults to 10000.0):
The base period of the RoPE embeddings.
+ rope_scaling (`Dict`, *optional*):
+ Dictionary containing the scaling configuration for the RoPE embeddings. NOTE: if you apply new rope type
+ and you expect the model to work on longer `max_position_embeddings`, we recommend you to update this value
+ accordingly.
+ Expected contents:
+ `rope_type` (`str`):
+ The sub-variant of RoPE to use. Can be one of ['default', 'linear', 'dynamic', 'yarn', 'longrope',
+ 'llama3'], with 'default' being the original RoPE implementation.
+ `factor` (`float`, *optional*):
+ Used with all rope types except 'default'. The scaling factor to apply to the RoPE embeddings. In
+ most scaling types, a `factor` of x will enable the model to handle sequences of length x *
+ original maximum pre-trained length.
+ `original_max_position_embeddings` (`int`, *optional*):
+ Used with 'dynamic', 'longrope' and 'llama3'. The original max position embeddings used during
+ pretraining.
+ `attention_factor` (`float`, *optional*):
+ Used with 'yarn' and 'longrope'. The scaling factor to be applied on the attention
+ computation. If unspecified, it defaults to value recommended by the implementation, using the
+ `factor` field to infer the suggested value.
+ `beta_fast` (`float`, *optional*):
+ Only used with 'yarn'. Parameter to set the boundary for extrapolation (only) in the linear
+ ramp function. If unspecified, it defaults to 32.
+ `beta_slow` (`float`, *optional*):
+ Only used with 'yarn'. Parameter to set the boundary for interpolation (only) in the linear
+ ramp function. If unspecified, it defaults to 1.
+ `short_factor` (`List[float]`, *optional*):
+ Only used with 'longrope'. The scaling factor to be applied to short contexts (<
+ `original_max_position_embeddings`). Must be a list of numbers with the same length as the hidden
+ size divided by the number of attention heads divided by 2
+ `long_factor` (`List[float]`, *optional*):
+ Only used with 'longrope'. The scaling factor to be applied to long contexts (<
+ `original_max_position_embeddings`). Must be a list of numbers with the same length as the hidden
+ size divided by the number of attention heads divided by 2
+ `low_freq_factor` (`float`, *optional*):
+ Only used with 'llama3'. Scaling factor applied to low frequency components of the RoPE
+ `high_freq_factor` (`float`, *optional*):
+ Only used with 'llama3'. Scaling factor applied to high frequency components of the RoPE
attention_bias (`bool`, defaults to `False`, *optional*, defaults to `False`):
Whether to use a bias in the query, key, value and output projection layers during self-attention.
attention_dropout (`float`, *optional*, defaults to 0.0):
@@ -121,6 +159,7 @@ def __init__(
eos_token_id=255001,
tie_word_embeddings=True,
rope_theta=10000.0,
+ rope_scaling=None,
attention_bias=False,
attention_dropout=0.0,
use_qk_norm=False,
@@ -144,10 +183,14 @@ def __init__(
self.layer_norm_eps = layer_norm_eps
self.use_cache = use_cache
self.rope_theta = rope_theta
+ self.rope_scaling = rope_scaling
self.attention_bias = attention_bias
self.attention_dropout = attention_dropout
self.use_qk_norm = use_qk_norm
+ # Validate the correctness of rotary position embeddings parameters
+ rope_config_validation(self)
+
super().__init__(
pad_token_id=pad_token_id,
bos_token_id=bos_token_id,
diff --git a/src/transformers/models/cohere/modeling_cohere.py b/src/transformers/models/cohere/modeling_cohere.py
index 4010d9ec3a4327..ae84a9ec2d1a43 100644
--- a/src/transformers/models/cohere/modeling_cohere.py
+++ b/src/transformers/models/cohere/modeling_cohere.py
@@ -37,6 +37,7 @@
BaseModelOutputWithPast,
CausalLMOutputWithPast,
)
+from ...modeling_rope_utils import ROPE_INIT_FUNCTIONS
from ...modeling_utils import PreTrainedModel
from ...pytorch_utils import ALL_LAYERNORM_LAYERS
from ...utils import (
@@ -135,35 +136,97 @@ def forward(self, hidden_states):
class CohereRotaryEmbedding(nn.Module):
- def __init__(self, dim, max_position_embeddings=2048, base=10000, device=None, scaling_factor=1.0):
+ # Note: the forward pass of this RoPE is slightly different from Llama's, resulting in different `sin`/`cos` for
+ # the same parameterization. The differences are highlighted with a comment.
+
+ def __init__(
+ self,
+ dim=None,
+ max_position_embeddings=2048,
+ base=10000,
+ device=None,
+ scaling_factor=1.0,
+ rope_type="default",
+ config: Optional[CohereConfig] = None,
+ ):
super().__init__()
- self.scaling_factor = scaling_factor
- self.dim = dim
- self.max_position_embeddings = max_position_embeddings
- self.base = base
- inv_freq = 1.0 / (self.base ** (torch.arange(0, self.dim, 2, dtype=torch.int64).float().to(device) / self.dim))
+ # TODO (joao): remove the `if` below, only used for BC
+ self.rope_kwargs = {}
+ if config is None:
+ logger.warning_once(
+ "`CohereRotaryEmbedding` can now be fully parameterized by passing the model config through the "
+ "`config` argument. All other arguments will be removed in v4.46"
+ )
+ self.rope_kwargs = {
+ "rope_type": rope_type,
+ "factor": scaling_factor,
+ "dim": dim,
+ "base": base,
+ "max_position_embeddings": max_position_embeddings,
+ }
+ self.rope_type = rope_type
+ self.max_seq_len_cached = max_position_embeddings
+ self.original_max_seq_len = max_position_embeddings
+ else:
+ # BC: "rope_type" was originally "type"
+ if config.rope_scaling is not None:
+ self.rope_type = config.rope_scaling.get("rope_type", config.rope_scaling.get("type"))
+ else:
+ self.rope_type = "default"
+ self.max_seq_len_cached = config.max_position_embeddings
+ self.original_max_seq_len = config.max_position_embeddings
+
+ self.config = config
+ self.rope_init_fn = ROPE_INIT_FUNCTIONS[self.rope_type]
+
+ inv_freq, self.attention_scaling = self.rope_init_fn(self.config, device, **self.rope_kwargs)
self.register_buffer("inv_freq", inv_freq, persistent=False)
+ self.original_inv_freq = self.inv_freq
+
+ def _dynamic_frequency_update(self, position_ids, device):
+ """
+ dynamic RoPE layers should recompute `inv_freq` in the following situations:
+ 1 - growing beyond the cached sequence length (allow scaling)
+ 2 - the current sequence length is in the original scale (avoid losing precision with small sequences)
+ """
+ seq_len = torch.max(position_ids) + 1
+ if seq_len > self.max_seq_len_cached: # growth
+ inv_freq, self.attention_scaling = self.rope_init_fn(
+ self.config, device, seq_len=seq_len, **self.rope_kwargs
+ )
+ self.register_buffer("inv_freq", inv_freq, persistent=False) # TODO joao: may break with compilation
+ self.max_seq_len_cached = seq_len
+
+ if seq_len < self.original_max_seq_len and self.max_seq_len_cached > self.original_max_seq_len: # reset
+ self.register_buffer("inv_freq", self.original_inv_freq, persistent=False)
+ self.max_seq_len_cached = self.original_max_seq_len
@torch.no_grad()
def forward(self, x, position_ids):
- # x: [bs, num_attention_heads, seq_len, head_size]
+ if "dynamic" in self.rope_type:
+ self._dynamic_frequency_update(position_ids, device=x.device)
+
+ # Core RoPE block
inv_freq_expanded = self.inv_freq[None, :, None].float().expand(position_ids.shape[0], -1, 1)
position_ids_expanded = position_ids[:, None, :].float()
-
- # Force float32 since bfloat16 loses precision on long contexts
- # See https://github.com/huggingface/transformers/pull/29285
+ # Force float32 (see https://github.com/huggingface/transformers/pull/29285)
device_type = x.device.type
device_type = device_type if isinstance(device_type, str) and device_type != "mps" else "cpu"
with torch.autocast(device_type=device_type, enabled=False):
freqs = (inv_freq_expanded.float() @ position_ids_expanded.float()).transpose(1, 2)
- emb = torch.repeat_interleave(freqs, 2, dim=-1)
+ emb = torch.repeat_interleave(freqs, 2, dim=-1) # This line differs from Llama's implementation
cos = emb.cos()
sin = emb.sin()
- return cos, sin
+
+ # Advanced RoPE types (e.g. yarn) apply a post-processing scaling factor, equivalent to scaling attention
+ cos = cos * self.attention_scaling
+ sin = sin * self.attention_scaling
+
+ return cos.to(dtype=x.dtype), sin.to(dtype=x.dtype)
def rotate_half(x):
- # Split and rotate
+ # Split and rotate. Note that this function is different from e.g. Llama.
x1 = x[..., ::2]
x2 = x[..., 1::2]
rot_x = torch.stack([-x2, x1], dim=-1).flatten(-2)
@@ -272,17 +335,10 @@ def __init__(self, config: CohereConfig, layer_idx: Optional[int] = None):
self.k_proj = nn.Linear(self.hidden_size, self.num_key_value_heads * self.head_dim, bias=config.attention_bias)
self.v_proj = nn.Linear(self.hidden_size, self.num_key_value_heads * self.head_dim, bias=config.attention_bias)
self.o_proj = nn.Linear(self.hidden_size, self.hidden_size, bias=config.attention_bias)
- self._init_rope()
- # Ignore copy
- def _init_rope(self):
- self.rotary_emb = CohereRotaryEmbedding(
- self.head_dim,
- max_position_embeddings=self.max_position_embeddings,
- base=self.rope_theta,
- )
+ # TODO (joao): remove in v4.46 (RoPE is computed in the model, not in the decoder layers)
+ self.rotary_emb = CohereRotaryEmbedding(config=self.config)
- # Ignore copy
def forward(
self,
hidden_states: torch.Tensor,
@@ -292,6 +348,7 @@ def forward(
output_attentions: bool = False,
use_cache: bool = False,
cache_position: Optional[torch.LongTensor] = None,
+ position_embeddings: Optional[Tuple[torch.Tensor, torch.Tensor]] = None, # will become mandatory in v4.46
**kwargs,
) -> Tuple[torch.Tensor, Optional[torch.Tensor], Optional[Tuple[torch.Tensor]]]:
bsz, q_len, _ = hidden_states.size()
@@ -310,7 +367,16 @@ def forward(
key_states = key_states.transpose(1, 2)
value_states = value_states.view(bsz, q_len, self.num_key_value_heads, self.head_dim).transpose(1, 2)
- cos, sin = self.rotary_emb(value_states, position_ids)
+ if position_embeddings is None:
+ logger.warning_once(
+ "The attention layers in this model are transitioning from computing the RoPE embeddings internally "
+ "through `position_ids` (2D tensor with the indexes of the tokens), to using externally computed "
+ "`position_embeddings` (Tuple of tensors, containing cos and sin). In v4.46 `position_ids` will be "
+ "removed and `position_embeddings` will be mandatory."
+ )
+ cos, sin = self.rotary_emb(value_states, position_ids)
+ else:
+ cos, sin = position_embeddings
query_states, key_states = apply_rotary_pos_emb(query_states, key_states, cos, sin)
if past_key_value is not None:
@@ -350,8 +416,7 @@ def forward(
return attn_output, attn_weights, past_key_value
-# copied from transformers.models.llama.modeling_llama.LlamaFlashAttention2 with Llama->Cohere
-# TODO(joao): add me back asap :)
+# Copied from transformers.models.llama.modeling_llama.LlamaFlashAttention2 with Llama->Cohere
class CohereFlashAttention2(CohereAttention):
"""
Cohere flash attention module. This module inherits from `CohereAttention` as the weights of the module stays
@@ -377,6 +442,7 @@ def forward(
output_attentions: bool = False,
use_cache: bool = False,
cache_position: Optional[torch.LongTensor] = None,
+ position_embeddings: Optional[Tuple[torch.Tensor, torch.Tensor]] = None, # will become mandatory in v4.46
**kwargs,
) -> Tuple[torch.Tensor, Optional[torch.Tensor], Optional[Tuple[torch.Tensor]]]:
if isinstance(past_key_value, StaticCache):
@@ -402,7 +468,16 @@ def forward(
key_states = key_states.transpose(1, 2)
value_states = value_states.view(bsz, q_len, self.num_key_value_heads, self.head_dim).transpose(1, 2)
- cos, sin = self.rotary_emb(value_states, position_ids)
+ if position_embeddings is None:
+ logger.warning_once(
+ "The attention layers in this model are transitioning from computing the RoPE embeddings internally "
+ "through `position_ids` (2D tensor with the indexes of the tokens), to using externally computed "
+ "`position_embeddings` (Tuple of tensors, containing cos and sin). In v4.46 `position_ids` will be "
+ "removed and `position_embeddings` will be mandatory."
+ )
+ cos, sin = self.rotary_emb(value_states, position_ids)
+ else:
+ cos, sin = position_embeddings
query_states, key_states = apply_rotary_pos_emb(query_states, key_states, cos, sin)
if past_key_value is not None:
@@ -418,7 +493,6 @@ def forward(
dropout_rate = self.attention_dropout if self.training else 0.0
- # Ignore copy
# In PEFT, usually we cast the layer norms in float32 for training stability reasons
# therefore the input hidden states gets silently casted in float32. Hence, we need
# cast them back in the correct dtype just to be sure everything works as expected.
@@ -465,8 +539,6 @@ def forward(
return attn_output, attn_weights, past_key_value
-# copied from transformers.models.llama.modeling_llama.LlamaSdpaAttention Llama->Cohere
-# TODO(joao): add me back asap :)
class CohereSdpaAttention(CohereAttention):
"""
Cohere attention module using torch.nn.functional.scaled_dot_product_attention. This module inherits from
@@ -474,7 +546,6 @@ class CohereSdpaAttention(CohereAttention):
SDPA API.
"""
- # Ignore copy
def forward(
self,
hidden_states: torch.Tensor,
@@ -484,6 +555,7 @@ def forward(
output_attentions: bool = False,
use_cache: bool = False,
cache_position: Optional[torch.LongTensor] = None,
+ position_embeddings: Optional[Tuple[torch.Tensor, torch.Tensor]] = None, # will become mandatory in v4.46
) -> Tuple[torch.Tensor, Optional[torch.Tensor], Optional[Tuple[torch.Tensor]]]:
if output_attentions:
# TODO: Improve this warning with e.g. `model.config.attn_implementation = "manual"` once this is implemented.
@@ -517,7 +589,16 @@ def forward(
key_states = key_states.transpose(1, 2)
value_states = value_states.view(bsz, q_len, self.num_key_value_heads, self.head_dim).transpose(1, 2)
- cos, sin = self.rotary_emb(value_states, position_ids)
+ if position_embeddings is None:
+ logger.warning_once(
+ "The attention layers in this model are transitioning from computing the RoPE embeddings internally "
+ "through `position_ids` (2D tensor with the indexes of the tokens), to using externally computed "
+ "`position_embeddings` (Tuple of tensors, containing cos and sin). In v4.46 `position_ids` will be "
+ "removed and `position_embeddings` will be mandatory."
+ )
+ cos, sin = self.rotary_emb(value_states, position_ids)
+ else:
+ cos, sin = position_embeddings
query_states, key_states = apply_rotary_pos_emb(query_states, key_states, cos, sin)
if past_key_value is not None:
@@ -587,6 +668,7 @@ def forward(
output_attentions: Optional[bool] = False,
use_cache: Optional[bool] = False,
cache_position: Optional[torch.LongTensor] = None,
+ position_embeddings: Optional[Tuple[torch.Tensor, torch.Tensor]] = None, # will become mandatory in v4.46
) -> Tuple[torch.FloatTensor, Optional[Tuple[torch.FloatTensor, torch.FloatTensor]]]:
"""
Args:
@@ -601,6 +683,11 @@ def forward(
If set to `True`, `past_key_values` key value states are returned and can be used to speed up decoding
(see `past_key_values`).
past_key_value (`Tuple(torch.FloatTensor)`, *optional*): cached past key and value projection states
+ cache_position (`torch.LongTensor` of shape `(sequence_length)`, *optional*):
+ Indices depicting the position of the input sequence tokens in the sequence
+ position_embeddings (`Tuple[torch.FloatTensor, torch.FloatTensor]`, *optional*):
+ Tuple containing the cosine and sine positional embeddings of shape `(batch_size, seq_len, head_dim)`,
+ with `head_dim` being the embedding dimension of each attention head.
"""
residual = hidden_states
@@ -615,6 +702,7 @@ def forward(
output_attentions=output_attentions,
use_cache=use_cache,
cache_position=cache_position,
+ position_embeddings=position_embeddings,
)
# Fully Connected
@@ -755,8 +843,7 @@ def _init_weights(self, module):
"The bare Cohere Model outputting raw hidden-states without any specific head on top.",
COHERE_START_DOCSTRING,
)
-# copied from transformers.models.llama.modeling_llama.LlamaModel with Llama->Cohere
-# TODO(joao): add me back asap :)
+# Copied from transformers.models.llama.modeling_llama.LlamaModel with Llama->Cohere, LLAMA->COHERE
class CohereModel(CoherePreTrainedModel):
"""
Transformer decoder consisting of *config.num_hidden_layers* layers. Each layer is a [`CohereDecoderLayer`]
@@ -776,6 +863,7 @@ def __init__(self, config: CohereConfig):
[CohereDecoderLayer(config, layer_idx) for layer_idx in range(config.num_hidden_layers)]
)
self.norm = CohereLayerNorm(hidden_size=(config.hidden_size), eps=config.layer_norm_eps)
+ self.rotary_emb = CohereRotaryEmbedding(config=config)
self.gradient_checkpointing = False
# Initialize weights and apply final processing
@@ -787,14 +875,13 @@ def get_input_embeddings(self):
def set_input_embeddings(self, value):
self.embed_tokens = value
- # Ignore copy
@add_start_docstrings_to_model_forward(COHERE_INPUTS_DOCSTRING)
def forward(
self,
input_ids: torch.LongTensor = None,
attention_mask: Optional[torch.Tensor] = None,
position_ids: Optional[torch.LongTensor] = None,
- past_key_values: Optional[List[torch.FloatTensor]] = None,
+ past_key_values: Optional[Union[Cache, List[torch.FloatTensor]]] = None,
inputs_embeds: Optional[torch.FloatTensor] = None,
use_cache: Optional[bool] = None,
output_attentions: Optional[bool] = None,
@@ -823,30 +910,33 @@ def forward(
if inputs_embeds is None:
inputs_embeds = self.embed_tokens(input_ids)
- past_seen_tokens = 0
return_legacy_cache = False
if (
use_cache and not isinstance(past_key_values, Cache) and not self.training
): # kept for BC (non `Cache` `past_key_values` inputs)
return_legacy_cache = True
past_key_values = DynamicCache.from_legacy_cache(past_key_values)
+ logger.warning_once(
+ "We detected that you are passing `past_key_values` as a tuple and this is deprecated and will be removed in v4.46. "
+ "Please use an appropriate `Cache` class (https://huggingface.co/docs/transformers/internal/generation_utils#transformers.Cache)"
+ )
if cache_position is None:
past_seen_tokens = past_key_values.get_seq_length() if past_key_values is not None else 0
cache_position = torch.arange(
past_seen_tokens, past_seen_tokens + inputs_embeds.shape[1], device=inputs_embeds.device
)
-
if position_ids is None:
position_ids = cache_position.unsqueeze(0)
causal_mask = self._update_causal_mask(
attention_mask, inputs_embeds, cache_position, past_key_values, output_attentions
)
-
- # embed positions
hidden_states = inputs_embeds
+ # create position embeddings to be shared across the decoder layers
+ position_embeddings = self.rotary_emb(hidden_states, position_ids)
+
# decoder layers
all_hidden_states = () if output_hidden_states else None
all_self_attns = () if output_attentions else None
@@ -866,6 +956,7 @@ def forward(
output_attentions,
use_cache,
cache_position,
+ position_embeddings,
)
else:
layer_outputs = decoder_layer(
@@ -876,6 +967,7 @@ def forward(
output_attentions=output_attentions,
use_cache=use_cache,
cache_position=cache_position,
+ position_embeddings=position_embeddings,
)
hidden_states = layer_outputs[0]
diff --git a/src/transformers/models/dbrx/modeling_dbrx.py b/src/transformers/models/dbrx/modeling_dbrx.py
index 8db9f6e8b7d09f..43bac44ba1be20 100644
--- a/src/transformers/models/dbrx/modeling_dbrx.py
+++ b/src/transformers/models/dbrx/modeling_dbrx.py
@@ -1066,7 +1066,7 @@ def forward(
return_legacy_cache = True
past_key_values = DynamicCache.from_legacy_cache(past_key_values)
logger.warning_once(
- "We detected that you are passing `past_key_values` as a tuple and this is deprecated and will be removed in v4.43. "
+ "We detected that you are passing `past_key_values` as a tuple and this is deprecated and will be removed in v4.46. "
"Please use an appropriate `Cache` class (https://huggingface.co/docs/transformers/internal/generation_utils#transformers.Cache)"
)
diff --git a/src/transformers/models/gemma/modeling_gemma.py b/src/transformers/models/gemma/modeling_gemma.py
index 085751cd9bc039..b14e0a4b3d8ca5 100644
--- a/src/transformers/models/gemma/modeling_gemma.py
+++ b/src/transformers/models/gemma/modeling_gemma.py
@@ -862,7 +862,7 @@ def forward(
return_legacy_cache = True
past_key_values = DynamicCache.from_legacy_cache(past_key_values)
logger.warning_once(
- "We detected that you are passing `past_key_values` as a tuple and this is deprecated and will be removed in v4.43. "
+ "We detected that you are passing `past_key_values` as a tuple and this is deprecated and will be removed in v4.46. "
"Please use an appropriate `Cache` class (https://huggingface.co/docs/transformers/internal/generation_utils#transformers.Cache)"
)
diff --git a/src/transformers/models/granite/modeling_granite.py b/src/transformers/models/granite/modeling_granite.py
index ff10b6e6d875f9..876f5ed2a7c8da 100644
--- a/src/transformers/models/granite/modeling_granite.py
+++ b/src/transformers/models/granite/modeling_granite.py
@@ -839,7 +839,7 @@ def forward(
return_legacy_cache = True
past_key_values = DynamicCache.from_legacy_cache(past_key_values)
logger.warning_once(
- "We detected that you are passing `past_key_values` as a tuple and this is deprecated and will be removed in v4.43. "
+ "We detected that you are passing `past_key_values` as a tuple and this is deprecated and will be removed in v4.46. "
"Please use an appropriate `Cache` class (https://huggingface.co/docs/transformers/v4.41.3/en/internal/generation_utils#transformers.Cache)"
)
diff --git a/src/transformers/models/llama/configuration_llama.py b/src/transformers/models/llama/configuration_llama.py
index 435f0091e06e70..a3667e06534564 100644
--- a/src/transformers/models/llama/configuration_llama.py
+++ b/src/transformers/models/llama/configuration_llama.py
@@ -192,7 +192,7 @@ def __init__(
self.mlp_bias = mlp_bias
self.head_dim = head_dim if head_dim is not None else self.hidden_size // self.num_attention_heads
# Validate the correctness of rotary position embeddings parameters
- # BC: if there is a 'type' field, move it to 'rope_type'.
+ # BC: if there is a 'type' field, copy it it to 'rope_type'.
if self.rope_scaling is not None and "type" in self.rope_scaling:
self.rope_scaling["rope_type"] = self.rope_scaling["type"]
rope_config_validation(self)
diff --git a/src/transformers/models/llama/modeling_llama.py b/src/transformers/models/llama/modeling_llama.py
index 9a1d6c0749f932..c7017832b9324c 100644
--- a/src/transformers/models/llama/modeling_llama.py
+++ b/src/transformers/models/llama/modeling_llama.py
@@ -951,7 +951,7 @@ def forward(
return_legacy_cache = True
past_key_values = DynamicCache.from_legacy_cache(past_key_values)
logger.warning_once(
- "We detected that you are passing `past_key_values` as a tuple and this is deprecated and will be removed in v4.43. "
+ "We detected that you are passing `past_key_values` as a tuple and this is deprecated and will be removed in v4.46. "
"Please use an appropriate `Cache` class (https://huggingface.co/docs/transformers/internal/generation_utils#transformers.Cache)"
)
diff --git a/src/transformers/models/mistral/modeling_mistral.py b/src/transformers/models/mistral/modeling_mistral.py
index c43418182c3881..ffe16b27203301 100644
--- a/src/transformers/models/mistral/modeling_mistral.py
+++ b/src/transformers/models/mistral/modeling_mistral.py
@@ -767,7 +767,7 @@ def forward(
past_key_values = DynamicCache.from_legacy_cache(past_key_values)
return_legacy_cache = True
logger.warning_once(
- "We detected that you are passing `past_key_values` as a tuple and this is deprecated and will be removed in v4.43. "
+ "We detected that you are passing `past_key_values` as a tuple and this is deprecated and will be removed in v4.46. "
"Please use an appropriate `Cache` class (https://huggingface.co/docs/transformers/internal/generation_utils#transformers.Cache)"
)
diff --git a/src/transformers/models/mixtral/modeling_mixtral.py b/src/transformers/models/mixtral/modeling_mixtral.py
index 2e23d06699087e..c7062e75b1085c 100644
--- a/src/transformers/models/mixtral/modeling_mixtral.py
+++ b/src/transformers/models/mixtral/modeling_mixtral.py
@@ -1023,7 +1023,7 @@ def forward(
use_legacy_cache = True
past_key_values = DynamicCache.from_legacy_cache(past_key_values)
logger.warning_once(
- "We detected that you are passing `past_key_values` as a tuple and this is deprecated and will be removed in v4.43. "
+ "We detected that you are passing `past_key_values` as a tuple and this is deprecated and will be removed in v4.46. "
"Please use an appropriate `Cache` class (https://huggingface.co/docs/transformers/internal/generation_utils#transformers.Cache)"
)
diff --git a/src/transformers/models/olmo/modeling_olmo.py b/src/transformers/models/olmo/modeling_olmo.py
index 007e69570e7821..b4bda8e2db5251 100644
--- a/src/transformers/models/olmo/modeling_olmo.py
+++ b/src/transformers/models/olmo/modeling_olmo.py
@@ -873,7 +873,7 @@ def forward(
return_legacy_cache = True
past_key_values = DynamicCache.from_legacy_cache(past_key_values)
logger.warning_once(
- "We detected that you are passing `past_key_values` as a tuple and this is deprecated and will be removed in v4.43. "
+ "We detected that you are passing `past_key_values` as a tuple and this is deprecated and will be removed in v4.46. "
"Please use an appropriate `Cache` class (https://huggingface.co/docs/transformers/internal/generation_utils#transformers.Cache)"
)
diff --git a/src/transformers/models/olmoe/modeling_olmoe.py b/src/transformers/models/olmoe/modeling_olmoe.py
index a53f1eeda61196..a33338365312db 100644
--- a/src/transformers/models/olmoe/modeling_olmoe.py
+++ b/src/transformers/models/olmoe/modeling_olmoe.py
@@ -1012,7 +1012,7 @@ def forward(
return_legacy_cache = True
past_key_values = DynamicCache.from_legacy_cache(past_key_values)
logger.warning_once(
- "We detected that you are passing `past_key_values` as a tuple and this is deprecated and will be removed in v4.43. "
+ "We detected that you are passing `past_key_values` as a tuple and this is deprecated and will be removed in v4.46. "
"Please use an appropriate `Cache` class (https://huggingface.co/docs/transformers/v4.41.3/en/internal/generation_utils#transformers.Cache)"
)
diff --git a/src/transformers/models/persimmon/modeling_persimmon.py b/src/transformers/models/persimmon/modeling_persimmon.py
index 9fab09bdcc7877..ccaa2c7fd29aae 100644
--- a/src/transformers/models/persimmon/modeling_persimmon.py
+++ b/src/transformers/models/persimmon/modeling_persimmon.py
@@ -690,7 +690,7 @@ def forward(
use_legacy_cache = True
past_key_values = DynamicCache.from_legacy_cache(past_key_values)
logger.warning_once(
- "We detected that you are passing `past_key_values` as a tuple and this is deprecated and will be removed in v4.43. "
+ "We detected that you are passing `past_key_values` as a tuple and this is deprecated and will be removed in v4.46. "
"Please use an appropriate `Cache` class (https://huggingface.co/docs/transformers/internal/generation_utils#transformers.Cache)"
)
diff --git a/src/transformers/models/phi/modeling_phi.py b/src/transformers/models/phi/modeling_phi.py
index 0d8be04af20d5c..648d1653a3b503 100644
--- a/src/transformers/models/phi/modeling_phi.py
+++ b/src/transformers/models/phi/modeling_phi.py
@@ -981,7 +981,7 @@ def forward(
use_legacy_cache = True
past_key_values = DynamicCache.from_legacy_cache(past_key_values)
logger.warning_once(
- "We detected that you are passing `past_key_values` as a tuple and this is deprecated and will be removed in v4.43. "
+ "We detected that you are passing `past_key_values` as a tuple and this is deprecated and will be removed in v4.46. "
"Please use an appropriate `Cache` class (https://huggingface.co/docs/transformers/internal/generation_utils#transformers.Cache)"
)
diff --git a/src/transformers/models/phi3/modeling_phi3.py b/src/transformers/models/phi3/modeling_phi3.py
index 273b6a8f505e79..ec395679ae6207 100644
--- a/src/transformers/models/phi3/modeling_phi3.py
+++ b/src/transformers/models/phi3/modeling_phi3.py
@@ -1008,7 +1008,7 @@ def forward(
use_legacy_cache = True
past_key_values = DynamicCache.from_legacy_cache(past_key_values)
logger.warning_once(
- "We detected that you are passing `past_key_values` as a tuple and this is deprecated and will be removed in v4.43. "
+ "We detected that you are passing `past_key_values` as a tuple and this is deprecated and will be removed in v4.46. "
"Please use an appropriate `Cache` class (https://huggingface.co/docs/transformers/internal/generation_utils#transformers.Cache)"
)
diff --git a/src/transformers/models/qwen2/modeling_qwen2.py b/src/transformers/models/qwen2/modeling_qwen2.py
index 030c74b034b794..d0ea8ef0e376e0 100644
--- a/src/transformers/models/qwen2/modeling_qwen2.py
+++ b/src/transformers/models/qwen2/modeling_qwen2.py
@@ -920,7 +920,7 @@ def forward(
use_legacy_cache = True
past_key_values = DynamicCache.from_legacy_cache(past_key_values)
logger.warning_once(
- "We detected that you are passing `past_key_values` as a tuple and this is deprecated and will be removed in v4.43. "
+ "We detected that you are passing `past_key_values` as a tuple and this is deprecated and will be removed in v4.46. "
"Please use an appropriate `Cache` class (https://huggingface.co/docs/transformers/internal/generation_utils#transformers.Cache)"
)
diff --git a/src/transformers/models/qwen2_moe/modeling_qwen2_moe.py b/src/transformers/models/qwen2_moe/modeling_qwen2_moe.py
index b196ed72a49b23..6f483e50cde065 100644
--- a/src/transformers/models/qwen2_moe/modeling_qwen2_moe.py
+++ b/src/transformers/models/qwen2_moe/modeling_qwen2_moe.py
@@ -1084,7 +1084,7 @@ def forward(
use_legacy_cache = True
past_key_values = DynamicCache.from_legacy_cache(past_key_values)
logger.warning_once(
- "We detected that you are passing `past_key_values` as a tuple and this is deprecated and will be removed in v4.43. "
+ "We detected that you are passing `past_key_values` as a tuple and this is deprecated and will be removed in v4.46. "
"Please use an appropriate `Cache` class (https://huggingface.co/docs/transformers/internal/generation_utils#transformers.Cache)"
)
diff --git a/src/transformers/models/stablelm/modeling_stablelm.py b/src/transformers/models/stablelm/modeling_stablelm.py
index 27d0c856a61bd6..d91c0832ed33da 100755
--- a/src/transformers/models/stablelm/modeling_stablelm.py
+++ b/src/transformers/models/stablelm/modeling_stablelm.py
@@ -965,7 +965,7 @@ def forward(
use_legacy_cache = True
past_key_values = DynamicCache.from_legacy_cache(past_key_values)
logger.warning_once(
- "We detected that you are passing `past_key_values` as a tuple and this is deprecated and will be removed in v4.43. "
+ "We detected that you are passing `past_key_values` as a tuple and this is deprecated and will be removed in v4.46. "
"Please use an appropriate `Cache` class (https://huggingface.co/docs/transformers/internal/generation_utils#transformers.Cache)"
)
diff --git a/src/transformers/models/starcoder2/modeling_starcoder2.py b/src/transformers/models/starcoder2/modeling_starcoder2.py
index c359c07c69c0b8..0be37c4e1fb91c 100644
--- a/src/transformers/models/starcoder2/modeling_starcoder2.py
+++ b/src/transformers/models/starcoder2/modeling_starcoder2.py
@@ -894,7 +894,7 @@ def forward(
use_legacy_cache = True
past_key_values = DynamicCache.from_legacy_cache(past_key_values)
logger.warning_once(
- "We detected that you are passing `past_key_values` as a tuple and this is deprecated and will be removed in v4.43. "
+ "We detected that you are passing `past_key_values` as a tuple and this is deprecated and will be removed in v4.46. "
"Please use an appropriate `Cache` class (https://huggingface.co/docs/transformers/internal/generation_utils#transformers.Cache)"
)
From 5ce0a113b5bc9dd8dbb92dd866772d79847d9a92 Mon Sep 17 00:00:00 2001
From: Yih-Dar <2521628+ydshieh@users.noreply.github.com>
Date: Mon, 16 Sep 2024 11:07:59 +0200
Subject: [PATCH 02/50] Fix SSH workflow (#33451)
* fix
* update
---------
Co-authored-by: ydshieh
---
.github/workflows/ssh-runner.yml | 15 +++++++++++++--
1 file changed, 13 insertions(+), 2 deletions(-)
diff --git a/.github/workflows/ssh-runner.yml b/.github/workflows/ssh-runner.yml
index b433abb484fac4..db649876f60492 100644
--- a/.github/workflows/ssh-runner.yml
+++ b/.github/workflows/ssh-runner.yml
@@ -58,8 +58,19 @@ jobs:
#because the SSH can be enabled dynamically if the workflow failed, so we need to store slack infos to be able to retrieve them during the waitforssh step
shell: bash
run: |
- if [ "${{ secrets[format('{0}_{1}', github.actor, 'SLACK_ID')] }}" != "" ]; then
- echo "SLACKCHANNEL=${{ secrets[format('{0}_{1}', github.actor, 'SLACK_ID')] }}" >> $GITHUB_ENV
+ echo "${{ github.actor }}"
+ github_actor=${{ github.actor }}
+ github_actor=${github_actor/'-'/'_'}
+ echo "$github_actor"
+ echo "github_actor=$github_actor" >> $GITHUB_ENV
+
+ - name: Store Slack infos
+ #because the SSH can be enabled dynamically if the workflow failed, so we need to store slack infos to be able to retrieve them during the waitforssh step
+ shell: bash
+ run: |
+ echo "${{ env.github_actor }}"
+ if [ "${{ secrets[format('{0}_{1}', env.github_actor, 'SLACK_ID')] }}" != "" ]; then
+ echo "SLACKCHANNEL=${{ secrets[format('{0}_{1}', env.github_actor, 'SLACK_ID')] }}" >> $GITHUB_ENV
else
echo "SLACKCHANNEL=${{ secrets.SLACK_CIFEEDBACK_CHANNEL }}" >> $GITHUB_ENV
fi
From ce62a41880b5b70a304d068eb58f55894a5a7af8 Mon Sep 17 00:00:00 2001
From: Merve Noyan
Date: Mon, 16 Sep 2024 13:08:31 +0200
Subject: [PATCH 03/50] Add keypoint-detection task guide (#33274)
---------
Co-authored-by: Steven Liu <59462357+stevhliu@users.noreply.github.com>
---
docs/source/en/_toctree.yml | 2 +
docs/source/en/tasks/keypoint_detection.md | 154 +++++++++++++++++++++
2 files changed, 156 insertions(+)
create mode 100644 docs/source/en/tasks/keypoint_detection.md
diff --git a/docs/source/en/_toctree.yml b/docs/source/en/_toctree.yml
index 235ea81a7f1ea6..7eff2a38302669 100644
--- a/docs/source/en/_toctree.yml
+++ b/docs/source/en/_toctree.yml
@@ -81,6 +81,8 @@
title: Image Feature Extraction
- local: tasks/mask_generation
title: Mask Generation
+ - local: tasks/keypoint_detection
+ title: Keypoint Detection
- local: tasks/knowledge_distillation_for_image_classification
title: Knowledge Distillation for Computer Vision
title: Computer Vision
diff --git a/docs/source/en/tasks/keypoint_detection.md b/docs/source/en/tasks/keypoint_detection.md
new file mode 100644
index 00000000000000..a0ec71a5c22000
--- /dev/null
+++ b/docs/source/en/tasks/keypoint_detection.md
@@ -0,0 +1,154 @@
+
+
+# Keypoint Detection
+
+[[open-in-colab]]
+
+Keypoint detection identifies and locates specific points of interest within an image. These keypoints, also known as landmarks, represent meaningful features of objects, such as facial features or object parts. These models take an image input and return the following outputs:
+
+- **Keypoints and Scores**: Points of interest and their confidence scores.
+- **Descriptors**: A representation of the image region surrounding each keypoint, capturing its texture, gradient, orientation and other properties.
+
+In this guide, we will show how to extract keypoints from images.
+
+For this tutorial, we will use [SuperPoint](./model_doc/superpoint.md), a foundation model for keypoint detection.
+
+```python
+from transformers import AutoImageProcessor, SuperPointForKeypointDetection
+processor = AutoImageProcessor.from_pretrained("magic-leap-community/superpoint")
+model = SuperPointForKeypointDetection.from_pretrained("magic-leap-community/superpoint")
+```
+
+Let's test the model on the images below.
+
+
+
+
+
+
+
+```python
+import torch
+from PIL import Image
+import requests
+import cv2
+
+
+url_image_1 = "https://huggingface.co/datasets/huggingface/documentation-images/resolve/main/bee.jpg"
+image_1 = Image.open(requests.get(url_image_1, stream=True).raw)
+url_image_2 = "https://huggingface.co/datasets/huggingface/documentation-images/resolve/main/cats.png"
+image_2 = Image.open(requests.get(url_image_2, stream=True).raw)
+
+images = [image_1, image_2]
+```
+
+We can now process our inputs and infer.
+
+```python
+inputs = processor(images,return_tensors="pt").to(model.device, model.dtype)
+outputs = model(**inputs)
+```
+
+The model output has relative keypoints, descriptors, masks and scores for each item in the batch. The mask highlights areas of the image where keypoints are present.
+
+```python
+SuperPointKeypointDescriptionOutput(loss=None, keypoints=tensor([[[0.0437, 0.0167],
+ [0.0688, 0.0167],
+ [0.0172, 0.0188],
+ ...,
+ [0.5984, 0.9812],
+ [0.6953, 0.9812]]]),
+ scores=tensor([[0.0056, 0.0053, 0.0079, ..., 0.0125, 0.0539, 0.0377],
+ [0.0206, 0.0058, 0.0065, ..., 0.0000, 0.0000, 0.0000]],
+ grad_fn=), descriptors=tensor([[[-0.0807, 0.0114, -0.1210, ..., -0.1122, 0.0899, 0.0357],
+ [-0.0807, 0.0114, -0.1210, ..., -0.1122, 0.0899, 0.0357],
+ [-0.0807, 0.0114, -0.1210, ..., -0.1122, 0.0899, 0.0357],
+ ...],
+ grad_fn=), mask=tensor([[1, 1, 1, ..., 1, 1, 1],
+ [1, 1, 1, ..., 0, 0, 0]], dtype=torch.int32), hidden_states=None)
+```
+
+To plot actual keypoints in the image, we need to postprocess the output. To do so, we have to pass the actual image sizes to `post_process_keypoint_detection` along with outputs.
+
+```python
+image_sizes = [(image.size[1], image.size[0]) for image in images]
+outputs = processor.post_process_keypoint_detection(outputs, image_sizes)
+```
+
+The outputs are now a list of dictionaries where each dictionary is a processed output of keypoints, scores and descriptors.
+
+```python
+[{'keypoints': tensor([[ 226, 57],
+ [ 356, 57],
+ [ 89, 64],
+ ...,
+ [3604, 3391]], dtype=torch.int32),
+ 'scores': tensor([0.0056, 0.0053, ...], grad_fn=),
+ 'descriptors': tensor([[-0.0807, 0.0114, -0.1210, ..., -0.1122, 0.0899, 0.0357],
+ [-0.0807, 0.0114, -0.1210, ..., -0.1122, 0.0899, 0.0357]],
+ grad_fn=)},
+ {'keypoints': tensor([[ 46, 6],
+ [ 78, 6],
+ [422, 6],
+ [206, 404]], dtype=torch.int32),
+ 'scores': tensor([0.0206, 0.0058, 0.0065, 0.0053, 0.0070, ...,grad_fn=),
+ 'descriptors': tensor([[-0.0525, 0.0726, 0.0270, ..., 0.0389, -0.0189, -0.0211],
+ [-0.0525, 0.0726, 0.0270, ..., 0.0389, -0.0189, -0.0211]}]
+```
+
+We can use these to plot the keypoints.
+
+```python
+import matplotlib.pyplot as plt
+import torch
+
+for i in range(len(images)):
+ keypoints = outputs[i]["keypoints"]
+ scores = outputs[i]["scores"]
+ descriptors = outputs[i]["descriptors"]
+ keypoints = outputs[i]["keypoints"].detach().numpy()
+ scores = outputs[i]["scores"].detach().numpy()
+ image = images[i]
+ image_width, image_height = image.size
+
+ plt.axis('off')
+ plt.imshow(image)
+ plt.scatter(
+ keypoints[:, 0],
+ keypoints[:, 1],
+ s=scores * 100,
+ c='cyan',
+ alpha=0.4
+ )
+ plt.show()
+```
+
+Below you can see the outputs.
+
+
+
+
+
+
From 2f62146f0e916c3e6752b59d34853be6df0506f2 Mon Sep 17 00:00:00 2001
From: Yoni Gozlan <74535834+yonigozlan@users.noreply.github.com>
Date: Mon, 16 Sep 2024 11:26:26 -0400
Subject: [PATCH 04/50] Uniformize kwargs for LLaVa processor and update docs
(#32858)
* Uniformize kwargs for LlaVa and update docs
* Change order of processor inputs in docstring
* Improve BC support for reversed images and text inputs
* cleanup llava processor call docstring
* Add encoded inputs as valid text inputs in reverse input check, add deprecation version in warning
* Put function check reversed images text outside base processor class
* Refactor _validate_images_text_input_order
* Add ProcessingUtilTester
* fix processing and test_processing
---
.../models/llava/modeling_llava.py | 2 +-
.../models/llava/processing_llava.py | 73 ++++++++++---------
tests/models/llava/test_modeling_llava.py | 20 ++---
tests/models/llava/test_processor_llava.py | 57 ++++++++++++++-
4 files changed, 104 insertions(+), 48 deletions(-)
diff --git a/src/transformers/models/llava/modeling_llava.py b/src/transformers/models/llava/modeling_llava.py
index 9ad19ccee72228..eb1c55341b0784 100644
--- a/src/transformers/models/llava/modeling_llava.py
+++ b/src/transformers/models/llava/modeling_llava.py
@@ -405,7 +405,7 @@ def forward(
>>> url = "https://www.ilankelman.org/stopsigns/australia.jpg"
>>> image = Image.open(requests.get(url, stream=True).raw)
- >>> inputs = processor(text=prompt, images=image, return_tensors="pt")
+ >>> inputs = processor(images=image, text=prompt, return_tensors="pt")
>>> # Generate
>>> generate_ids = model.generate(**inputs, max_new_tokens=15)
diff --git a/src/transformers/models/llava/processing_llava.py b/src/transformers/models/llava/processing_llava.py
index 678724ae95be41..28a9410e6cbf0b 100644
--- a/src/transformers/models/llava/processing_llava.py
+++ b/src/transformers/models/llava/processing_llava.py
@@ -16,18 +16,33 @@
Processor class for Llava.
"""
-from typing import List, Optional, Union
+import sys
+from typing import List, Union
from ...feature_extraction_utils import BatchFeature
from ...image_utils import ImageInput, get_image_size, to_numpy_array
-from ...processing_utils import ProcessorMixin
-from ...tokenization_utils_base import PaddingStrategy, PreTokenizedInput, TextInput, TruncationStrategy
-from ...utils import TensorType, logging
+from ...processing_utils import ProcessingKwargs, ProcessorMixin, _validate_images_text_input_order
+from ...tokenization_utils_base import PreTokenizedInput, TextInput
+from ...utils import logging
+if sys.version_info >= (3, 11):
+ from typing import Unpack
+else:
+ from typing_extensions import Unpack
+
logger = logging.get_logger(__name__)
+class LlavaProcessorKwargs(ProcessingKwargs, total=False):
+ _defaults = {
+ "text_kwargs": {
+ "padding": False,
+ },
+ "images_kwargs": {},
+ }
+
+
class LlavaProcessor(ProcessorMixin):
r"""
Constructs a Llava processor which wraps a Llava image processor and a Llava tokenizer into a single processor.
@@ -73,12 +88,11 @@ def __init__(
def __call__(
self,
- text: Union[TextInput, PreTokenizedInput, List[TextInput], List[PreTokenizedInput]] = None,
images: ImageInput = None,
- padding: Union[bool, str, PaddingStrategy] = False,
- truncation: Union[bool, str, TruncationStrategy] = None,
- max_length=None,
- return_tensors: Optional[Union[str, TensorType]] = TensorType.PYTORCH,
+ text: Union[TextInput, PreTokenizedInput, List[TextInput], List[PreTokenizedInput]] = None,
+ audio=None,
+ videos=None,
+ **kwargs: Unpack[LlavaProcessorKwargs],
) -> BatchFeature:
"""
Main method to prepare for the model one or several sequences(s) and image(s). This method forwards the `text`
@@ -88,29 +102,15 @@ def __call__(
of the above two methods for more information.
Args:
+ images (`PIL.Image.Image`, `np.ndarray`, `torch.Tensor`, `List[PIL.Image.Image]`, `List[np.ndarray]`, `List[torch.Tensor]`):
+ The image or batch of images to be prepared. Each image can be a PIL image, NumPy array or PyTorch
+ tensor. Both channels-first and channels-last formats are supported.
text (`str`, `List[str]`, `List[List[str]]`):
The sequence or batch of sequences to be encoded. Each sequence can be a string or a list of strings
(pretokenized string). If the sequences are provided as list of strings (pretokenized), you must set
`is_split_into_words=True` (to lift the ambiguity with a batch of sequences).
- images (`PIL.Image.Image`, `np.ndarray`, `torch.Tensor`, `List[PIL.Image.Image]`, `List[np.ndarray]`, `List[torch.Tensor]`):
- The image or batch of images to be prepared. Each image can be a PIL image, NumPy array or PyTorch
- tensor. Both channels-first and channels-last formats are supported.
- padding (`bool`, `str` or [`~utils.PaddingStrategy`], *optional*, defaults to `False`):
- Select a strategy to pad the returned sequences (according to the model's padding side and padding
- index) among:
- - `True` or `'longest'`: Pad to the longest sequence in the batch (or no padding if only a single
- sequence if provided).
- - `'max_length'`: Pad to a maximum length specified with the argument `max_length` or to the maximum
- acceptable input length for the model if that argument is not provided.
- - `False` or `'do_not_pad'` (default): No padding (i.e., can output a batch with sequences of different
- lengths).
- max_length (`int`, *optional*):
- Maximum length of the returned list and optionally padding length (see above).
- truncation (`bool`, *optional*):
- Activates truncation to cut input sequences longer than `max_length` to `max_length`.
return_tensors (`str` or [`~utils.TensorType`], *optional*):
If set, will return tensors of a particular framework. Acceptable values are:
-
- `'tf'`: Return TensorFlow `tf.constant` objects.
- `'pt'`: Return PyTorch `torch.Tensor` objects.
- `'np'`: Return NumPy `np.ndarray` objects.
@@ -125,8 +125,19 @@ def __call__(
`None`).
- **pixel_values** -- Pixel values to be fed to a model. Returned when `images` is not `None`.
"""
+ if images is None and text is None:
+ raise ValueError("You have to specify at least one of `images` or `text`.")
+
+ # check if images and text inputs are reversed for BC
+ images, text = _validate_images_text_input_order(images, text)
+
+ output_kwargs = self._merge_kwargs(
+ LlavaProcessorKwargs,
+ tokenizer_init_kwargs=self.tokenizer.init_kwargs,
+ **kwargs,
+ )
if images is not None:
- image_inputs = self.image_processor(images, return_tensors=return_tensors)
+ image_inputs = self.image_processor(images, **output_kwargs["images_kwargs"])
else:
image_inputs = {}
@@ -158,13 +169,7 @@ def __call__(
"Using processors without these attributes in the config is deprecated and will throw an error in v4.47."
)
- text_inputs = self.tokenizer(
- prompt_strings,
- return_tensors=return_tensors,
- padding=padding,
- truncation=truncation,
- max_length=max_length,
- )
+ text_inputs = self.tokenizer(prompt_strings, **output_kwargs["text_kwargs"])
return BatchFeature(data={**text_inputs, **image_inputs})
# Copied from transformers.models.clip.processing_clip.CLIPProcessor.batch_decode with CLIP->Llama
diff --git a/tests/models/llava/test_modeling_llava.py b/tests/models/llava/test_modeling_llava.py
index 5c05480ffa6dbb..305fc9e9a84cdb 100644
--- a/tests/models/llava/test_modeling_llava.py
+++ b/tests/models/llava/test_modeling_llava.py
@@ -274,7 +274,7 @@ def test_small_model_integration_test(self):
prompt = "\nUSER: What are the things I should be cautious about when I visit this place?\nASSISTANT:"
image_file = "https://llava-vl.github.io/static/images/view.jpg"
raw_image = Image.open(requests.get(image_file, stream=True).raw)
- inputs = self.processor(prompt, raw_image, return_tensors="pt")
+ inputs = self.processor(images=raw_image, text=prompt, return_tensors="pt")
EXPECTED_INPUT_IDS = torch.tensor([[1, 32000, 28705, 13, 11123, 28747, 1824, 460, 272, 1722,315, 1023, 347, 13831, 925, 684, 739, 315, 3251, 456,1633, 28804, 13, 4816, 8048, 12738, 28747]]) # fmt: skip
self.assertTrue(torch.equal(inputs["input_ids"], EXPECTED_INPUT_IDS))
@@ -299,7 +299,7 @@ def test_small_model_integration_test_llama_single(self):
prompt = "USER: \nWhat are the things I should be cautious about when I visit this place? ASSISTANT:"
image_file = "https://llava-vl.github.io/static/images/view.jpg"
raw_image = Image.open(requests.get(image_file, stream=True).raw)
- inputs = processor(prompt, raw_image, return_tensors="pt").to(torch_device, torch.float16)
+ inputs = processor(images=raw_image, text=prompt, return_tensors="pt").to(torch_device, torch.float16)
output = model.generate(**inputs, max_new_tokens=900, do_sample=False)
EXPECTED_DECODED_TEXT = "USER: \nWhat are the things I should be cautious about when I visit this place? ASSISTANT: When visiting this place, which is a pier or dock extending over a body of water, there are a few things to be cautious about. First, be aware of the weather conditions, as sudden changes in weather can make the pier unsafe to walk on. Second, be mindful of the water depth and any potential hazards, such as submerged rocks or debris, that could cause accidents or injuries. Additionally, be cautious of the tides and currents, as they can change rapidly and pose a risk to swimmers or those who venture too close to the edge of the pier. Finally, be respectful of the environment and other visitors, and follow any posted rules or guidelines for the area." # fmt: skip
@@ -325,7 +325,7 @@ def test_small_model_integration_test_llama_batched(self):
image1 = Image.open(requests.get("https://llava-vl.github.io/static/images/view.jpg", stream=True).raw)
image2 = Image.open(requests.get("http://images.cocodataset.org/val2017/000000039769.jpg", stream=True).raw)
- inputs = processor(prompts, images=[image1, image2], return_tensors="pt", padding=True)
+ inputs = processor(images=[image1, image2], text=prompts, return_tensors="pt", padding=True)
output = model.generate(**inputs, max_new_tokens=20)
@@ -349,7 +349,7 @@ def test_small_model_integration_test_batch(self):
image1 = Image.open(requests.get("https://llava-vl.github.io/static/images/view.jpg", stream=True).raw)
image2 = Image.open(requests.get("http://images.cocodataset.org/val2017/000000039769.jpg", stream=True).raw)
- inputs = self.processor(prompts, images=[image1, image2], return_tensors="pt", padding=True)
+ inputs = self.processor(images=[image1, image2], text=prompts, return_tensors="pt", padding=True)
output = model.generate(**inputs, max_new_tokens=20)
@@ -381,7 +381,7 @@ def test_small_model_integration_test_llama_batched_regression(self):
image1 = Image.open(requests.get("https://llava-vl.github.io/static/images/view.jpg", stream=True).raw)
image2 = Image.open(requests.get("http://images.cocodataset.org/val2017/000000039769.jpg", stream=True).raw)
- inputs = processor(prompts, images=[image1, image2, image1], return_tensors="pt", padding=True)
+ inputs = processor(images=[image1, image2, image1], text=prompts, return_tensors="pt", padding=True)
output = model.generate(**inputs, max_new_tokens=20)
@@ -409,8 +409,8 @@ def test_batched_generation(self):
image2 = Image.open(requests.get(url2, stream=True).raw)
inputs = processor(
- text=[prompt1, prompt2, prompt3],
images=[image1, image2, image1, image2],
+ text=[prompt1, prompt2, prompt3],
return_tensors="pt",
padding=True,
).to(torch_device)
@@ -444,7 +444,7 @@ def test_llava_index_error_bug(self):
image_file = "http://images.cocodataset.org/val2017/000000039769.jpg"
raw_image = Image.open(requests.get(image_file, stream=True).raw)
- inputs = processor(prompt, raw_image, return_tensors="pt").to(torch_device, torch.float16)
+ inputs = processor(images=raw_image, text=prompt, return_tensors="pt").to(torch_device, torch.float16)
# Make sure that `generate` works
_ = model.generate(**inputs, max_new_tokens=20)
@@ -510,7 +510,7 @@ def test_generation_no_images(self):
processor = AutoProcessor.from_pretrained(model_id)
# Prepare inputs with no images
- inputs = processor("Hello, I am", return_tensors="pt").to(torch_device)
+ inputs = processor(text="Hello, I am", return_tensors="pt").to(torch_device)
# Make sure that `generate` works
_ = model.generate(**inputs, max_new_tokens=20)
@@ -554,13 +554,13 @@ def test_expansion_in_processing(self):
# check processing with expansion of inputs
processor.vision_feature_select_strategy = "default"
processor.patch_size = 14
- inputs_expanded = processor(prompt, raw_image, return_tensors="pt").to(torch_device, torch.float16)
+ inputs_expanded = processor(images=raw_image, text=prompt, return_tensors="pt").to(torch_device, torch.float16)
self.assertTrue(inputs_expanded.input_ids.shape[-1] == 593)
# check processing without expansion of inputs (legacy behavior)
processor.vision_feature_select_strategy = None
processor.patch_size = None
- inputs = processor(prompt, raw_image, return_tensors="pt").to(torch_device, torch.float16)
+ inputs = processor(images=raw_image, text=prompt, return_tensors="pt").to(torch_device, torch.float16)
self.assertTrue(inputs.input_ids.shape[-1] == 18)
# generate exactly 20 tokens
diff --git a/tests/models/llava/test_processor_llava.py b/tests/models/llava/test_processor_llava.py
index 54c1b4674cbcef..5b05a8b92ea513 100644
--- a/tests/models/llava/test_processor_llava.py
+++ b/tests/models/llava/test_processor_llava.py
@@ -11,18 +11,43 @@
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
# See the License for the specific language governing permissions and
# limitations under the License.
+import shutil
+import tempfile
import unittest
-from transformers.testing_utils import require_vision
+from transformers import AutoProcessor, AutoTokenizer, LlamaTokenizerFast, LlavaProcessor
+from transformers.testing_utils import require_torch, require_vision
from transformers.utils import is_vision_available
+from ...test_processing_common import ProcessorTesterMixin
+
if is_vision_available():
- from transformers import AutoTokenizer, LlavaProcessor
+ from transformers import CLIPImageProcessor
@require_vision
-class LlavaProcessorTest(unittest.TestCase):
+class LlavaProcessorTest(ProcessorTesterMixin, unittest.TestCase):
+ processor_class = LlavaProcessor
+
+ def setUp(self):
+ self.tmpdirname = tempfile.mkdtemp()
+ image_processor = CLIPImageProcessor(do_center_crop=False)
+ tokenizer = LlamaTokenizerFast.from_pretrained("huggyllama/llama-7b")
+
+ processor = LlavaProcessor(image_processor=image_processor, tokenizer=tokenizer)
+
+ processor.save_pretrained(self.tmpdirname)
+
+ def get_tokenizer(self, **kwargs):
+ return AutoProcessor.from_pretrained(self.tmpdirname, **kwargs).tokenizer
+
+ def get_image_processor(self, **kwargs):
+ return AutoProcessor.from_pretrained(self.tmpdirname, **kwargs).image_processor
+
+ def tearDown(self):
+ shutil.rmtree(self.tmpdirname)
+
def test_can_load_various_tokenizers(self):
for checkpoint in ["Intel/llava-gemma-2b", "llava-hf/llava-1.5-7b-hf"]:
processor = LlavaProcessor.from_pretrained(checkpoint)
@@ -45,3 +70,29 @@ def test_chat_template(self):
formatted_prompt = processor.apply_chat_template(messages, add_generation_prompt=True)
self.assertEqual(expected_prompt, formatted_prompt)
+
+ @require_torch
+ @require_vision
+ def test_unstructured_kwargs_batched(self):
+ if "image_processor" not in self.processor_class.attributes:
+ self.skipTest(f"image_processor attribute not present in {self.processor_class}")
+ image_processor = self.get_component("image_processor")
+ tokenizer = self.get_component("tokenizer")
+
+ processor = self.processor_class(tokenizer=tokenizer, image_processor=image_processor)
+ self.skip_processor_without_typed_kwargs(processor)
+
+ input_str = ["lower newer", "upper older longer string"]
+ image_input = self.prepare_image_inputs() * 2
+ inputs = processor(
+ images=image_input,
+ text=input_str,
+ return_tensors="pt",
+ size={"height": 214, "width": 214},
+ padding="longest",
+ max_length=76,
+ )
+
+ self.assertEqual(inputs["pixel_values"].shape[2], 214)
+
+ self.assertEqual(len(inputs["input_ids"][0]), 5)
From c7a91f5adf976e0517c4a7f1506fb0c24f353053 Mon Sep 17 00:00:00 2001
From: Sergio Paniego Blanco
Date: Mon, 16 Sep 2024 18:52:27 +0200
Subject: [PATCH 05/50] `Agents, supercharged - Multi-agents, External tools,
and more` docs typo fixed (#33478)
* Typo fixed in Agents, supercharged
---
docs/source/en/agents_advanced.md | 2 +-
1 file changed, 1 insertion(+), 1 deletion(-)
diff --git a/docs/source/en/agents_advanced.md b/docs/source/en/agents_advanced.md
index e7469a310c4102..399eeb9b70eb20 100644
--- a/docs/source/en/agents_advanced.md
+++ b/docs/source/en/agents_advanced.md
@@ -34,7 +34,7 @@ You can easily build hierarchical multi-agent systems with `transformers.agents`
To do so, encapsulate the agent in a [`ManagedAgent`] object. This object needs arguments `agent`, `name`, and a `description`, which will then be embedded in the manager agent's system prompt to let it know how to call this managed agent, as we also do for tools.
-Here's an example of making an agent that managed a specitif web search agent using our [`DuckDuckGoSearchTool`]:
+Here's an example of making an agent that managed a specific web search agent using our [`DuckDuckGoSearchTool`]:
```py
from transformers.agents import ReactCodeAgent, HfApiEngine, DuckDuckGoSearchTool, ManagedAgent
From c2d05897bf4e8b34773838accaddd66028bc148d Mon Sep 17 00:00:00 2001
From: Ahmed Almaghz <53489256+AhmedAlmaghz@users.noreply.github.com>
Date: Mon, 16 Sep 2024 20:02:03 +0300
Subject: [PATCH 06/50] [i18n-ar] Add File : `docs/source/ar/_toctree.yml`
(#32696)
* Update ar lang build_documentation.yml
* Update ar lang build_pr_documentation.yml
* Update docs/source/ar/pipeline_tutorial.md
Co-authored-by: Abdullah Mohammed <554032+abodacs@users.noreply.github.com>
* Update docs/source/ar/pipeline_tutorial.md
Co-authored-by: Abdullah Mohammed <554032+abodacs@users.noreply.github.com>
* Update docs/source/ar/pipeline_tutorial.md
Co-authored-by: Abdullah Mohammed <554032+abodacs@users.noreply.github.com>
* Update docs/source/ar/pipeline_tutorial.md
Co-authored-by: Abdullah Mohammed <554032+abodacs@users.noreply.github.com>
* Update docs/source/ar/pipeline_tutorial.md
Co-authored-by: Abdullah Mohammed <554032+abodacs@users.noreply.github.com>
* Update docs/source/ar/pipeline_tutorial.md
Co-authored-by: Abdullah Mohammed <554032+abodacs@users.noreply.github.com>
* Update docs/source/ar/pipeline_tutorial.md
Co-authored-by: Abdullah Mohammed <554032+abodacs@users.noreply.github.com>
* Update docs/source/ar/pipeline_tutorial.md
Co-authored-by: Abdullah Mohammed <554032+abodacs@users.noreply.github.com>
* Update docs/source/ar/pipeline_tutorial.md
Co-authored-by: Abdullah Mohammed <554032+abodacs@users.noreply.github.com>
* Update docs/source/ar/pipeline_tutorial.md
Co-authored-by: Abdullah Mohammed <554032+abodacs@users.noreply.github.com>
* Update docs/source/ar/pipeline_tutorial.md
Co-authored-by: Abdullah Mohammed <554032+abodacs@users.noreply.github.com>
* Update docs/source/ar/pipeline_tutorial.md
Co-authored-by: Abdullah Mohammed <554032+abodacs@users.noreply.github.com>
* Update docs/source/ar/pipeline_tutorial.md
Co-authored-by: Abdullah Mohammed <554032+abodacs@users.noreply.github.com>
* Update docs/source/ar/pipeline_tutorial.md
Co-authored-by: Abdullah Mohammed <554032+abodacs@users.noreply.github.com>
* Update docs/source/ar/pipeline_tutorial.md
Co-authored-by: Abdullah Mohammed <554032+abodacs@users.noreply.github.com>
* Update docs/source/ar/pipeline_tutorial.md
Co-authored-by: Abdullah Mohammed <554032+abodacs@users.noreply.github.com>
* Update docs/source/ar/pipeline_tutorial.md
Co-authored-by: Abdullah Mohammed <554032+abodacs@users.noreply.github.com>
* Update docs/source/ar/pipeline_tutorial.md
Co-authored-by: Abdullah Mohammed <554032+abodacs@users.noreply.github.com>
* Update docs/source/ar/pipeline_tutorial.md
Co-authored-by: Abdullah Mohammed <554032+abodacs@users.noreply.github.com>
* Update docs/source/ar/pipeline_tutorial.md
Co-authored-by: Abdullah Mohammed <554032+abodacs@users.noreply.github.com>
* Update docs/source/ar/pipeline_tutorial.md
Co-authored-by: Abdullah Mohammed <554032+abodacs@users.noreply.github.com>
* Update docs/source/ar/pipeline_tutorial.md
Co-authored-by: Abdullah Mohammed <554032+abodacs@users.noreply.github.com>
* Update docs/source/ar/pipeline_tutorial.md
Co-authored-by: Abdullah Mohammed <554032+abodacs@users.noreply.github.com>
* Update docs/source/ar/pipeline_tutorial.md
Co-authored-by: Abdullah Mohammed <554032+abodacs@users.noreply.github.com>
* Update docs/source/ar/pipeline_tutorial.md
Co-authored-by: Abdullah Mohammed <554032+abodacs@users.noreply.github.com>
* Update docs/source/ar/autoclass_tutorial.md
Co-authored-by: Abdullah Mohammed <554032+abodacs@users.noreply.github.com>
* Update docs/source/ar/autoclass_tutorial.md
Co-authored-by: Abdullah Mohammed <554032+abodacs@users.noreply.github.com>
* Update docs/source/ar/autoclass_tutorial.md
Co-authored-by: Abdullah Mohammed <554032+abodacs@users.noreply.github.com>
* Update docs/source/ar/autoclass_tutorial.md
Co-authored-by: Abdullah Mohammed <554032+abodacs@users.noreply.github.com>
* Update docs/source/ar/autoclass_tutorial.md
Co-authored-by: Abdullah Mohammed <554032+abodacs@users.noreply.github.com>
* Update docs/source/ar/autoclass_tutorial.md
Co-authored-by: Abdullah Mohammed <554032+abodacs@users.noreply.github.com>
* Update docs/source/ar/autoclass_tutorial.md
Co-authored-by: Abdullah Mohammed <554032+abodacs@users.noreply.github.com>
* Update docs/source/ar/autoclass_tutorial.md
Co-authored-by: Abdullah Mohammed <554032+abodacs@users.noreply.github.com>
* Update docs/source/ar/autoclass_tutorial.md
Co-authored-by: Abdullah Mohammed <554032+abodacs@users.noreply.github.com>
* Update docs/source/ar/autoclass_tutorial.md
Co-authored-by: Abdullah Mohammed <554032+abodacs@users.noreply.github.com>
* Update docs/source/ar/autoclass_tutorial.md
Co-authored-by: Abdullah Mohammed <554032+abodacs@users.noreply.github.com>
* Update docs/source/ar/autoclass_tutorial.md
Co-authored-by: Abdullah Mohammed <554032+abodacs@users.noreply.github.com>
* Update docs/source/ar/autoclass_tutorial.md
Co-authored-by: Abdullah Mohammed <554032+abodacs@users.noreply.github.com>
* Update docs/source/ar/autoclass_tutorial.md
Co-authored-by: Abdullah Mohammed <554032+abodacs@users.noreply.github.com>
* Update docs/source/ar/preprocessing.md
Co-authored-by: Abdullah Mohammed <554032+abodacs@users.noreply.github.com>
* Update docs/source/ar/preprocessing.md
Co-authored-by: Abdullah Mohammed <554032+abodacs@users.noreply.github.com>
* Update docs/source/ar/preprocessing.md
Co-authored-by: Abdullah Mohammed <554032+abodacs@users.noreply.github.com>
* Update docs/source/ar/preprocessing.md
Co-authored-by: Abdullah Mohammed <554032+abodacs@users.noreply.github.com>
* Update docs/source/ar/preprocessing.md
Co-authored-by: Abdullah Mohammed <554032+abodacs@users.noreply.github.com>
* Update docs/source/ar/preprocessing.md
Co-authored-by: Abdullah Mohammed <554032+abodacs@users.noreply.github.com>
* Update docs/source/ar/preprocessing.md
Co-authored-by: Abdullah Mohammed <554032+abodacs@users.noreply.github.com>
* Update docs/source/ar/preprocessing.md
Co-authored-by: Abdullah Mohammed <554032+abodacs@users.noreply.github.com>
* Update docs/source/ar/preprocessing.md
Co-authored-by: Abdullah Mohammed <554032+abodacs@users.noreply.github.com>
* Update docs/source/ar/preprocessing.md
Co-authored-by: Abdullah Mohammed <554032+abodacs@users.noreply.github.com>
* Update docs/source/ar/preprocessing.md
Co-authored-by: Abdullah Mohammed <554032+abodacs@users.noreply.github.com>
* Update docs/source/ar/preprocessing.md
Co-authored-by: Abdullah Mohammed <554032+abodacs@users.noreply.github.com>
* Update docs/source/ar/preprocessing.md
Co-authored-by: Abdullah Mohammed <554032+abodacs@users.noreply.github.com>
* Update docs/source/ar/preprocessing.md
Co-authored-by: Abdullah Mohammed <554032+abodacs@users.noreply.github.com>
* Update docs/source/ar/preprocessing.md
Co-authored-by: Abdullah Mohammed <554032+abodacs@users.noreply.github.com>
* Update docs/source/ar/preprocessing.md
Co-authored-by: Abdullah Mohammed <554032+abodacs@users.noreply.github.com>
* Update docs/source/ar/preprocessing.md
Co-authored-by: Abdullah Mohammed <554032+abodacs@users.noreply.github.com>
* Update docs/source/ar/preprocessing.md
Co-authored-by: Abdullah Mohammed <554032+abodacs@users.noreply.github.com>
* Update docs/source/ar/preprocessing.md
Co-authored-by: Abdullah Mohammed <554032+abodacs@users.noreply.github.com>
* Update docs/source/ar/preprocessing.md
Co-authored-by: Abdullah Mohammed <554032+abodacs@users.noreply.github.com>
* Update docs/source/ar/preprocessing.md
Co-authored-by: Abdullah Mohammed <554032+abodacs@users.noreply.github.com>
* Update docs/source/ar/preprocessing.md
Co-authored-by: Abdullah Mohammed <554032+abodacs@users.noreply.github.com>
* Update docs/source/ar/preprocessing.md
Co-authored-by: Abdullah Mohammed <554032+abodacs@users.noreply.github.com>
* Update docs/source/ar/preprocessing.md
Co-authored-by: Abdullah Mohammed <554032+abodacs@users.noreply.github.com>
* Update docs/source/ar/preprocessing.md
Co-authored-by: Abdullah Mohammed <554032+abodacs@users.noreply.github.com>
* Update docs/source/ar/preprocessing.md
Co-authored-by: Abdullah Mohammed <554032+abodacs@users.noreply.github.com>
* Update docs/source/ar/preprocessing.md
Co-authored-by: Abdullah Mohammed <554032+abodacs@users.noreply.github.com>
* Update docs/source/ar/preprocessing.md
Co-authored-by: Abdullah Mohammed <554032+abodacs@users.noreply.github.com>
* Update docs/source/ar/preprocessing.md
Co-authored-by: Abdullah Mohammed <554032+abodacs@users.noreply.github.com>
* Update docs/source/ar/preprocessing.md
Co-authored-by: Abdullah Mohammed <554032+abodacs@users.noreply.github.com>
* Update docs/source/ar/preprocessing.md
Co-authored-by: Abdullah Mohammed <554032+abodacs@users.noreply.github.com>
* Update docs/source/ar/preprocessing.md
Co-authored-by: Abdullah Mohammed <554032+abodacs@users.noreply.github.com>
* Update docs/source/ar/preprocessing.md
Co-authored-by: Abdullah Mohammed <554032+abodacs@users.noreply.github.com>
* Update docs/source/ar/preprocessing.md
Co-authored-by: Abdullah Mohammed <554032+abodacs@users.noreply.github.com>
* Update docs/source/ar/preprocessing.md
Co-authored-by: Abdullah Mohammed <554032+abodacs@users.noreply.github.com>
* Update docs/source/ar/preprocessing.md
Co-authored-by: Abdullah Mohammed <554032+abodacs@users.noreply.github.com>
* Update docs/source/ar/preprocessing.md
Co-authored-by: Abdullah Mohammed <554032+abodacs@users.noreply.github.com>
* Update docs/source/ar/preprocessing.md
Co-authored-by: Abdullah Mohammed <554032+abodacs@users.noreply.github.com>
* Update docs/source/ar/preprocessing.md
Co-authored-by: Abdullah Mohammed <554032+abodacs@users.noreply.github.com>
* Update docs/source/ar/preprocessing.md
Co-authored-by: Abdullah Mohammed <554032+abodacs@users.noreply.github.com>
* Update docs/source/ar/training.md
Co-authored-by: Abdullah Mohammed <554032+abodacs@users.noreply.github.com>
* Update docs/source/ar/training.md
Co-authored-by: Abdullah Mohammed <554032+abodacs@users.noreply.github.com>
* Update docs/source/ar/training.md
Co-authored-by: Abdullah Mohammed <554032+abodacs@users.noreply.github.com>
* Update docs/source/ar/training.md
Co-authored-by: Abdullah Mohammed <554032+abodacs@users.noreply.github.com>
* Update docs/source/ar/training.md
Co-authored-by: Abdullah Mohammed <554032+abodacs@users.noreply.github.com>
* Update docs/source/ar/training.md
Co-authored-by: Abdullah Mohammed <554032+abodacs@users.noreply.github.com>
* Update docs/source/ar/training.md
Co-authored-by: Abdullah Mohammed <554032+abodacs@users.noreply.github.com>
* Update docs/source/ar/training.md
Co-authored-by: Abdullah Mohammed <554032+abodacs@users.noreply.github.com>
* Update docs/source/ar/training.md
Co-authored-by: Abdullah Mohammed <554032+abodacs@users.noreply.github.com>
* Update docs/source/ar/training.md
Co-authored-by: Abdullah Mohammed <554032+abodacs@users.noreply.github.com>
* Update docs/source/ar/training.md
Co-authored-by: Abdullah Mohammed <554032+abodacs@users.noreply.github.com>
* Update docs/source/ar/training.md
Co-authored-by: Abdullah Mohammed <554032+abodacs@users.noreply.github.com>
* Update docs/source/ar/training.md
Co-authored-by: Abdullah Mohammed <554032+abodacs@users.noreply.github.com>
* Update docs/source/ar/training.md
Co-authored-by: Abdullah Mohammed <554032+abodacs@users.noreply.github.com>
* Update docs/source/ar/training.md
Co-authored-by: Abdullah Mohammed <554032+abodacs@users.noreply.github.com>
* Update docs/source/ar/training.md
Co-authored-by: Abdullah Mohammed <554032+abodacs@users.noreply.github.com>
* Update docs/source/ar/training.md
Co-authored-by: Abdullah Mohammed <554032+abodacs@users.noreply.github.com>
* Update docs/source/ar/training.md
Co-authored-by: Abdullah Mohammed <554032+abodacs@users.noreply.github.com>
* Update docs/source/ar/training.md
Co-authored-by: Abdullah Mohammed <554032+abodacs@users.noreply.github.com>
* Update docs/source/ar/training.md
Co-authored-by: Abdullah Mohammed <554032+abodacs@users.noreply.github.com>
* Update docs/source/ar/run_scripts.md
Co-authored-by: Abdullah Mohammed <554032+abodacs@users.noreply.github.com>
* Update docs/source/ar/run_scripts.md
Co-authored-by: Abdullah Mohammed <554032+abodacs@users.noreply.github.com>
* Update docs/source/ar/run_scripts.md
Co-authored-by: Abdullah Mohammed <554032+abodacs@users.noreply.github.com>
* Update docs/source/ar/run_scripts.md
Co-authored-by: Abdullah Mohammed <554032+abodacs@users.noreply.github.com>
* Update docs/source/ar/run_scripts.md
Co-authored-by: Abdullah Mohammed <554032+abodacs@users.noreply.github.com>
* Update docs/source/ar/run_scripts.md
Co-authored-by: Abdullah Mohammed <554032+abodacs@users.noreply.github.com>
* Update docs/source/ar/run_scripts.md
Co-authored-by: Abdullah Mohammed <554032+abodacs@users.noreply.github.com>
* Update docs/source/ar/accelerate.md
Co-authored-by: Abdullah Mohammed <554032+abodacs@users.noreply.github.com>
* Update docs/source/ar/accelerate.md
Co-authored-by: Abdullah Mohammed <554032+abodacs@users.noreply.github.com>
* Update docs/source/ar/accelerate.md
Co-authored-by: Abdullah Mohammed <554032+abodacs@users.noreply.github.com>
* Update docs/source/ar/accelerate.md
Co-authored-by: Abdullah Mohammed <554032+abodacs@users.noreply.github.com>
* Update docs/source/ar/accelerate.md
Co-authored-by: Abdullah Mohammed <554032+abodacs@users.noreply.github.com>
* Update docs/source/ar/accelerate.md
Co-authored-by: Abdullah Mohammed <554032+abodacs@users.noreply.github.com>
* Create _config.py
* Update _toctree.yml
* Update _toctree.yml
* Update docs/source/ar/peft.md
Co-authored-by: Abdullah Mohammed <554032+abodacs@users.noreply.github.com>
* Update docs/source/ar/peft.md
Co-authored-by: Abdullah Mohammed <554032+abodacs@users.noreply.github.com>
* Update docs/source/ar/peft.md
Co-authored-by: Abdullah Mohammed <554032+abodacs@users.noreply.github.com>
* Update docs/source/ar/peft.md
Co-authored-by: Abdullah Mohammed <554032+abodacs@users.noreply.github.com>
* Update docs/source/ar/peft.md
Co-authored-by: Abdullah Mohammed <554032+abodacs@users.noreply.github.com>
* Update docs/source/ar/peft.md
Co-authored-by: Abdullah Mohammed <554032+abodacs@users.noreply.github.com>
* Update docs/source/ar/peft.md
Co-authored-by: Abdullah Mohammed <554032+abodacs@users.noreply.github.com>
* Update docs/source/ar/peft.md
Co-authored-by: Abdullah Mohammed <554032+abodacs@users.noreply.github.com>
* Update docs/source/ar/peft.md
Co-authored-by: Abdullah Mohammed <554032+abodacs@users.noreply.github.com>
* Update _toctree.yml
* Update docs/source/ar/model_sharing.md
Co-authored-by: Abdullah Mohammed <554032+abodacs@users.noreply.github.com>
* Update docs/source/ar/model_sharing.md
Co-authored-by: Abdullah Mohammed <554032+abodacs@users.noreply.github.com>
* Update docs/source/ar/model_sharing.md
Co-authored-by: Abdullah Mohammed <554032+abodacs@users.noreply.github.com>
* Update docs/source/ar/model_sharing.md
Co-authored-by: Abdullah Mohammed <554032+abodacs@users.noreply.github.com>
* Update docs/source/ar/model_sharing.md
Co-authored-by: Abdullah Mohammed <554032+abodacs@users.noreply.github.com>
* Update docs/source/ar/model_sharing.md
Co-authored-by: Abdullah Mohammed <554032+abodacs@users.noreply.github.com>
* Update docs/source/ar/model_sharing.md
Co-authored-by: Abdullah Mohammed <554032+abodacs@users.noreply.github.com>
* Update docs/source/ar/model_sharing.md
Co-authored-by: Abdullah Mohammed <554032+abodacs@users.noreply.github.com>
* Update docs/source/ar/model_sharing.md
Co-authored-by: Abdullah Mohammed <554032+abodacs@users.noreply.github.com>
* Update docs/source/ar/model_sharing.md
Co-authored-by: Abdullah Mohammed <554032+abodacs@users.noreply.github.com>
* Update docs/source/ar/model_sharing.md
Co-authored-by: Abdullah Mohammed <554032+abodacs@users.noreply.github.com>
* Update docs/source/ar/model_sharing.md
Co-authored-by: Abdullah Mohammed <554032+abodacs@users.noreply.github.com>
* Update docs/source/ar/model_sharing.md
Co-authored-by: Abdullah Mohammed <554032+abodacs@users.noreply.github.com>
* Update docs/source/ar/model_sharing.md
Co-authored-by: Abdullah Mohammed <554032+abodacs@users.noreply.github.com>
* Update docs/source/ar/model_sharing.md
Co-authored-by: Abdullah Mohammed <554032+abodacs@users.noreply.github.com>
* Update docs/source/ar/model_sharing.md
Co-authored-by: Abdullah Mohammed <554032+abodacs@users.noreply.github.com>
* Update docs/source/ar/model_sharing.md
Co-authored-by: Abdullah Mohammed <554032+abodacs@users.noreply.github.com>
* Update docs/source/ar/model_sharing.md
Co-authored-by: Abdullah Mohammed <554032+abodacs@users.noreply.github.com>
* Update docs/source/ar/model_sharing.md
Co-authored-by: Abdullah Mohammed <554032+abodacs@users.noreply.github.com>
* Update docs/source/ar/model_sharing.md
Co-authored-by: Abdullah Mohammed <554032+abodacs@users.noreply.github.com>
* Update docs/source/ar/conversations.md
Co-authored-by: Abdullah Mohammed <554032+abodacs@users.noreply.github.com>
* Update docs/source/ar/conversations.md
Co-authored-by: Abdullah Mohammed <554032+abodacs@users.noreply.github.com>
* Update docs/source/ar/conversations.md
Co-authored-by: Abdullah Mohammed <554032+abodacs@users.noreply.github.com>
* Update docs/source/ar/conversations.md
Co-authored-by: Abdullah Mohammed <554032+abodacs@users.noreply.github.com>
* Update docs/source/ar/conversations.md
Co-authored-by: Abdullah Mohammed <554032+abodacs@users.noreply.github.com>
* Update docs/source/ar/conversations.md
Co-authored-by: Abdullah Mohammed <554032+abodacs@users.noreply.github.com>
* Update docs/source/ar/conversations.md
Co-authored-by: Abdullah Mohammed <554032+abodacs@users.noreply.github.com>
* Update docs/source/ar/conversations.md
Co-authored-by: Abdullah Mohammed <554032+abodacs@users.noreply.github.com>
* Update docs/source/ar/conversations.md
Co-authored-by: Abdullah Mohammed <554032+abodacs@users.noreply.github.com>
* Update docs/source/ar/conversations.md
Co-authored-by: Abdullah Mohammed <554032+abodacs@users.noreply.github.com>
* Update docs/source/ar/conversations.md
Co-authored-by: Abdullah Mohammed <554032+abodacs@users.noreply.github.com>
* Update docs/source/ar/conversations.md
Co-authored-by: Abdullah Mohammed <554032+abodacs@users.noreply.github.com>
* Update docs/source/ar/conversations.md
Co-authored-by: Abdullah Mohammed <554032+abodacs@users.noreply.github.com>
* Update docs/source/ar/conversations.md
Co-authored-by: Abdullah Mohammed <554032+abodacs@users.noreply.github.com>
* Update docs/source/ar/conversations.md
Co-authored-by: Abdullah Mohammed <554032+abodacs@users.noreply.github.com>
* Update docs/source/ar/agents.md
Co-authored-by: Abdullah Mohammed <554032+abodacs@users.noreply.github.com>
* Update docs/source/ar/agents.md
Co-authored-by: Abdullah Mohammed <554032+abodacs@users.noreply.github.com>
* Update docs/source/ar/agents.md
Co-authored-by: Abdullah Mohammed <554032+abodacs@users.noreply.github.com>
* Update docs/source/ar/agents.md
Co-authored-by: Abdullah Mohammed <554032+abodacs@users.noreply.github.com>
* Update docs/source/ar/agents.md
Co-authored-by: Abdullah Mohammed <554032+abodacs@users.noreply.github.com>
* Update docs/source/ar/agents.md
Co-authored-by: Abdullah Mohammed <554032+abodacs@users.noreply.github.com>
* Update docs/source/ar/agents.md
Co-authored-by: Abdullah Mohammed <554032+abodacs@users.noreply.github.com>
* Update docs/source/ar/agents.md
Co-authored-by: Abdullah Mohammed <554032+abodacs@users.noreply.github.com>
* Update docs/source/ar/agents.md
Co-authored-by: Abdullah Mohammed <554032+abodacs@users.noreply.github.com>
* Update docs/source/ar/agents.md
Co-authored-by: Abdullah Mohammed <554032+abodacs@users.noreply.github.com>
* Update docs/source/ar/agents.md
Co-authored-by: Abdullah Mohammed <554032+abodacs@users.noreply.github.com>
* Update docs/source/ar/agents.md
Co-authored-by: Abdullah Mohammed <554032+abodacs@users.noreply.github.com>
* Update docs/source/ar/agents.md
Co-authored-by: Abdullah Mohammed <554032+abodacs@users.noreply.github.com>
* Update docs/source/ar/agents.md
Co-authored-by: Abdullah Mohammed <554032+abodacs@users.noreply.github.com>
* Update docs/source/ar/agents.md
Co-authored-by: Abdullah Mohammed <554032+abodacs@users.noreply.github.com>
* Update docs/source/ar/agents.md
Co-authored-by: Abdullah Mohammed <554032+abodacs@users.noreply.github.com>
* Update docs/source/ar/agents.md
Co-authored-by: Abdullah Mohammed <554032+abodacs@users.noreply.github.com>
* Update docs/source/ar/agents.md
Co-authored-by: Abdullah Mohammed <554032+abodacs@users.noreply.github.com>
* Update docs/source/ar/agents.md
Co-authored-by: Abdullah Mohammed <554032+abodacs@users.noreply.github.com>
* Update docs/source/ar/agents.md
Co-authored-by: Abdullah Mohammed <554032+abodacs@users.noreply.github.com>
* Update docs/source/ar/agents.md
Co-authored-by: Abdullah Mohammed <554032+abodacs@users.noreply.github.com>
* Update docs/source/ar/agents.md
Co-authored-by: Abdullah Mohammed <554032+abodacs@users.noreply.github.com>
* Update docs/source/ar/agents.md
Co-authored-by: Abdullah Mohammed <554032+abodacs@users.noreply.github.com>
* Update docs/source/ar/agents.md
Co-authored-by: Abdullah Mohammed <554032+abodacs@users.noreply.github.com>
* Update docs/source/ar/agents.md
Co-authored-by: Abdullah Mohammed <554032+abodacs@users.noreply.github.com>
* Update docs/source/ar/agents.md
Co-authored-by: Abdullah Mohammed <554032+abodacs@users.noreply.github.com>
* Update docs/source/ar/agents.md
Co-authored-by: Abdullah Mohammed <554032+abodacs@users.noreply.github.com>
* Update docs/source/ar/agents.md
Co-authored-by: Abdullah Mohammed <554032+abodacs@users.noreply.github.com>
* Update docs/source/ar/agents.md
Co-authored-by: Abdullah Mohammed <554032+abodacs@users.noreply.github.com>
* Update docs/source/ar/agents.md
Co-authored-by: Abdullah Mohammed <554032+abodacs@users.noreply.github.com>
* Update docs/source/ar/agents.md
Co-authored-by: Abdullah Mohammed <554032+abodacs@users.noreply.github.com>
* Update docs/source/ar/agents.md
Co-authored-by: Abdullah Mohammed <554032+abodacs@users.noreply.github.com>
* Update docs/source/ar/agents.md
Co-authored-by: Abdullah Mohammed <554032+abodacs@users.noreply.github.com>
* Update docs/source/ar/agents.md
Co-authored-by: Abdullah Mohammed <554032+abodacs@users.noreply.github.com>
* Update docs/source/ar/agents.md
Co-authored-by: Abdullah Mohammed <554032+abodacs@users.noreply.github.com>
* Update docs/source/ar/agents.md
Co-authored-by: Abdullah Mohammed <554032+abodacs@users.noreply.github.com>
* Update docs/source/ar/agents.md
Co-authored-by: Abdullah Mohammed <554032+abodacs@users.noreply.github.com>
* Update docs/source/ar/agents.md
Co-authored-by: Abdullah Mohammed <554032+abodacs@users.noreply.github.com>
* Update docs/source/ar/llm_tutorial.md
Co-authored-by: Abdullah Mohammed <554032+abodacs@users.noreply.github.com>
* Update docs/source/ar/llm_tutorial.md
Co-authored-by: Abdullah Mohammed <554032+abodacs@users.noreply.github.com>
* Update docs/source/ar/llm_tutorial.md
Co-authored-by: Abdullah Mohammed <554032+abodacs@users.noreply.github.com>
* Update docs/source/ar/llm_tutorial.md
Co-authored-by: Abdullah Mohammed <554032+abodacs@users.noreply.github.com>
* Update docs/source/ar/llm_tutorial.md
Co-authored-by: Abdullah Mohammed <554032+abodacs@users.noreply.github.com>
* Update docs/source/ar/llm_tutorial.md
Co-authored-by: Abdullah Mohammed <554032+abodacs@users.noreply.github.com>
* Update docs/source/ar/llm_tutorial.md
Co-authored-by: Abdullah Mohammed <554032+abodacs@users.noreply.github.com>
* Update docs/source/ar/llm_tutorial.md
Co-authored-by: Abdullah Mohammed <554032+abodacs@users.noreply.github.com>
* Update docs/source/ar/llm_tutorial.md
Co-authored-by: Abdullah Mohammed <554032+abodacs@users.noreply.github.com>
* Update docs/source/ar/llm_tutorial.md
Co-authored-by: Abdullah Mohammed <554032+abodacs@users.noreply.github.com>
* Update docs/source/ar/llm_tutorial.md
Co-authored-by: Abdullah Mohammed <554032+abodacs@users.noreply.github.com>
* Update docs/source/ar/llm_tutorial.md
Co-authored-by: Abdullah Mohammed <554032+abodacs@users.noreply.github.com>
* Update docs/source/ar/llm_tutorial.md
Co-authored-by: Abdullah Mohammed <554032+abodacs@users.noreply.github.com>
* Update docs/source/ar/llm_tutorial.md
Co-authored-by: Abdullah Mohammed <554032+abodacs@users.noreply.github.com>
* Update docs/source/ar/llm_tutorial.md
Co-authored-by: Abdullah Mohammed <554032+abodacs@users.noreply.github.com>
* Update docs/source/ar/llm_tutorial.md
Co-authored-by: Abdullah Mohammed <554032+abodacs@users.noreply.github.com>
* Update docs/source/ar/llm_tutorial.md
Co-authored-by: Abdullah Mohammed <554032+abodacs@users.noreply.github.com>
* Update docs/source/ar/llm_tutorial.md
Co-authored-by: Abdullah Mohammed <554032+abodacs@users.noreply.github.com>
* Update docs/source/ar/llm_tutorial.md
Co-authored-by: Abdullah Mohammed <554032+abodacs@users.noreply.github.com>
* Update docs/source/ar/llm_tutorial.md
Co-authored-by: Abdullah Mohammed <554032+abodacs@users.noreply.github.com>
* Update docs/source/ar/llm_tutorial.md
Co-authored-by: Abdullah Mohammed <554032+abodacs@users.noreply.github.com>
* Update llm_tutorial.md
* Update _toctree.yml
* Update autoclass_tutorial.md
* Update autoclass_tutorial.md
* Update preprocessing.md
* Update glossary.md
* Update run_scripts.md
* Update run_scripts.md
* Update run_scripts.md
---------
Co-authored-by: Abdullah Mohammed <554032+abodacs@users.noreply.github.com>
---
.github/workflows/build_documentation.yml | 2 +-
.github/workflows/build_pr_documentation.yml | 2 +-
docs/source/ar/_config.py | 14 +
docs/source/ar/_toctree.yml | 892 +++++++++++++++++++
docs/source/ar/accelerate.md | 120 +++
docs/source/ar/agents.md | 539 +++++++++++
docs/source/ar/autoclass_tutorial.md | 167 ++++
docs/source/ar/conversations.md | 204 +++++
docs/source/ar/glossary.md | 446 ++++++++++
docs/source/ar/index.md | 342 +++++++
docs/source/ar/installation.md | 246 +++++
docs/source/ar/llm_tutorial.md | 248 ++++++
docs/source/ar/model_sharing.md | 223 +++++
docs/source/ar/peft.md | 250 ++++++
docs/source/ar/pipeline_tutorial.md | 315 +++++++
docs/source/ar/preprocessing.md | 521 +++++++++++
docs/source/ar/quicktour.md | 543 +++++++++++
docs/source/ar/run_scripts.md | 351 ++++++++
docs/source/ar/training.md | 412 +++++++++
19 files changed, 5835 insertions(+), 2 deletions(-)
create mode 100644 docs/source/ar/_config.py
create mode 100644 docs/source/ar/_toctree.yml
create mode 100644 docs/source/ar/accelerate.md
create mode 100644 docs/source/ar/agents.md
create mode 100644 docs/source/ar/autoclass_tutorial.md
create mode 100644 docs/source/ar/conversations.md
create mode 100644 docs/source/ar/glossary.md
create mode 100644 docs/source/ar/index.md
create mode 100644 docs/source/ar/installation.md
create mode 100644 docs/source/ar/llm_tutorial.md
create mode 100644 docs/source/ar/model_sharing.md
create mode 100644 docs/source/ar/peft.md
create mode 100644 docs/source/ar/pipeline_tutorial.md
create mode 100644 docs/source/ar/preprocessing.md
create mode 100644 docs/source/ar/quicktour.md
create mode 100644 docs/source/ar/run_scripts.md
create mode 100644 docs/source/ar/training.md
diff --git a/.github/workflows/build_documentation.yml b/.github/workflows/build_documentation.yml
index e3e3b5f2df37f1..b25567fb092a14 100644
--- a/.github/workflows/build_documentation.yml
+++ b/.github/workflows/build_documentation.yml
@@ -15,7 +15,7 @@ jobs:
commit_sha: ${{ github.sha }}
package: transformers
notebook_folder: transformers_doc
- languages: de en es fr hi it ko pt tr zh ja te
+ languages: ar de en es fr hi it ko pt tr zh ja te
custom_container: huggingface/transformers-doc-builder
secrets:
token: ${{ secrets.HUGGINGFACE_PUSH }}
diff --git a/.github/workflows/build_pr_documentation.yml b/.github/workflows/build_pr_documentation.yml
index c8d073ea34688f..f698f860b2f93c 100644
--- a/.github/workflows/build_pr_documentation.yml
+++ b/.github/workflows/build_pr_documentation.yml
@@ -14,5 +14,5 @@ jobs:
commit_sha: ${{ github.event.pull_request.head.sha }}
pr_number: ${{ github.event.number }}
package: transformers
- languages: de en es fr hi it ko pt tr zh ja te
+ languages: ar de en es fr hi it ko pt tr zh ja te
custom_container: huggingface/transformers-doc-builder
diff --git a/docs/source/ar/_config.py b/docs/source/ar/_config.py
new file mode 100644
index 00000000000000..f49e4e4731965a
--- /dev/null
+++ b/docs/source/ar/_config.py
@@ -0,0 +1,14 @@
+# docstyle-ignore
+INSTALL_CONTENT = """
+# Transformers installation
+! pip install transformers datasets evaluate accelerate
+# To install from source instead of the last release, comment the command above and uncomment the following one.
+# ! pip install git+https://github.com/huggingface/transformers.git
+"""
+
+notebook_first_cells = [{"type": "code", "content": INSTALL_CONTENT}]
+black_avoid_patterns = {
+ "{processor_class}": "FakeProcessorClass",
+ "{model_class}": "FakeModelClass",
+ "{object_class}": "FakeObjectClass",
+}
diff --git a/docs/source/ar/_toctree.yml b/docs/source/ar/_toctree.yml
new file mode 100644
index 00000000000000..39e0ae14e19c29
--- /dev/null
+++ b/docs/source/ar/_toctree.yml
@@ -0,0 +1,892 @@
+- sections:
+ - local: index
+ title: 🤗 المحولات
+ - local: quicktour
+ title: جولة سريعة
+ - local: installation
+ title: التثبيت
+ title: البدء
+- sections:
+ - local: pipeline_tutorial
+ title: تشغيل الاستنتاج باستخدام خطوط الأنابيب
+ - local: autoclass_tutorial
+ title: كتابة تعليمات برمجية متكيفه باستخدام AutoClass
+ - local: preprocessing
+ title: معالجة البيانات مسبقًا
+ - local: training
+ title: ضبط نموذج مسبق التدريب
+ - local: run_scripts
+ title: التدريب باستخدام نص برمجي
+ - local: accelerate
+ title: إعداد تدريب موزع باستخدام 🤗 Accelerate
+ - local: peft
+ title: تحميل النماذج المخصصة وتدريبها باستخدام 🤗 PEFT
+ - local: model_sharing
+ title: مشاركة نموذجك
+ - local: agents
+ title: الوكلاء
+ - local: llm_tutorial
+ title: التوليد باستخدام LLMs
+ - local: conversations
+ title: الدردشة مع المحولات
+ title: البرامج التعليمية
+# - sections:
+# - isExpanded: false
+# sections:
+# - local: tasks/sequence_classification
+# title: تصنيف النصوص
+# - local: tasks/token_classification
+# title: تصنيف الرموز
+# - local: tasks/question_answering
+# title: الإجابة على الأسئلة
+# - local: tasks/language_modeling
+# title: نمذجة اللغة السببية
+# - local: tasks/masked_language_modeling
+# title: نمذجة اللغة المقنعة
+# - local: tasks/translation
+# title: الترجمة
+# - local: tasks/summarization
+# title: التلخيص
+# - local: tasks/multiple_choice
+# title: الاختيار المتعدد
+# title: معالجة اللغات الطبيعية
+# - isExpanded: false
+# sections:
+# - local: tasks/audio_classification
+# title: تصنيف الصوت
+# - local: tasks/asr
+# title: التعرف التلقائي على الكلام
+# title: الصوت
+# - isExpanded: false
+# sections:
+# - local: tasks/image_classification
+# title: تصنيف الصور
+# - local: tasks/semantic_segmentation
+# title: تجزئة الصور
+# - local: tasks/video_classification
+# title: تصنيف الفيديو
+# - local: tasks/object_detection
+# title: اكتشاف الأشياء
+# - local: tasks/zero_shot_object_detection
+# title: اكتشاف الأشياء بدون تدريب
+# - local: tasks/zero_shot_image_classification
+# title: تصنيف الصور بدون تدريب
+# - local: tasks/monocular_depth_estimation
+# title: تقدير العمق
+# - local: tasks/image_to_image
+# title: صورة إلى صورة
+# - local: tasks/image_feature_extraction
+# title: استخراج ميزات الصورة
+# - local: tasks/mask_generation
+# title: توليد القناع
+# - local: tasks/knowledge_distillation_for_image_classification
+# title: التقليل المعرفي للرؤية الحاسوبية
+# title: الرؤية الحاسوبية
+# - isExpanded: false
+# sections:
+# - local: tasks/image_captioning
+# title: وصف الصور Image captioning
+# - local: tasks/document_question_answering
+# title: الإجابة على أسئلة المستندات
+# - local: tasks/visual_question_answering
+# title: الإجابة على الأسئلة المرئية
+# - local: tasks/text-to-speech
+# title: تحويل النص إلى كلام
+# title: المتعددة الوسائط
+# - isExpanded: false
+# sections:
+# - local: generation_strategies
+# title: تخصيص استراتيجية التوليد
+# - local: kv_cache
+# title: أفضل الممارسات للتوليد باستخدام ذاكرة التخزين المؤقت
+# title: التوليد
+# - isExpanded: false
+# sections:
+# - local: tasks/idefics
+# title: مهام الصور مع IDEFICS
+# - local: tasks/prompting
+# title: دليل إرشادي لمحفزات النماذج اللغوية الكبيرة
+# title: الإرشاد
+# title: أدلة المهام
+# - sections:
+# - local: fast_tokenizers
+# title: استخدم برامج التجزئة السريعة من 🤗 Tokenizers
+# - local: multilingual
+# title: تشغيل الاستنتاج باستخدام نماذج متعددة اللغات
+# - local: create_a_model
+# title: استخدام واجهات برمجة التطبيقات الخاصة بالنموذج
+# - local: custom_models
+# title: مشاركة نموذج مخصص
+# - local: chat_templating
+# title: قوالب لنماذج الدردشة
+# - local: trainer
+# title: المدرب
+# - local: sagemaker
+# title: تشغيل التدريب على Amazon SageMaker
+# - local: serialization
+# title: التصدير إلى ONNX
+# - local: tflite
+# title: التصدير إلى TFLite
+# - local: torchscript
+# title: التصدير إلى TorchScript
+# - local: benchmarks
+# title: المعايير
+# - local: notebooks
+# title: دفاتر الملاحظات مع الأمثلة
+# - local: community
+# title: موارد المجتمع
+# - local: troubleshooting
+# title: استكشاف الأخطاء وإصلاحها
+# - local: gguf
+# title: التوافق مع ملفات GGUF
+# title: أدلة المطورين
+# - sections:
+# - local: quantization/overview
+# title: نظرة عامة
+# - local: quantization/bitsandbytes
+# title: bitsandbytes
+# - local: quantization/gptq
+# title: GPTQ
+# - local: quantization/awq
+# title: AWQ
+# - local: quantization/aqlm
+# title: AQLM
+# - local: quantization/quanto
+# title: Quanto
+# - local: quantization/eetq
+# title: EETQ
+# - local: quantization/hqq
+# title: HQQ
+# - local: quantization/optimum
+# title: Optimum
+# - local: quantization/contribute
+# title: المساهمة بطريقة جديدة للتكميم
+# title: أساليب التكميم
+# - sections:
+# - local: performance
+# title: الأداء-نظرة عامة
+# - local: llm_optims
+# title: تحسين الاستدلال LLM
+# - sections:
+# - local: perf_train_gpu_one
+# title: استخدام عدة وحدات معالجة رسوميات (GPUs) بشكل متوازٍ
+# - local: perf_train_gpu_many
+# title: وحدات معالجة الرسومات (GPU) متعددة والتوازي
+# - local: fsdp
+# title: Fully Sharded Data Parallel
+# - local: deepspeed
+# title: DeepSpeed
+# - local: perf_train_cpu
+# title: التدريب الفعال على وحدة المعالجة المركزية (CPU)
+# - local: perf_train_cpu_many
+# title: التدريب الموزع لوحدة المعالجة المركزية (CPU)
+# - local: perf_train_tpu_tf
+# title: التدريب على (TPU) باستخدام TensorFlow
+# - local: perf_train_special
+# title: تدريب PyTorch على Apple silicon
+# - local: perf_hardware
+# title: الأجهزة المخصصة للتدريب
+# - local: hpo_train
+# title: البحث عن المعاملات المثلى باستخدام واجهة برمجة تطبيقات المدرب
+# title: تقنيات التدريب الفعال
+# - sections:
+# - local: perf_infer_cpu
+# title: الإستدلال على وحدة المعالجة المركزية (CPU)
+# - local: perf_infer_gpu_one
+# title: الإستدلال على وحدة معالجة الرسومات (GPU)
+# title: تحسين الاستدلال
+# - local: big_models
+# title: إنشاء نموذج كبير
+# - local: debugging
+# title: تصحيح الأخطاء البرمجية
+# - local: tf_xla
+# title: تكامل XLA لنماذج TensorFlow
+# - local: perf_torch_compile
+# title: تحسين الاستدلال باستخدام `torch.compile()`
+# title: الأداء وقابلية التوسع
+# - sections:
+# - local: contributing
+# title: كيفية المساهمة في 🤗 المحولات؟
+# - local: add_new_model
+# title: كيفية إضافة نموذج إلى 🤗 المحولات؟
+# - local: add_new_pipeline
+# title: كيفية إضافة خط أنابيب إلى 🤗 المحولات؟
+# - local: testing
+# title: الاختبار
+# - local: pr_checks
+# title: التحقق من طلب السحب
+# title: المساهمة
+- sections:
+ # - local: philosophy
+ # title: الفلسفة
+ - local: glossary
+ title: (قاموس المصطلحات (قائمة الكلمات
+ # - local: task_summary
+ # title: ما الذي يمكن أن تفعله 🤗 المحولات
+ # - local: tasks_explained
+ # title: كيف تحل المحولات المهام
+ # - local: model_summary
+ # title: عائلة نماذج المحول
+ # - local: tokenizer_summary
+ # title: ملخص برنامج مقسم النصوص (tokenizers)
+ # - local: attention
+ # title: الانتباه Attention
+ # - local: pad_truncation
+ # title: الحشو والتقليم
+ # - local: bertology
+ # title: BERTology
+ # - local: perplexity
+ # title: حيرة النماذج ذات الطول الثابت
+ # - local: pipeline_webserver
+ # title: خطوط الأنابيب للاستدلال على خادم الويب
+ # - local: model_memory_anatomy
+ # title: تشريح تدريب النموذج
+ # - local: llm_tutorial_optimization
+ # title: الاستفادة القصوى من LLMs
+ title: أطر مفاهيمية
+# - sections:
+# - sections:
+# - local: main_classes/agent
+# title: الوكلاء والأدوات
+# - local: model_doc/auto
+# title: فئات يتم إنشاؤها ديناميكيًا
+# - local: main_classes/backbones
+# title: العمود الفقري
+# - local: main_classes/callback
+# title: عمليات الاسترجاع
+# - local: main_classes/configuration
+# title: التكوين
+# - local: main_classes/data_collator
+# title: مجمع البيانات
+# - local: main_classes/keras_callbacks
+# title: استدعاءات Keras
+# - local: main_classes/logging
+# title: التسجيل
+# - local: main_classes/model
+# title: النماذج
+# - local: main_classes/text_generation
+# title: توليد النصوص
+# - local: main_classes/onnx
+# title: ONNX
+# - local: main_classes/optimizer_schedules
+# title: التحسين
+# - local: main_classes/output
+# title: مخرجات النموذج
+# - local: main_classes/pipelines
+# title: خطوط الأنابيب
+# - local: main_classes/processors
+# title: المعالجات
+# - local: main_classes/quantization
+# title: التكميم
+# - local: main_classes/tokenizer
+# title: برنامج مقسم النصوص
+# - local: main_classes/trainer
+# title: المدرب
+# - local: main_classes/deepspeed
+# title: DeepSpeed
+# - local: main_classes/feature_extractor
+# title: مستخرج الميزات
+# - local: main_classes/image_processor
+# title: معالج الصور
+# title: الفئات الرئيسية
+# - sections:
+# - isExpanded: false
+# sections:
+# - local: model_doc/albert
+# title: ALBERT
+# - local: model_doc/bart
+# title: BART
+# - local: model_doc/barthez
+# title: BARThez
+# - local: model_doc/bartpho
+# title: BARTpho
+# - local: model_doc/bert
+# title: BERT
+# - local: model_doc/bert-generation
+# title: BertGeneration
+# - local: model_doc/bert-japanese
+# title: BertJapanese
+# - local: model_doc/bertweet
+# title: Bertweet
+# - local: model_doc/big_bird
+# title: BigBird
+# - local: model_doc/bigbird_pegasus
+# title: BigBirdPegasus
+# - local: model_doc/biogpt
+# title: BioGpt
+# - local: model_doc/blenderbot
+# title: Blenderbot
+# - local: model_doc/blenderbot-small
+# title: Blenderbot Small
+# - local: model_doc/bloom
+# title: BLOOM
+# - local: model_doc/bort
+# title: BORT
+# - local: model_doc/byt5
+# title: ByT5
+# - local: model_doc/camembert
+# title: CamemBERT
+# - local: model_doc/canine
+# title: CANINE
+# - local: model_doc/codegen
+# title: CodeGen
+# - local: model_doc/code_llama
+# title: CodeLlama
+# - local: model_doc/cohere
+# title: Cohere
+# - local: model_doc/convbert
+# title: ConvBERT
+# - local: model_doc/cpm
+# title: CPM
+# - local: model_doc/cpmant
+# title: CPMANT
+# - local: model_doc/ctrl
+# title: CTRL
+# - local: model_doc/dbrx
+# title: DBRX
+# - local: model_doc/deberta
+# title: DeBERTa
+# - local: model_doc/deberta-v2
+# title: DeBERTa-v2
+# - local: model_doc/dialogpt
+# title: DialoGPT
+# - local: model_doc/distilbert
+# title: DistilBERT
+# - local: model_doc/dpr
+# title: DPR
+# - local: model_doc/electra
+# title: ELECTRA
+# - local: model_doc/encoder-decoder
+# title: Encoder Decoder Models
+# - local: model_doc/ernie
+# title: ERNIE
+# - local: model_doc/ernie_m
+# title: ErnieM
+# - local: model_doc/esm
+# title: ESM
+# - local: model_doc/falcon
+# title: Falcon
+# - local: model_doc/fastspeech2_conformer
+# title: FastSpeech2Conformer
+# - local: model_doc/flan-t5
+# title: FLAN-T5
+# - local: model_doc/flan-ul2
+# title: FLAN-UL2
+# - local: model_doc/flaubert
+# title: FlauBERT
+# - local: model_doc/fnet
+# title: FNet
+# - local: model_doc/fsmt
+# title: FSMT
+# - local: model_doc/funnel
+# title: Funnel Transformer
+# - local: model_doc/fuyu
+# title: Fuyu
+# - local: model_doc/gemma
+# title: Gemma
+# - local: model_doc/openai-gpt
+# title: GPT
+# - local: model_doc/gpt_neo
+# title: GPT Neo
+# - local: model_doc/gpt_neox
+# title: GPT NeoX
+# - local: model_doc/gpt_neox_japanese
+# title: GPT NeoX Japanese
+# - local: model_doc/gptj
+# title: GPT-J
+# - local: model_doc/gpt2
+# title: GPT2
+# - local: model_doc/gpt_bigcode
+# title: GPTBigCode
+# - local: model_doc/gptsan-japanese
+# title: GPTSAN Japanese
+# - local: model_doc/gpt-sw3
+# title: GPTSw3
+# - local: model_doc/herbert
+# title: HerBERT
+# - local: model_doc/ibert
+# title: I-BERT
+# - local: model_doc/jamba
+# title: Jamba
+# - local: model_doc/jetmoe
+# title: JetMoe
+# - local: model_doc/jukebox
+# title: Jukebox
+# - local: model_doc/led
+# title: LED
+# - local: model_doc/llama
+# title: LLaMA
+# - local: model_doc/llama2
+# title: Llama2
+# - local: model_doc/llama3
+# title: Llama3
+# - local: model_doc/longformer
+# title: Longformer
+# - local: model_doc/longt5
+# title: LongT5
+# - local: model_doc/luke
+# title: LUKE
+# - local: model_doc/m2m_100
+# title: M2M100
+# - local: model_doc/madlad-400
+# title: MADLAD-400
+# - local: model_doc/mamba
+# title: Mamba
+# - local: model_doc/marian
+# title: MarianMT
+# - local: model_doc/markuplm
+# title: MarkupLM
+# - local: model_doc/mbart
+# title: MBart and MBart-50
+# - local: model_doc/mega
+# title: MEGA
+# - local: model_doc/megatron-bert
+# title: MegatronBERT
+# - local: model_doc/megatron_gpt2
+# title: MegatronGPT2
+# - local: model_doc/mistral
+# title: Mistral
+# - local: model_doc/mixtral
+# title: Mixtral
+# - local: model_doc/mluke
+# title: mLUKE
+# - local: model_doc/mobilebert
+# title: MobileBERT
+# - local: model_doc/mpnet
+# title: MPNet
+# - local: model_doc/mpt
+# title: MPT
+# - local: model_doc/mra
+# title: MRA
+# - local: model_doc/mt5
+# title: MT5
+# - local: model_doc/mvp
+# title: MVP
+# - local: model_doc/nezha
+# title: NEZHA
+# - local: model_doc/nllb
+# title: NLLB
+# - local: model_doc/nllb-moe
+# title: NLLB-MoE
+# - local: model_doc/nystromformer
+# title: Nyströmformer
+# - local: model_doc/olmo
+# title: OLMo
+# - local: model_doc/open-llama
+# title: Open-Llama
+# - local: model_doc/opt
+# title: OPT
+# - local: model_doc/pegasus
+# title: Pegasus
+# - local: model_doc/pegasus_x
+# title: PEGASUS-X
+# - local: model_doc/persimmon
+# title: Persimmon
+# - local: model_doc/phi
+# title: Phi
+# - local: model_doc/phi3
+# title: Phi-3
+# - local: model_doc/phobert
+# title: PhoBERT
+# - local: model_doc/plbart
+# title: PLBart
+# - local: model_doc/prophetnet
+# title: ProphetNet
+# - local: model_doc/qdqbert
+# title: QDQBert
+# - local: model_doc/qwen2
+# title: Qwen2
+# - local: model_doc/qwen2_moe
+# title: Qwen2MoE
+# - local: model_doc/rag
+# title: RAG
+# - local: model_doc/realm
+# title: REALM
+# - local: model_doc/recurrent_gemma
+# title: RecurrentGemma
+# - local: model_doc/reformer
+# title: Reformer
+# - local: model_doc/rembert
+# title: RemBERT
+# - local: model_doc/retribert
+# title: RetriBERT
+# - local: model_doc/roberta
+# title: RoBERTa
+# - local: model_doc/roberta-prelayernorm
+# title: RoBERTa-PreLayerNorm
+# - local: model_doc/roc_bert
+# title: RoCBert
+# - local: model_doc/roformer
+# title: RoFormer
+# - local: model_doc/rwkv
+# title: RWKV
+# - local: model_doc/splinter
+# title: Splinter
+# - local: model_doc/squeezebert
+# title: SqueezeBERT
+# - local: model_doc/stablelm
+# title: StableLm
+# - local: model_doc/starcoder2
+# title: Starcoder2
+# - local: model_doc/switch_transformers
+# title: SwitchTransformers
+# - local: model_doc/t5
+# title: T5
+# - local: model_doc/t5v1.1
+# title: T5v1.1
+# - local: model_doc/tapex
+# title: TAPEX
+# - local: model_doc/transfo-xl
+# title: Transformer XL
+# - local: model_doc/ul2
+# title: UL2
+# - local: model_doc/umt5
+# title: UMT5
+# - local: model_doc/xmod
+# title: X-MOD
+# - local: model_doc/xglm
+# title: XGLM
+# - local: model_doc/xlm
+# title: XLM
+# - local: model_doc/xlm-prophetnet
+# title: XLM-ProphetNet
+# - local: model_doc/xlm-roberta
+# title: XLM-RoBERTa
+# - local: model_doc/xlm-roberta-xl
+# title: XLM-RoBERTa-XL
+# - local: model_doc/xlm-v
+# title: XLM-V
+# - local: model_doc/xlnet
+# title: XLNet
+# - local: model_doc/yoso
+# title: YOSO
+# title: Text models
+# - isExpanded: false
+# sections:
+# - local: model_doc/beit
+# title: BEiT
+# - local: model_doc/bit
+# title: BiT
+# - local: model_doc/conditional_detr
+# title: Conditional DETR
+# - local: model_doc/convnext
+# title: ConvNeXT
+# - local: model_doc/convnextv2
+# title: ConvNeXTV2
+# - local: model_doc/cvt
+# title: CVT
+# - local: model_doc/deformable_detr
+# title: Deformable DETR
+# - local: model_doc/deit
+# title: DeiT
+# - local: model_doc/depth_anything
+# title: Depth Anything
+# - local: model_doc/deta
+# title: DETA
+# - local: model_doc/detr
+# title: DETR
+# - local: model_doc/dinat
+# title: DiNAT
+# - local: model_doc/dinov2
+# title: DINOV2
+# - local: model_doc/dit
+# title: DiT
+# - local: model_doc/dpt
+# title: DPT
+# - local: model_doc/efficientformer
+# title: EfficientFormer
+# - local: model_doc/efficientnet
+# title: EfficientNet
+# - local: model_doc/focalnet
+# title: FocalNet
+# - local: model_doc/glpn
+# title: GLPN
+# - local: model_doc/imagegpt
+# title: ImageGPT
+# - local: model_doc/levit
+# title: LeViT
+# - local: model_doc/mask2former
+# title: Mask2Former
+# - local: model_doc/maskformer
+# title: MaskFormer
+# - local: model_doc/mobilenet_v1
+# title: MobileNetV1
+# - local: model_doc/mobilenet_v2
+# title: MobileNetV2
+# - local: model_doc/mobilevit
+# title: MobileViT
+# - local: model_doc/mobilevitv2
+# title: MobileViTV2
+# - local: model_doc/nat
+# title: NAT
+# - local: model_doc/poolformer
+# title: PoolFormer
+# - local: model_doc/pvt
+# title: Pyramid Vision Transformer (PVT)
+# - local: model_doc/pvt_v2
+# title: Pyramid Vision Transformer v2 (PVTv2)
+# - local: model_doc/regnet
+# title: RegNet
+# - local: model_doc/resnet
+# title: ResNet
+# - local: model_doc/segformer
+# title: SegFormer
+# - local: model_doc/seggpt
+# title: SegGpt
+# - local: model_doc/superpoint
+# title: SuperPoint
+# - local: model_doc/swiftformer
+# title: SwiftFormer
+# - local: model_doc/swin
+# title: Swin Transformer
+# - local: model_doc/swinv2
+# title: Swin Transformer V2
+# - local: model_doc/swin2sr
+# title: Swin2SR
+# - local: model_doc/table-transformer
+# title: Table Transformer
+# - local: model_doc/upernet
+# title: UperNet
+# - local: model_doc/van
+# title: VAN
+# - local: model_doc/vit
+# title: Vision Transformer (ViT)
+# - local: model_doc/vit_hybrid
+# title: ViT Hybrid
+# - local: model_doc/vitdet
+# title: ViTDet
+# - local: model_doc/vit_mae
+# title: ViTMAE
+# - local: model_doc/vitmatte
+# title: ViTMatte
+# - local: model_doc/vit_msn
+# title: ViTMSN
+# - local: model_doc/yolos
+# title: YOLOS
+# title: Vision models
+# - isExpanded: false
+# sections:
+# - local: model_doc/audio-spectrogram-transformer
+# title: Audio Spectrogram Transformer
+# - local: model_doc/bark
+# title: Bark
+# - local: model_doc/clap
+# title: CLAP
+# - local: model_doc/encodec
+# title: EnCodec
+# - local: model_doc/hubert
+# title: Hubert
+# - local: model_doc/mctct
+# title: MCTCT
+# - local: model_doc/mms
+# title: MMS
+# - local: model_doc/musicgen
+# title: MusicGen
+# - local: model_doc/musicgen_melody
+# title: MusicGen Melody
+# - local: model_doc/pop2piano
+# title: Pop2Piano
+# - local: model_doc/seamless_m4t
+# title: Seamless-M4T
+# - local: model_doc/seamless_m4t_v2
+# title: SeamlessM4T-v2
+# - local: model_doc/sew
+# title: SEW
+# - local: model_doc/sew-d
+# title: SEW-D
+# - local: model_doc/speech_to_text
+# title: Speech2Text
+# - local: model_doc/speech_to_text_2
+# title: Speech2Text2
+# - local: model_doc/speecht5
+# title: SpeechT5
+# - local: model_doc/unispeech
+# title: UniSpeech
+# - local: model_doc/unispeech-sat
+# title: UniSpeech-SAT
+# - local: model_doc/univnet
+# title: UnivNet
+# - local: model_doc/vits
+# title: VITS
+# - local: model_doc/wav2vec2
+# title: Wav2Vec2
+# - local: model_doc/wav2vec2-bert
+# title: Wav2Vec2-BERT
+# - local: model_doc/wav2vec2-conformer
+# title: Wav2Vec2-Conformer
+# - local: model_doc/wav2vec2_phoneme
+# title: Wav2Vec2Phoneme
+# - local: model_doc/wavlm
+# title: WavLM
+# - local: model_doc/whisper
+# title: Whisper
+# - local: model_doc/xls_r
+# title: XLS-R
+# - local: model_doc/xlsr_wav2vec2
+# title: XLSR-Wav2Vec2
+# title: Audio models
+# - isExpanded: false
+# sections:
+# - local: model_doc/timesformer
+# title: TimeSformer
+# - local: model_doc/videomae
+# title: VideoMAE
+# - local: model_doc/vivit
+# title: ViViT
+# title: Video models
+# - isExpanded: false
+# sections:
+# - local: model_doc/align
+# title: ALIGN
+# - local: model_doc/altclip
+# title: AltCLIP
+# - local: model_doc/blip
+# title: BLIP
+# - local: model_doc/blip-2
+# title: BLIP-2
+# - local: model_doc/bridgetower
+# title: BridgeTower
+# - local: model_doc/bros
+# title: BROS
+# - local: model_doc/chinese_clip
+# title: Chinese-CLIP
+# - local: model_doc/clip
+# title: CLIP
+# - local: model_doc/clipseg
+# title: CLIPSeg
+# - local: model_doc/clvp
+# title: CLVP
+# - local: model_doc/data2vec
+# title: Data2Vec
+# - local: model_doc/deplot
+# title: DePlot
+# - local: model_doc/donut
+# title: Donut
+# - local: model_doc/flava
+# title: FLAVA
+# - local: model_doc/git
+# title: GIT
+# - local: model_doc/grounding-dino
+# title: Grounding DINO
+# - local: model_doc/groupvit
+# title: GroupViT
+# - local: model_doc/idefics
+# title: IDEFICS
+# - local: model_doc/idefics2
+# title: Idefics2
+# - local: model_doc/instructblip
+# title: InstructBLIP
+# - local: model_doc/kosmos-2
+# title: KOSMOS-2
+# - local: model_doc/layoutlm
+# title: LayoutLM
+# - local: model_doc/layoutlmv2
+# title: LayoutLMV2
+# - local: model_doc/layoutlmv3
+# title: LayoutLMV3
+# - local: model_doc/layoutxlm
+# title: LayoutXLM
+# - local: model_doc/lilt
+# title: LiLT
+# - local: model_doc/llava
+# title: Llava
+# - local: model_doc/llava_next
+# title: LLaVA-NeXT
+# - local: model_doc/lxmert
+# title: LXMERT
+# - local: model_doc/matcha
+# title: MatCha
+# - local: model_doc/mgp-str
+# title: MGP-STR
+# - local: model_doc/nougat
+# title: Nougat
+# - local: model_doc/oneformer
+# title: OneFormer
+# - local: model_doc/owlvit
+# title: OWL-ViT
+# - local: model_doc/owlv2
+# title: OWLv2
+# - local: model_doc/paligemma
+# title: PaliGemma
+# - local: model_doc/perceiver
+# title: Perceiver
+# - local: model_doc/pix2struct
+# title: Pix2Struct
+# - local: model_doc/sam
+# title: Segment Anything
+# - local: model_doc/siglip
+# title: SigLIP
+# - local: model_doc/speech-encoder-decoder
+# title: Speech Encoder Decoder Models
+# - local: model_doc/tapas
+# title: TAPAS
+# - local: model_doc/trocr
+# title: TrOCR
+# - local: model_doc/tvlt
+# title: TVLT
+# - local: model_doc/tvp
+# title: TVP
+# - local: model_doc/udop
+# title: UDOP
+# - local: model_doc/video_llava
+# title: VideoLlava
+# - local: model_doc/vilt
+# title: ViLT
+# - local: model_doc/vipllava
+# title: VipLlava
+# - local: model_doc/vision-encoder-decoder
+# title: Vision Encoder Decoder Models
+# - local: model_doc/vision-text-dual-encoder
+# title: Vision Text Dual Encoder
+# - local: model_doc/visual_bert
+# title: VisualBERT
+# - local: model_doc/xclip
+# title: X-CLIP
+# title: Multimodal models
+# - isExpanded: false
+# sections:
+# - local: model_doc/decision_transformer
+# title: محول القرار
+# - local: model_doc/trajectory_transformer
+# title: محول المسار
+# title: نماذج التعلم التعزيزية
+# - isExpanded: false
+# sections:
+# - local: model_doc/autoformer
+# title: Autoformer
+# - local: model_doc/informer
+# title: Informer
+# - local: model_doc/patchtsmixer
+# title: PatchTSMixer
+# - local: model_doc/patchtst
+# title: PatchTST
+# - local: model_doc/time_series_transformer
+# title: محول السلاسل الزمنية
+# title: نماذج السلاسل الزمنية
+# - isExpanded: false
+# sections:
+# - local: model_doc/graphormer
+# title: Graphormer
+# title: نماذج الرسم البياني
+# title: النماذج
+# - sections:
+# - local: internal/modeling_utils
+# title: الطبقات المخصصة والمرافق
+# - local: internal/pipelines_utils
+# title: مرافق خطوط الأنابيب
+# - local: internal/tokenization_utils
+# title: مرافق مقسم النصوص
+# - local: internal/trainer_utils
+# title: مرافق المدرب
+# - local: internal/generation_utils
+# title: مرافق التوليد
+# - local: internal/image_processing_utils
+# title: مرافق معالجة الصور
+# - local: internal/audio_utils
+# title: مرافق معالجة الصوت
+# - local: internal/file_utils
+# title: مرافق عامة
+# - local: internal/time_series_utils
+# title: مرافق السلاسل الزمنية
+# title: مساعدون داخليون
+# title: API
diff --git a/docs/source/ar/accelerate.md b/docs/source/ar/accelerate.md
new file mode 100644
index 00000000000000..486c1efe59af60
--- /dev/null
+++ b/docs/source/ar/accelerate.md
@@ -0,0 +1,120 @@
+# التدريب الموزع باستخدام 🤗 Accelerate
+
+
+مع تزايد حجم النماذج اللغوية، برز التوازي كأحد الاستراتيجيات لتدريب نماذج أكبر على أجهزة محدودة وتسريع عملية التدريب بمقدار كبير. أنشأنا في Hugging Face، قمنا بإنشاء مكتبة [ Accelerate](https://huggingface.co/docs/accelerate) لمساعدة المستخدمين على تدريب أي نموذج من Transformers بسهولة على أي نوع من الإعدادات الموزعة، سواء كان ذلك على عدة وحدات معالجة رسومات (GPUs) على جهاز واحد أو على عدة وحدات معالجة رسومات موزعة على عدة أجهزة. في هذا الدليل، تعلم كيفية تخصيص حلقة تدريب PyTorch الأصلية لتمكين التدريب في بيئة موزعة.
+
+## الإعداد
+
+ابدأ بتثبيت 🤗 Accelerate:
+
+```bash
+pip install accelerate
+```
+
+ثم قم باستيراد وإنشاء كائن [`~accelerate.Accelerator`]. سيقوم [`~accelerate.Accelerator`] تلقائيًا باكتشاف نوع الإعداد الموزع الخاص بك وتهيئة جميع المكونات اللازمة للتدريب. لن تحتاج إلى وضع نموذجك على جهاز بشكل معين.
+
+```py
+>>> from accelerate import Accelerator
+
+>>> accelerator = Accelerator()
+```
+
+## الاستعداد للتسريع
+
+الخطوة التالية هي تمرير جميع كائنات التدريب ذات الصلة إلى دالة الإعداد [`~accelerate.Accelerator.prepare`]. ويشمل ذلك DataLoaders للتدريب والتقييم، ونموذجًا ومُحَسِّنً المعاملات (optimizer):
+
+```py
+>>> train_dataloader, eval_dataloader, model, optimizer = accelerator.prepare(
+... train_dataloader, eval_dataloader, model, optimizer
+... )
+```
+
+## الخلفي Backward
+
+الإضافة الأخيرة هي استبدال الدالة المعتادة `loss.backward()` في حلقة التدريب الخاصة بك بدالة [`~accelerate.Accelerator.backward`] في 🤗 Accelerate:
+
+```py
+>>> for epoch in range(num_epochs):
+... for batch in train_dataloader:
+... outputs = model(**batch)
+... loss = outputs.loss
+... accelerator.backward(loss)
+
+... optimizer.step()
+... lr_scheduler.step()
+... optimizer.zero_grad()
+... progress_bar.update(1)
+```
+
+كما يمكنك أن ترى في الكود التالي، فأنت بحاجة فقط إلى إضافة أربعة أسطر من الكود إلى حلقة التدريب الخاصة بك لتمكين التدريب الموزع!
+
+```diff
++ from accelerate import Accelerator
+ from transformers import AdamW, AutoModelForSequenceClassification, get_scheduler
+
++ accelerator = Accelerator()
+
+ model = AutoModelForSequenceClassification.from_pretrained(checkpoint, num_labels=2)
+ optimizer = AdamW(model.parameters(), lr=3e-5)
+
+- device = torch.device("cuda") if torch.cuda.is_available() else torch.device("cpu")
+- model.to(device)
+
++ train_dataloader, eval_dataloader, model, optimizer = accelerator.prepare(
++ train_dataloader, eval_dataloader, model, optimizer
++ )
+
+ num_epochs = 3
+ num_training_steps = num_epochs * len(train_dataloader)
+ lr_scheduler = get_scheduler(
+ "linear",
+ optimizer=optimizer,
+ num_warmup_steps=0,
+ num_training_steps=num_training_steps
+ )
+
+ progress_bar = tqdm(range(num_training_steps))
+
+ model.train()
+ for epoch in range(num_epochs):
+ for batch in train_dataloader:
+- batch = {k: v.to(device) for k, v in batch.items()}
+ outputs = model(**batch)
+ loss = outputs.loss
+- loss.backward()
++ accelerator.backward(loss)
+optimizer.step()
+ lr_scheduler.step()
+ optimizer.zero_grad()
+ progress_bar.update(1)
+```
+
+## تدريب
+
+بمجرد إضافة أسطر الكود ذات الصلة، قم بتشغيل التدريب الخاص بك في أحد النصوص أو الدفاتر مثل Colaboratory.
+
+### التدريب باستخدام نص برمجي
+
+إذا كنت تشغل التدريب الخاص بك من نص برمجي، فقم بتشغيل الأمر التالي لإنشاء وحفظ ملف تكوين:
+
+```bash
+accelerate config
+```
+
+ثم قم بتشغيل التدريب الخاص بك باستخدام:
+
+```bash
+accelerate launch train.py
+```
+
+### التدريب باستخدام دفتر ملاحظات
+
+يمكن أيضًا تشغيل 🤗 Accelerate في دفاتر إذا كنت تخطط لاستخدام وحدات معالجة الرسوميات (TPUs) في Colaboratory. قم بتغليف كل الكود المسؤول عن التدريب في دالة، ومررها إلى [`~accelerate.notebook_launcher`]:
+
+```py
+>>> from accelerate import notebook_launcher
+
+>>> notebook_launcher(training_function)
+```
+
+للحصول على مزيد من المعلومات حول 🤗 Accelerate وميزاته الغنية، يرجى الرجوع إلى [الوثائق](https://huggingface.co/docs/accelerate).
\ No newline at end of file
diff --git a/docs/source/ar/agents.md b/docs/source/ar/agents.md
new file mode 100644
index 00000000000000..92b2a4715f6f07
--- /dev/null
+++ b/docs/source/ar/agents.md
@@ -0,0 +1,539 @@
+# الوكلاء والأدوات
+
+[[open-in-colab]]
+
+### ما هو الوكيل؟
+
+يمكن للنظم اللغوية الكبيرة (LLMs) التي تم تدريبها على أداء [نمذجة اللغة السببية](./tasks/language_modeling.) التعامل مع مجموعة واسعة من المهام، ولكنها غالبًا ما تواجه صعوبات في المهام الأساسية مثل المنطق والحساب والبحث. وعندما يتم استدعاؤها في مجالات لا تؤدي فيها أداءً جيدًا، فإنها غالبًا ما تفشل في توليد الإجابة التي نتوقعها منها.
+
+يتمثل أحد النهج للتغلب على هذا القصور في إنشاء "وكيل".
+
+الوكيل هو نظام يستخدم LLM كمحرك له، ولديه حق الوصول إلى وظائف تسمى "أدوات".
+
+هذه "الأدوات" هي وظائف لأداء مهمة، وتحتوي على جميع الأوصاف اللازمة للوكيل لاستخدامها بشكل صحيح.
+
+يمكن برمجة الوكيل للقيام بما يلي:
+- وضع سلسلة من الإجراءات/الأدوات وتشغيلها جميعًا في نفس الوقت مثل [`CodeAgent`] على سبيل المثال
+- التخطيط للاجراءات/الأدوات وتنفيذها واحدة تلو الأخرى والانتظار حتى انتهاء كل إجراء قبل إطلاق التالي مثل [`ReactJsonAgent`] على سبيل المثال
+
+### أنواع الوكلاء
+
+#### الوكيل البرمجي (Code agent)
+
+يتمتع هذا الوكيل يتبع خطوات محددة: أولًا، يخطط لسلسلة من الإجراءات التي يريد تنفيذها، ثم شفرة Python لتنفيذ جميع الإجراءات في نفس الوقت. وهو يتعامل بشكل أصلي مع أنواع مختلفة من المدخلات والمخرجات للأدوات التي يستخدمها، وبالتالي فهو الخيار الموصى به للمهام متعددة الوسائط.
+
+#### وكلاء التفاعل
+
+هذا هو الوكيل الذي يتم اللجوء إليه لحل مهام الاستدلال، حيث يجعل إطار ReAct ([Yao et al.، 2022](https://huggingface.co/papers/2210.03629)) من الكفاءة حقًا التفكير على أساس ملاحظاته السابقة.
+
+نقوم بتنفيذ إصدارين من ReactJsonAgent:
+- [`ReactJsonAgent`] يقوم بتوليد استدعاءات الأدوات كـ JSON في إخراجها.
+- [`ReactCodeAgent`] هو نوع جديد من ReactJsonAgent يقوم بتوليد استدعاءات أدواته كمقاطع من التعليمات البرمجية، والتي تعمل بشكل جيد حقًا مع LLMs التي تتمتع بأداء قوي في البرمجة.
+
+> [!TIP]
+> اقرأ منشور المدونة [Open-source LLMs as LangChain Agents](https://huggingface.co/blog/open-source-llms-as-agents) لمعرفة المزيد عن وكيل ReAct.
+
+![إطار عمل وكيل ReAct](https://huggingface.co/datasets/huggingface/documentation-images/resolve/main/blog/open-source-llms-as-agents/ReAct.png)
+
+على سبيل المثال، إليك كيف يعمل وكيل ReAct Code طريقه من خلال السؤال التالي.
+
+```py3
+>>> agent.run(
+... "How many more blocks (also denoted as layers) in BERT base encoder than the encoder from the architecture proposed in Attention is All You Need?",
+... )
+=====New task=====
+How many more blocks (also denoted as layers) in BERT base encoder than the encoder from the architecture proposed in Attention is All You Need?
+====Agent is executing the code below:
+bert_blocks = search(query="number of blocks in BERT base encoder")
+print("BERT blocks:", bert_blocks)
+====
+Print outputs:
+BERT blocks: twelve encoder blocks
+
+====Agent is executing the code below:
+attention_layer = search(query="number of layers in Attention is All You Need")
+print("Attention layers:", attention_layer)
+====
+Print outputs:
+Attention layers: Encoder: The encoder is composed of a stack of N = 6 identical layers. Each layer has two sub-layers. The first is a multi-head self-attention mechanism, and the second is a simple, position- 2 Page 3 Figure 1: The Transformer - model architecture.
+
+====Agent is executing the code below:
+bert_blocks = 12
+attention_layers = 6
+diff = bert_blocks - attention_layers
+print("Difference in blocks:", diff)
+final_answer(diff)
+====
+
+Print outputs:
+Difference in blocks: 6
+
+Final answer: 6
+```
+
+### كيف يمكنني بناء وكيل؟
+
+لتهيئة وكيل، تحتاج إلى هذه الوسائط:
+
+- نموذج لغوي كبير (LLM) يشكل المحرك الأساسي للوكيل. الوكيل نفسه ليس النموذج اللغوي، بل هو برنامج يستخدم النموذج اللغوي كمحرك له.
+- موجه النظام (system prompt): هذه هي التعليمات التي يتم إعطاؤها للنموذج اللغوي لإنشاء مخرجاته.
+- صندوق أدوات (toolbox) يختار الوكيل منه الأدوات لتنفيذها
+- محلل (parser) لاستخراج الأدوات التي يجب استدعاؤها من مخرجات النموذج اللغوي LLM والأدوات التي يجب استخدامها
+
+عند تهيئة نظام الوكيل، يتم استخدام سمات الأداة لإنشاء وصف للأداة، ثم يتم دمجها في موجه النظام الخاص `system_prompt` للوكيل لإعلامه بالأدوات التي يمكنه استخدامها ولماذا.
+
+للبدء، يرجى تثبيت `agents` الإضافية لتثبيت جميع التبعيات الافتراضية.
+
+```bash
+pip install transformers[agents]
+```
+
+قم ببناء محرك LLM الخاص بك من خلال تعريف طريقة `llm_engine` التي تقبل قائمة من [الرسائل](./chat_templating.) وتعيد النص. يجب أن تقبل هذه الدالة القابلة للاستدعاء أيضًا معامل `stop` يشير إلى متى يجب التوقف عن التوليد.
+
+```python
+from huggingface_hub import login, InferenceClient
+
+login("")
+
+client = InferenceClient(model="meta-llama/Meta-Llama-3-70B-Instruct")
+
+def llm_engine(messages, stop_sequences=["Task"]) -> str:
+ response = client.chat_completion(messages, stop=stop_sequences, max_tokens=1000)
+ answer = response.choices[0].message.content
+ return answer
+```
+
+يمكنك استخدام أي طريقة `llm_engine` طالما أنها:
+1. يتبع تنسيق [رسائل](./chat_templating.md) لإدخاله (`List [Dict [str، str]]`) ويعيد `str`
+2. يتوقف عن توليد المخراجات من التسلسلات التي تم تمريرها في معامل `stop`
+
+أنت بحاجة أيضًا إلى معامل "الأدوات" الذي يقبل قائمة من "الأدوات". يمكنك توفير قائمة فارغة لـ "الأدوات"، ولكن استخدم صندوق الأدوات الافتراضي مع معامل اختياري `add_base_tools=True`.
+
+الآن يمكنك إنشاء وكيل، مثل [`CodeAgent`], وتشغيله. ولتسهيل الأمر، نقدم أيضًا فئة [`HfEngine`] التي تستخدم `huggingface_hub.InferenceClient` بشكل مخفى.
+
+```python
+from transformers import CodeAgent, HfEngine
+
+llm_engine = HfEngine(model="meta-llama/Meta-Llama-3-70B-Instruct")
+agent = CodeAgent(tools=[], llm_engine=llm_engine, add_base_tools=True)
+
+agent.run(
+ "Could you translate this sentence from French, say it out loud and return the audio.",
+ sentence="Où est la boulangerie la plus proche?",
+)
+```
+
+هذه الميزة ستكون مفيدة في حالة الحاجة الملحة! يمكنك حتى ترك معامل `llm_engine` غير محدد، وسيتم إنشاء [`HfEngine`] بشكل تلقائي.
+
+```python
+from transformers import CodeAgent
+
+agent = CodeAgent(tools=[], add_base_tools=True)
+
+agent.run(
+ "Could you translate this sentence from French, say it out loud and give me the audio.",
+ sentence="Où est la boulangerie la plus proche?",
+)
+```
+
+لاحظ أننا استخدمنا معامل "sentence" إضافي: يمكنك تمرير النص كمعامل إضافي إلى النموذج.
+
+يمكنك أيضًا استخدام هذا للإشارة إلى مسار الملفات المحلية أو البعيدة للنموذج لاستخدامها:
+
+```py
+from transformers import ReactCodeAgent
+
+agent = ReactCodeAgent(tools=[], llm_engine=llm_engine, add_base_tools=True)
+
+agent.run("Why does Mike not know many people in New York?", audio="https://huggingface.co/datasets/huggingface/documentation-images/resolve/main/transformers/recording.mp3")
+```
+
+
+تم تحديد موجه النظام ومحلل المخرجات تلقائيًا، ولكن يمكنك فحصهما بسهولة عن طريق استدعاء `system_prompt_template` على وكيلك.
+
+```python
+print(agent.system_prompt_template)
+```
+
+من المهم أن تشرح بأكبر قدر ممكن من الوضوح المهمة التي تريد تنفيذها.
+كل عملية [`~Agent.run`] مستقلة، وبما أن الوكيل مدعوم من LLM، فقد تؤدي الاختلافات الطفيفة في موجهك إلى نتائج مختلفة تمامًا.
+يمكنك أيضًا تشغيل وكيل بشكل متتالي لمهام مختلفة: في كل مرة يتم فيها إعادة تهيئة سمتي `agent.task` و`agent.logs`.
+
+
+#### تنفيذ التعليمات البرمجية
+
+يقوم مفسر Python بتنفيذ التعليمات البرمجية على مجموعة من المدخلات التي يتم تمريرها جنبًا إلى جنب مع أدواتك.
+يجب أن يكون هذا الأمر آمنًا لأن الوظائف الوحيدة التي يمكن استدعاؤها هي الأدوات التي قدمتها (خاصة إذا كانت أدوات من Hugging Face فقط) ووظيفة الطباعة، لذا فأنت مقيد بالفعل بما يمكن تنفيذه.
+
+مفسر Python لا يسمح أيضًا باستدعاء دوال بشكل افتراضي خارج قائمة آمنة، لذا فإن جميع الهجمات الأكثر وضوحًا لا ينبغي أن تكون مشكلة.
+يمكنك أيضًا الإذن باستيرادات إضافية عن طريق تمرير الوحدات النمطية المصرح بها كقائمة من السلاسل في معامل `additional_authorized_imports` عند تهيئة [`ReactCodeAgent`] أو [`CodeAgent`]:
+
+```py
+>>> from transformers import ReactCodeAgent
+
+>>> agent = ReactCodeAgent(tools=[], additional_authorized_imports=['requests', 'bs4'])
+>>> agent.run("Could you get me the title of the page at url 'https://huggingface.co/blog'?")
+
+(...)
+'Hugging Face – Blog'
+```
+
+سيتم إيقاف التنفيذ عند أي رمز يحاول تنفيذ عملية غير قانونية أو إذا كان هناك خطأ Python عادي في التعليمات البرمجية التي تم إنشاؤها بواسطة الوكيل.
+
+> [!WARNING]
+> يمكن لـ LLM توليد شفرة برمجية عشوائية سيتم تنفيذها بعد ذلك: لا تقمب استدعاء أى دوال غير آمنة!
+
+### موجه النظام
+
+ينشئ الوكيل، أو بالأحرى LLM الذي يقود الوكيل، يولد مخرجات بناءً على موجه النظام. يمكن تخصيص موجه النظام وتصميمه للمهام المقصودة. على سبيل المثال، تحقق من موجه النظام لـ [`ReactCodeAgent`] (الإصدار أدناه مبسط قليلاً).
+
+```text
+You will be given a task to solve as best you can.
+You have access to the following tools:
+<>
+
+To solve the task, you must plan forward to proceed in a series of steps, in a cycle of 'Thought:', 'Code:', and 'Observation:' sequences.
+
+At each step, in the 'Thought:' sequence, you should first explain your reasoning towards solving the task, then the tools that you want to use.
+Then in the 'Code:' sequence, you shold write the code in simple Python. The code sequence must end with '/End code' sequence.
+During each intermediate step, you can use 'print()' to save whatever important information you will then need.
+These print outputs will then be available in the 'Observation:' field, for using this information as input for the next step.
+
+In the end you have to return a final answer using the `final_answer` tool.
+
+Here are a few examples using notional tools:
+---
+{examples}
+
+Above example were using notional tools that might not exist for you. You only have acces to those tools:
+<>
+You also can perform computations in the python code you generate.
+
+Always provide a 'Thought:' and a 'Code:\n```py' sequence ending with '```' sequence. You MUST provide at least the 'Code:' sequence to move forward.
+
+Remember to not perform too many operations in a single code block! You should split the task into intermediate code blocks.
+Print results at the end of each step to save the intermediate results. Then use final_answer() to return the final result.
+
+Remember to make sure that variables you use are all defined.
+
+Now Begin!
+```
+
+يتضمن موجه النظام:
+- *مقدمة* تشرح كيف يجب أن يتصرف الوكيل والأدوات التي يجب عليه استخدامها.
+- وصف لجميع الأدوات التي يتم تحديدها بواسطة رمز `<>` الذي يتم استبداله ديناميكيًا في وقت التشغيل بالأدوات التي يحددها المستخدم أو يختارها.
+ - يأتي وصف الأداة من سمات الأداة، `name`، و`description`، و`inputs` و`output_type`، وقالب `jinja2` بسيط يمكنك تحسينه.
+- شكل المخرج المتوقع.
+
+يمكنك تحسين موجه النظام، على سبيل المثال، عن طريق إضافة شرح لتنسيق المخرجات.
+
+للحصول على أقصى قدر من المرونة، يمكنك الكتابة فوق قالب موجه النظام بالكامل عن طريق تمرير موجه مخصص كمعامل إلى معلمة `system_prompt`.
+
+```python
+from transformers import ReactJsonAgent
+from transformers.agents import PythonInterpreterTool
+
+agent = ReactJsonAgent(tools=[PythonInterpreterTool()], system_prompt="{your_custom_prompt}")
+```
+
+> [!WARNING]
+> يرجى التأكد من تحديد سلسلة `<>` في مكان ما في `template` حتى يكون الوكيل على علم
+بالأدوات المتاحة.
+
+
+### فحص تشغيل الوكيل
+
+فيما يلي بعض السمات المفيدة لفحص ما حدث بعد التشغيل:
+- تخزن `agent.logs` سجلات مفصلة للوكيل. في كل خطوة من تشغيل الوكيل، يتم تخزين كل شيء في قاموس إلحاقه بـ `agent.logs`.
+- تشغيل `agent.write_inner_memory_from_logs()` يخلق ذاكرة داخلية لسجلات الوكيل للنظام LLM لعرضها، كقائمة من رسائل الدردشة. تنتقل هذه الطريقة عبر كل خطوة من سجل الوكيل ولا تخزن سوى ما يهمها كرسالة: على سبيل المثال، سيحفظ موجه النظام والمهمة في رسائل منفصلة، ثم لكل خطوة سيخزن مخرج LLM كرسالة، ومخرج استدعاء الأداة كرسالة أخرى. استخدم هذا إذا كنت تريد عرضًا عامًا لما حدث - ولكن لن يتم نسخ كل سجل بواسطة هذه الطريقة.
+
+## الأدوات
+
+الأداة هي عبارة عن وظيفة أساسية يستخدمها الوكيل لتنفيذ مهمة محددة.
+
+يمكنك على سبيل المثال التحقق من [`PythonInterpreterTool`]: لديه اسم ووصف ووصف للمدخلات ونوع للمخرج، وطريقة `__call__` التي تقوم بتنفيذ المهمة المطلوبة.
+
+عند تهيئة الوكيل، يتم استخدام سمات الأداة لتوليد وصف للأداة يتم تضمينه في موجه النظام الخاص بالوكيل. يتيح هذا للوكيل معرفة الأدوات التي يمكنه استخدامها ولماذا.
+
+### صندوق الأدوات الافتراضي
+
+يأتي Transformers مع صندوق أدوات افتراضي لتمكين الوكلاء، والذي يمكنك إضافته إلى وكيلك عند التهيئة باستخدام معامل `add_base_tools = True`:
+
+- **الإجابة على أسئلة المستند**: الإجابة على سؤال حول المستند (مثل ملف PDF) بتنسيق صورة ([Donut](./model_doc/donut))
+- **الإجابة على أسئلة الصور**: الإجابة على سؤال حول صورة ([VILT](./model_doc/vilt))
+- **التحدث إلى النص**: قم بتفريغ الكلام إلى نص ([Whisper](./model_doc/whisper))
+- **النص إلى كلام**: تحويل النص إلى كلام ([SpeechT5](./model_doc/speecht5))
+- **الترجمة**: ترجمة جملة معينة من لغة المصدر إلى لغة الهدف.
+- **مفسر كود Python**: تشغيل كود Python الذي تم إنشاؤه بواسطة LLM في بيئة آمنة. لن يتم إضافة هذه الأداة إلى [`ReactJsonAgent`] إلا إذا استخدمت `add_base_tools=True`، نظرًا لأن الأدوات المستندة إلى التعليمات البرمجية يمكنها بالفعل تنفيذ كود Python
+لا تترجم النصوص الخاصة ولا الأكواد البرمجية ولا الروابط ولا رموز HTML وCSS:
+
+يمكنك استخدام أداة يدويًا عن طريق استدعاء دالة [`load_tool`] وتحديد مهمة لتنفيذها.
+
+```python
+from transformers import load_tool
+
+tool = load_tool("text-to-speech")
+audio = tool("This is a text to speech tool")
+```
+
+### إنشاء أداة جديدة
+
+يمكنك إنشاء أداتك الخاصة لتغطية حالات الاستخدام التي لا تغطيها الأدوات الافتراضية من Hugging Face.
+على سبيل المثال، دعنا نقوم بإنشاء أداة تعرض النموذج الأكثر تنزيلًا لمهمة معينة من Hub.
+
+سوف نبدأ بالكود التالي.
+
+```python
+from huggingface_hub import list_models
+
+task = "text-classification"
+
+model = next(iter(list_models(filter=task, sort="downloads", direction=-1)))
+print(model.id)
+```
+
+يمكن تحويل هذه الشيفرة إلى فئة ترث من الفئة العليا [`Tool`].
+
+تحتاج الأداة المخصصة إلى:
+
+- اسم `name`، والتي تمثل اسم الأداة نفسها. عادةً ما يصف الاسم وظيفتها. بما أن الكود يعيد النموذج الأكثر تنزيلًا لمهمة ما، فلنسمها `model_download_counter`.
+- تستخدم خاصية `description` لملء موجه نظام الوكيل.
+- خاصية `inputs`، والتي هي عبارة عن قاموس بمفاتيح "type" و"description". يحتوي على معلومات تساعد المفسر Python على اتخاذ خيارات مستنيرة بشأن المدخلات.
+- خاصية `output_type`، والتي تحدد نوع المخرج.
+- طريقة `forward` والتي تحتوي على الكود الذي سيتم تنفيذه للحصول على النتيجة النهائية.
+
+```python
+from transformers import Tool
+from huggingface_hub import list_models
+
+class HFModelDownloadsTool(Tool):
+ name = "model_download_counter"
+ description = (
+ "This is a tool that returns the most downloaded model of a given task on the Hugging Face Hub. "
+ "It returns the name of the checkpoint."
+ )
+
+ inputs = {
+ "task": {
+ "type": "text",
+ "description": "the task category (such as text-classification, depth-estimation, etc)",
+ }
+ }
+ output_type = "text"
+
+ def forward(self, task: str):
+ model = next(iter(list_models(filter=task, sort="downloads", direction=-1)))
+ return model.id
+```
+
+الآن بعد أن أصبحت فئة `HfModelDownloadsTool` المخصصة جاهزة، يمكنك حفظها في ملف باسم `model_downloads.py` واستيرادها للاستخدام.
+
+```python
+from model_downloads import HFModelDownloadsTool
+
+tool = HFModelDownloadsTool()
+```
+
+يمكنك أيضًا مشاركة أداتك المخصصة في Hub عن طريق استدعاء [`~Tool.push_to_hub`] على الأداة. تأكد من أنك قمت بإنشاء مستودع لها على Hub وأنك تستخدم رمز وصول للقراءة.
+
+```python
+tool.push_to_hub("{your_username}/hf-model-downloads")
+```
+
+قم بتحميل الأداة باستخدام دالة [`~Tool.load_tool`] ومررها إلى معلمة `tools` في الوكيل الخاص بك.
+
+```python
+from transformers import load_tool, CodeAgent
+
+model_download_tool = load_tool("m-ric/hf-model-downloads")
+agent = CodeAgent(tools=[model_download_tool], llm_engine=llm_engine)
+agent.run(
+ "Can you give me the name of the model that has the most downloads in the 'text-to-video' task on the Hugging Face Hub?"
+)
+```
+
+ستحصل على ما يلي:
+
+```text
+======== New task ========
+Can you give me the name of the model that has the most downloads in the 'text-to-video' task on the Hugging Face Hub?
+==== Agent is executing the code below:
+most_downloaded_model = model_download_counter(task="text-to-video")
+print(f"The most downloaded model for the 'text-to-video' task is {most_downloaded_model}.")
+====
+```
+
+والناتج:
+
+`"النموذج الأكثر تنزيلًا لمهمة `text-to-video` هو ByteDance/AnimateDiff-Lightning."`
+
+### إدارة صندوق أدوات الوكيل الخاص بك
+
+إذا كنت قد قمت بتهيئة وكيل، فمن غير الملائم إعادة تهيئته من البداية لإضافة أداة جديدة ترغب في استخدامها. باستخدام مكتبة Transformers، يمكنك إدارة صندوق أدوات الوكيل بإضافة أو استبدال أداة موجودة.
+
+دعنا نضيف الأداة `model_download_tool` إلى وكيل تم تهيئته مسبقًا باستخدام صندوق الأدوات الافتراضي.
+
+```python
+from transformers import CodeAgent
+
+agent = CodeAgent(tools=[], llm_engine=llm_engine, add_base_tools=True)
+agent.toolbox.add_tool(model_download_tool)
+```
+
+الآن يمكننا الاستفادة من الأداة الجديدة وأداة تحويل النص إلى كلام السابقة:
+
+```python
+ agent.run(
+ "Can you read out loud the name of the model that has the most downloads in the 'text-to-video' task on the Hugging Face Hub and return the audio?"
+ )
+```
+
+| **Audio** |
+|------------------------------------------------------------------------------------------------------------------------------------------------------|
+|
diff --git a/i18n/README_ar.md b/i18n/README_ar.md
index 60ec4e1c068907..c2dd588fdb233f 100644
--- a/i18n/README_ar.md
+++ b/i18n/README_ar.md
@@ -49,6 +49,7 @@ limitations under the License.
Deutsch |
Tiếng Việt |
العربية |
+ اردو |
diff --git a/i18n/README_de.md b/i18n/README_de.md
index 7128a9ad999fc7..2532c9e12fab59 100644
--- a/i18n/README_de.md
+++ b/i18n/README_de.md
@@ -49,6 +49,7 @@ limitations under the License.
Deutsch |
Tiếng Việt |
العربية |
+ اردو |
diff --git a/i18n/README_es.md b/i18n/README_es.md
index fdef68ff1b4c20..6682147d7867cf 100644
--- a/i18n/README_es.md
+++ b/i18n/README_es.md
@@ -44,6 +44,7 @@ limitations under the License.
Deutsch |
Tiếng Việt |
العربية |
+ اردو |
diff --git a/i18n/README_fr.md b/i18n/README_fr.md
index 2c78ba041db2f1..c1eaa10edb927d 100644
--- a/i18n/README_fr.md
+++ b/i18n/README_fr.md
@@ -49,6 +49,7 @@ limitations under the License.
Deutsch |
Tiếng Việt |
العربية |
+ اردو |
diff --git a/i18n/README_hd.md b/i18n/README_hd.md
index a6e017a6f833c1..07077e5dd9c37d 100644
--- a/i18n/README_hd.md
+++ b/i18n/README_hd.md
@@ -69,6 +69,7 @@ checkpoint: जाँच बिंदु
Deutsch |
Tiếng Việt |
العربية |
+ اردو |
diff --git a/i18n/README_ja.md b/i18n/README_ja.md
index 27b770869f7192..293a5ee111b0c7 100644
--- a/i18n/README_ja.md
+++ b/i18n/README_ja.md
@@ -79,6 +79,7 @@ user: ユーザ
Deutsch |
Tiếng Việt |
العربية |
+ اردو |
diff --git a/i18n/README_ko.md b/i18n/README_ko.md
index 283318478f4b1e..e2a9b80d0d3ecc 100644
--- a/i18n/README_ko.md
+++ b/i18n/README_ko.md
@@ -44,6 +44,7 @@ limitations under the License.
Deutsch |
Tiếng Việt |
العربية |
+ اردو |
diff --git a/i18n/README_pt-br.md b/i18n/README_pt-br.md
index a356caefba9b42..79007e5aaa33f9 100644
--- a/i18n/README_pt-br.md
+++ b/i18n/README_pt-br.md
@@ -49,6 +49,7 @@ limitations under the License.
Deutsch |
Tiếng Việt |
العربية |
+ اردو |
diff --git a/i18n/README_ru.md b/i18n/README_ru.md
index fe548c1001149a..759acdbb912771 100644
--- a/i18n/README_ru.md
+++ b/i18n/README_ru.md
@@ -49,6 +49,7 @@ limitations under the License.
Deutsch |
Tiếng Việt |
العربية |
+ اردو |
diff --git a/i18n/README_te.md b/i18n/README_te.md
index 9dbd522c463db4..feb537ad1a48d2 100644
--- a/i18n/README_te.md
+++ b/i18n/README_te.md
@@ -51,6 +51,7 @@ limitations under the License.
Deutsch |
Tiếng Việt |
العربية |
+ اردو |
جدید ترین مشین لرننگ برائے JAX، PyTorch اور TensorFlow
+
+
+
+
+
+
+🤗 Transformers مختلف طریقوں جیسے کہ متن، بصارت، اور آڈیو پر کام کرنے کے لیے ہزاروں پری ٹرینڈ ماڈلز فراہم کرتے ہیں۔
+
+یہ ماڈلز درج ذیل پر لاگو کیے جا سکتے ہیں:
+
+* 📝 متن، جیسے کہ متن کی درجہ بندی، معلومات کا استخراج، سوالات کے جوابات، خلاصہ، ترجمہ، اور متن کی تخلیق، 100 سے زائد زبانوں میں۔
+* 🖼️ تصاویر، جیسے کہ تصویر کی درجہ بندی، اشیاء کی شناخت، اور تقسیم۔
+* 🗣️ آڈیو، جیسے کہ تقریر کی شناخت اور آڈیو کی درجہ بندی۔
+
+ٹرانسفارمر ماڈلز **مختلف طریقوں کو ملا کر** بھی کام انجام دے سکتے ہیں، جیسے کہ ٹیبل سوال جواب، بصری حروف کی شناخت، اسکین شدہ دستاویزات سے معلومات نکالنا، ویڈیو کی درجہ بندی، اور بصری سوال جواب۔
+
+🤗 Transformers ایسے APIs فراہم کرتا ہے جو آپ کو تیز رفتاری سے پری ٹرینڈ ماڈلز کو ایک دیے گئے متن پر ڈاؤن لوڈ اور استعمال کرنے، انہیں اپنے ڈیٹا سیٹس پر فائن ٹون کرنے، اور پھر ہمارے [ماڈل حب](https://huggingface.co/models) پر کمیونٹی کے ساتھ شیئر کرنے کی سہولت دیتا ہے۔ اسی وقت، ہر پائتھن ماڈیول جو ایک آرکیٹیکچر کو بیان کرتا ہے، مکمل طور پر خود مختار ہوتا ہے اور اسے تیز تحقیقاتی تجربات کے لیے تبدیل کیا جا سکتا ہے۔
+
+
+🤗 Transformers تین سب سے مشہور ڈیپ لرننگ لائبریریوں — [Jax](https://jax.readthedocs.io/en/latest/)، [PyTorch](https://pytorch.org/) اور [TensorFlow](https://www.tensorflow.org/) — کی مدد سے تیار کردہ ہے، جن کے درمیان بے حد ہموار انضمام ہے۔ اپنے ماڈلز کو ایک کے ساتھ تربیت دینا اور پھر دوسرے کے ساتھ inference کے لیے لوڈ کرنا انتہائی سادہ ہے۔
+
+## آن لائن ڈیمو
+
+آپ ہمارے زیادہ تر ماڈلز کو براہ راست ان کے صفحات پر [ماڈل ہب](https://huggingface.co/models) سے آزما سکتے ہیں۔ ہم عوامی اور نجی ماڈلز کے لیے [ذاتی ماڈل ہوسٹنگ، ورژننگ، اور انفرنس API](https://huggingface.co/pricing) بھی فراہم کرتے ہیں۔
+
+یہاں چند مثالیں ہیں:
+
+قدرتی زبان کی پروسیسنگ میں:
+
+- [BERT کے ساتھ ماسک شدہ الفاظ کی تکمیل](https://huggingface.co/google-bert/bert-base-uncased?text=Paris+is+the+%5BMASK%5D+of+France)
+- [Electra کے ساتھ نامزد اداروں کی شناخت](https://huggingface.co/dbmdz/electra-large-discriminator-finetuned-conll03-english?text=My+name+is+Sarah+and+I+live+in+London+city)
+- [Mistral کے ساتھ متنی جنریشن](https://huggingface.co/mistralai/Mistral-7B-Instruct-v0.2)
+- [RoBERTa کے ساتھ قدرتی زبان کی دلیل](https://huggingface.co/FacebookAI/roberta-large-mnli?text=The+dog+was+lost.+Nobody+lost+any+animal)
+- [BART کے ساتھ خلاصہ کاری](https://huggingface.co/facebook/bart-large-cnn?text=The+tower+is+324+metres+%281%2C063+ft%29+tall%2C+about+the+same+height+as+an+81-storey+building%2C+and+the+tallest+structure+in+Paris.+Its+base+is+square%2C+measuring+125+metres+%28410+ft%29+on+each+side.+During+its+construction%2C+the+Eiffel+Tower+surpassed+the+Washington+Monument+to+become+the+tallest+man-made+structure+in+the+world%2C+a+title+it+held+for+41+years+until+the+Chrysler+Building+in+New+York+City+was+finished+in+1930.+It+was+the+first+structure+to+reach+a+height+of+300+metres.+Due+to+the+addition+of+a+broadcasting+aerial+at+the+top+of+the+tower+in+1957%2C+it+is+now+taller+than+the+Chrysler+Building+by+5.2+metres+%2817+ft%29.+Excluding+transmitters%2C+the+Eiffel+Tower+is+the+second+tallest+free-standing+structure+in+France+after+the+Millau+Viaduct)
+- [DistilBERT کے ساتھ سوالات کے جوابات](https://huggingface.co/distilbert/distilbert-base-uncased-distilled-squad?text=Which+name+is+also+used+to+describe+the+Amazon+rainforest+in+English%3F&context=The+Amazon+rainforest+%28Portuguese%3A+Floresta+Amaz%C3%B4nica+or+Amaz%C3%B4nia%3B+Spanish%3A+Selva+Amaz%C3%B3nica%2C+Amazon%C3%ADa+or+usually+Amazonia%3B+French%3A+For%C3%AAt+amazonienne%3B+Dutch%3A+Amazoneregenwoud%29%2C+also+known+in+English+as+Amazonia+or+the+Amazon+Jungle%2C+is+a+moist+broadleaf+forest+that+covers+most+of+the+Amazon+basin+of+South+America.+This+basin+encompasses+7%2C000%2C000+square+kilometres+%282%2C700%2C000+sq+mi%29%2C+of+which+5%2C500%2C000+square+kilometres+%282%2C100%2C000+sq+mi%29+are+covered+by+the+rainforest.+This+region+includes+territory+belonging+to+nine+nations.+The+majority+of+the+forest+is+contained+within+Brazil%2C+with+60%25+of+the+rainforest%2C+followed+by+Peru+with+13%25%2C+Colombia+with+10%25%2C+and+with+minor+amounts+in+Venezuela%2C+Ecuador%2C+Bolivia%2C+Guyana%2C+Suriname+and+French+Guiana.+States+or+departments+in+four+nations+contain+%22Amazonas%22+in+their+names.+The+Amazon+represents+over+half+of+the+planet%27s+remaining+rainforests%2C+and+comprises+the+largest+and+most+biodiverse+tract+of+tropical+rainforest+in+the+world%2C+with+an+estimated+390+billion+individual+trees+divided+into+16%2C000+species)
+- [T5 کے ساتھ ترجمہ](https://huggingface.co/google-t5/t5-base?text=My+name+is+Wolfgang+and+I+live+in+Berlin)
+
+کمپیوٹر وژن میں:
+- [ViT کے ساتھ امیج کی درجہ بندی](https://huggingface.co/google/vit-base-patch16-224)
+- [DETR کے ساتھ اشیاء کی شناخت](https://huggingface.co/facebook/detr-resnet-50)
+- [SegFormer کے ساتھ سیمانٹک سیگمینٹیشن](https://huggingface.co/nvidia/segformer-b0-finetuned-ade-512-512)
+- [Mask2Former کے ساتھ پینوسٹک سیگمینٹیشن](https://huggingface.co/facebook/mask2former-swin-large-coco-panoptic)
+- [Depth Anything کے ساتھ گہرائی کا اندازہ](https://huggingface.co/docs/transformers/main/model_doc/depth_anything)
+- [VideoMAE کے ساتھ ویڈیو کی درجہ بندی](https://huggingface.co/docs/transformers/model_doc/videomae)
+- [OneFormer کے ساتھ یونیورسل سیگمینٹیشن](https://huggingface.co/shi-labs/oneformer_ade20k_dinat_large)
+
+
+آڈیو:
+- [خودکار تقریر کی پہچان Whisper کے ساتھ](https://huggingface.co/openai/whisper-large-v3)
+- [کلیدی الفاظ کی تلاش Wav2Vec2 کے ساتھ](https://huggingface.co/superb/wav2vec2-base-superb-ks)
+- [آڈیو کی درجہ بندی Audio Spectrogram Transformer کے ساتھ](https://huggingface.co/MIT/ast-finetuned-audioset-10-10-0.4593)
+
+ملٹی ماڈل ٹاسک میں:
+
+- [ٹیبل سوال جواب کے لیے TAPAS](https://huggingface.co/google/tapas-base-finetuned-wtq)
+- [ویژول سوال جواب کے لیے ViLT](https://huggingface.co/dandelin/vilt-b32-finetuned-vqa)
+- [امیج کیپشننگ کے لیے LLaVa](https://huggingface.co/llava-hf/llava-1.5-7b-hf)
+- [زیرو شاٹ امیج کلاسیفیکیشن کے لیے SigLIP](https://huggingface.co/google/siglip-so400m-patch14-384)
+- [دستاویزی سوال جواب کے لیے LayoutLM](https://huggingface.co/impira/layoutlm-document-qa)
+- [زیرو شاٹ ویڈیو کلاسیفیکیشن کے لیے X-CLIP](https://huggingface.co/docs/transformers/model_doc/xclip)
+- [زیرو شاٹ آبجیکٹ ڈیٹیکشن کے لیے OWLv2](https://huggingface.co/docs/transformers/en/model_doc/owlv2)
+- [زیرو شاٹ امیج سیگمنٹیشن کے لیے CLIPSeg](https://huggingface.co/docs/transformers/model_doc/clipseg)
+- [خودکار ماسک جنریشن کے لیے SAM](https://huggingface.co/docs/transformers/model_doc/sam)
+
+
+## ٹرانسفارمرز کے 100 منصوبے
+
+🤗 Transformers صرف پیشگی تربیت یافتہ ماڈلز کا ایک ٹول کٹ نہیں ہے: یہ ایک کمیونٹی ہے جو اس کے ارد گرد اور ہیگنگ فیس حب پر تعمیر شدہ منصوبوں کا مجموعہ ہے۔ ہم چاہتے ہیں کہ🤗 Transformers ترقی کاروں، محققین، طلباء، پروفیسرز، انجینئرز، اور ہر کسی کو اپنے خوابوں کے منصوبے بنانے میں مدد فراہم کرے۔
+
+
+🤗 Transformers کے 100,000 ستاروں کی خوشی منانے کے لیے، ہم نے کمیونٹی پر روشنی ڈالنے کا فیصلہ کیا ہے، اور ہم نے [awesome-transformers](./awesome-transformers.md) کا صفحہ بنایا ہے جو 100 شاندار منصوبے درج کرتا ہے جو 🤗 Transformers کے ارد گرد بنائے گئے ہیں۔
+
+اگر آپ کے پاس کوئی ایسا منصوبہ ہے جسے آپ سمجھتے ہیں کہ اس فہرست کا حصہ ہونا چاہیے، تو براہ کرم ایک PR کھولیں تاکہ اسے شامل کیا جا سکے!
+
+## اگر آپ ہیگنگ فیس ٹیم سے حسب ضرورت معاونت تلاش کر رہے ہیں
+
+
+
+
+
+## فوری ٹور
+
+دیے گئے ان پٹ (متن، تصویر، آڈیو، ...) پر ماڈل کو فوری طور پر استعمال کرنے کے لیے، ہم pipeline API فراہم کرتے ہیں۔ پائپ لائنز ایک پیشگی تربیت یافتہ ماڈل کو اس ماڈل کی تربیت کے دوران استعمال ہونے والے پری پروسیسنگ کے ساتھ گروپ کرتی ہیں۔ یہاں یہ ہے کہ مثبت اور منفی متون کی درجہ بندی کے لیے پائپ لائن کو جلدی سے کیسے استعمال کیا جائے:
+
+
+```python
+>>> from transformers import pipeline
+
+# جذبات کے تجزیے کے لیے ایک پائپ لائن مختص کریں
+>>> classifier = pipeline('sentiment-analysis')
+>>> classifier('We are very happy to introduce pipeline to the transformers repository.')
+[{'label': 'POSITIVE', 'score': 0.9996980428695679}]
+```
+
+دوسری لائن کوڈ پائپ لائن کے ذریعہ استعمال ہونے والے پیشگی تربیت یافتہ ماڈل کو ڈاؤن لوڈ اور کیش کرتی ہے، جبکہ تیسری لائن اسے دیے گئے متن پر جانچتی ہے۔ یہاں، جواب "مثبت" ہے جس کی اعتماد کی شرح 99.97% ہے۔
+
+بہت سے کاموں کے لیے ایک پیشگی تربیت یافتہ pipeline تیار ہے، NLP کے علاوہ کمپیوٹر ویژن اور آواز میں بھی۔ مثال کے طور پر، ہم تصویر میں دریافت شدہ اشیاء کو آسانی سے نکال سکتے ہیں:
+
+
+``` python
+>>> import requests
+>>> from PIL import Image
+>>> from transformers import pipeline
+
+# جذبات کے تجزیے کے لیے ایک پائپ لائن مختص کریں
+>>> url = "https://huggingface.co/datasets/huggingface/documentation-images/resolve/main/coco_sample.png"
+>>> image_data = requests.get(url, stream=True).raw
+>>> image = Image.open(image_data)
+
+>>> object_detector = pipeline('object-detection')
+>>> object_detector(image)
+[{'score': 0.9982201457023621،
+ 'label': 'remote'،
+ 'box': {'xmin': 40, 'ymin': 70, 'xmax': 175, 'ymax': 117}}،
+ {'score': 0.9960021376609802،
+ 'label': 'remote'،
+ 'box': {'xmin': 333, 'ymin': 72, 'xmax': 368, 'ymax': 187}}،
+ {'score': 0.9954745173454285،
+ 'label': 'couch'،
+ 'box': {'xmin': 0, 'ymin': 1, 'xmax': 639, 'ymax': 473}}،
+ {'score': 0.9988006353378296،
+ 'label': 'cat'،
+ 'box': {'xmin': 13, 'ymin': 52, 'xmax': 314, 'ymax': 470}}،
+ {'score': 0.9986783862113953،
+ 'label': 'cat'،
+ 'box': {'xmin': 345, 'ymin': 23, 'xmax': 640, 'ymax': 368}}]
+```
+
+یہاں، ہم کو تصویر میں دریافت شدہ اشیاء کی فہرست ملتی ہے، ہر ایک کے گرد ایک باکس اور اعتماد کا اسکور۔ یہاں اصل تصویر بائیں طرف ہے، اور پیشگوئیاں دائیں طرف ظاہر کی گئی ہیں:
+
+
+
+
+
+
+
+آپ `pipeline` API کی مدد سے معاونت شدہ کاموں کے بارے میں مزید جان سکتے ہیں [اس ٹیوٹوریل](https://huggingface.co/docs/transformers/task_summary) میں۔
+
+
+`pipeline` کے علاوہ، کسی بھی پیشگی تربیت یافتہ ماڈل کو آپ کے دیے گئے کام پر ڈاؤن لوڈ اور استعمال کرنے کے لیے، صرف تین لائنوں کا کوڈ کافی ہے۔ یہاں PyTorch ورژن ہے:
+
+```python
+>>> from transformers import AutoTokenizer، AutoModel
+
+>>> tokenizer = AutoTokenizer.from_pretrained("google-bert/bert-base-uncased")
+>>> model = AutoModel.from_pretrained("google-bert/bert-base-uncased")
+
+>>> inputs = tokenizer("Hello world!"، return_tensors="pt")
+>>> outputs = model(**inputs)
+```
+
+اور یہاں TensorFlow کے لیے مساوی کوڈ ہے:
+```python
+>>> from transformers import AutoTokenizer، TFAutoModel
+
+>>> tokenizer = AutoTokenizer.from_pretrained("google-bert/bert-base-uncased")
+>>> model = TFAutoModel.from_pretrained("google-bert/bert-base-uncased")
+
+>>> inputs = tokenizer("Hello world!"، return_tensors="tf")
+>>> outputs = model(**inputs)
+```
+
+ٹوکینائزر تمام پری پروسیسنگ کا ذمہ دار ہے جس کی پیشگی تربیت یافتہ ماڈل کو ضرورت ہوتی ہے اور اسے براہ راست ایک واحد سٹرنگ (جیسا کہ اوپر کی مثالوں میں) یا ایک فہرست پر کال کیا جا سکتا ہے۔ یہ ایک لغت فراہم کرے گا جسے آپ ڈاؤن اسٹریم کوڈ میں استعمال کر سکتے ہیں یا سادہ طور پر اپنے ماڈل کو ** دلیل انپیکنگ آپریٹر کے ذریعے براہ راست پاس کر سکتے ہیں۔
+
+ماڈل خود ایک باقاعدہ [PyTorch `nn.Module`](https://pytorch.org/docs/stable/nn.html#torch.nn.Module) یا [TensorFlow `tf.keras.Model`](https://www.tensorflow.org/api_docs/python/tf/keras/Model) (آپ کے بیک اینڈ پر منحصر ہے) ہے جسے آپ معمول کے مطابق استعمال کر سکتے ہیں۔ [یہ ٹیوٹوریل](https://huggingface.co/docs/transformers/training) وضاحت کرتا ہے کہ کلاسیکی PyTorch یا TensorFlow تربیتی لوپ میں ایسے ماڈل کو کیسے ضم کیا جائے، یا ہمارے `Trainer` API کا استعمال کرتے ہوئے نئے ڈیٹا سیٹ پر جلدی سے فائن ٹیون کیسے کیا جائے۔
+
+## مجھے Transformers کیوں استعمال کرنا چاہیے؟
+
+ 1. استعمال میں آسان جدید ترین ماڈلز:
+
+ - قدرتی زبان کی سمجھ اور تخلیق، کمپیوٹر وژن، اور آڈیو کے کاموں میں اعلی کارکردگی۔
+ - معلمین اور عملی ماہرین کے لیے کم داخلی رکاوٹ۔
+ - سیکھنے کے لیے صرف تین کلاسز کے ساتھ چند یوزر فرینڈلی ایبسٹریکشنز۔
+ - ہمارے تمام pretrained ماڈلز کے استعمال کے لیے ایک متحد API۔
+
+ 2. کمپیوٹیشن کے اخراجات میں کمی، کاربن فٹ پرنٹ میں کمی:
+
+- محققین ہمیشہ دوبارہ تربیت کرنے کی بجائے تربیت شدہ ماڈلز شیئر کر سکتے ہیں۔
+- عملی ماہرین کمپیوٹ وقت اور پروڈکشن اخراجات کو کم کر سکتے ہیں۔
+- ہر موڈیلٹی کے لیے 400,000 سے زیادہ pretrained ماڈلز کے ساتھ درجنوں آرکیٹیکچرز۔
+
+ 3. ماڈل کے لائف ٹائم کے ہر حصے کے لیے صحیح
+فریم ورک کا انتخاب کریں:
+
+ - 3 لائنز کے کوڈ میں جدید ترین ماڈلز تربیت دیں۔
+ - ایک ماڈل کو کسی بھی وقت TF2.0/PyTorch/JAX فریم ورکس کے درمیان منتقل کریں۔
+ - تربیت، تشخیص، اور پروڈکشن کے لیے بغیر کسی رکاوٹ کے صحیح فریم ورک کا انتخاب کریں۔
+
+ 4. اپنے ضروریات کے مطابق آسانی سے ماڈل یا ایک مثال کو حسب ضرورت بنائیں:
+
+ - ہم ہر آرکیٹیکچر کے لیے مثالیں فراہم کرتے ہیں تاکہ اصل مصنفین کے شائع شدہ نتائج کو دوبارہ پیدا کیا جا سکے۔
+ - ماڈلز کی اندرونی تفصیلات کو جتنا ممکن ہو یکساں طور پر ظاہر کیا جاتا ہے۔
+ - فوری تجربات کے لیے ماڈل فائلز کو لائبریری سے آزادانہ طور پر استعمال کیا جا سکتا ہے۔
+
+## مجھے Transformers کیوں استعمال نہیں کرنا چاہیے؟
+
+- یہ لائبریری نیورل نیٹس کے لیے بلڈنگ بلاکس کا ماڈیولر ٹول باکس نہیں ہے۔ ماڈل فائلز میں موجود کوڈ جان بوجھ کر اضافی ایبسٹریکشنز کے ساتھ دوبارہ ترتیب نہیں دیا گیا ہے، تاکہ محققین بغیر اضافی ایبسٹریکشنز/فائلوں میں گئے ہوئے جلدی سے ہر ماڈل پر کام کر سکیں۔
+- تربیتی API کا مقصد کسی بھی ماڈل پر کام کرنے کے لیے نہیں ہے بلکہ یہ لائبریری کے فراہم کردہ ماڈلز کے ساتھ کام کرنے کے لیے بہتر بنایا گیا ہے۔ عام مشین لرننگ لوپس کے لیے، آپ کو دوسری لائبریری (ممکنہ طور پر [Accelerate](https://huggingface.co/docs/accelerate)) استعمال کرنی چاہیے۔
+- حالانکہ ہم جتنا ممکن ہو زیادہ سے زیادہ استعمال کے کیسز پیش کرنے کی کوشش کرتے ہیں، ہمارے [مثالوں کے فولڈر](https://github.com/huggingface/transformers/tree/main/examples) میں موجود اسکرپٹس صرف یہی ہیں: مثالیں۔ یہ توقع کی جاتی ہے کہ یہ آپ کے مخصوص مسئلے پر فوراً کام نہیں کریں گی اور آپ کو اپنی ضروریات کے مطابق کوڈ کی کچھ لائنیں تبدیل کرنی پڑیں گی۔
+
+### انسٹالیشن
+
+#### pip کے ساتھ
+
+یہ ریپوزٹری Python 3.8+، Flax 0.4.1+، PyTorch 1.11+، اور TensorFlow 2.6+ پر ٹیسٹ کی گئی ہے۔
+
+آپ کو 🤗 Transformers کو ایک [ورچوئل ماحول](https://docs.python.org/3/library/venv.html) میں انسٹال کرنا چاہیے۔ اگر آپ Python ورچوئل ماحول سے واقف نہیں ہیں، تو [یوزر گائیڈ](https://packaging.python.org/guides/installing-using-pip-and-virtual-environments/) دیکھیں۔
+
+پہلے، Python کے اس ورژن کے ساتھ ایک ورچوئل ماحول بنائیں جو آپ استعمال کر رہے ہیں اور اسے ایکٹیویٹ کریں۔
+
+پھر، آپ کو کم از کم Flax، PyTorch، یا TensorFlow میں سے کسی ایک کو انسٹال کرنے کی ضرورت ہوگی۔
+براہ کرم اپنے پلیٹ فارم کے لیے مخصوص انسٹالیشن کمانڈ کے حوالے سے [TensorFlow انسٹالیشن صفحہ](https://www.tensorflow.org/install/)، [PyTorch انسٹالیشن صفحہ](https://pytorch.org/get-started/locally/#start-locally) اور/یا [Flax](https://github.com/google/flax#quick-install) اور [Jax](https://github.com/google/jax#installation) انسٹالیشن صفحات دیکھیں۔
+
+جب ان میں سے کوئی ایک بیک اینڈ انسٹال ہو جائے، تو 🤗 Transformers کو pip کے ذریعے مندرجہ ذیل طریقے سے انسٹال کیا جا سکتا ہے:
+
+```bash
+pip install transformers
+```
+
+اگر آپ مثالوں کے ساتھ کھیلنا چاہتے ہیں یا آپ کو کوڈ کا تازہ ترین ورژن چاہیے اور آپ نئے ریلیز کا انتظار نہیں کر سکتے، تو آپ کو [سورس سے لائبریری انسٹال کرنی ہوگی](https://huggingface.co/docs/transformers/installation#installing-from-source)۔
+
+#### conda کے ساتھ
+
+🤗 Transformers کو conda کے ذریعے مندرجہ ذیل طریقے سے انسٹال کیا جا سکتا ہے:
+
+```shell script
+conda install conda-forge::transformers
+```
+
+> **_نوٹ:_** `transformers` کو `huggingface` چینل سے انسٹال کرنا اب ختم کیا جا چکا ہے۔
+
+Flax، PyTorch، یا TensorFlow کو conda کے ساتھ انسٹال کرنے کے لیے انسٹالیشن صفحات کی پیروی کریں۔
+
+> **_نوٹ:_** ونڈوز پر، آپ کو کیشنگ سے فائدہ اٹھانے کے لیے ڈویلپر موڈ کو ایکٹیویٹ کرنے کا پیغام دیا جا سکتا ہے۔ اگر یہ آپ کے لیے ممکن نہیں ہے، تو براہ کرم ہمیں [اس مسئلے](https://github.com/huggingface/huggingface_hub/issues/1062) میں بتائیں۔
+
+### ماڈل کی تعمیرات
+
+ 🤗 Transformers کی طرف سے فراہم کردہ **[تمام ماڈل چیک پوائنٹس](https://huggingface.co/models)** ہگنگ فیس کے ماڈل حب [model hub](https://huggingface.co/models) سے بآسانی مربوط ہیں، جہاں یہ براہ راست [صارفین](https://huggingface.co/users) اور [تنظیموں](https://huggingface.co/organizations) کے ذریعہ اپ لوڈ کیے جاتے ہیں۔
+
+چیک پوائنٹس کی موجودہ تعداد: ![](https://img.shields.io/endpoint?url=https://huggingface.co/api/shields/models&color=brightgreen)
+
+🤗 Transformers فی الحال درج ذیل معماریاں فراہم کرتا ہے: ہر ایک کا اعلی سطحی خلاصہ دیکھنے کے لیے [یہاں](https://huggingface.co/docs/transformers/model_summary) دیکھیں۔
+
+یہ چیک کرنے کے لیے کہ ہر ماڈل کی Flax، PyTorch یا TensorFlow میں کوئی عملداری ہے یا 🤗 Tokenizers لائبریری کے ذریعہ سپورٹ کردہ ٹوکنائزر کے ساتھ ہے، [اس جدول](https://huggingface.co/docs/transformers/index#supported-frameworks) کا حوالہ لیں۔
+
+یہ عملداری مختلف ڈیٹا سیٹس پر ٹیسٹ کی گئی ہیں (مثال کے اسکرپٹس دیکھیں) اور اصل عملداری کی کارکردگی کے ہم آہنگ ہونی چاہئیں۔ آپ کو کارکردگی کی مزید تفصیلات [دستاویزات](https://github.com/huggingface/transformers/tree/main/examples) کے مثالوں کے سیکشن میں مل سکتی ہیں۔
+
+
+## مزید معلومات حاصل کریں
+
+| سیکشن | تفصیل |
+|-|-|
+| [دستاویزات](https://huggingface.co/docs/transformers/) | مکمل API دستاویزات اور ٹیوٹوریلز |
+| [ٹاسک کا خلاصہ](https://huggingface.co/docs/transformers/task_summary) | 🤗 Transformers کے ذریعہ سپورٹ کردہ ٹاسک |
+| [پری پروسیسنگ ٹیوٹوریل](https://huggingface.co/docs/transformers/preprocessing) | ماڈلز کے لیے ڈیٹا تیار کرنے کے لیے `Tokenizer` کلاس کا استعمال |
+| [ٹریننگ اور فائن ٹیوننگ](https://huggingface.co/docs/transformers/training) | PyTorch/TensorFlow ٹریننگ لوپ میں 🤗 Transformers کی طرف سے فراہم کردہ ماڈلز کا استعمال اور `Trainer` API |
+| [تیز دورہ: فائن ٹیوننگ/استعمال کے اسکرپٹس](https://github.com/huggingface/transformers/tree/main/examples) | مختلف قسم کے ٹاسک پر ماڈلز کو فائن ٹیون کرنے کے لیے مثال کے اسکرپٹس |
+| [ماڈل کا اشتراک اور اپ لوڈ کرنا](https://huggingface.co/docs/transformers/model_sharing) | اپنی فائن ٹیون کردہ ماڈلز کو کمیونٹی کے ساتھ اپ لوڈ اور شیئر کریں |
+
+## استشہاد
+
+ہم نے اب ایک [تحقیقی مقالہ](https://www.aclweb.org/anthology/2020.emnlp-demos.6/) تیار کیا ہے جسے آپ 🤗 Transformers لائبریری کے لیے حوالہ دے سکتے ہیں:
+
+```bibtex
+@inproceedings{wolf-etal-2020-transformers،
+ title = "Transformers: State-of-the-Art Natural Language Processing"،
+ author = "Thomas Wolf and Lysandre Debut and Victor Sanh and Julien Chaumond and Clement Delangue and Anthony Moi and Pierric Cistac and Tim Rault and R{\'e}mi Louf and Morgan Funtowicz and Joe Davison and Sam Shleifer and Patrick von Platen and Clara Ma and Yacine Jernite and Julien Plu and Canwen Xu and Teven Le Scao and Sylvain Gugger and Mariama Drame and Quentin Lhoest and Alexander M. Rush"،
+ booktitle = "Proceedings of the 2020 Conference on Empirical Methods in Natural Language Processing: System Demonstrations"،
+ month = oct،
+ year = "2020"،
+ address = "Online"،
+ publisher = "Association for Computational Linguistics"،
+ url = "https://www.aclweb.org/anthology/2020.emnlp-demos.6"،
+ pages = "38--45"
+}
+```
diff --git a/i18n/README_vi.md b/i18n/README_vi.md
index f85dda3e215d25..5e5c2ab1e25cf7 100644
--- a/i18n/README_vi.md
+++ b/i18n/README_vi.md
@@ -49,6 +49,7 @@ limitations under the License.
Deutsch |
Tiếng việt |
العربية |
+ اردو |
diff --git a/i18n/README_zh-hans.md b/i18n/README_zh-hans.md
index f857f50d1a55c9..61f3a19849ff55 100644
--- a/i18n/README_zh-hans.md
+++ b/i18n/README_zh-hans.md
@@ -69,6 +69,7 @@ checkpoint: 检查点
Deutsch |
Tiếng Việt |
العربية |
+ اردو |
diff --git a/i18n/README_zh-hant.md b/i18n/README_zh-hant.md
index 721e6575dec721..e20798a2d4571f 100644
--- a/i18n/README_zh-hant.md
+++ b/i18n/README_zh-hant.md
@@ -81,6 +81,7 @@ user: 使用者
Deutsch |
Tiếng Việt |
العربية |
+ اردو |
From 4f1e9bae4e0a2d94e8a347964569dd1df385de55 Mon Sep 17 00:00:00 2001
From: =?UTF-8?q?Ziy=C3=BA=20Ye?=
Date: Wed, 18 Sep 2024 07:23:05 -0700
Subject: [PATCH 29/50] fix the wandb logging issue (#33464)
* fix the wandb logging issue
* handle ConfigError in WandbCallback; move import to local scope
* update integration_utils.py; move import of ConfigError
* Update integration_utils.py: remove trailing whitespace
---
src/transformers/integrations/integration_utils.py | 12 +++++++++++-
1 file changed, 11 insertions(+), 1 deletion(-)
diff --git a/src/transformers/integrations/integration_utils.py b/src/transformers/integrations/integration_utils.py
index 9172f9599f77b0..40298f9c6fc77b 100755
--- a/src/transformers/integrations/integration_utils.py
+++ b/src/transformers/integrations/integration_utils.py
@@ -803,6 +803,10 @@ def setup(self, args, state, model, **kwargs):
if self._wandb is None:
return
self._initialized = True
+
+ # prepare to handle potential configuration issues during setup
+ from wandb.sdk.lib.config_util import ConfigError as WandbConfigError
+
if state.is_world_process_zero:
logger.info(
'Automatic Weights & Biases logging enabled, to disable set os.environ["WANDB_DISABLED"] = "true"'
@@ -852,7 +856,13 @@ def setup(self, args, state, model, **kwargs):
try:
self._wandb.config["model/num_parameters"] = model.num_parameters()
except AttributeError:
- logger.info("Could not log the number of model parameters in Weights & Biases.")
+ logger.info(
+ "Could not log the number of model parameters in Weights & Biases due to an AttributeError."
+ )
+ except WandbConfigError:
+ logger.warning(
+ "A ConfigError was raised whilst setting the number of model parameters in Weights & Biases config."
+ )
# log the initial model architecture to an artifact
if self._log_model.is_enabled:
From f883827c0a0832b9dd53ede18aa7fffe74a1fec2 Mon Sep 17 00:00:00 2001
From: Yoach Lacombe <52246514+ylacombe@users.noreply.github.com>
Date: Wed, 18 Sep 2024 16:25:45 +0200
Subject: [PATCH 30/50] Fix tests in ASR pipeline (#33545)
---
..._pipelines_automatic_speech_recognition.py | 74 +++++++++----------
1 file changed, 35 insertions(+), 39 deletions(-)
diff --git a/tests/pipelines/test_pipelines_automatic_speech_recognition.py b/tests/pipelines/test_pipelines_automatic_speech_recognition.py
index abb07d831ad003..842933d2b76c94 100644
--- a/tests/pipelines/test_pipelines_automatic_speech_recognition.py
+++ b/tests/pipelines/test_pipelines_automatic_speech_recognition.py
@@ -295,8 +295,8 @@ def test_torch_large(self):
self.assertEqual(output, {"text": ""})
ds = load_dataset("hf-internal-testing/librispeech_asr_dummy", "clean", split="validation").sort("id")
- filename = ds[40]["file"]
- output = speech_recognizer(filename)
+ audio = ds[40]["audio"]
+ output = speech_recognizer(audio)
self.assertEqual(output, {"text": "A MAN SAID TO THE UNIVERSE SIR I EXIST"})
@require_torch
@@ -312,8 +312,8 @@ def test_torch_large_with_input_features(self):
self.assertEqual(output, {"text": ""})
ds = load_dataset("hf-internal-testing/librispeech_asr_dummy", "clean", split="validation").sort("id")
- filename = ds[40]["file"]
- output = speech_recognizer(filename)
+ audio = ds[40]["audio"]
+ output = speech_recognizer(audio)
self.assertEqual(output, {"text": "a man said to the universe sir i exist"})
@slow
@@ -542,11 +542,11 @@ def test_torch_whisper(self):
framework="pt",
)
ds = load_dataset("hf-internal-testing/librispeech_asr_dummy", "clean", split="validation").sort("id")
- filename = ds[40]["file"]
- output = speech_recognizer(filename)
+ audio = ds[40]["audio"]
+ output = speech_recognizer(audio)
self.assertEqual(output, {"text": " A man said to the universe, Sir, I exist."})
- output = speech_recognizer([filename], chunk_length_s=5, batch_size=4)
+ output = speech_recognizer([ds[40]["audio"]], chunk_length_s=5, batch_size=4)
self.assertEqual(output, [{"text": " A man said to the universe, Sir, I exist."}])
@require_torch
@@ -1014,8 +1014,8 @@ def test_torch_speech_encoder_decoder(self):
)
ds = load_dataset("hf-internal-testing/librispeech_asr_dummy", "clean", split="validation").sort("id")
- filename = ds[40]["file"]
- output = speech_recognizer(filename)
+ audio = ds[40]["audio"]
+ output = speech_recognizer(audio)
self.assertEqual(output, {"text": 'Ein Mann sagte zum Universum : " Sir, ich existiert! "'})
@slow
@@ -1032,13 +1032,11 @@ def test_simple_wav2vec2(self):
self.assertEqual(output, {"text": ""})
ds = load_dataset("hf-internal-testing/librispeech_asr_dummy", "clean", split="validation").sort("id")
- filename = ds[40]["file"]
- output = asr(filename)
+ audio = ds[40]["audio"]
+ output = asr(audio)
self.assertEqual(output, {"text": "A MAN SAID TO THE UNIVERSE SIR I EXIST"})
- filename = ds[40]["file"]
- with open(filename, "rb") as f:
- data = f.read()
+ data = Audio().encode_example(ds[40]["audio"])["bytes"]
output = asr(data)
self.assertEqual(output, {"text": "A MAN SAID TO THE UNIVERSE SIR I EXIST"})
@@ -1058,13 +1056,11 @@ def test_simple_s2t(self):
self.assertEqual(output, {"text": "(Applausi)"})
ds = load_dataset("hf-internal-testing/librispeech_asr_dummy", "clean", split="validation").sort("id")
- filename = ds[40]["file"]
- output = asr(filename)
+ audio = ds[40]["audio"]
+ output = asr(audio)
self.assertEqual(output, {"text": "Un uomo disse all'universo: \"Signore, io esisto."})
- filename = ds[40]["file"]
- with open(filename, "rb") as f:
- data = f.read()
+ data = Audio().encode_example(ds[40]["audio"])["bytes"]
output = asr(data)
self.assertEqual(output, {"text": "Un uomo disse all'universo: \"Signore, io esisto."})
@@ -1078,13 +1074,13 @@ def test_simple_whisper_asr(self):
framework="pt",
)
ds = load_dataset("hf-internal-testing/librispeech_asr_dummy", "clean", split="validation")
- filename = ds[0]["file"]
- output = speech_recognizer(filename)
+ audio = ds[0]["audio"]
+ output = speech_recognizer(audio)
self.assertEqual(
output,
{"text": " Mr. Quilter is the apostle of the middle classes, and we are glad to welcome his gospel."},
)
- output = speech_recognizer(filename, return_timestamps=True)
+ output = speech_recognizer(ds[0]["audio"], return_timestamps=True)
self.assertEqual(
output,
{
@@ -1100,7 +1096,7 @@ def test_simple_whisper_asr(self):
},
)
speech_recognizer.model.generation_config.alignment_heads = [[2, 2], [3, 0], [3, 2], [3, 3], [3, 4], [3, 5]]
- output = speech_recognizer(filename, return_timestamps="word")
+ output = speech_recognizer(ds[0]["audio"], return_timestamps="word")
# fmt: off
self.assertEqual(
output,
@@ -1135,7 +1131,7 @@ def test_simple_whisper_asr(self):
"^Whisper cannot return `char` timestamps, only word level or segment level timestamps. "
"Use `return_timestamps='word'` or `return_timestamps=True` respectively.$",
):
- _ = speech_recognizer(filename, return_timestamps="char")
+ _ = speech_recognizer(audio, return_timestamps="char")
@slow
@require_torch
@@ -1147,8 +1143,8 @@ def test_simple_whisper_translation(self):
framework="pt",
)
ds = load_dataset("hf-internal-testing/librispeech_asr_dummy", "clean", split="validation").sort("id")
- filename = ds[40]["file"]
- output = speech_recognizer(filename)
+ audio = ds[40]["audio"]
+ output = speech_recognizer(audio)
self.assertEqual(output, {"text": " A man said to the universe, Sir, I exist."})
model = WhisperForConditionalGeneration.from_pretrained("openai/whisper-large")
@@ -1158,7 +1154,7 @@ def test_simple_whisper_translation(self):
speech_recognizer_2 = AutomaticSpeechRecognitionPipeline(
model=model, tokenizer=tokenizer, feature_extractor=feature_extractor
)
- output_2 = speech_recognizer_2(filename)
+ output_2 = speech_recognizer_2(ds[0]["audio"])
self.assertEqual(output, output_2)
# either use generate_kwargs or set the model's generation_config
@@ -1170,7 +1166,7 @@ def test_simple_whisper_translation(self):
feature_extractor=feature_extractor,
generate_kwargs={"task": "transcribe", "language": "<|it|>"},
)
- output_3 = speech_translator(filename)
+ output_3 = speech_translator(ds[0]["audio"])
self.assertEqual(output_3, {"text": " Un uomo ha detto all'universo, Sir, esiste."})
@slow
@@ -1182,10 +1178,10 @@ def test_whisper_language(self):
framework="pt",
)
ds = load_dataset("hf-internal-testing/librispeech_asr_dummy", "clean", split="validation")
- filename = ds[0]["file"]
+ audio = ds[0]["audio"]
# 1. English-only model compatible with no language argument
- output = speech_recognizer(filename)
+ output = speech_recognizer(audio)
self.assertEqual(
output,
{"text": " Mr. Quilter is the apostle of the middle classes, and we are glad to welcome his gospel."},
@@ -1197,7 +1193,7 @@ def test_whisper_language(self):
"Cannot specify `task` or `language` for an English-only model. If the model is intended to be multilingual, "
"pass `is_multilingual=True` to generate, or update the generation config.",
):
- _ = speech_recognizer(filename, generate_kwargs={"language": "en"})
+ _ = speech_recognizer(ds[0]["audio"], generate_kwargs={"language": "en"})
# 3. Multilingual model accepts language argument
speech_recognizer = pipeline(
@@ -1205,7 +1201,7 @@ def test_whisper_language(self):
model="openai/whisper-tiny",
framework="pt",
)
- output = speech_recognizer(filename, generate_kwargs={"language": "en"})
+ output = speech_recognizer(ds[0]["audio"], generate_kwargs={"language": "en"})
self.assertEqual(
output,
{"text": " Mr. Quilter is the apostle of the middle classes and we are glad to welcome his gospel."},
@@ -1315,8 +1311,8 @@ def test_xls_r_to_en(self):
)
ds = load_dataset("hf-internal-testing/librispeech_asr_dummy", "clean", split="validation").sort("id")
- filename = ds[40]["file"]
- output = speech_recognizer(filename)
+ audio = ds[40]["audio"]
+ output = speech_recognizer(audio)
self.assertEqual(output, {"text": "A man said to the universe: “Sir, I exist."})
@slow
@@ -1331,8 +1327,8 @@ def test_xls_r_from_en(self):
)
ds = load_dataset("hf-internal-testing/librispeech_asr_dummy", "clean", split="validation").sort("id")
- filename = ds[40]["file"]
- output = speech_recognizer(filename)
+ audio = ds[40]["audio"]
+ output = speech_recognizer(audio)
self.assertEqual(output, {"text": "Ein Mann sagte zu dem Universum, Sir, ich bin da."})
@slow
@@ -1348,9 +1344,8 @@ def test_speech_to_text_leveraged(self):
)
ds = load_dataset("hf-internal-testing/librispeech_asr_dummy", "clean", split="validation").sort("id")
- filename = ds[40]["file"]
-
- output = speech_recognizer(filename)
+ audio = ds[40]["audio"]
+ output = speech_recognizer(audio)
self.assertEqual(output, {"text": "a man said to the universe sir i exist"})
@slow
@@ -1561,6 +1556,7 @@ def test_whisper_longform(self):
feature_extractor=processor.feature_extractor,
max_new_tokens=128,
device=torch_device,
+ return_timestamps=True, # to allow longform generation
)
ds = load_dataset("distil-whisper/meanwhile", "default")["test"]
From fc83a4d45921150b4c23b68d08ac9ee946070149 Mon Sep 17 00:00:00 2001
From: Umar Butler
Date: Thu, 19 Sep 2024 00:41:50 +1000
Subject: [PATCH 31/50] Added support for bfloat16 to zero-shot classification
pipeline (#33554)
* Added support for bfloat16 to zero-shot classification pipeline
* Ensure support for TF.
Co-authored-by: Matt
* Remove dependency on `torch`.
Co-authored-by: Matt
---------
Co-authored-by: Matt
---
src/transformers/pipelines/zero_shot_classification.py | 5 ++++-
1 file changed, 4 insertions(+), 1 deletion(-)
diff --git a/src/transformers/pipelines/zero_shot_classification.py b/src/transformers/pipelines/zero_shot_classification.py
index 9a600bc8ad0fb8..f4aee3341e30d5 100644
--- a/src/transformers/pipelines/zero_shot_classification.py
+++ b/src/transformers/pipelines/zero_shot_classification.py
@@ -239,7 +239,10 @@ def _forward(self, inputs):
def postprocess(self, model_outputs, multi_label=False):
candidate_labels = [outputs["candidate_label"] for outputs in model_outputs]
sequences = [outputs["sequence"] for outputs in model_outputs]
- logits = np.concatenate([output["logits"].numpy() for output in model_outputs])
+ if self.framework == "pt":
+ logits = np.concatenate([output["logits"].float().numpy() for output in model_outputs])
+ else:
+ logits = np.concatenate([output["logits"].numpy() for output in model_outputs])
N = logits.shape[0]
n = len(candidate_labels)
num_sequences = N // n
From 7542fac2c7e5e5761fb0394b045a8e4d9168da1c Mon Sep 17 00:00:00 2001
From: Joao Gante
Date: Wed, 18 Sep 2024 15:43:06 +0100
Subject: [PATCH 32/50] =?UTF-8?q?Pipeline:=20no=20side-effects=20on=20`mod?=
=?UTF-8?q?el.config`=20and=20`model.generation=5Fconfig`=20=F0=9F=94=AB?=
=?UTF-8?q?=20=20(#33480)?=
MIME-Version: 1.0
Content-Type: text/plain; charset=UTF-8
Content-Transfer-Encoding: 8bit
---
.../generation/configuration_utils.py | 4 ++
src/transformers/generation/utils.py | 13 ++++---
.../pipelines/automatic_speech_recognition.py | 4 ++
src/transformers/pipelines/base.py | 37 +++++++++++--------
.../pipelines/document_question_answering.py | 4 ++
src/transformers/pipelines/image_to_text.py | 4 ++
.../pipelines/table_question_answering.py | 4 ++
.../pipelines/text2text_generation.py | 11 ++++--
src/transformers/pipelines/text_generation.py | 13 ++++---
src/transformers/pipelines/text_to_audio.py | 6 ++-
.../pipelines/visual_question_answering.py | 4 ++
tests/pipelines/test_pipelines_common.py | 26 +++++++++++++
tests/utils/test_modeling_utils.py | 32 ++++++++++++++++
13 files changed, 132 insertions(+), 30 deletions(-)
diff --git a/src/transformers/generation/configuration_utils.py b/src/transformers/generation/configuration_utils.py
index e2585b1b9ed49c..5e9ac835c19d6d 100644
--- a/src/transformers/generation/configuration_utils.py
+++ b/src/transformers/generation/configuration_utils.py
@@ -1229,6 +1229,10 @@ def from_model_config(cls, model_config: PretrainedConfig) -> "GenerationConfig"
"""
config_dict = model_config.to_dict()
config_dict.pop("_from_model_config", None)
+
+ # Removes all `None` from the model config dict -- this lets the generation config defaults to take hold
+ config_dict = {key: value for key, value in config_dict.items() if value is not None}
+
generation_config = cls.from_dict(config_dict, return_unused_kwargs=False, _from_model_config=True)
# Special case: some models have generation attributes set in the decoder. Use them if still unset in the
diff --git a/src/transformers/generation/utils.py b/src/transformers/generation/utils.py
index 019eb6c27f18cc..d8896f91267d7b 100644
--- a/src/transformers/generation/utils.py
+++ b/src/transformers/generation/utils.py
@@ -1334,23 +1334,26 @@ def _prepare_generation_config(
# the following conditions must be met
# 1) the generation config must have been created from the model config (`_from_model_config` field);
# 2) the generation config must have seen no modification since its creation (the hash is the same);
- # 3) the user must have set generation parameters in the model config.
+ # 3) there are non-default generation parameters in the model config.
+ # 4) the user must have set new generation parameters in the model config.
# NOTE: `torch.compile` can't compile `hash`, this legacy support is disabled with compilation.
if (
not is_torchdynamo_compiling()
and self.generation_config._from_model_config # 1)
and self.generation_config._original_object_hash == hash(self.generation_config) # 2)
+ and len(self.config._get_non_default_generation_parameters()) > 0 # 3)
):
new_generation_config = GenerationConfig.from_model_config(self.config)
- if new_generation_config != self.generation_config: # 3)
+ if new_generation_config != self.generation_config: # 4)
warnings.warn(
"You have modified the pretrained model configuration to control generation. This is a"
- " deprecated strategy to control generation and will be removed soon, in a future version."
+ " deprecated strategy to control generation and will be removed in v5."
" Please use and modify the model generation configuration (see"
- " https://huggingface.co/docs/transformers/generation_strategies#default-text-generation-configuration )"
+ " https://huggingface.co/docs/transformers/generation_strategies#default-text-generation-configuration )",
+ UserWarning,
)
self.generation_config = new_generation_config
- using_model_generation_config = True
+
generation_config = self.generation_config
using_model_generation_config = True
diff --git a/src/transformers/pipelines/automatic_speech_recognition.py b/src/transformers/pipelines/automatic_speech_recognition.py
index f3de341d88954c..7c122bed5437cc 100644
--- a/src/transformers/pipelines/automatic_speech_recognition.py
+++ b/src/transformers/pipelines/automatic_speech_recognition.py
@@ -501,6 +501,10 @@ def _forward(self, model_inputs, return_timestamps=False, **generate_kwargs):
else:
generate_kwargs["num_frames"] = num_frames
+ # User-defined `generation_config` passed to the pipeline call take precedence
+ if "generation_config" not in generate_kwargs:
+ generate_kwargs["generation_config"] = self.generation_config
+
tokens = self.model.generate(
inputs=inputs,
attention_mask=attention_mask,
diff --git a/src/transformers/pipelines/base.py b/src/transformers/pipelines/base.py
index 7db33ab5bd1a01..40a91a0d484b8e 100644
--- a/src/transformers/pipelines/base.py
+++ b/src/transformers/pipelines/base.py
@@ -13,6 +13,7 @@
# See the License for the specific language governing permissions and
# limitations under the License.
import collections
+import copy
import csv
import importlib
import json
@@ -899,22 +900,26 @@ def __init__(
):
self.model.to(self.device)
- # Update config and generation_config with task specific parameters
- task_specific_params = self.model.config.task_specific_params
- if task_specific_params is not None and task in task_specific_params:
- self.model.config.update(task_specific_params.get(task))
- if self.model.can_generate():
- self.model.generation_config.update(**task_specific_params.get(task))
-
- # Pipelines calling `generate`: if the tokenizer has a pad token but the model doesn't, set it in the
- # forward params so that `generate` is aware of the pad token.
- if (
- self.tokenizer is not None
- and self.model.can_generate()
- and self.tokenizer.pad_token_id is not None
- and self.model.generation_config.pad_token_id is None
- ):
- self.model.generation_config.pad_token_id = self.tokenizer.pad_token_id
+ # If the model can generate, create a local generation config. This is done to avoid side-effects on the model
+ # as we apply local tweaks to the generation config.
+ if self.model.can_generate():
+ self.prefix = self.model.config.prefix if hasattr(self.model.config, "prefix") else None
+ self.generation_config = copy.deepcopy(self.model.generation_config)
+ # Update the generation config with task specific params if they exist
+ # NOTE: `prefix` is pipeline-specific and doesn't exist in the generation config.
+ task_specific_params = self.model.config.task_specific_params
+ if task_specific_params is not None and task in task_specific_params:
+ this_task_params = task_specific_params.get(task)
+ if "prefix" in this_task_params:
+ self.prefix = this_task_params.pop("prefix")
+ self.generation_config.update(**this_task_params)
+ # If the tokenizer has a pad token but the model doesn't, set it so that `generate` is aware of it.
+ if (
+ self.tokenizer is not None
+ and self.tokenizer.pad_token_id is not None
+ and self.generation_config.pad_token_id is None
+ ):
+ self.generation_config.pad_token_id = self.tokenizer.pad_token_id
self.call_count = 0
self._batch_size = kwargs.pop("batch_size", None)
diff --git a/src/transformers/pipelines/document_question_answering.py b/src/transformers/pipelines/document_question_answering.py
index aa4fb48aae6a40..9198f432263822 100644
--- a/src/transformers/pipelines/document_question_answering.py
+++ b/src/transformers/pipelines/document_question_answering.py
@@ -429,6 +429,10 @@ def _forward(self, model_inputs, **generate_kwargs):
is_last = model_inputs.pop("is_last", False)
if self.model_type == ModelType.VisionEncoderDecoder:
+ # User-defined `generation_config` passed to the pipeline call take precedence
+ if "generation_config" not in generate_kwargs:
+ generate_kwargs["generation_config"] = self.generation_config
+
model_outputs = self.model.generate(**model_inputs, **generate_kwargs)
else:
model_outputs = self.model(**model_inputs)
diff --git a/src/transformers/pipelines/image_to_text.py b/src/transformers/pipelines/image_to_text.py
index 88dce8e591ae41..91d44c46d25c10 100644
--- a/src/transformers/pipelines/image_to_text.py
+++ b/src/transformers/pipelines/image_to_text.py
@@ -181,6 +181,10 @@ def _forward(self, model_inputs, **generate_kwargs):
):
model_inputs["input_ids"] = None
+ # User-defined `generation_config` passed to the pipeline call take precedence
+ if "generation_config" not in generate_kwargs:
+ generate_kwargs["generation_config"] = self.generation_config
+
# FIXME: We need to pop here due to a difference in how `generation.py` and `generation.tf_utils.py`
# parse inputs. In the Tensorflow version, `generate` raises an error if we don't use `input_ids` whereas
# the PyTorch version matches it with `self.model.main_input_name` or `self.model.encoder.main_input_name`
diff --git a/src/transformers/pipelines/table_question_answering.py b/src/transformers/pipelines/table_question_answering.py
index 702a47b7c3cbed..77c95432c7218f 100644
--- a/src/transformers/pipelines/table_question_answering.py
+++ b/src/transformers/pipelines/table_question_answering.py
@@ -385,6 +385,10 @@ def _forward(self, model_inputs, sequential=False, **generate_kwargs):
else:
outputs = self.batch_inference(**model_inputs)
else:
+ # User-defined `generation_config` passed to the pipeline call take precedence
+ if "generation_config" not in generate_kwargs:
+ generate_kwargs["generation_config"] = self.generation_config
+
outputs = self.model.generate(**model_inputs, **generate_kwargs)
model_outputs = {"model_inputs": model_inputs, "table": table, "outputs": outputs}
return model_outputs
diff --git a/src/transformers/pipelines/text2text_generation.py b/src/transformers/pipelines/text2text_generation.py
index 42d97f4d11b919..75ded8ac085ca5 100644
--- a/src/transformers/pipelines/text2text_generation.py
+++ b/src/transformers/pipelines/text2text_generation.py
@@ -115,7 +115,7 @@ def check_inputs(self, input_length: int, min_length: int, max_length: int):
return True
def _parse_and_tokenize(self, *args, truncation):
- prefix = self.model.config.prefix if self.model.config.prefix is not None else ""
+ prefix = self.prefix if self.prefix is not None else ""
if isinstance(args[0], list):
if self.tokenizer.pad_token_id is None:
raise ValueError("Please make sure that the tokenizer has a pad_token_id when using a batch input")
@@ -185,9 +185,14 @@ def _forward(self, model_inputs, **generate_kwargs):
self.check_inputs(
input_length,
- generate_kwargs.get("min_length", self.model.config.min_length),
- generate_kwargs.get("max_length", self.model.config.max_length),
+ generate_kwargs.get("min_length", self.generation_config.min_length),
+ generate_kwargs.get("max_length", self.generation_config.max_length),
)
+
+ # User-defined `generation_config` passed to the pipeline call take precedence
+ if "generation_config" not in generate_kwargs:
+ generate_kwargs["generation_config"] = self.generation_config
+
output_ids = self.model.generate(**model_inputs, **generate_kwargs)
out_b = output_ids.shape[0]
if self.framework == "pt":
diff --git a/src/transformers/pipelines/text_generation.py b/src/transformers/pipelines/text_generation.py
index 8bd1017ffc6696..9bffca522d5f2e 100644
--- a/src/transformers/pipelines/text_generation.py
+++ b/src/transformers/pipelines/text_generation.py
@@ -103,8 +103,8 @@ def __init__(self, *args, **kwargs):
# It also defines both some preprocess_kwargs and generate_kwargs
# which is why we cannot put them in their respective methods.
prefix = None
- if self.model.config.prefix is not None:
- prefix = self.model.config.prefix
+ if self.prefix is not None:
+ prefix = self.prefix
if prefix is None and self.model.__class__.__name__ in [
"XLNetLMHeadModel",
"TransfoXLLMHeadModel",
@@ -316,7 +316,7 @@ def preprocess(
if "max_new_tokens" in generate_kwargs:
new_tokens = generate_kwargs["max_new_tokens"]
else:
- new_tokens = generate_kwargs.get("max_length", self.model.config.max_length) - cur_len
+ new_tokens = generate_kwargs.get("max_length", self.generation_config.max_length) - cur_len
if new_tokens < 0:
raise ValueError("We cannot infer how many new tokens are expected")
if cur_len + new_tokens > self.tokenizer.model_max_length:
@@ -354,7 +354,7 @@ def _forward(self, model_inputs, **generate_kwargs):
and generate_kwargs["generation_config"].max_new_tokens is not None
)
if not has_max_new_tokens:
- generate_kwargs["max_length"] = generate_kwargs.get("max_length") or self.model.config.max_length
+ generate_kwargs["max_length"] = generate_kwargs.get("max_length") or self.generation_config.max_length
generate_kwargs["max_length"] += prefix_length
has_min_new_tokens = "min_new_tokens" in generate_kwargs or (
"generation_config" in generate_kwargs
@@ -363,7 +363,10 @@ def _forward(self, model_inputs, **generate_kwargs):
if not has_min_new_tokens and "min_length" in generate_kwargs:
generate_kwargs["min_length"] += prefix_length
- # BS x SL
+ # User-defined `generation_config` passed to the pipeline call take precedence
+ if "generation_config" not in generate_kwargs:
+ generate_kwargs["generation_config"] = self.generation_config
+
generated_sequence = self.model.generate(input_ids=input_ids, attention_mask=attention_mask, **generate_kwargs)
out_b = generated_sequence.shape[0]
if self.framework == "pt":
diff --git a/src/transformers/pipelines/text_to_audio.py b/src/transformers/pipelines/text_to_audio.py
index 81653f14d6d878..d17d18205920b0 100644
--- a/src/transformers/pipelines/text_to_audio.py
+++ b/src/transformers/pipelines/text_to_audio.py
@@ -111,7 +111,7 @@ def preprocess(self, text, **kwargs):
if self.model.config.model_type == "bark":
# bark Tokenizer is called with BarkProcessor which uses those kwargs
new_kwargs = {
- "max_length": self.model.generation_config.semantic_config.get("max_input_semantic_length", 256),
+ "max_length": self.generation_config.semantic_config.get("max_input_semantic_length", 256),
"add_special_tokens": False,
"return_attention_mask": True,
"return_token_type_ids": False,
@@ -137,6 +137,10 @@ def _forward(self, model_inputs, **kwargs):
# we expect some kwargs to be additional tensors which need to be on the right device
generate_kwargs = self._ensure_tensor_on_device(generate_kwargs, device=self.device)
+ # User-defined `generation_config` passed to the pipeline call take precedence
+ if "generation_config" not in generate_kwargs:
+ generate_kwargs["generation_config"] = self.generation_config
+
# generate_kwargs get priority over forward_params
forward_params.update(generate_kwargs)
diff --git a/src/transformers/pipelines/visual_question_answering.py b/src/transformers/pipelines/visual_question_answering.py
index e5849cbdec1955..89988c0cba2b1b 100644
--- a/src/transformers/pipelines/visual_question_answering.py
+++ b/src/transformers/pipelines/visual_question_answering.py
@@ -162,6 +162,10 @@ def preprocess(self, inputs, padding=False, truncation=False, timeout=None):
def _forward(self, model_inputs, **generate_kwargs):
if self.model.can_generate():
+ # User-defined `generation_config` passed to the pipeline call take precedence
+ if "generation_config" not in generate_kwargs:
+ generate_kwargs["generation_config"] = self.generation_config
+
model_outputs = self.model.generate(**model_inputs, **generate_kwargs)
else:
model_outputs = self.model(**model_inputs)
diff --git a/tests/pipelines/test_pipelines_common.py b/tests/pipelines/test_pipelines_common.py
index ea36ae5728d161..1fec4be3d95ca0 100644
--- a/tests/pipelines/test_pipelines_common.py
+++ b/tests/pipelines/test_pipelines_common.py
@@ -31,6 +31,7 @@
AutoTokenizer,
DistilBertForSequenceClassification,
MaskGenerationPipeline,
+ T5ForConditionalGeneration,
TextClassificationPipeline,
TextGenerationPipeline,
TFAutoModelForSequenceClassification,
@@ -234,6 +235,31 @@ def test_auto_model_pipeline_registration_from_local_dir(self):
self.assertIsInstance(pipe, TextGenerationPipeline) # Assert successful load
+ @require_torch
+ def test_pipeline_with_task_parameters_no_side_effects(self):
+ """
+ Regression test: certain pipeline flags, like `task`, modified the model configuration, causing unexpected
+ side-effects
+ """
+ # This checkpoint has task-specific parameters that will modify the behavior of the pipeline
+ model = T5ForConditionalGeneration.from_pretrained("t5-small")
+ self.assertTrue(model.config.num_beams == 1)
+
+ # The task-specific parameters used to cause side-effects on `model.config` -- not anymore
+ pipe = pipeline(model=model, tokenizer=AutoTokenizer.from_pretrained("t5-small"), task="translation_en_to_de")
+ self.assertTrue(model.config.num_beams == 1)
+ self.assertTrue(model.generation_config.num_beams == 1)
+
+ # Under the hood: we now store a generation config in the pipeline. This generation config stores the
+ # task-specific paremeters.
+ self.assertTrue(pipe.generation_config.num_beams == 4)
+
+ # We can confirm that the task-specific parameters have an effect. (In this case, the default is `num_beams=1`,
+ # which would crash when `num_return_sequences=4` is passed.)
+ pipe("Hugging Face doesn't sell hugs.", num_return_sequences=4)
+ with self.assertRaises(ValueError):
+ pipe("Hugging Face doesn't sell hugs.", num_return_sequences=4, num_beams=1)
+
@is_pipeline_test
class PipelineScikitCompatTest(unittest.TestCase):
diff --git a/tests/utils/test_modeling_utils.py b/tests/utils/test_modeling_utils.py
index f78285fdb90d90..2130ed4b7c887f 100644
--- a/tests/utils/test_modeling_utils.py
+++ b/tests/utils/test_modeling_utils.py
@@ -1715,6 +1715,38 @@ def test_isin_mps_friendly(self):
torch.equal(torch.isin(random_ids, random_test_tensor), isin_mps_friendly(random_ids, random_test_tensor))
)
+ def test_save_and_load_config_with_custom_generation(self):
+ """
+ Regression test for the ability to save and load a config with a custom generation kwarg (i.e. a parameter
+ that gets moved to the generation config and reset on the model config)
+ """
+ model = T5ForConditionalGeneration.from_pretrained(TINY_T5)
+
+ # The default for `num_beams` is 1 and `early_stopping` is False
+ self.assertTrue(model.config.num_beams == 1)
+ self.assertTrue(model.config.early_stopping is False)
+
+ # When we save the model, this custom parameter should be moved to the generation config AND the model
+ # config should contain `None`
+ model.config.num_beams = 2
+ model.config.early_stopping = True
+ self.assertTrue(model.generation_config.num_beams == 1) # unmodified generation config
+ with tempfile.TemporaryDirectory() as tmp_dir:
+ model.save_pretrained(tmp_dir)
+ new_model = T5ForConditionalGeneration.from_pretrained(tmp_dir)
+ # moved to generation config
+ self.assertTrue(new_model.generation_config.num_beams == 2)
+ self.assertTrue(new_model.generation_config.early_stopping is True)
+ # reset in the model config
+ self.assertTrue(new_model.config.num_beams is None)
+ self.assertTrue(new_model.config.early_stopping is None)
+
+ # Sanity check: We can run `generate` with the new model without any warnings
+ random_ids = torch.randint(0, 100, (1, 5))
+ with warnings.catch_warnings(record=True) as w:
+ new_model.generate(random_ids, max_new_tokens=3)
+ self.assertTrue(len(w) == 0)
+
@slow
@require_torch
From 8efc06ee1863bd6e34e8adb7b10901da87c66818 Mon Sep 17 00:00:00 2001
From: Matt
Date: Wed, 18 Sep 2024 15:57:39 +0100
Subject: [PATCH 33/50] Return attention mask in ASR pipeline to avoid warnings
(#33509)
return attention mask in ASR pipeline
---
.../pipelines/automatic_speech_recognition.py | 8 ++++++--
1 file changed, 6 insertions(+), 2 deletions(-)
diff --git a/src/transformers/pipelines/automatic_speech_recognition.py b/src/transformers/pipelines/automatic_speech_recognition.py
index 7c122bed5437cc..4301982f1e901c 100644
--- a/src/transformers/pipelines/automatic_speech_recognition.py
+++ b/src/transformers/pipelines/automatic_speech_recognition.py
@@ -440,6 +440,7 @@ def preprocess(self, inputs, chunk_length_s=0, stride_length_s=None):
truncation=False,
padding="longest",
return_tensors="pt",
+ return_attention_mask=True,
)
else:
if self.type == "seq2seq_whisper" and stride is None:
@@ -448,13 +449,16 @@ def preprocess(self, inputs, chunk_length_s=0, stride_length_s=None):
sampling_rate=self.feature_extractor.sampling_rate,
return_tensors="pt",
return_token_timestamps=True,
+ return_attention_mask=True,
)
extra["num_frames"] = processed.pop("num_frames")
else:
processed = self.feature_extractor(
- inputs, sampling_rate=self.feature_extractor.sampling_rate, return_tensors="pt"
+ inputs,
+ sampling_rate=self.feature_extractor.sampling_rate,
+ return_tensors="pt",
+ return_attention_mask=True,
)
-
if self.torch_dtype is not None:
processed = processed.to(dtype=self.torch_dtype)
if stride is not None:
From 9db963aeed419c8379c6d6425186fec0bfb86908 Mon Sep 17 00:00:00 2001
From: Dominik Niedziela <99881522+dom-dziela@users.noreply.github.com>
Date: Wed, 18 Sep 2024 17:38:31 +0200
Subject: [PATCH 34/50] enforce original size to be a list (#33564)
* enforce original size to be a list
* formatting
* apply datatype change to unpad_image in llava_next
---
src/transformers/models/llava_next/modeling_llava_next.py | 6 ++++++
.../models/llava_onevision/modeling_llava_onevision.py | 6 ++++++
2 files changed, 12 insertions(+)
diff --git a/src/transformers/models/llava_next/modeling_llava_next.py b/src/transformers/models/llava_next/modeling_llava_next.py
index c1d1ca8c276d7a..bf76921090b244 100644
--- a/src/transformers/models/llava_next/modeling_llava_next.py
+++ b/src/transformers/models/llava_next/modeling_llava_next.py
@@ -123,6 +123,12 @@ def unpad_image(tensor, original_size):
Returns:
`torch.Tensor`: The unpadded image tensor.
"""
+ if not isinstance(original_size, (list, tuple)):
+ if not isinstance(original_size, (torch.Tensor, np.ndarray)):
+ raise TypeError(
+ f"image_size invalid type: {type(original_size)} not valid, should be either list, tuple, np.ndarray or tensor"
+ )
+ original_size = original_size.tolist()
original_height, original_width = original_size
current_height, current_width = tensor.shape[1:]
diff --git a/src/transformers/models/llava_onevision/modeling_llava_onevision.py b/src/transformers/models/llava_onevision/modeling_llava_onevision.py
index 697ea84fea5040..d3200fb5193d4b 100644
--- a/src/transformers/models/llava_onevision/modeling_llava_onevision.py
+++ b/src/transformers/models/llava_onevision/modeling_llava_onevision.py
@@ -124,6 +124,12 @@ def unpad_image(tensor, original_size):
Returns:
`torch.Tensor`: The unpadded image tensor.
"""
+ if not isinstance(original_size, (list, tuple)):
+ if not isinstance(original_size, (torch.Tensor, np.ndarray)):
+ raise TypeError(
+ f"image_size invalid type: {type(original_size)} not valid, should be either list, tuple, np.ndarray or tensor"
+ )
+ original_size = original_size.tolist()
original_height, original_width = original_size
current_height, current_width = tensor.shape[1:]
From 7b1ce634cb16f86725826e427bf30f1276cc0e19 Mon Sep 17 00:00:00 2001
From: Yoni Gozlan <74535834+yonigozlan@users.noreply.github.com>
Date: Wed, 18 Sep 2024 12:56:45 -0400
Subject: [PATCH 35/50] Improve compiled RT-DETR inference speed (#33412)
* modify rt detr to improve inference times when compiled
* Remove redundant "to"
* Fix conditional lru_cache and missing shapes_list
* nit unnecessary list creation
* Fix compile error when ninja not available and custon kernel activated
---
.../models/rt_detr/modeling_rt_detr.py | 86 ++++++++++++++-----
1 file changed, 64 insertions(+), 22 deletions(-)
diff --git a/src/transformers/models/rt_detr/modeling_rt_detr.py b/src/transformers/models/rt_detr/modeling_rt_detr.py
index ab83a81f50674d..4e32434901cdc7 100644
--- a/src/transformers/models/rt_detr/modeling_rt_detr.py
+++ b/src/transformers/models/rt_detr/modeling_rt_detr.py
@@ -18,7 +18,7 @@
import os
import warnings
from dataclasses import dataclass
-from functools import lru_cache, partial
+from functools import lru_cache, partial, wraps
from pathlib import Path
from typing import Dict, List, Optional, Tuple, Union
@@ -737,7 +737,9 @@ def multi_scale_deformable_attention(
) -> Tensor:
batch_size, _, num_heads, hidden_dim = value.shape
_, num_queries, num_heads, num_levels, num_points, _ = sampling_locations.shape
- value_list = value.split([height.item() * width.item() for height, width in value_spatial_shapes], dim=1)
+ # Ignore copy
+ value_list = value.split([height * width for height, width in value_spatial_shapes], dim=1)
+
sampling_grids = 2 * sampling_locations - 1
sampling_value_list = []
for level_id, (height, width) in enumerate(value_spatial_shapes):
@@ -849,6 +851,7 @@ def forward(
position_embeddings: Optional[torch.Tensor] = None,
reference_points=None,
spatial_shapes=None,
+ spatial_shapes_list=None,
level_start_index=None,
output_attentions: bool = False,
):
@@ -858,7 +861,10 @@ def forward(
batch_size, num_queries, _ = hidden_states.shape
batch_size, sequence_length, _ = encoder_hidden_states.shape
- if (spatial_shapes[:, 0] * spatial_shapes[:, 1]).sum() != sequence_length:
+
+ # Ignore copy
+ total_elements = sum(shape[0] * shape[1] for shape in spatial_shapes_list)
+ if total_elements != sequence_length:
raise ValueError(
"Make sure to align the spatial shapes with the sequence length of the encoder hidden states"
)
@@ -893,9 +899,12 @@ def forward(
else:
raise ValueError(f"Last dim of reference_points must be 2 or 4, but got {reference_points.shape[-1]}")
- if self.disable_custom_kernels:
+ # Ignore copy
+ if self.disable_custom_kernels or MultiScaleDeformableAttention is None:
# PyTorch implementation
- output = multi_scale_deformable_attention(value, spatial_shapes, sampling_locations, attention_weights)
+ output = multi_scale_deformable_attention(
+ value, spatial_shapes_list, sampling_locations, attention_weights
+ )
else:
try:
# custom kernel
@@ -909,7 +918,9 @@ def forward(
)
except Exception:
# PyTorch implementation
- output = multi_scale_deformable_attention(value, spatial_shapes, sampling_locations, attention_weights)
+ output = multi_scale_deformable_attention(
+ value, spatial_shapes_list, sampling_locations, attention_weights
+ )
output = self.output_proj(output)
return output, attention_weights
@@ -1064,6 +1075,7 @@ def forward(
position_embeddings: Optional[torch.Tensor] = None,
reference_points=None,
spatial_shapes=None,
+ spatial_shapes_list=None,
level_start_index=None,
encoder_hidden_states: Optional[torch.Tensor] = None,
encoder_attention_mask: Optional[torch.Tensor] = None,
@@ -1114,6 +1126,7 @@ def forward(
position_embeddings=position_embeddings,
reference_points=reference_points,
spatial_shapes=spatial_shapes,
+ spatial_shapes_list=spatial_shapes_list,
level_start_index=level_start_index,
output_attentions=output_attentions,
)
@@ -1299,14 +1312,16 @@ def __init__(self, config: RTDetrConfig):
self.pan_blocks.append(RTDetrCSPRepLayer(config))
@staticmethod
- def build_2d_sincos_position_embedding(width, height, embed_dim=256, temperature=10000.0):
- grid_w = torch.arange(int(width), dtype=torch.float32)
- grid_h = torch.arange(int(height), dtype=torch.float32)
+ def build_2d_sincos_position_embedding(
+ width, height, embed_dim=256, temperature=10000.0, device="cpu", dtype=torch.float32
+ ):
+ grid_w = torch.arange(int(width), dtype=dtype, device=device)
+ grid_h = torch.arange(int(height), dtype=dtype, device=device)
grid_w, grid_h = torch.meshgrid(grid_w, grid_h, indexing="ij")
if embed_dim % 4 != 0:
raise ValueError("Embed dimension must be divisible by 4 for 2D sin-cos position embedding")
pos_dim = embed_dim // 4
- omega = torch.arange(pos_dim, dtype=torch.float32) / pos_dim
+ omega = torch.arange(pos_dim, dtype=dtype, device=device) / pos_dim
omega = 1.0 / (temperature**omega)
out_w = grid_w.flatten()[..., None] @ omega[None]
@@ -1372,8 +1387,13 @@ def forward(
src_flatten = hidden_states[enc_ind].flatten(2).permute(0, 2, 1)
if self.training or self.eval_size is None:
pos_embed = self.build_2d_sincos_position_embedding(
- width, height, self.encoder_hidden_dim, self.positional_encoding_temperature
- ).to(src_flatten.device, src_flatten.dtype)
+ width,
+ height,
+ self.encoder_hidden_dim,
+ self.positional_encoding_temperature,
+ device=src_flatten.device,
+ dtype=src_flatten.dtype,
+ )
else:
pos_embed = None
@@ -1441,6 +1461,7 @@ def forward(
position_embeddings=None,
reference_points=None,
spatial_shapes=None,
+ spatial_shapes_list=None,
level_start_index=None,
valid_ratios=None,
output_attentions=None,
@@ -1512,6 +1533,7 @@ def forward(
encoder_hidden_states=encoder_hidden_states,
reference_points=reference_points_input,
spatial_shapes=spatial_shapes,
+ spatial_shapes_list=spatial_shapes_list,
level_start_index=level_start_index,
encoder_attention_mask=encoder_attention_mask,
output_attentions=output_attentions,
@@ -1575,6 +1597,27 @@ def forward(
)
+def compile_compatible_lru_cache(*lru_args, **lru_kwargs):
+ def decorator(func):
+ @wraps(func)
+ def wrapper(self, *args, **kwargs):
+ if not torch.compiler.is_compiling():
+ # Cache the function only if the model is not being compiled
+ # check if the function is already cached, otherwise create it
+ if not hasattr(self, f"_cached_{func.__name__}"):
+ self.__setattr__(
+ f"_cached_{func.__name__}", lru_cache(*lru_args, **lru_kwargs)(func.__get__(self))
+ )
+ return self.__getattribute__(f"_cached_{func.__name__}")(*args, **kwargs)
+ else:
+ # Otherwise, just call the original function
+ return func(self, *args, **kwargs)
+
+ return wrapper
+
+ return decorator
+
+
@add_start_docstrings(
"""
RT-DETR Model (consisting of a backbone and encoder-decoder) outputting raw hidden states without any head on top.
@@ -1626,7 +1669,7 @@ def __init__(self, config: RTDetrConfig):
# init encoder output anchors and valid_mask
if config.anchor_image_size:
- self.anchors, self.valid_mask = self.generate_anchors()
+ self.anchors, self.valid_mask = self.generate_anchors(dtype=self.dtype)
# Create decoder input projection layers
# https://github.com/lyuwenyu/RT-DETR/blob/94f5e16708329d2f2716426868ec89aa774af016/rtdetr_pytorch/src/zoo/rtdetr/rtdetr_decoder.py#L412
@@ -1669,12 +1712,8 @@ def unfreeze_backbone(self):
for param in self.backbone.parameters():
param.requires_grad_(True)
- @lru_cache(maxsize=32)
- def generate_anchors(self, spatial_shapes=None, grid_size=0.05):
- # We always generate anchors in float32 to preserve equivalence between
- # dynamic and static anchor inference
- dtype = torch.float32
-
+ @compile_compatible_lru_cache(maxsize=32)
+ def generate_anchors(self, spatial_shapes=None, grid_size=0.05, device="cpu", dtype=torch.float32):
if spatial_shapes is None:
spatial_shapes = [
[int(self.config.anchor_image_size[0] / s), int(self.config.anchor_image_size[1] / s)]
@@ -1683,10 +1722,12 @@ def generate_anchors(self, spatial_shapes=None, grid_size=0.05):
anchors = []
for level, (height, width) in enumerate(spatial_shapes):
grid_y, grid_x = torch.meshgrid(
- torch.arange(end=height, dtype=dtype), torch.arange(end=width, dtype=dtype), indexing="ij"
+ torch.arange(end=height, dtype=dtype, device=device),
+ torch.arange(end=width, dtype=dtype, device=device),
+ indexing="ij",
)
grid_xy = torch.stack([grid_x, grid_y], -1)
- valid_wh = torch.tensor([width, height]).to(dtype)
+ valid_wh = torch.tensor([width, height], device=device).to(dtype)
grid_xy = (grid_xy.unsqueeze(0) + 0.5) / valid_wh
wh = torch.ones_like(grid_xy) * grid_size * (2.0**level)
anchors.append(torch.concat([grid_xy, wh], -1).reshape(-1, height * width, 4))
@@ -1826,7 +1867,7 @@ def forward(
# Pass spatial_shapes as tuple to make it hashable and make sure
# lru_cache is working for generate_anchors()
spatial_shapes_tuple = tuple(spatial_shapes_list)
- anchors, valid_mask = self.generate_anchors(spatial_shapes_tuple)
+ anchors, valid_mask = self.generate_anchors(spatial_shapes_tuple, device=device, dtype=dtype)
else:
anchors, valid_mask = self.anchors, self.valid_mask
@@ -1873,6 +1914,7 @@ def forward(
encoder_attention_mask=attention_mask,
reference_points=init_reference_points,
spatial_shapes=spatial_shapes,
+ spatial_shapes_list=spatial_shapes_list,
level_start_index=level_start_index,
output_attentions=output_attentions,
output_hidden_states=output_hidden_states,
From 6019f3ff7805f94b4bec1ad5fcf8c438ecb03ee6 Mon Sep 17 00:00:00 2001
From: Marc Sun <57196510+SunMarc@users.noreply.github.com>
Date: Wed, 18 Sep 2024 19:10:28 +0200
Subject: [PATCH 36/50] Fix bnb dequantization (#33546)
---
src/transformers/integrations/bitsandbytes.py | 1 +
1 file changed, 1 insertion(+)
diff --git a/src/transformers/integrations/bitsandbytes.py b/src/transformers/integrations/bitsandbytes.py
index c49d353ccb520b..f37ca9a2650bf3 100644
--- a/src/transformers/integrations/bitsandbytes.py
+++ b/src/transformers/integrations/bitsandbytes.py
@@ -437,6 +437,7 @@ def _dequantize_and_replace(
new_module.to(device)
model._modules[name] = new_module
+ has_been_replaced = True
if len(list(module.children())) > 0:
_, has_been_replaced = _dequantize_and_replace(
module,
From 5af7d41e49bbfc8319f462eb45253dcb3863dfb7 Mon Sep 17 00:00:00 2001
From: Yoach Lacombe <52246514+ylacombe@users.noreply.github.com>
Date: Wed, 18 Sep 2024 19:23:44 +0200
Subject: [PATCH 37/50] Codec integration (#33565)
* clean mimi commit
* some nits suggestions from Arthur
* make fixup
* rename repo id + change readme
* Update docs/source/en/model_doc/mimi.md
Co-authored-by: amyeroberts <22614925+amyeroberts@users.noreply.github.com>
* add flaky flag to batching equivalence due to audio_codes failing sometimes
---------
Co-authored-by: amyeroberts <22614925+amyeroberts@users.noreply.github.com>
---
docs/source/en/_toctree.yml | 2 +
docs/source/en/index.md | 1 +
docs/source/en/model_doc/mimi.md | 69 +
docs/source/en/perf_infer_gpu_one.md | 2 +
src/transformers/__init__.py | 14 +
src/transformers/models/__init__.py | 1 +
.../models/auto/configuration_auto.py | 2 +
.../models/auto/feature_extraction_auto.py | 1 +
src/transformers/models/auto/modeling_auto.py | 1 +
src/transformers/models/mimi/__init__.py | 57 +
.../models/mimi/configuration_mimi.py | 234 +++
.../convert_mimi_checkpoint_to_pytorch.py | 198 ++
src/transformers/models/mimi/modeling_mimi.py | 1722 +++++++++++++++++
src/transformers/utils/dummy_pt_objects.py | 14 +
tests/models/mimi/__init__.py | 0
tests/models/mimi/test_modeling_mimi.py | 890 +++++++++
16 files changed, 3208 insertions(+)
create mode 100644 docs/source/en/model_doc/mimi.md
create mode 100644 src/transformers/models/mimi/__init__.py
create mode 100644 src/transformers/models/mimi/configuration_mimi.py
create mode 100644 src/transformers/models/mimi/convert_mimi_checkpoint_to_pytorch.py
create mode 100644 src/transformers/models/mimi/modeling_mimi.py
create mode 100644 tests/models/mimi/__init__.py
create mode 100644 tests/models/mimi/test_modeling_mimi.py
diff --git a/docs/source/en/_toctree.yml b/docs/source/en/_toctree.yml
index 7eff2a38302669..59f0ff48d22a75 100644
--- a/docs/source/en/_toctree.yml
+++ b/docs/source/en/_toctree.yml
@@ -722,6 +722,8 @@
title: Hubert
- local: model_doc/mctct
title: MCTCT
+ - local: model_doc/mimi
+ title: Mimi
- local: model_doc/mms
title: MMS
- local: model_doc/musicgen
diff --git a/docs/source/en/index.md b/docs/source/en/index.md
index c18426de4c031c..cc5d7990929aa4 100644
--- a/docs/source/en/index.md
+++ b/docs/source/en/index.md
@@ -210,6 +210,7 @@ Flax), PyTorch, and/or TensorFlow.
| [Megatron-BERT](model_doc/megatron-bert) | ✅ | ❌ | ❌ |
| [Megatron-GPT2](model_doc/megatron_gpt2) | ✅ | ✅ | ✅ |
| [MGP-STR](model_doc/mgp-str) | ✅ | ❌ | ❌ |
+| [Mimi](model_doc/mimi) | ✅ | ❌ | ❌ |
| [Mistral](model_doc/mistral) | ✅ | ✅ | ✅ |
| [Mixtral](model_doc/mixtral) | ✅ | ❌ | ❌ |
| [mLUKE](model_doc/mluke) | ✅ | ❌ | ❌ |
diff --git a/docs/source/en/model_doc/mimi.md b/docs/source/en/model_doc/mimi.md
new file mode 100644
index 00000000000000..486d1836334949
--- /dev/null
+++ b/docs/source/en/model_doc/mimi.md
@@ -0,0 +1,69 @@
+
+
+# Mimi
+
+## Overview
+
+The Mimi model was proposed in [Moshi: a speech-text foundation model for real-time dialogue](https://kyutai.org/Moshi.pdf) by Alexandre Défossez, Laurent Mazaré, Manu Orsini, Amélie Royer, Patrick Pérez, Hervé Jégou, Edouard Grave and Neil Zeghidour. Mimi is a high-fidelity audio codec model developed by the Kyutai team, that combines semantic and acoustic information into audio tokens running at 12Hz and a bitrate of 1.1kbps. In other words, it can be used to map audio waveforms into “audio tokens”, known as “codebooks”.
+
+The abstract from the paper is the following:
+
+*We introduce Moshi, a speech-text foundation model and full-duplex spoken dialogue framework. Current systems for spoken dialogue rely on pipelines of independent components, namely voice activity detection, speech recognition, textual dialogue and text-to-speech. Such frameworks cannot emulate the experience of real conversations. First, their complexity induces a latency of several seconds between interactions. Second, text being the intermediate modality for dialogue, non-linguistic information that modifies meaning— such as emotion or non-speech sounds— is lost in the interaction. Finally, they rely on a segmentation into speaker turns, which does not take into account overlapping speech, interruptions and interjections. Moshi solves these independent issues altogether by casting spoken dialogue as speech-to-speech generation. Starting from a text language model backbone, Moshi generates speech as tokens from the residual quantizer of a neural audio codec, while modeling separately its own speech and that of the user into parallel streams. This allows for the removal of explicit speaker turns, and the modeling of arbitrary conversational dynamics. We moreover extend the hierarchical semantic-to-acoustic token generation of previous work to first predict time-aligned text tokens as a prefix to audio tokens. Not only this “Inner Monologue” method significantly improves the linguistic quality of generated speech, but we also illustrate how it can provide streaming speech recognition and text-to-speech. Our resulting model is the first real-time full-duplex spoken large language model, with a theoretical latency of 160ms, 200ms in practice, and is available at github.com/kyutai-labs/moshi.*
+
+Its architecture is based on [Encodec](model_doc/encodec) with several major differences:
+* it uses a much lower frame-rate.
+* it uses additional transformers for encoding and decoding for better latent contextualization
+* it uses a different quantization scheme: one codebook is dedicated to semantic projection.
+
+## Usage example
+
+Here is a quick example of how to encode and decode an audio using this model:
+
+```python
+>>> from datasets import load_dataset, Audio
+>>> from transformers import MimiModel, AutoFeatureExtractor
+>>> librispeech_dummy = load_dataset("hf-internal-testing/librispeech_asr_dummy", "clean", split="validation")
+
+>>> # load model and feature extractor
+>>> model = MimiModel.from_pretrained("kyutai/mimi")
+>>> feature_extractor = AutoFeatureExtractor.from_pretrained("kyutai/mimi")
+
+>>> # load audio sample
+>>> librispeech_dummy = librispeech_dummy.cast_column("audio", Audio(sampling_rate=feature_extractor.sampling_rate))
+>>> audio_sample = librispeech_dummy[-1]["audio"]["array"]
+>>> inputs = feature_extractor(raw_audio=audio_sample, sampling_rate=feature_extractor.sampling_rate, return_tensors="pt")
+
+>>> encoder_outputs = model.encode(inputs["input_values"], inputs["padding_mask"])
+>>> audio_values = model.decode(encoder_outputs.audio_codes, inputs["padding_mask"])[0]
+>>> # or the equivalent with a forward pass
+>>> audio_values = model(inputs["input_values"], inputs["padding_mask"]).audio_values
+```
+
+This model was contributed by [Yoach Lacombe (ylacombe)](https://huggingface.co/ylacombe).
+The original code can be found [here](https://github.com/kyutai-labs/moshi).
+
+
+## MimiConfig
+
+[[autodoc]] MimiConfig
+
+## MimiModel
+
+[[autodoc]] MimiModel
+ - decode
+ - encode
+ - forward
diff --git a/docs/source/en/perf_infer_gpu_one.md b/docs/source/en/perf_infer_gpu_one.md
index dd3433f2cd4862..4c220dd0f1483c 100644
--- a/docs/source/en/perf_infer_gpu_one.md
+++ b/docs/source/en/perf_infer_gpu_one.md
@@ -61,6 +61,7 @@ FlashAttention-2 is currently supported for the following architectures:
* [Llava-NeXT](https://huggingface.co/docs/transformers/model_doc/llava_next)
* [Llava-NeXT-Video](https://huggingface.co/docs/transformers/model_doc/llava_next_video)
* [LLaVA-Onevision](https://huggingface.co/docs/transformers/model_doc/llava_onevision)
+* [Mimi](https://huggingface.co/docs/transformers/model_doc/mimi)
* [VipLlava](https://huggingface.co/docs/transformers/model_doc/vipllava)
* [VideoLlava](https://huggingface.co/docs/transformers/model_doc/video_llava)
* [M2M100](https://huggingface.co/docs/transformers/model_doc/m2m_100)
@@ -228,6 +229,7 @@ For now, Transformers supports SDPA inference and training for the following arc
* [Jamba](https://huggingface.co/docs/transformers/model_doc/jamba#transformers.JambaModel)
* [Llama](https://huggingface.co/docs/transformers/model_doc/llama#transformers.LlamaModel)
* [LLaVA-Onevision](https://huggingface.co/docs/transformers/model_doc/llava_onevision)
+* [Mimi](https://huggingface.co/docs/transformers/model_doc/mimi)
* [Mistral](https://huggingface.co/docs/transformers/model_doc/mistral#transformers.MistralModel)
* [Mixtral](https://huggingface.co/docs/transformers/model_doc/mixtral#transformers.MixtralModel)
* [Musicgen](https://huggingface.co/docs/transformers/model_doc/musicgen#transformers.MusicgenModel)
diff --git a/src/transformers/__init__.py b/src/transformers/__init__.py
index bfd0d37916b553..aa13a97fe46150 100755
--- a/src/transformers/__init__.py
+++ b/src/transformers/__init__.py
@@ -573,6 +573,7 @@
"MgpstrProcessor",
"MgpstrTokenizer",
],
+ "models.mimi": ["MimiConfig"],
"models.mistral": ["MistralConfig"],
"models.mixtral": ["MixtralConfig"],
"models.mluke": [],
@@ -2666,6 +2667,12 @@
"MgpstrPreTrainedModel",
]
)
+ _import_structure["models.mimi"].extend(
+ [
+ "MimiModel",
+ "MimiPreTrainedModel",
+ ]
+ )
_import_structure["models.mistral"].extend(
[
"MistralForCausalLM",
@@ -5345,6 +5352,9 @@
MgpstrProcessor,
MgpstrTokenizer,
)
+ from .models.mimi import (
+ MimiConfig,
+ )
from .models.mistral import MistralConfig
from .models.mixtral import MixtralConfig
from .models.mobilebert import (
@@ -7212,6 +7222,10 @@
MgpstrModel,
MgpstrPreTrainedModel,
)
+ from .models.mimi import (
+ MimiModel,
+ MimiPreTrainedModel,
+ )
from .models.mistral import (
MistralForCausalLM,
MistralForSequenceClassification,
diff --git a/src/transformers/models/__init__.py b/src/transformers/models/__init__.py
index 2022048cd4553f..5b5d1e7902bd67 100644
--- a/src/transformers/models/__init__.py
+++ b/src/transformers/models/__init__.py
@@ -149,6 +149,7 @@
megatron_bert,
megatron_gpt2,
mgp_str,
+ mimi,
mistral,
mixtral,
mluke,
diff --git a/src/transformers/models/auto/configuration_auto.py b/src/transformers/models/auto/configuration_auto.py
index 2cd7d550d90b7a..5a6ec14e78cd43 100644
--- a/src/transformers/models/auto/configuration_auto.py
+++ b/src/transformers/models/auto/configuration_auto.py
@@ -167,6 +167,7 @@
("mega", "MegaConfig"),
("megatron-bert", "MegatronBertConfig"),
("mgp-str", "MgpstrConfig"),
+ ("mimi", "MimiConfig"),
("mistral", "MistralConfig"),
("mixtral", "MixtralConfig"),
("mobilebert", "MobileBertConfig"),
@@ -468,6 +469,7 @@
("megatron-bert", "Megatron-BERT"),
("megatron_gpt2", "Megatron-GPT2"),
("mgp-str", "MGP-STR"),
+ ("mimi", "Mimi"),
("mistral", "Mistral"),
("mixtral", "Mixtral"),
("mluke", "mLUKE"),
diff --git a/src/transformers/models/auto/feature_extraction_auto.py b/src/transformers/models/auto/feature_extraction_auto.py
index 7f335d66584f9f..dca0c08aa90957 100644
--- a/src/transformers/models/auto/feature_extraction_auto.py
+++ b/src/transformers/models/auto/feature_extraction_auto.py
@@ -69,6 +69,7 @@
("levit", "LevitFeatureExtractor"),
("maskformer", "MaskFormerFeatureExtractor"),
("mctct", "MCTCTFeatureExtractor"),
+ ("mimi", "EncodecFeatureExtractor"),
("mobilenet_v1", "MobileNetV1FeatureExtractor"),
("mobilenet_v2", "MobileNetV2FeatureExtractor"),
("mobilevit", "MobileViTFeatureExtractor"),
diff --git a/src/transformers/models/auto/modeling_auto.py b/src/transformers/models/auto/modeling_auto.py
index e0d15f1e236590..2bc71f07970aee 100644
--- a/src/transformers/models/auto/modeling_auto.py
+++ b/src/transformers/models/auto/modeling_auto.py
@@ -158,6 +158,7 @@
("mega", "MegaModel"),
("megatron-bert", "MegatronBertModel"),
("mgp-str", "MgpstrForSceneTextRecognition"),
+ ("mimi", "MimiModel"),
("mistral", "MistralModel"),
("mixtral", "MixtralModel"),
("mobilebert", "MobileBertModel"),
diff --git a/src/transformers/models/mimi/__init__.py b/src/transformers/models/mimi/__init__.py
new file mode 100644
index 00000000000000..43b2bec6caa5b3
--- /dev/null
+++ b/src/transformers/models/mimi/__init__.py
@@ -0,0 +1,57 @@
+# Copyright 2024 The HuggingFace Team. All rights reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+# http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+from typing import TYPE_CHECKING
+
+from ...utils import (
+ OptionalDependencyNotAvailable,
+ _LazyModule,
+ is_torch_available,
+)
+
+
+_import_structure = {
+ "configuration_mimi": ["MimiConfig"],
+}
+
+try:
+ if not is_torch_available():
+ raise OptionalDependencyNotAvailable()
+except OptionalDependencyNotAvailable:
+ pass
+else:
+ _import_structure["modeling_mimi"] = [
+ "MimiModel",
+ "MimiPreTrainedModel",
+ ]
+
+if TYPE_CHECKING:
+ from .configuration_mimi import (
+ MimiConfig,
+ )
+
+ try:
+ if not is_torch_available():
+ raise OptionalDependencyNotAvailable()
+ except OptionalDependencyNotAvailable:
+ pass
+ else:
+ from .modeling_mimi import (
+ MimiModel,
+ MimiPreTrainedModel,
+ )
+
+else:
+ import sys
+
+ sys.modules[__name__] = _LazyModule(__name__, globals()["__file__"], _import_structure, module_spec=__spec__)
diff --git a/src/transformers/models/mimi/configuration_mimi.py b/src/transformers/models/mimi/configuration_mimi.py
new file mode 100644
index 00000000000000..5564b1a54ba63b
--- /dev/null
+++ b/src/transformers/models/mimi/configuration_mimi.py
@@ -0,0 +1,234 @@
+# coding=utf-8
+# Copyright 2024 Meta Platforms, Inc. and affiliates, and the HuggingFace Inc. team. All rights reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+# http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+"""Mimi model configuration"""
+
+import math
+
+import numpy as np
+
+from ...configuration_utils import PretrainedConfig
+from ...utils import logging
+
+
+logger = logging.get_logger(__name__)
+
+
+class MimiConfig(PretrainedConfig):
+ r"""
+ This is the configuration class to store the configuration of an [`MimiModel`]. It is used to instantiate a
+ Mimi model according to the specified arguments, defining the model architecture. Instantiating a configuration
+ with the defaults will yield a similar configuration to that of the
+ [kyutai/mimi](https://huggingface.co/kyutai/mimi) architecture.
+
+ Configuration objects inherit from [`PretrainedConfig`] and can be used to control the model outputs. Read the
+ documentation from [`PretrainedConfig`] for more information.
+
+ Args:
+ sampling_rate (`int`, *optional*, defaults to 24000):
+ The sampling rate at which the audio waveform should be digitalized expressed in hertz (Hz).
+ frame_rate (`float`, *optional*, defaults to 12.5):
+ Framerate of the model.
+ audio_channels (`int`, *optional*, defaults to 1):
+ Number of channels in the audio data. Either 1 for mono or 2 for stereo.
+ hidden_size (`int`, *optional*, defaults to 512):
+ Intermediate representation dimension.
+ num_filters (`int`, *optional*, defaults to 64):
+ Number of convolution kernels of first `MimiConv1d` down sampling layer.
+ num_residual_layers (`int`, *optional*, defaults to 1):
+ Number of residual layers.
+ upsampling_ratios (`Sequence[int]`, *optional*):
+ Kernel size and stride ratios. The encoder uses downsampling ratios instead of upsampling ratios, hence it
+ will use the ratios in the reverse order to the ones specified here that must match the decoder order.
+ If not specified, will defaults to `[8, 6, 5, 4]`
+ kernel_size (`int`, *optional*, defaults to 7):
+ Kernel size for the initial convolution.
+ last_kernel_size (`int`, *optional*, defaults to 3):
+ Kernel size for the last convolution layer.
+ residual_kernel_size (`int`, *optional*, defaults to 3):
+ Kernel size for the residual layers.
+ dilation_growth_rate (`int`, *optional*, defaults to 2):
+ How much to increase the dilation with each layer.
+ use_causal_conv (`bool`, *optional*, defaults to `True`):
+ Whether to use fully causal convolution.
+ pad_mode (`str`, *optional*, defaults to `"constant"`):
+ Padding mode for the convolutions.
+ compress (`int`, *optional*, defaults to 2):
+ Reduced dimensionality in residual branches.
+ trim_right_ratio (`float`, *optional*, defaults to 1.0):
+ Ratio for trimming at the right of the transposed convolution under the `use_causal_conv = True` setup. If
+ equal to 1.0, it means that all the trimming is done at the right.
+ codebook_size (`int`, *optional*, defaults to 2048):
+ Number of discret codes in each codebooks.
+ codebook_dim (`int`, *optional*, defaults to 256):
+ Dimension of the unquantized codebook vectors. If not defined, uses `hidden_size`.
+ num_quantizers (`int`, *optional*, defaults to 32):
+ Number of quantizer channels, or codebooks, in the quantizer.
+ use_conv_shortcut (`bool`, *optional*, defaults to `False`):
+ Whether to use a convolutional layer as the 'skip' connection in the `MimiResnetBlock` block. If False,
+ an identity function will be used, giving a generic residual connection.
+ vector_quantization_hidden_dimension (`int`, *optional*, defaults to 256):
+ Intermediate representation dimension in the residual vector quantization space.
+ num_semantic_quantizers (`int`, *optional*, defaults to 1):
+ Number of semantic quantizer channels, or codebooks, in the semantic quantizer. Must be lower than `num_quantizers`.
+ upsample_groups (`int`, *optional*, defaults to 512):
+ If `frame_rate!=encodec_frame_rate`, indicates the number of groups used in the upsampling operation to go from one rate to another.
+ num_hidden_layers (`int`, *optional*, defaults to 8):
+ Number of hidden layers in the Transformer models.
+ intermediate_size (`int`, *optional*, defaults to 2048):
+ Dimension of the MLP representations.
+ num_attention_heads (`int`, *optional*, defaults to 8):
+ Number of attention heads for each attention layer in the Transformer encoder.
+ num_key_value_heads (`int`, *optional*, defaults to 8):
+ This is the number of key_value heads that should be used to implement Grouped Query Attention. If
+ `num_key_value_heads=num_attention_heads`, the model will use Multi Head Attention (MHA), if
+ `num_key_value_heads=1` the model will use Multi Query Attention (MQA) otherwise GQA is used. When
+ converting a multi-head checkpoint to a GQA checkpoint, each group key and value head should be constructed
+ by meanpooling all the original heads within that group. For more details checkout [this
+ paper](https://arxiv.org/pdf/2305.13245.pdf). If it is not specified, will default to `8`.
+ head_dim (`int`, *optional*, defaults to `hidden_size // num_attention_heads`):
+ The attention head dimension.
+ hidden_act (`str` or `function`, *optional*, defaults to `"gelu"`):
+ The non-linear activation function (function or string) in the decoder.
+ max_position_embeddings (`int`, *optional*, defaults to 8000):
+ The maximum sequence length that this model might ever be used with. Mimi's sliding window attention
+ allows sequence of up to 8000 tokens.
+ initializer_range (`float`, *optional*, defaults to 0.02):
+ The standard deviation of the truncated_normal_initializer for initializing all weight matrices.
+ norm_eps (`float`, *optional*, defaults to 1e-05):
+ The epsilon used by the LayerNorm normalization layers.
+ use_cache (`bool`, *optional*, defaults to `False`):
+ Whether or not the model should return the last key/values attentions (not used by all models). Only
+ relevant if `config.is_decoder=True`.
+ rope_theta (`float`, *optional*, defaults to 10000.0):
+ The base period of the RoPE embeddings.
+ sliding_window (`int`, *optional*, defaults to 250):
+ Sliding window attention window size. If not specified, will default to `250`.
+ attention_dropout (`float`, *optional*, defaults to 0.0):
+ The dropout ratio for the attention probabilities.
+ layer_scale_initial_scale (`float`, *optional*, defaults to 0.01):
+ Initiale scale of the residual rescaling operation done in the Transformer models.
+ attention_bias (`bool`, defaults to `False`, *optional*, defaults to `False`):
+ Whether to use a bias in the query, key, value and output projection layers during self-attention.
+ Example:
+
+ ```python
+ >>> from transformers import MimiModel, MimiConfig
+
+ >>> # Initializing a "kyutai/mimi" style configuration
+ >>> configuration = MimiConfig()
+
+ >>> # Initializing a model (with random weights) from the "kyutai/mimi" style configuration
+ >>> model = MimiModel(configuration)
+
+ >>> # Accessing the model configuration
+ >>> configuration = model.config
+ ```"""
+
+ model_type = "mimi"
+
+ def __init__(
+ self,
+ sampling_rate=24_000,
+ frame_rate=12.5,
+ audio_channels=1,
+ hidden_size=512,
+ num_filters=64,
+ num_residual_layers=1,
+ upsampling_ratios=None,
+ kernel_size=7,
+ last_kernel_size=3,
+ residual_kernel_size=3,
+ dilation_growth_rate=2,
+ use_causal_conv=True,
+ pad_mode="constant",
+ compress=2,
+ trim_right_ratio=1.0,
+ codebook_size=2048,
+ codebook_dim=256,
+ num_quantizers=32,
+ use_conv_shortcut=False,
+ vector_quantization_hidden_dimension=256,
+ num_semantic_quantizers=1,
+ upsample_groups=512,
+ num_hidden_layers=8,
+ intermediate_size=2048,
+ num_attention_heads=8,
+ num_key_value_heads=8,
+ head_dim=None,
+ hidden_act="gelu",
+ max_position_embeddings=8000,
+ initializer_range=0.02,
+ norm_eps=1e-5,
+ use_cache=False,
+ rope_theta=10000.0,
+ sliding_window=250,
+ attention_dropout=0.0,
+ layer_scale_initial_scale=0.01,
+ attention_bias=False,
+ **kwargs,
+ ):
+ self.sampling_rate = sampling_rate
+ self.frame_rate = frame_rate
+ self.audio_channels = audio_channels
+ self.hidden_size = hidden_size
+ self.num_filters = num_filters
+ self.num_residual_layers = num_residual_layers
+ self.upsampling_ratios = upsampling_ratios if upsampling_ratios else [8, 6, 5, 4]
+ self.kernel_size = kernel_size
+ self.last_kernel_size = last_kernel_size
+ self.residual_kernel_size = residual_kernel_size
+ self.dilation_growth_rate = dilation_growth_rate
+ self.use_causal_conv = use_causal_conv
+ self.pad_mode = pad_mode
+ self.compress = compress
+ self.trim_right_ratio = trim_right_ratio
+ self.codebook_size = codebook_size
+ self.codebook_dim = codebook_dim if codebook_dim is not None else hidden_size
+ self.num_quantizers = num_quantizers
+ self.use_conv_shortcut = use_conv_shortcut
+ self.vector_quantization_hidden_dimension = vector_quantization_hidden_dimension
+ self.upsample_groups = upsample_groups
+ self.num_hidden_layers = num_hidden_layers
+ self.intermediate_size = intermediate_size
+ self.num_attention_heads = num_attention_heads
+ self.num_key_value_heads = num_key_value_heads
+ self.hidden_act = hidden_act
+ self.max_position_embeddings = max_position_embeddings
+ self.initializer_range = initializer_range
+ self.norm_eps = norm_eps
+ self.use_cache = use_cache
+ self.rope_theta = rope_theta
+ self.sliding_window = sliding_window
+ self.attention_dropout = attention_dropout
+ self.head_dim = head_dim or hidden_size // num_attention_heads
+ self.layer_scale_initial_scale = layer_scale_initial_scale
+ self.attention_bias = attention_bias
+
+ if num_semantic_quantizers >= self.num_quantizers:
+ raise ValueError(
+ f"The number of semantic quantizers should be lower than the total number of quantizers {self.num_quantizers}, but is currently {num_semantic_quantizers}."
+ )
+ self.num_semantic_quantizers = num_semantic_quantizers
+ super().__init__(**kwargs)
+
+ @property
+ def encodec_frame_rate(self) -> int:
+ hop_length = np.prod(self.upsampling_ratios)
+ return math.ceil(self.sampling_rate / hop_length)
+
+ @property
+ def num_codebooks(self) -> int:
+ # alias to num_quantizers
+ return self.num_quantizers
diff --git a/src/transformers/models/mimi/convert_mimi_checkpoint_to_pytorch.py b/src/transformers/models/mimi/convert_mimi_checkpoint_to_pytorch.py
new file mode 100644
index 00000000000000..c617fa036c5d47
--- /dev/null
+++ b/src/transformers/models/mimi/convert_mimi_checkpoint_to_pytorch.py
@@ -0,0 +1,198 @@
+# coding=utf-8
+# Copyright 2024 The HuggingFace Inc. team. All rights reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+# http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+"""Convert Mimi checkpoints."""
+
+import argparse
+
+import safetensors
+import torch
+
+from transformers import (
+ EncodecFeatureExtractor,
+ MimiConfig,
+ MimiModel,
+ logging,
+)
+
+
+logging.set_verbosity_info()
+logger = logging.get_logger("transformers.models.mimi")
+
+
+def assert_param_count(model_1, model_2):
+ count_1 = sum(p[1].numel() for p in model_1.named_parameters() if "final_proj" not in p[0])
+ count_2 = sum(p[1].numel() for p in model_2.named_parameters() if "final_proj" not in p[0])
+ assert count_1 == count_2, f"{model_1.__class__}: {count_1} != {model_2.__class__}: {count_2}"
+
+
+def param_count(model):
+ return sum(p[1].numel() for p in model.named_parameters() if "final_proj" not in p[0])
+
+
+def _grab_best_device(use_gpu=True):
+ if torch.cuda.device_count() > 0 and use_gpu:
+ device = "cuda"
+ else:
+ device = "cpu"
+ return torch.device(device)
+
+
+convert_list = [
+ # GENERAL
+ ("conv.conv.conv", "conv"),
+ ("convtr.convtr.convtr", "conv"),
+ ("conv.conv", "conv"),
+ ("convtr.convtr", "conv"),
+ # QUANTIZER
+ ("quantizer.rvq_first.vq", "quantizer.semantic_residual_vector_quantizer"),
+ ("quantizer.rvq_first", "quantizer.semantic_residual_vector_quantizer"),
+ ("quantizer.rvq_rest.vq", "quantizer.acoustic_residual_vector_quantizer"),
+ ("quantizer.rvq_rest", "quantizer.acoustic_residual_vector_quantizer"),
+ ("_codebook", "codebook"),
+ ("_initialized", "initialized"),
+ ("embedding_sum", "embed_sum"),
+ # ENCODER PART
+ ("encoder.model", "encoder.layers"),
+ ("decoder.model", "decoder.layers"),
+ # TRANSFORMERS PART
+ ("encoder_transformer.transformer", "encoder_transformer"),
+ ("decoder_transformer.transformer", "decoder_transformer"),
+ ("linear1", "mlp.fc1"),
+ ("linear2", "mlp.fc2"),
+ ("self_attn.out_proj", "self_attn.o_proj"),
+ ("norm1", "input_layernorm"),
+ ("norm2", "post_attention_layernorm"),
+ ("layer_scale_1", "self_attn_layer_scale"),
+ ("layer_scale_2", "mlp_layer_scale"),
+]
+
+
+def _convert_model(
+ state_dict,
+ hf_model,
+ convert_list,
+ device,
+ config,
+ unwanted_prefix=None,
+):
+ hidden_size = config.hidden_size
+ head_dim = config.head_dim
+ num_heads = int(config.hidden_size // config.head_dim)
+ num_key_value_heads = config.num_key_value_heads
+ key_value_head_dim = config.num_key_value_heads * head_dim
+
+ # permute for sliced rotary
+ def permute(w, n_heads, dim1=hidden_size, dim2=hidden_size):
+ return w.view(n_heads, dim1 // n_heads // 2, 2, dim2).transpose(1, 2).reshape(dim1, dim2)
+
+ for k, v in list(state_dict.items()):
+ new_k = k if unwanted_prefix is None else k[len(unwanted_prefix) :]
+ for old_layer_name, new_layer_name in convert_list:
+ if old_layer_name in new_k:
+ new_k = new_k.replace(old_layer_name, new_layer_name)
+
+ if "in_proj_weight" in new_k:
+ # split qkv into query key and value
+ mixed_qkv = state_dict.pop(k)
+ qkv_dim = mixed_qkv.size(0) // 3
+
+ query_layer = mixed_qkv[:qkv_dim]
+ key_layer = mixed_qkv[qkv_dim : qkv_dim * 2]
+ value_layer = mixed_qkv[qkv_dim * 2 :]
+
+ state_dict[new_k.replace("in_proj_weight", "q_proj.weight")] = permute(query_layer, num_heads)
+ state_dict[new_k.replace("in_proj_weight", "k_proj.weight")] = permute(
+ key_layer, num_key_value_heads, dim1=key_value_head_dim
+ )
+ state_dict[new_k.replace("in_proj_weight", "v_proj.weight")] = value_layer
+ else:
+ state_dict[new_k] = state_dict.pop(k)
+
+ extra_keys = set(state_dict.keys()) - set(hf_model.state_dict().keys())
+ missing_keys = set(hf_model.state_dict().keys()) - set(state_dict.keys())
+ if len(extra_keys) != 0:
+ raise ValueError(f"extra keys found: {extra_keys}")
+ if len(missing_keys) != 0:
+ raise ValueError(f"missing keys: {missing_keys}")
+ hf_model.load_state_dict(state_dict, strict=True)
+ n_params = param_count(hf_model)
+
+ logger.info(f"model loaded: {round(n_params/1e6,1)}M params")
+
+ hf_model.eval()
+ hf_model.to(device)
+ del state_dict
+
+ return hf_model
+
+
+@torch.no_grad()
+def convert_checkpoint(
+ checkpoint_path,
+ pytorch_dump_folder_path,
+ config_path=None,
+ repo_id=None,
+):
+ """
+ Copy/paste/tweak model's weights to transformers design.
+ """
+ device = _grab_best_device()
+
+ if config_path is not None:
+ config = MimiConfig.from_pretrained(config_path)
+ else:
+ config = MimiConfig()
+
+ model = MimiModel(config)
+
+ feature_extractor = EncodecFeatureExtractor(
+ feature_size=config.audio_channels,
+ sampling_rate=config.sampling_rate,
+ )
+ feature_extractor.save_pretrained(pytorch_dump_folder_path)
+
+ original_checkpoint = safetensors.torch.load_file(checkpoint_path)
+ if "best_state" in original_checkpoint:
+ # we might have a training state saved, in which case discard the yaml results and just retain the weights
+ original_checkpoint = original_checkpoint["best_state"]
+
+ model = _convert_model(original_checkpoint, model, convert_list, device, config)
+
+ model.save_pretrained(pytorch_dump_folder_path)
+
+ if repo_id:
+ print("Pushing to the hub...")
+ feature_extractor.push_to_hub(repo_id)
+ model.push_to_hub(repo_id)
+
+
+if __name__ == "__main__":
+ parser = argparse.ArgumentParser()
+ parser.add_argument("--checkpoint_path", required=True, default=None, type=str, help="Path to original checkpoint")
+ parser.add_argument("--config_path", default=None, type=str, help="Path to hf config.json of model to convert")
+ parser.add_argument(
+ "--pytorch_dump_folder_path", required=True, default=None, type=str, help="Path to the output PyTorch model."
+ )
+ parser.add_argument(
+ "--push_to_hub", default=None, type=str, help="Where to upload the converted model on the 🤗 hub."
+ )
+
+ args = parser.parse_args()
+ convert_checkpoint(
+ args.checkpoint_path,
+ args.pytorch_dump_folder_path,
+ args.config_path,
+ args.push_to_hub,
+ )
diff --git a/src/transformers/models/mimi/modeling_mimi.py b/src/transformers/models/mimi/modeling_mimi.py
new file mode 100644
index 00000000000000..db36250b3d89df
--- /dev/null
+++ b/src/transformers/models/mimi/modeling_mimi.py
@@ -0,0 +1,1722 @@
+# coding=utf-8
+# Copyright 2024 Kyutai, and the HuggingFace Inc. team. All rights reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+# http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+"""PyTorch Mimi model."""
+
+import math
+from dataclasses import dataclass
+from typing import List, Optional, Tuple, Union
+
+import torch
+import torch.utils.checkpoint
+from torch import nn
+
+from ...activations import ACT2FN
+from ...cache_utils import Cache, DynamicCache, StaticCache
+from ...modeling_attn_mask_utils import AttentionMaskConverter
+from ...modeling_outputs import BaseModelOutputWithPast
+from ...modeling_utils import PreTrainedModel
+from ...utils import (
+ ModelOutput,
+ add_start_docstrings,
+ add_start_docstrings_to_model_forward,
+ is_flash_attn_2_available,
+ is_flash_attn_greater_or_equal_2_10,
+ logging,
+ replace_return_docstrings,
+)
+from .configuration_mimi import MimiConfig
+
+
+if is_flash_attn_2_available():
+ from ...modeling_flash_attention_utils import _flash_attention_forward
+
+logger = logging.get_logger(__name__)
+
+
+# General docstring
+_CONFIG_FOR_DOC = "MimiConfig"
+
+
+# Copied from transformers.models.llama.modeling_llama._prepare_4d_causal_attention_mask_with_cache_position
+def _prepare_4d_causal_attention_mask_with_cache_position(
+ attention_mask: torch.Tensor,
+ sequence_length: int,
+ target_length: int,
+ dtype: torch.dtype,
+ device: torch.device,
+ min_dtype: float,
+ cache_position: torch.Tensor,
+ batch_size: int,
+):
+ """
+ Creates a causal 4D mask of shape `(batch_size, 1, query_length, key_value_length)` from a 2D mask of shape
+ `(batch_size, key_value_length)`, or if the input `attention_mask` is already 4D, do nothing.
+
+ Args:
+ attention_mask (`torch.Tensor`):
+ A 2D attention mask of shape `(batch_size, key_value_length)` or a 4D attention mask of shape `(batch_size, 1, query_length, key_value_length)`.
+ sequence_length (`int`):
+ The sequence length being processed.
+ target_length (`int`):
+ The target length: when generating with static cache, the mask should be as long as the static cache, to account for the 0 padding, the part of the cache that is not filled yet.
+ dtype (`torch.dtype`):
+ The dtype to use for the 4D attention mask.
+ device (`torch.device`):
+ The device to plcae the 4D attention mask on.
+ min_dtype (`float`):
+ The minimum value representable with the dtype `dtype`.
+ cache_position (`torch.Tensor`):
+ Indices depicting the position of the input sequence tokens in the sequence.
+ batch_size (`torch.Tensor`):
+ Batch size.
+ """
+ if attention_mask is not None and attention_mask.dim() == 4:
+ # In this case we assume that the mask comes already in inverted form and requires no inversion or slicing.
+ causal_mask = attention_mask
+ else:
+ causal_mask = torch.full((sequence_length, target_length), fill_value=min_dtype, dtype=dtype, device=device)
+ if sequence_length != 1:
+ causal_mask = torch.triu(causal_mask, diagonal=1)
+ causal_mask *= torch.arange(target_length, device=device) > cache_position.reshape(-1, 1)
+ causal_mask = causal_mask[None, None, :, :].expand(batch_size, 1, -1, -1)
+ if attention_mask is not None:
+ causal_mask = causal_mask.clone() # copy to contiguous memory for in-place edit
+ mask_length = attention_mask.shape[-1]
+ padding_mask = causal_mask[:, :, :, :mask_length] + attention_mask[:, None, None, :]
+ padding_mask = padding_mask == 0
+ causal_mask[:, :, :, :mask_length] = causal_mask[:, :, :, :mask_length].masked_fill(
+ padding_mask, min_dtype
+ )
+
+ return causal_mask
+
+
+@dataclass
+class MimiOutput(ModelOutput):
+ """
+ Args:
+ audio_codes (`torch.LongTensor` of shape `(batch_size, num_quantizers, codes_length)`, *optional*):
+ Discret code embeddings computed using `model.encode`.
+ audio_values (`torch.FloatTensor` of shape `(batch_size, sequence_length)`, *optional*)
+ Decoded audio values, obtained using the decoder part of Mimi.
+ encoder_past_key_values (`Cache`, *optional*):
+ Pre-computed hidden-states (key and values in the self-attention blocks) that can be used to speed up sequential decoding of the encoder transformer.
+ This typically consists in the `past_key_values` returned by the model at a previous stage of decoding, when `use_cache=True` or `config.use_cache=True`.
+
+ The model will output the same cache format that is fed as input.
+
+ If `past_key_values` are used, the user can optionally input only the last `audio_values` or `audio_codes (those that don't
+ have their past key value states given to this model).
+ decoder_past_key_values (`Cache`, *optional*):
+ Pre-computed hidden-states (key and values in the self-attention blocks) that can be used to speed up sequential decoding of the decoder transformer.
+ This typically consists in the `past_key_values` returned by the model at a previous stage of decoding, when `use_cache=True` or `config.use_cache=True`.
+
+ The model will output the same cache format that is fed as input.
+
+ If `past_key_values` are used, the user can optionally input only the last `audio_values` or `audio_codes (those that don't
+ have their past key value states given to this model).
+ """
+
+ audio_codes: torch.LongTensor = None
+ audio_values: torch.FloatTensor = None
+ encoder_past_key_values: Optional[Union[Cache, List[torch.FloatTensor]]] = None
+ decoder_past_key_values: Optional[Union[Cache, List[torch.FloatTensor]]] = None
+
+
+@dataclass
+class MimiEncoderOutput(ModelOutput):
+ """
+ Args:
+ audio_codes (`torch.LongTensor` of shape `(batch_size, num_quantizers, codes_length)`, *optional*):
+ Discret code embeddings computed using `model.encode`.
+ encoder_past_key_values (`Cache`, *optional*):
+ Pre-computed hidden-states (key and values in the self-attention blocks) that can be used to speed up sequential decoding of the encoder transformer.
+ This typically consists in the `past_key_values` returned by the model at a previous stage of decoding, when `use_cache=True` or `config.use_cache=True`.
+
+ The model will output the same cache format that is fed as input.
+
+ If `past_key_values` are used, the user can optionally input only the last `audio_values` or `audio_codes (those that don't
+ have their past key value states given to this model).
+ """
+
+ audio_codes: torch.LongTensor = None
+ encoder_past_key_values: Optional[Union[Cache, List[torch.FloatTensor]]] = None
+
+
+@dataclass
+class MimiDecoderOutput(ModelOutput):
+ """
+ Args:
+ audio_values (`torch.FloatTensor` of shape `(batch_size, segment_length)`, *optional*):
+ Decoded audio values, obtained using the decoder part of Mimi.
+ decoder_past_key_values (`Cache`, *optional*):
+ Pre-computed hidden-states (key and values in the self-attention blocks) that can be used to speed up sequential decoding of the decoder transformer.
+ This typically consists in the `past_key_values` returned by the model at a previous stage of decoding, when `use_cache=True` or `config.use_cache=True`.
+
+ The model will output the same cache format that is fed as input.
+
+ If `past_key_values` are used, the user can optionally input only the last `audio_values` or `audio_codes (those that don't
+ have their past key value states given to this model).
+ """
+
+ audio_values: torch.FloatTensor = None
+ decoder_past_key_values: Optional[Union[Cache, List[torch.FloatTensor]]] = None
+
+
+class MimiConv1d(nn.Module):
+ """Conv1d with asymmetric or causal padding and normalization."""
+
+ def __init__(
+ self,
+ config,
+ in_channels: int,
+ out_channels: int,
+ kernel_size: int,
+ stride: int = 1,
+ dilation: int = 1,
+ groups: int = 1,
+ pad_mode=None,
+ bias: bool = True,
+ ):
+ super().__init__()
+ self.causal = config.use_causal_conv
+ self.pad_mode = config.pad_mode if pad_mode is None else pad_mode
+
+ # warn user on unusual setup between dilation and stride
+ if stride > 1 and dilation > 1:
+ logger.warning(
+ "MimiConv1d has been initialized with stride > 1 and dilation > 1"
+ f" (kernel_size={kernel_size} stride={stride}, dilation={dilation})."
+ )
+
+ self.conv = nn.Conv1d(
+ in_channels, out_channels, kernel_size, stride, dilation=dilation, groups=groups, bias=bias
+ )
+
+ kernel_size = self.conv.kernel_size[0]
+ stride = torch.tensor(self.conv.stride[0], dtype=torch.int64)
+ dilation = self.conv.dilation[0]
+
+ # Effective kernel size with dilations.
+ kernel_size = torch.tensor((kernel_size - 1) * dilation + 1, dtype=torch.int64)
+
+ self.register_buffer("stride", stride, persistent=False)
+ self.register_buffer("kernel_size", kernel_size, persistent=False)
+ self.register_buffer("padding_total", torch.tensor(kernel_size - stride, dtype=torch.int64), persistent=False)
+
+ # Asymmetric padding required for odd strides
+ self.padding_right = self.padding_total // 2
+ self.padding_left = self.padding_total - self.padding_right
+
+ def apply_weight_norm(self):
+ weight_norm = nn.utils.weight_norm
+ if hasattr(nn.utils.parametrizations, "weight_norm"):
+ weight_norm = nn.utils.parametrizations.weight_norm
+
+ weight_norm(self.conv)
+
+ def remove_weight_norm(self):
+ nn.utils.remove_weight_norm(self.conv)
+
+ # Copied from transformers.models.encodec.modeling_encodec.EncodecConv1d._get_extra_padding_for_conv1d
+ def _get_extra_padding_for_conv1d(
+ self,
+ hidden_states: torch.Tensor,
+ ) -> torch.Tensor:
+ """See `pad_for_conv1d`."""
+ length = hidden_states.shape[-1]
+ n_frames = (length - self.kernel_size + self.padding_total) / self.stride + 1
+ n_frames = torch.ceil(n_frames).to(torch.int64) - 1
+ ideal_length = n_frames * self.stride + self.kernel_size - self.padding_total
+
+ return ideal_length - length
+
+ @staticmethod
+ # Copied from transformers.models.encodec.modeling_encodec.EncodecConv1d._pad1d
+ def _pad1d(hidden_states: torch.Tensor, paddings: Tuple[int, int], mode: str = "zero", value: float = 0.0):
+ """Tiny wrapper around torch.nn.functional.pad, just to allow for reflect padding on small input.
+ If this is the case, we insert extra 0 padding to the right before the reflection happens.
+ """
+ length = hidden_states.shape[-1]
+ padding_left, padding_right = paddings
+ if not mode == "reflect":
+ return nn.functional.pad(hidden_states, paddings, mode, value)
+
+ max_pad = max(padding_left, padding_right)
+ extra_pad = 0
+ if length <= max_pad:
+ extra_pad = max_pad - length + 1
+ hidden_states = nn.functional.pad(hidden_states, (0, extra_pad))
+ padded = nn.functional.pad(hidden_states, paddings, mode, value)
+ end = padded.shape[-1] - extra_pad
+ return padded[..., :end]
+
+ def forward(self, hidden_states):
+ extra_padding = self._get_extra_padding_for_conv1d(hidden_states)
+
+ if self.causal:
+ # Left padding for causal
+ hidden_states = self._pad1d(hidden_states, (self.padding_total, extra_padding), mode=self.pad_mode)
+ else:
+ hidden_states = self._pad1d(
+ hidden_states, (self.padding_left, self.padding_right + extra_padding), mode=self.pad_mode
+ )
+
+ hidden_states = self.conv(hidden_states)
+ return hidden_states
+
+
+class MimiConvTranspose1d(nn.Module):
+ """ConvTranspose1d with asymmetric or causal padding and normalization."""
+
+ def __init__(
+ self,
+ config,
+ in_channels: int,
+ out_channels: int,
+ kernel_size: int,
+ stride: int = 1,
+ groups: int = 1,
+ bias=True,
+ ):
+ super().__init__()
+ self.causal = config.use_causal_conv
+ self.trim_right_ratio = config.trim_right_ratio
+ self.conv = nn.ConvTranspose1d(in_channels, out_channels, kernel_size, stride, groups=groups, bias=bias)
+
+ if not (self.causal or self.trim_right_ratio == 1.0):
+ raise ValueError("`trim_right_ratio` != 1.0 only makes sense for causal convolutions")
+
+ kernel_size = self.conv.kernel_size[0]
+ stride = self.conv.stride[0]
+ padding_total = kernel_size - stride
+
+ # We will only trim fixed padding. Extra padding from `pad_for_conv1d` would be
+ # removed at the very end, when keeping only the right length for the output,
+ # as removing it here would require also passing the length at the matching layer
+ # in the encoder.
+ if self.causal:
+ # Trim the padding on the right according to the specified ratio
+ # if trim_right_ratio = 1.0, trim everything from right
+ self.padding_right = math.ceil(padding_total * self.trim_right_ratio)
+ else:
+ # Asymmetric padding required for odd strides
+ self.padding_right = padding_total // 2
+
+ self.padding_left = padding_total - self.padding_right
+
+ def apply_weight_norm(self):
+ weight_norm = nn.utils.weight_norm
+ if hasattr(nn.utils.parametrizations, "weight_norm"):
+ weight_norm = nn.utils.parametrizations.weight_norm
+
+ weight_norm(self.conv)
+
+ def remove_weight_norm(self):
+ nn.utils.remove_weight_norm(self.conv)
+
+ def forward(self, hidden_states):
+ hidden_states = self.conv(hidden_states)
+
+ # unpad
+ end = hidden_states.shape[-1] - self.padding_right
+ hidden_states = hidden_states[..., self.padding_left : end]
+ return hidden_states
+
+
+# Copied from transformers.models.encodec.modeling_encodec.EncodecResnetBlock with Encodec->Mimi,EnCodec->Mimi
+class MimiResnetBlock(nn.Module):
+ """
+ Residual block from SEANet model as used by Mimi.
+ """
+
+ def __init__(self, config: MimiConfig, dim: int, dilations: List[int]):
+ super().__init__()
+ kernel_sizes = (config.residual_kernel_size, 1)
+ if len(kernel_sizes) != len(dilations):
+ raise ValueError("Number of kernel sizes should match number of dilations")
+
+ hidden = dim // config.compress
+ block = []
+ for i, (kernel_size, dilation) in enumerate(zip(kernel_sizes, dilations)):
+ in_chs = dim if i == 0 else hidden
+ out_chs = dim if i == len(kernel_sizes) - 1 else hidden
+ block += [nn.ELU()]
+ block += [MimiConv1d(config, in_chs, out_chs, kernel_size, dilation=dilation)]
+ self.block = nn.ModuleList(block)
+
+ if config.use_conv_shortcut:
+ self.shortcut = MimiConv1d(config, dim, dim, kernel_size=1)
+ else:
+ self.shortcut = nn.Identity()
+
+ def forward(self, hidden_states):
+ residual = hidden_states
+ for layer in self.block:
+ hidden_states = layer(hidden_states)
+
+ return self.shortcut(residual) + hidden_states
+
+
+class MimiEncoder(nn.Module):
+ """SEANet encoder as used by Mimi."""
+
+ def __init__(self, config: MimiConfig):
+ super().__init__()
+ model = [MimiConv1d(config, config.audio_channels, config.num_filters, config.kernel_size)]
+ scaling = 1
+
+ # Downsample to raw audio scale
+ for ratio in reversed(config.upsampling_ratios):
+ current_scale = scaling * config.num_filters
+ # Add residual layers
+ for j in range(config.num_residual_layers):
+ model += [MimiResnetBlock(config, current_scale, [config.dilation_growth_rate**j, 1])]
+ # Add downsampling layers
+ model += [nn.ELU()]
+ model += [MimiConv1d(config, current_scale, current_scale * 2, kernel_size=ratio * 2, stride=ratio)]
+ scaling *= 2
+
+ model += [nn.ELU()]
+ model += [MimiConv1d(config, scaling * config.num_filters, config.hidden_size, config.last_kernel_size)]
+
+ self.layers = nn.ModuleList(model)
+
+ # Copied from transformers.models.encodec.modeling_encodec.EncodecEncoder.forward
+ def forward(self, hidden_states):
+ for layer in self.layers:
+ hidden_states = layer(hidden_states)
+ return hidden_states
+
+
+class MimiLayerScale(nn.Module):
+ """Layer scale from [Touvron et al 2021] (https://arxiv.org/pdf/2103.17239.pdf).
+ This rescales diagonally the residual outputs close to 0, with a learnt scale.
+ """
+
+ def __init__(self, config):
+ super().__init__()
+ channels = config.hidden_size
+ initial_scale = config.layer_scale_initial_scale
+ self.scale = nn.Parameter(torch.full((channels,), initial_scale, requires_grad=True))
+
+ def forward(self, x: torch.Tensor):
+ return self.scale * x
+
+
+# Copied from transformers.models.mistral.modeling_mistral.MistralRotaryEmbedding with Mistral->Mimi
+class MimiRotaryEmbedding(nn.Module):
+ def __init__(self, dim, max_position_embeddings=2048, base=10000, device=None):
+ super().__init__()
+
+ self.dim = dim
+ self.max_position_embeddings = max_position_embeddings
+ self.base = base
+ inv_freq = 1.0 / (self.base ** (torch.arange(0, self.dim, 2, dtype=torch.int64).float().to(device) / self.dim))
+ self.register_buffer("inv_freq", inv_freq, persistent=False)
+
+ @torch.no_grad()
+ # copied from transformers.models.llama.modeling_llama.LlamaRotaryEmbedding.forward
+ # TODO(joao): add me back asap :)
+ def forward(self, x, position_ids):
+ # x: [bs, num_attention_heads, seq_len, head_size]
+ inv_freq_expanded = self.inv_freq[None, :, None].float().expand(position_ids.shape[0], -1, 1)
+ position_ids_expanded = position_ids[:, None, :].float()
+ # Force float32 since bfloat16 loses precision on long contexts
+ # See https://github.com/huggingface/transformers/pull/29285
+ device_type = x.device.type
+ device_type = device_type if isinstance(device_type, str) and device_type != "mps" else "cpu"
+ with torch.autocast(device_type=device_type, enabled=False):
+ freqs = (inv_freq_expanded.float() @ position_ids_expanded.float()).transpose(1, 2)
+ emb = torch.cat((freqs, freqs), dim=-1)
+ cos = emb.cos()
+ sin = emb.sin()
+ return cos.to(dtype=x.dtype), sin.to(dtype=x.dtype)
+
+
+# Copied from transformers.models.llama.modeling_llama.rotate_half
+def rotate_half(x):
+ """Rotates half the hidden dims of the input."""
+ x1 = x[..., : x.shape[-1] // 2]
+ x2 = x[..., x.shape[-1] // 2 :]
+ return torch.cat((-x2, x1), dim=-1)
+
+
+# Copied from transformers.models.llama.modeling_llama.apply_rotary_pos_emb
+def apply_rotary_pos_emb(q, k, cos, sin, position_ids=None, unsqueeze_dim=1):
+ """Applies Rotary Position Embedding to the query and key tensors.
+
+ Args:
+ q (`torch.Tensor`): The query tensor.
+ k (`torch.Tensor`): The key tensor.
+ cos (`torch.Tensor`): The cosine part of the rotary embedding.
+ sin (`torch.Tensor`): The sine part of the rotary embedding.
+ position_ids (`torch.Tensor`, *optional*):
+ Deprecated and unused.
+ unsqueeze_dim (`int`, *optional*, defaults to 1):
+ The 'unsqueeze_dim' argument specifies the dimension along which to unsqueeze cos[position_ids] and
+ sin[position_ids] so that they can be properly broadcasted to the dimensions of q and k. For example, note
+ that cos[position_ids] and sin[position_ids] have the shape [batch_size, seq_len, head_dim]. Then, if q and
+ k have the shape [batch_size, heads, seq_len, head_dim], then setting unsqueeze_dim=1 makes
+ cos[position_ids] and sin[position_ids] broadcastable to the shapes of q and k. Similarly, if q and k have
+ the shape [batch_size, seq_len, heads, head_dim], then set unsqueeze_dim=2.
+ Returns:
+ `tuple(torch.Tensor)` comprising of the query and key tensors rotated using the Rotary Position Embedding.
+ """
+ cos = cos.unsqueeze(unsqueeze_dim)
+ sin = sin.unsqueeze(unsqueeze_dim)
+ q_embed = (q * cos) + (rotate_half(q) * sin)
+ k_embed = (k * cos) + (rotate_half(k) * sin)
+ return q_embed, k_embed
+
+
+class MimiMLP(nn.Module):
+ def __init__(self, config):
+ super().__init__()
+ self.config = config
+ self.activation_fn = ACT2FN[config.hidden_act]
+ self.fc1 = nn.Linear(config.hidden_size, config.intermediate_size, bias=False)
+ self.fc2 = nn.Linear(config.intermediate_size, config.hidden_size, bias=False)
+
+ # Copied from transformers.models.clip.modeling_clip.CLIPMLP.forward
+ def forward(self, hidden_states: torch.Tensor) -> torch.Tensor:
+ hidden_states = self.fc1(hidden_states)
+ hidden_states = self.activation_fn(hidden_states)
+ hidden_states = self.fc2(hidden_states)
+ return hidden_states
+
+
+# Copied from transformers.models.llama.modeling_llama.repeat_kv
+def repeat_kv(hidden_states: torch.Tensor, n_rep: int) -> torch.Tensor:
+ """
+ This is the equivalent of torch.repeat_interleave(x, dim=1, repeats=n_rep). The hidden states go from (batch,
+ num_key_value_heads, seqlen, head_dim) to (batch, num_attention_heads, seqlen, head_dim)
+ """
+ batch, num_key_value_heads, slen, head_dim = hidden_states.shape
+ if n_rep == 1:
+ return hidden_states
+ hidden_states = hidden_states[:, :, None, :, :].expand(batch, num_key_value_heads, n_rep, slen, head_dim)
+ return hidden_states.reshape(batch, num_key_value_heads * n_rep, slen, head_dim)
+
+
+# Copied from transformers.models.gemma.modeling_gemma.GemmaAttention with Gemma->Mimi
+class MimiAttention(nn.Module):
+ """Multi-headed attention from 'Attention Is All You Need' paper"""
+
+ def __init__(self, config: MimiConfig, layer_idx: Optional[int] = None):
+ super().__init__()
+ self.config = config
+ self.layer_idx = layer_idx
+ if layer_idx is None:
+ logger.warning_once(
+ f"Instantiating {self.__class__.__name__} without passing a `layer_idx` is not recommended and will "
+ "lead to errors during the forward call if caching is used. Please make sure to provide a `layer_idx` "
+ "when creating this class."
+ )
+
+ self.attention_dropout = config.attention_dropout
+ self.hidden_size = config.hidden_size
+ self.num_heads = config.num_attention_heads
+ self.head_dim = config.head_dim
+ self.num_key_value_heads = config.num_key_value_heads
+ self.num_key_value_groups = self.num_heads // self.num_key_value_heads
+ self.max_position_embeddings = config.max_position_embeddings
+ self.rope_theta = config.rope_theta
+ self.is_causal = True
+ self.scaling = 1 / math.sqrt(config.head_dim)
+
+ if self.hidden_size % self.num_heads != 0:
+ raise ValueError(
+ f"hidden_size must be divisible by num_heads (got `hidden_size`: {self.hidden_size}"
+ f" and `num_heads`: {self.num_heads})."
+ )
+
+ self.q_proj = nn.Linear(self.hidden_size, self.num_heads * self.head_dim, bias=config.attention_bias)
+ self.k_proj = nn.Linear(self.hidden_size, self.num_key_value_heads * self.head_dim, bias=config.attention_bias)
+ self.v_proj = nn.Linear(self.hidden_size, self.num_key_value_heads * self.head_dim, bias=config.attention_bias)
+ self.o_proj = nn.Linear(self.num_heads * self.head_dim, self.hidden_size, bias=config.attention_bias)
+ self.rotary_emb = MimiRotaryEmbedding(
+ self.head_dim,
+ max_position_embeddings=self.max_position_embeddings,
+ base=self.rope_theta,
+ )
+ self.sliding_window = config.sliding_window # Ignore copy
+
+ def forward(
+ self,
+ hidden_states: torch.Tensor,
+ attention_mask: Optional[torch.Tensor] = None,
+ position_ids: Optional[torch.LongTensor] = None,
+ past_key_value: Optional[Cache] = None,
+ output_attentions: bool = False,
+ use_cache: bool = False,
+ cache_position: Optional[torch.LongTensor] = None,
+ ) -> Tuple[torch.Tensor, Optional[torch.Tensor], Optional[Tuple[torch.Tensor]]]:
+ bsz, q_len, _ = hidden_states.size()
+
+ query_states = self.q_proj(hidden_states)
+ key_states = self.k_proj(hidden_states)
+ value_states = self.v_proj(hidden_states)
+
+ query_states = query_states.view(bsz, q_len, self.num_heads, self.head_dim).transpose(1, 2)
+ key_states = key_states.view(bsz, q_len, self.num_key_value_heads, self.head_dim).transpose(1, 2)
+ value_states = value_states.view(bsz, q_len, self.num_key_value_heads, self.head_dim).transpose(1, 2)
+
+ cos, sin = self.rotary_emb(value_states, position_ids)
+ query_states, key_states = apply_rotary_pos_emb(query_states, key_states, cos, sin)
+
+ if past_key_value is not None:
+ # sin and cos are specific to RoPE models; cache_position needed for the static cache
+ cache_kwargs = {"sin": sin, "cos": cos, "cache_position": cache_position}
+ key_states, value_states = past_key_value.update(key_states, value_states, self.layer_idx, cache_kwargs)
+
+ key_states = repeat_kv(key_states, self.num_key_value_groups)
+ value_states = repeat_kv(value_states, self.num_key_value_groups)
+
+ attn_weights = torch.matmul(query_states, key_states.transpose(2, 3)) * self.scaling
+
+ if attention_mask is not None: # no matter the length, we just slice it
+ causal_mask = attention_mask[:, :, :, : key_states.shape[-2]]
+ attn_weights = attn_weights + causal_mask
+
+ # upcast attention to fp32
+ attn_weights = nn.functional.softmax(attn_weights, dim=-1, dtype=torch.float32).to(query_states.dtype)
+ attn_weights = nn.functional.dropout(attn_weights, p=self.attention_dropout, training=self.training)
+ attn_output = torch.matmul(attn_weights, value_states)
+
+ if attn_output.size() != (bsz, self.num_heads, q_len, self.head_dim):
+ raise ValueError(
+ f"`attn_output` should be of size {(bsz, self.num_heads, q_len, self.head_dim)}, but is"
+ f" {attn_output.size()}"
+ )
+
+ attn_output = attn_output.transpose(1, 2).contiguous()
+
+ attn_output = attn_output.view(bsz, q_len, -1)
+ attn_output = self.o_proj(attn_output)
+
+ if not output_attentions:
+ attn_weights = None
+
+ return attn_output, attn_weights, past_key_value
+
+
+# Copied from transformers.models.gemma.modeling_gemma.GemmaFlashAttention2 with Gemma->Mimi
+class MimiFlashAttention2(MimiAttention):
+ """
+ Mimi flash attention module. This module inherits from `MimiAttention` as the weights of the module stays
+ untouched. The only required change would be on the forward pass where it needs to correctly call the public API of
+ flash attention and deal with padding tokens in case the input contains any of them.
+ """
+
+ def __init__(self, *args, **kwargs):
+ super().__init__(*args, **kwargs)
+
+ # TODO: Should be removed once Flash Attention for RoCm is bumped to 2.1.
+ # flash_attn<2.1 generates top-left aligned causal mask, while what is needed here is bottom-right alignement, that was made default for flash_attn>=2.1. This attribute is used to handle this difference. Reference: https://github.com/Dao-AILab/flash-attention/releases/tag/v2.1.0.
+ # Beware that with flash_attn<2.1, using q_seqlen != k_seqlen (except for the case q_seqlen == 1) produces a wrong mask (top-left).
+ self._flash_attn_uses_top_left_mask = not is_flash_attn_greater_or_equal_2_10()
+
+ def forward(
+ self,
+ hidden_states: torch.Tensor,
+ attention_mask: Optional[torch.LongTensor] = None,
+ position_ids: Optional[torch.LongTensor] = None,
+ past_key_value: Optional[Cache] = None,
+ output_attentions: bool = False,
+ use_cache: bool = False,
+ cache_position: Optional[torch.LongTensor] = None,
+ ) -> Tuple[torch.Tensor, Optional[torch.Tensor], Optional[Tuple[torch.Tensor]]]:
+ if isinstance(past_key_value, StaticCache):
+ raise ValueError(
+ "`static` cache implementation is not compatible with `attn_implementation==flash_attention_2` "
+ "make sure to use `sdpa` in the mean time, and open an issue at https://github.com/huggingface/transformers"
+ )
+
+ output_attentions = False
+
+ bsz, q_len, _ = hidden_states.size()
+
+ query_states = self.q_proj(hidden_states)
+ key_states = self.k_proj(hidden_states)
+ value_states = self.v_proj(hidden_states)
+
+ # Flash attention requires the input to have the shape
+ # batch_size x seq_length x head_dim x hidden_dim
+ # therefore we just need to keep the original shape
+ query_states = query_states.view(bsz, q_len, self.num_heads, self.head_dim).transpose(1, 2)
+ key_states = key_states.view(bsz, q_len, self.num_key_value_heads, self.head_dim).transpose(1, 2)
+ value_states = value_states.view(bsz, q_len, self.num_key_value_heads, self.head_dim).transpose(1, 2)
+
+ cos, sin = self.rotary_emb(value_states, position_ids)
+ query_states, key_states = apply_rotary_pos_emb(query_states, key_states, cos, sin)
+
+ if past_key_value is not None:
+ # sin and cos are specific to RoPE models; cache_position needed for the static cache
+ cache_kwargs = {"sin": sin, "cos": cos, "cache_position": cache_position}
+ key_states, value_states = past_key_value.update(key_states, value_states, self.layer_idx, cache_kwargs)
+
+ # TODO: These transpose are quite inefficient but Flash Attention requires the layout [batch_size, sequence_length, num_heads, head_dim]. We would need to refactor the KV cache
+ # to be able to avoid many of these transpose/reshape/view.
+ query_states = query_states.transpose(1, 2)
+ key_states = key_states.transpose(1, 2)
+ value_states = value_states.transpose(1, 2)
+
+ dropout_rate = self.attention_dropout if self.training else 0.0
+
+ # In PEFT, usually we cast the layer norms in float32 for training stability reasons
+ # therefore the input hidden states gets silently casted in float32. Hence, we need
+ # cast them back in the correct dtype just to be sure everything works as expected.
+ # This might slowdown training & inference so it is recommended to not cast the LayerNorms
+ # in fp32. (MimiRMSNorm handles it correctly)
+
+ input_dtype = query_states.dtype
+ if input_dtype == torch.float32:
+ if torch.is_autocast_enabled():
+ target_dtype = torch.get_autocast_gpu_dtype()
+ # Handle the case where the model is quantized
+ elif hasattr(self.config, "_pre_quantization_dtype"):
+ target_dtype = self.config._pre_quantization_dtype
+ else:
+ target_dtype = self.q_proj.weight.dtype
+
+ logger.warning_once(
+ f"The input hidden states seems to be silently casted in float32, this might be related to"
+ f" the fact you have upcasted embedding or layer norm layers in float32. We will cast back the input in"
+ f" {target_dtype}."
+ )
+
+ query_states = query_states.to(target_dtype)
+ key_states = key_states.to(target_dtype)
+ value_states = value_states.to(target_dtype)
+
+ attn_output = _flash_attention_forward(
+ query_states,
+ key_states,
+ value_states,
+ attention_mask,
+ q_len,
+ position_ids=position_ids,
+ dropout=dropout_rate,
+ sliding_window=getattr(self, "sliding_window", None),
+ is_causal=self.is_causal,
+ use_top_left_mask=self._flash_attn_uses_top_left_mask,
+ )
+
+ attn_output = attn_output.reshape(bsz, q_len, -1).contiguous()
+ attn_output = self.o_proj(attn_output)
+
+ if not output_attentions:
+ attn_weights = None
+
+ return attn_output, attn_weights, past_key_value
+
+
+# Copied from transformers.models.gemma.modeling_gemma.GemmaSdpaAttention with Gemma->Mimi
+class MimiSdpaAttention(MimiAttention):
+ """
+ Mimi attention module using torch.nn.functional.scaled_dot_product_attention. This module inherits from
+ `MimiAttention` as the weights of the module stays untouched. The only changes are on the forward pass to adapt to
+ SDPA API.
+ """
+
+ # Adapted from MimiAttention.forward
+ def forward(
+ self,
+ hidden_states: torch.Tensor,
+ attention_mask: Optional[torch.Tensor] = None,
+ position_ids: Optional[torch.LongTensor] = None,
+ past_key_value: Optional[Cache] = None,
+ output_attentions: bool = False,
+ use_cache: bool = False,
+ cache_position: Optional[torch.LongTensor] = None,
+ **kwargs,
+ ) -> Tuple[torch.Tensor, Optional[torch.Tensor], Optional[Tuple[torch.Tensor]]]:
+ if output_attentions:
+ # TODO: Improve this warning with e.g. `model.config.attn_implementation = "manual"` once this is implemented.
+ logger.warning_once(
+ "MimiModel is using MimiSdpaAttention, but `torch.nn.functional.scaled_dot_product_attention` does not support `output_attentions=True`. Falling back to the manual attention implementation, "
+ 'but specifying the manual implementation will be required from Transformers version v5.0.0 onwards. This warning can be removed using the argument `attn_implementation="eager"` when loading the model.'
+ )
+ return super().forward(
+ hidden_states=hidden_states,
+ attention_mask=attention_mask,
+ position_ids=position_ids,
+ past_key_value=past_key_value,
+ output_attentions=output_attentions,
+ use_cache=use_cache,
+ cache_position=cache_position,
+ )
+
+ bsz, q_len, _ = hidden_states.size()
+
+ query_states = self.q_proj(hidden_states)
+ key_states = self.k_proj(hidden_states)
+ value_states = self.v_proj(hidden_states)
+
+ query_states = query_states.view(bsz, q_len, self.num_heads, self.head_dim).transpose(1, 2)
+ key_states = key_states.view(bsz, q_len, self.num_key_value_heads, self.head_dim).transpose(1, 2)
+ value_states = value_states.view(bsz, q_len, self.num_key_value_heads, self.head_dim).transpose(1, 2)
+
+ cos, sin = self.rotary_emb(value_states, position_ids)
+ query_states, key_states = apply_rotary_pos_emb(query_states, key_states, cos, sin)
+
+ if past_key_value is not None:
+ # sin and cos are specific to RoPE models; cache_position needed for the static cache
+ cache_kwargs = {"sin": sin, "cos": cos, "cache_position": cache_position}
+ key_states, value_states = past_key_value.update(key_states, value_states, self.layer_idx, cache_kwargs)
+
+ key_states = repeat_kv(key_states, self.num_key_value_groups)
+ value_states = repeat_kv(value_states, self.num_key_value_groups)
+
+ causal_mask = attention_mask
+ if attention_mask is not None:
+ causal_mask = causal_mask[:, :, :, : key_states.shape[-2]]
+
+ # SDPA with memory-efficient backend is currently (torch==2.1.2) bugged with non-contiguous inputs with custom attn_mask,
+ # Reference: https://github.com/pytorch/pytorch/issues/112577.
+ if query_states.device.type == "cuda" and causal_mask is not None:
+ query_states = query_states.contiguous()
+ key_states = key_states.contiguous()
+ value_states = value_states.contiguous()
+
+ # We dispatch to SDPA's Flash Attention or Efficient kernels via this `is_causal` if statement instead of an inline conditional assignment
+ # in SDPA to support both torch.compile's dynamic shapes and full graph options. An inline conditional prevents dynamic shapes from compiling.
+ is_causal = True if causal_mask is None and q_len > 1 else False
+
+ attn_output = torch.nn.functional.scaled_dot_product_attention(
+ query_states,
+ key_states,
+ value_states,
+ attn_mask=causal_mask,
+ dropout_p=self.attention_dropout if self.training else 0.0,
+ is_causal=is_causal,
+ )
+
+ attn_output = attn_output.transpose(1, 2).contiguous()
+ attn_output = attn_output.view(bsz, q_len, -1)
+
+ attn_output = self.o_proj(attn_output)
+
+ return attn_output, None, past_key_value
+
+
+MIMI_ATTENTION_CLASSES = {
+ "eager": MimiAttention,
+ "flash_attention_2": MimiFlashAttention2,
+ "sdpa": MimiSdpaAttention,
+}
+
+
+class MimiTransformerLayer(nn.Module):
+ def __init__(self, config: MimiConfig, layer_idx: int):
+ super().__init__()
+ self.hidden_size = config.hidden_size
+
+ self.self_attn = MIMI_ATTENTION_CLASSES[config._attn_implementation](config=config, layer_idx=layer_idx)
+
+ self.mlp = MimiMLP(config)
+ self.input_layernorm = nn.LayerNorm(config.hidden_size, eps=config.norm_eps)
+ self.post_attention_layernorm = nn.LayerNorm(config.hidden_size, eps=config.norm_eps)
+ self.self_attn_layer_scale = MimiLayerScale(config)
+ self.mlp_layer_scale = MimiLayerScale(config)
+
+ def forward(
+ self,
+ hidden_states: torch.Tensor,
+ attention_mask: Optional[torch.Tensor] = None,
+ position_ids: Optional[torch.LongTensor] = None,
+ past_key_value: Optional[Cache] = None,
+ output_attentions: Optional[bool] = False,
+ use_cache: Optional[bool] = False,
+ cache_position: Optional[torch.LongTensor] = None,
+ **kwargs,
+ ) -> Tuple[torch.FloatTensor, Optional[Tuple[torch.FloatTensor, torch.FloatTensor]]]:
+ """
+ Args:
+ hidden_states (`torch.FloatTensor`): input to the layer of shape `(batch, seq_len, embed_dim)`
+ attention_mask (`torch.FloatTensor`, *optional*):
+ attention mask of size `(batch_size, sequence_length)` if flash attention is used or `(batch_size, 1,
+ query_sequence_length, key_sequence_length)` if default attention is used.
+ output_attentions (`bool`, *optional*):
+ Whether or not to return the attentions tensors of all attention layers. See `attentions` under
+ returned tensors for more detail.
+ use_cache (`bool`, *optional*):
+ If set to `True`, `past_key_values` key value states are returned and can be used to speed up decoding
+ (see `past_key_values`).
+ past_key_value (`Tuple(torch.FloatTensor)`, *optional*): cached past key and value projection states
+ cache_position (`torch.LongTensor` of shape `(sequence_length)`, *optional*):
+ Indices depicting the position of the input sequence tokens in the sequence
+ kwargs (`dict`, *optional*):
+ Arbitrary kwargs to be ignored, used for FSDP and other methods that injects code
+ into the model
+ """
+ residual = hidden_states
+
+ hidden_states = self.input_layernorm(hidden_states)
+
+ # Self Attention
+ hidden_states, self_attn_weights, present_key_value = self.self_attn(
+ hidden_states=hidden_states,
+ attention_mask=attention_mask,
+ position_ids=position_ids,
+ past_key_value=past_key_value,
+ output_attentions=output_attentions,
+ use_cache=use_cache,
+ cache_position=cache_position,
+ **kwargs,
+ )
+ hidden_states = residual + self.self_attn_layer_scale(hidden_states)
+
+ # Fully Connected
+ residual = hidden_states
+ hidden_states = self.post_attention_layernorm(hidden_states)
+ hidden_states = self.mlp(hidden_states)
+ hidden_states = residual + self.mlp_layer_scale(hidden_states)
+
+ outputs = (hidden_states,)
+
+ if output_attentions:
+ outputs += (self_attn_weights,)
+
+ if use_cache:
+ outputs += (present_key_value,)
+
+ return outputs
+
+
+class MimiTransformerModel(nn.Module):
+ """
+ Transformer decoder consisting of *config.num_hidden_layers* layers. Each layer is a [`MimiTransformerLayer`]
+
+ Args:
+ config: MimiConfig
+ """
+
+ def __init__(self, config: MimiConfig):
+ super().__init__()
+
+ self.layers = nn.ModuleList(
+ [MimiTransformerLayer(config, layer_idx) for layer_idx in range(config.num_hidden_layers)]
+ )
+ self._attn_implementation = config._attn_implementation
+
+ self.gradient_checkpointing = False
+ self.config = config
+
+ def forward(
+ self,
+ hidden_states: torch.LongTensor = None,
+ attention_mask: Optional[torch.Tensor] = None,
+ position_ids: Optional[torch.LongTensor] = None,
+ past_key_values: Optional[Union[Cache, List[torch.FloatTensor]]] = None,
+ use_cache: Optional[bool] = None,
+ output_attentions: Optional[bool] = None,
+ output_hidden_states: Optional[bool] = None,
+ return_dict: Optional[bool] = None,
+ cache_position: Optional[torch.LongTensor] = None,
+ ) -> Union[Tuple, BaseModelOutputWithPast]:
+ """
+ Args:
+ hidden_states (`torch.FloatTensor` of shape `(batch_size, sequence_length, hidden_size)`, *optional*):
+ Embedded representation that will be contextualized by the model
+ attention_mask (`torch.Tensor` of shape `(batch_size, sequence_length)`, *optional*):
+ Mask to avoid performing attention on padding token indices. Mask values selected in `[0, 1]`:
+
+ - 1 for tokens that are **not masked**,
+ - 0 for tokens that are **masked**.
+
+ [What are attention masks?](../glossary#attention-mask)
+
+ Indices can be obtained using [`AutoTokenizer`]. See [`PreTrainedTokenizer.encode`] and
+ [`PreTrainedTokenizer.__call__`] for details.
+
+ If `past_key_values` is used, optionally only the last `decoder_input_ids` have to be input (see
+ `past_key_values`).
+
+ If you want to change padding behavior, you should read [`modeling_opt._prepare_decoder_attention_mask`]
+ and modify to your needs. See diagram 1 in [the paper](https://arxiv.org/abs/1910.13461) for more
+ information on the default strategy.
+
+ - 1 indicates the head is **not masked**,
+ - 0 indicates the head is **masked**.
+ position_ids (`torch.LongTensor` of shape `(batch_size, sequence_length)`, *optional*):
+ Indices of positions of each input sequence tokens in the position embeddings. Selected in the range `[0,
+ config.n_positions - 1]`.
+
+ [What are position IDs?](../glossary#position-ids)
+ past_key_values (`Cache` or `tuple(tuple(torch.FloatTensor))`, *optional*):
+ Pre-computed hidden-states (key and values in the self-attention blocks and in the cross-attention
+ blocks) that can be used to speed up sequential decoding. This typically consists in the `past_key_values`
+ returned by the model at a previous stage of decoding, when `use_cache=True` or `config.use_cache=True`.
+
+ Two formats are allowed:
+ - a [`~cache_utils.Cache`] instance;
+ - Tuple of `tuple(torch.FloatTensor)` of length `config.n_layers`, with each tuple having 2 tensors of
+ shape `(batch_size, num_heads, sequence_length, embed_size_per_head)`). This is also known as the legacy
+ cache format.
+
+ The model will output the same cache format that is fed as input. If no `past_key_values` are passed, the
+ legacy cache format will be returned.
+
+ If `past_key_values` are used, the user can optionally input only the last `input_ids` (those that don't
+ have their past key value states given to this model) of shape `(batch_size, 1)` instead of all `input_ids`
+ of shape `(batch_size, sequence_length)`.
+ use_cache (`bool`, *optional*):
+ If set to `True`, `past_key_values` key value states are returned and can be used to speed up decoding (see
+ `past_key_values`).
+ output_attentions (`bool`, *optional*):
+ Whether or not to return the attentions tensors of all attention layers. See `attentions` under returned
+ tensors for more detail.
+ output_hidden_states (`bool`, *optional*):
+ Whether or not to return the hidden states of all layers. See `hidden_states` under returned tensors for
+ more detail.
+ return_dict (`bool`, *optional*):
+ Whether or not to return a [`~utils.ModelOutput`] instead of a plain tuple.
+ """
+ output_attentions = output_attentions if output_attentions is not None else self.config.output_attentions
+ output_hidden_states = (
+ output_hidden_states if output_hidden_states is not None else self.config.output_hidden_states
+ )
+ use_cache = use_cache if use_cache is not None else self.config.use_cache
+
+ return_dict = return_dict if return_dict is not None else self.config.use_return_dict
+
+ if self.gradient_checkpointing and self.training and use_cache:
+ logger.warning_once(
+ "`use_cache=True` is incompatible with gradient checkpointing. Setting `use_cache=False`."
+ )
+ use_cache = False
+
+ if use_cache and past_key_values is None and not self.training:
+ past_key_values = DynamicCache.from_legacy_cache(past_key_values)
+
+ if cache_position is None:
+ past_seen_tokens = past_key_values.get_seq_length() if past_key_values is not None else 0
+ cache_position = torch.arange(
+ past_seen_tokens, past_seen_tokens + hidden_states.shape[1], device=hidden_states.device
+ )
+
+ if position_ids is None:
+ position_ids = cache_position.unsqueeze(0)
+
+ causal_mask = None
+ if attention_mask is not None:
+ causal_mask = self._update_causal_mask(
+ attention_mask, hidden_states, cache_position, past_key_values, output_attentions
+ )
+
+ # decoder layers
+ all_hidden_states = () if output_hidden_states else None
+ all_self_attns = () if output_attentions else None
+ next_decoder_cache = None
+
+ for decoder_layer in self.layers:
+ if output_hidden_states:
+ all_hidden_states += (hidden_states,)
+
+ if self.gradient_checkpointing and self.training:
+ layer_outputs = self._gradient_checkpointing_func(
+ decoder_layer.__call__,
+ hidden_states,
+ causal_mask,
+ position_ids,
+ past_key_values,
+ output_attentions,
+ use_cache,
+ cache_position,
+ )
+ else:
+ layer_outputs = decoder_layer(
+ hidden_states,
+ attention_mask=causal_mask,
+ position_ids=position_ids,
+ past_key_value=past_key_values,
+ output_attentions=output_attentions,
+ use_cache=use_cache,
+ cache_position=cache_position,
+ )
+
+ hidden_states = layer_outputs[0]
+
+ if use_cache:
+ next_decoder_cache = layer_outputs[2 if output_attentions else 1]
+
+ if output_attentions:
+ all_self_attns += (layer_outputs[1],)
+
+ # add hidden states from the last decoder layer
+ if output_hidden_states:
+ all_hidden_states += (hidden_states,)
+
+ next_cache = next_decoder_cache if use_cache else None
+
+ if not return_dict:
+ return tuple(v for v in [hidden_states, next_cache, all_hidden_states, all_self_attns] if v is not None)
+
+ return BaseModelOutputWithPast(
+ last_hidden_state=hidden_states,
+ past_key_values=next_cache,
+ hidden_states=all_hidden_states,
+ attentions=all_self_attns,
+ )
+
+ # Copied from transformers.models.gemma.modeling_gemma.GemmaModel._update_causal_mask
+ def _update_causal_mask(
+ self,
+ attention_mask: torch.Tensor,
+ input_tensor: torch.Tensor,
+ cache_position: torch.Tensor,
+ past_key_values: Cache,
+ output_attentions: bool,
+ ):
+ if self.config._attn_implementation == "flash_attention_2":
+ if attention_mask is not None and 0.0 in attention_mask:
+ return attention_mask
+ return None
+
+ # For SDPA, when possible, we will rely on its `is_causal` argument instead of its `attn_mask` argument, in
+ # order to dispatch on Flash Attention 2. This feature is not compatible with static cache, as SDPA will fail
+ # to infer the attention mask.
+ past_seen_tokens = past_key_values.get_seq_length() if past_key_values is not None else 0
+ using_static_cache = isinstance(past_key_values, StaticCache)
+
+ # When output attentions is True, sdpa implementation's forward method calls the eager implementation's forward
+ if self.config._attn_implementation == "sdpa" and not using_static_cache and not output_attentions:
+ if AttentionMaskConverter._ignore_causal_mask_sdpa(
+ attention_mask,
+ inputs_embeds=input_tensor,
+ past_key_values_length=past_seen_tokens,
+ is_training=self.training,
+ ):
+ return None
+
+ dtype, device = input_tensor.dtype, input_tensor.device
+ min_dtype = torch.finfo(dtype).min
+ sequence_length = input_tensor.shape[1]
+ if using_static_cache:
+ target_length = past_key_values.get_max_length()
+ else:
+ target_length = (
+ attention_mask.shape[-1]
+ if isinstance(attention_mask, torch.Tensor)
+ else past_seen_tokens + sequence_length + 1
+ )
+
+ # In case the provided `attention` mask is 2D, we generate a causal mask here (4D).
+ causal_mask = _prepare_4d_causal_attention_mask_with_cache_position(
+ attention_mask,
+ sequence_length=sequence_length,
+ target_length=target_length,
+ dtype=dtype,
+ device=device,
+ min_dtype=min_dtype,
+ cache_position=cache_position,
+ batch_size=input_tensor.shape[0],
+ )
+ if (
+ self.config._attn_implementation == "sdpa"
+ and attention_mask is not None
+ and attention_mask.device.type == "cuda"
+ and not output_attentions
+ ):
+ # Attend to all tokens in fully masked rows in the causal_mask, for example the relevant first rows when
+ # using left padding. This is required by F.scaled_dot_product_attention memory-efficient attention path.
+ # Details: https://github.com/pytorch/pytorch/issues/110213
+ causal_mask = AttentionMaskConverter._unmask_unattended(causal_mask, min_dtype)
+
+ return causal_mask
+
+
+class MimiDecoder(nn.Module):
+ """SEANet decoder as used by Mimi."""
+
+ def __init__(self, config: MimiConfig):
+ super().__init__()
+ scaling = int(2 ** len(config.upsampling_ratios))
+ model = [MimiConv1d(config, config.hidden_size, scaling * config.num_filters, config.kernel_size)]
+
+ # Upsample to raw audio scale
+ for ratio in config.upsampling_ratios:
+ current_scale = scaling * config.num_filters
+ # Add upsampling layers
+ model += [nn.ELU()]
+ model += [
+ MimiConvTranspose1d(config, current_scale, current_scale // 2, kernel_size=ratio * 2, stride=ratio)
+ ]
+ # Add residual layers
+ for j in range(config.num_residual_layers):
+ model += [MimiResnetBlock(config, current_scale // 2, (config.dilation_growth_rate**j, 1))]
+ scaling //= 2
+
+ # Add final layers
+ model += [nn.ELU()]
+ model += [MimiConv1d(config, config.num_filters, config.audio_channels, config.last_kernel_size)]
+ self.layers = nn.ModuleList(model)
+
+ # Copied from transformers.models.encodec.modeling_encodec.EncodecDecoder.forward
+ def forward(self, hidden_states):
+ for layer in self.layers:
+ hidden_states = layer(hidden_states)
+ return hidden_states
+
+
+class MimiEuclideanCodebook(nn.Module):
+ """Codebook with Euclidean distance."""
+
+ def __init__(self, config: MimiConfig, epsilon: float = 1e-5):
+ super().__init__()
+ embed = torch.zeros(config.codebook_size, config.codebook_dim)
+
+ self.codebook_size = config.codebook_size
+
+ self.register_buffer("initialized", torch.Tensor([True]))
+ self.register_buffer("cluster_usage", torch.ones(config.codebook_size))
+ self.register_buffer("embed_sum", embed)
+ self._embed = None
+ self.epsilon = epsilon
+
+ @property
+ def embed(self) -> torch.Tensor:
+ if self._embed is None:
+ self._embed = self.embed_sum / self.cluster_usage.clamp(min=self.epsilon)[:, None]
+ return self._embed
+
+ def quantize(self, hidden_states):
+ # Projects each vector in `hidden_states` over the nearest centroid and return its index.
+ # `hidden_states` should be `[N, D]` with `N` the number of input vectors and `D` the dimension.
+ dists = torch.cdist(hidden_states[None], self.embed[None], p=2)[0]
+ embed_ind = dists.argmin(dim=-1)
+ return embed_ind
+
+ # Copied from transformers.models.encodec.modeling_encodec.EncodecEuclideanCodebook.encode
+ def encode(self, hidden_states):
+ shape = hidden_states.shape
+ # pre-process
+ hidden_states = hidden_states.reshape((-1, shape[-1]))
+ # quantize
+ embed_ind = self.quantize(hidden_states)
+ # post-process
+ embed_ind = embed_ind.view(*shape[:-1])
+ return embed_ind
+
+ # Copied from transformers.models.encodec.modeling_encodec.EncodecEuclideanCodebook.decode
+ def decode(self, embed_ind):
+ quantize = nn.functional.embedding(embed_ind, self.embed)
+ return quantize
+
+
+# Copied from transformers.models.encodec.modeling_encodec.EncodecVectorQuantization with Encodec->Mimi
+class MimiVectorQuantization(nn.Module):
+ """
+ Vector quantization implementation. Currently supports only euclidean distance.
+ """
+
+ def __init__(self, config: MimiConfig):
+ super().__init__()
+ self.codebook = MimiEuclideanCodebook(config)
+
+ def encode(self, hidden_states):
+ hidden_states = hidden_states.permute(0, 2, 1)
+ embed_in = self.codebook.encode(hidden_states)
+ return embed_in
+
+ def decode(self, embed_ind):
+ quantize = self.codebook.decode(embed_ind)
+ quantize = quantize.permute(0, 2, 1)
+ return quantize
+
+
+class MimiResidualVectorQuantizer(nn.Module):
+ """Residual Vector Quantizer."""
+
+ def __init__(self, config: MimiConfig, num_quantizers: int = None):
+ super().__init__()
+ self.codebook_size = config.codebook_size
+ self.frame_rate = config.frame_rate
+ self.num_quantizers = num_quantizers if num_quantizers is not None else config.num_quantizers
+ self.layers = nn.ModuleList([MimiVectorQuantization(config) for _ in range(self.num_quantizers)])
+
+ self.input_proj = None
+ self.output_proj = None
+ if config.vector_quantization_hidden_dimension != config.hidden_size:
+ self.input_proj = torch.nn.Conv1d(
+ config.hidden_size, config.vector_quantization_hidden_dimension, 1, bias=False
+ )
+ self.output_proj = torch.nn.Conv1d(
+ config.vector_quantization_hidden_dimension, config.hidden_size, 1, bias=False
+ )
+
+ def encode(self, embeddings: torch.Tensor, num_quantizers: Optional[int] = None) -> torch.Tensor:
+ """
+ Encode a given input tensor with the specified frame rate at the given number of quantizers / codebooks. The RVQ encode method sets
+ the appropriate number of quantizers to use and returns indices for each quantizer.
+ """
+ if self.input_proj is not None:
+ embeddings = self.input_proj(embeddings)
+
+ num_quantizers = num_quantizers if num_quantizers is not None else self.num_quantizers
+
+ residual = embeddings
+ all_indices = []
+ for layer in self.layers[:num_quantizers]:
+ indices = layer.encode(residual)
+ quantized = layer.decode(indices)
+ residual = residual - quantized
+ all_indices.append(indices)
+ out_indices = torch.stack(all_indices)
+ return out_indices
+
+ def decode(self, codes: torch.Tensor) -> torch.Tensor:
+ """Decode the given codes of shape [B, K, T] to the quantized representation."""
+ quantized_out = torch.tensor(0.0, device=codes.device)
+ codes = codes.transpose(0, 1)
+ for i, indices in enumerate(codes):
+ layer = self.layers[i]
+ quantized = layer.decode(indices)
+ quantized_out = quantized_out + quantized
+
+ if self.output_proj is not None:
+ quantized_out = self.output_proj(quantized_out)
+ return quantized_out
+
+
+class MimiSplitResidualVectorQuantizer(nn.Module):
+ """Split Residual Vector Quantizer."""
+
+ def __init__(self, config: MimiConfig):
+ super().__init__()
+ self.codebook_size = config.codebook_size
+ self.frame_rate = config.frame_rate
+ self.max_num_quantizers = config.num_quantizers
+
+ self.num_semantic_quantizers = config.num_semantic_quantizers
+ self.num_acoustic_quantizers = config.num_quantizers - config.num_semantic_quantizers
+
+ self.semantic_residual_vector_quantizer = MimiResidualVectorQuantizer(config, self.num_semantic_quantizers)
+ self.acoustic_residual_vector_quantizer = MimiResidualVectorQuantizer(config, self.num_acoustic_quantizers)
+
+ def encode(self, embeddings: torch.Tensor, num_quantizers: Optional[float] = None) -> torch.Tensor:
+ """
+ Encode a given input tensor with the specified frame rate at the given number of quantizers / codebooks. The RVQ encode method sets
+ the appropriate number of quantizers to use and returns indices for each quantizer.
+ """
+
+ num_quantizers = self.max_num_quantizers if num_quantizers is None else num_quantizers
+
+ if num_quantizers > self.max_num_quantizers:
+ raise ValueError(
+ f"The number of quantizers (i.e codebooks) asked should be lower than the total number of quantizers {self.max_num_quantizers}, but is currently {num_quantizers}."
+ )
+
+ if num_quantizers < self.num_semantic_quantizers:
+ raise ValueError(
+ f"The number of quantizers (i.e codebooks) asked should be higher than the number of semantic quantizers {self.num_semantic_quantizers}, but is currently {num_quantizers}."
+ )
+
+ # codes is [K, B, T], with T frames, K nb of codebooks.
+ codes = self.semantic_residual_vector_quantizer.encode(embeddings)
+
+ if num_quantizers > self.num_semantic_quantizers:
+ acoustic_codes = self.acoustic_residual_vector_quantizer.encode(
+ embeddings, num_quantizers=num_quantizers - self.num_semantic_quantizers
+ )
+ codes = torch.cat([codes, acoustic_codes], dim=0)
+
+ return codes
+
+ def decode(self, codes: torch.Tensor) -> torch.Tensor:
+ """Decode the given codes to the quantized representation."""
+
+ # The first num_semantic_quantizers codebooks are decoded using the semantic RVQ
+ quantized_out = self.semantic_residual_vector_quantizer.decode(codes[:, : self.num_semantic_quantizers])
+
+ # The rest of the codebooks are decoded using the acoustic RVQ
+ if codes.shape[1] > self.num_semantic_quantizers:
+ quantized_out += self.acoustic_residual_vector_quantizer.decode(codes[:, self.num_semantic_quantizers :])
+ return quantized_out
+
+
+class MimiPreTrainedModel(PreTrainedModel):
+ """
+ An abstract class to handle weights initialization and a simple interface for downloading and loading pretrained
+ models.
+ """
+
+ config_class = MimiConfig
+ base_model_prefix = "mimi"
+ main_input_name = "input_values"
+ supports_gradient_checkpointing = True
+ _no_split_modules = ["MimiDecoderLayer"]
+ _skip_keys_device_placement = "past_key_values"
+ _supports_flash_attn_2 = True
+ _supports_sdpa = True
+ _supports_cache_class = True
+ _supports_static_cache = True
+
+ # Copied from transformers.models.encodec.modeling_encodec.EncodecPreTrainedModel._init_weights
+ def _init_weights(self, module):
+ """Initialize the weights"""
+ if isinstance(module, nn.Linear):
+ module.weight.data.normal_(mean=0.0, std=self.config.initializer_range)
+ if module.bias is not None:
+ module.bias.data.zero_()
+ elif isinstance(module, (nn.LayerNorm, nn.GroupNorm)):
+ module.bias.data.zero_()
+ module.weight.data.fill_(1.0)
+ elif isinstance(module, nn.Conv1d):
+ nn.init.kaiming_normal_(module.weight)
+ if module.bias is not None:
+ k = math.sqrt(module.groups / (module.in_channels * module.kernel_size[0]))
+ nn.init.uniform_(module.bias, a=-k, b=k)
+ elif isinstance(module, nn.Embedding):
+ module.weight.data.normal_(mean=0.0, std=self.config.initializer_range)
+ if module.padding_idx is not None:
+ module.weight.data[module.padding_idx].zero_()
+ elif isinstance(module, nn.LSTM):
+ for name, param in module.named_parameters():
+ if "weight" in name:
+ nn.init.xavier_uniform_(param)
+ elif "bias" in name:
+ nn.init.constant_(param, 0.0)
+
+
+MIMI_START_DOCSTRING = r"""
+ This model inherits from [`PreTrainedModel`]. Check the superclass documentation for the generic methods the
+ library implements for all its model (such as downloading or saving, resizing the input embeddings, pruning heads
+ etc.)
+
+ This model is also a PyTorch [torch.nn.Module](https://pytorch.org/docs/stable/nn.html#torch.nn.Module) subclass.
+ Use it as a regular PyTorch Module and refer to the PyTorch documentation for all matter related to general usage
+ and behavior.
+
+ Parameters:
+ config ([`MimiConfig`]):
+ Model configuration class with all the parameters of the model. Initializing with a config file does not
+ load the weights associated with the model, only the configuration. Check out the
+ [`~PreTrainedModel.from_pretrained`] method to load the model weights.
+"""
+
+
+MIMI_INPUTS_DOCSTRING = r"""
+ Args:
+ input_values (`torch.FloatTensor` of shape `(batch_size, channels, sequence_length)`, *optional*):
+ Raw audio input converted to Float.
+ padding_mask (`torch.Tensor` of shape `(batch_size, sequence_length)`, *optional*):
+ Indicates which inputs are to be ignored due to padding, where elements are either 1 for *not masked* or 0
+ for *masked*.
+ num_quantizers (`int`, *optional*):
+ Number of quantizers (i.e codebooks) to use. By default, all quantizers are used.
+ audio_codes (`torch.LongTensor` of shape `(batch_size, num_quantizers, codes_length)`, *optional*):
+ Discret code embeddings computed using `model.encode`.
+ encoder_past_key_values (`Cache`, *optional*):
+ Pre-computed hidden-states (key and values in the self-attention blocks) that can be used to speed up sequential decoding of the encoder transformer.
+ This typically consists in the `past_key_values` returned by the model at a previous stage of decoding, when `use_cache=True` or `config.use_cache=True`.
+
+ The model will output the same cache format that is fed as input.
+
+ If `past_key_values` are used, the user can optionally input only the last `audio_values` or `audio_codes (those that don't
+ have their past key value states given to this model).
+ decoder_past_key_values (`Cache`, *optional*):
+ Pre-computed hidden-states (key and values in the self-attention blocks) that can be used to speed up sequential decoding of the decoder transformer.
+ This typically consists in the `past_key_values` returned by the model at a previous stage of decoding, when `use_cache=True` or `config.use_cache=True`.
+
+ The model will output the same cache format that is fed as input.
+
+ If `past_key_values` are used, the user can optionally input only the last `audio_values` or `audio_codes (those that don't
+ have their past key value states given to this model).
+ return_dict (`bool`, *optional*):
+ Whether or not to return a [`~utils.ModelOutput`] instead of a plain tuple.
+"""
+
+
+@add_start_docstrings(
+ "The Mimi neural audio codec model.",
+ MIMI_START_DOCSTRING,
+)
+class MimiModel(MimiPreTrainedModel):
+ def __init__(self, config: MimiConfig):
+ super().__init__(config)
+ self.config = config
+
+ self.encoder = MimiEncoder(config)
+ self.encoder_transformer = MimiTransformerModel(config)
+
+ self.downsample = None
+ self.upsample = None
+ if config.frame_rate != config.encodec_frame_rate:
+ self.downsample = MimiConv1d(
+ config,
+ config.hidden_size,
+ config.hidden_size,
+ kernel_size=2 * int(config.encodec_frame_rate / config.frame_rate),
+ stride=2,
+ bias=False,
+ pad_mode="replicate",
+ )
+
+ self.upsample = MimiConvTranspose1d(
+ config,
+ config.hidden_size,
+ config.hidden_size,
+ kernel_size=2 * int(config.encodec_frame_rate / config.frame_rate),
+ stride=2,
+ bias=False,
+ groups=config.upsample_groups,
+ )
+
+ self.decoder_transformer = MimiTransformerModel(config)
+ self.decoder = MimiDecoder(config)
+
+ self.quantizer = MimiSplitResidualVectorQuantizer(config)
+
+ self.bits_per_codebook = int(math.log2(self.config.codebook_size))
+ if 2**self.bits_per_codebook != self.config.codebook_size:
+ raise ValueError("The codebook_size must be a power of 2.")
+
+ # Initialize weights and apply final processing
+ self.post_init()
+
+ def get_encoder(self):
+ return self.encoder
+
+ def get_decoder(self):
+ return self.decoder
+
+ def _encode_frame(
+ self,
+ input_values: torch.Tensor,
+ num_quantizers: int,
+ padding_mask: int,
+ past_key_values: Optional[Union[Cache, List[torch.FloatTensor]]] = None,
+ return_dict: Optional[bool] = None,
+ ) -> Tuple[torch.Tensor, Optional[torch.Tensor]]:
+ """
+ Encodes the given input using the underlying VQVAE. The padding mask is required to compute the correct scale.
+ """
+ embeddings = self.encoder(input_values)
+ encoder_outputs = self.encoder_transformer(
+ embeddings.transpose(1, 2), past_key_values=past_key_values, return_dict=return_dict
+ )
+ if return_dict:
+ past_key_values = encoder_outputs.get("past_key_values")
+ elif len(encoder_outputs) > 1:
+ past_key_values = encoder_outputs[1]
+ embeddings = encoder_outputs[0].transpose(1, 2)
+ embeddings = self.downsample(embeddings)
+
+ codes = self.quantizer.encode(embeddings, num_quantizers)
+ codes = codes.transpose(0, 1)
+ return codes, past_key_values
+
+ def encode(
+ self,
+ input_values: torch.Tensor,
+ padding_mask: torch.Tensor = None,
+ num_quantizers: Optional[float] = None,
+ encoder_past_key_values: Optional[Union[Cache, List[torch.FloatTensor]]] = None,
+ return_dict: Optional[bool] = None,
+ ) -> Union[Tuple[torch.Tensor, Optional[torch.Tensor]], MimiEncoderOutput]:
+ """
+ Encodes the input audio waveform into discrete codes.
+
+ Args:
+ input_values (`torch.Tensor` of shape `(batch_size, channels, sequence_length)`):
+ Float values of the input audio waveform.
+ padding_mask (`torch.Tensor` of shape `(batch_size, channels, sequence_length)`):
+ Indicates which inputs are to be ignored due to padding, where elements are either 1 for *not masked* or 0
+ for *masked*.
+ num_quantizers (`int`, *optional*):
+ Number of quantizers (i.e codebooks) to use. By default, all quantizers are used.
+ encoder_past_key_values (`Cache`, *optional*):
+ Pre-computed hidden-states (key and values in the self-attention blocks) that can be used to speed up sequential decoding of the encoder transformer.
+ This typically consists in the `past_key_values` returned by the model at a previous stage of decoding, when `use_cache=True` or `config.use_cache=True`.
+
+ The model will output the same cache format that is fed as input.
+
+ If `past_key_values` are used, the user can optionally input only the last `audio_values` or `audio_codes (those that don't
+ have their past key value states given to this model).
+ return_dict (`bool`, *optional*):
+ Whether or not to return a [`~utils.ModelOutput`] instead of a plain tuple.
+
+ Returns:
+ `codebook` of shape `[batch_size, num_codebooks, frames]`, the discrete encoded codes for the input audio waveform.
+ """
+ return_dict = return_dict if return_dict is not None else self.config.return_dict
+
+ num_quantizers = self.config.num_quantizers if num_quantizers is None else num_quantizers
+
+ if num_quantizers > self.config.num_quantizers:
+ raise ValueError(
+ f"The number of quantizers (i.e codebooks) asked should be lower than the total number of quantizers {self.config.num_quantizers}, but is currently {num_quantizers}."
+ )
+
+ _, channels, input_length = input_values.shape
+
+ if channels < 1 or channels > 2:
+ raise ValueError(f"Number of audio channels must be 1 or 2, but got {channels}")
+
+ if padding_mask is None:
+ padding_mask = torch.ones_like(input_values).bool()
+
+ encoded_frames, encoder_past_key_values = self._encode_frame(
+ input_values,
+ num_quantizers,
+ padding_mask.bool(),
+ past_key_values=encoder_past_key_values,
+ return_dict=return_dict,
+ )
+
+ if not return_dict:
+ return (
+ encoded_frames,
+ encoder_past_key_values,
+ )
+
+ return MimiEncoderOutput(encoded_frames, encoder_past_key_values)
+
+ def _decode_frame(
+ self,
+ codes: torch.Tensor,
+ past_key_values: Optional[Union[Cache, List[torch.FloatTensor]]] = None,
+ return_dict: Optional[bool] = None,
+ ) -> torch.Tensor:
+ embeddings = self.quantizer.decode(codes)
+
+ embeddings = self.upsample(embeddings)
+ decoder_outputs = self.decoder_transformer(
+ embeddings.transpose(1, 2), past_key_values=past_key_values, return_dict=return_dict
+ )
+ if return_dict:
+ past_key_values = decoder_outputs.get("past_key_values")
+ elif len(decoder_outputs) > 1:
+ past_key_values = decoder_outputs[1]
+ embeddings = decoder_outputs[0].transpose(1, 2)
+ outputs = self.decoder(embeddings)
+ return outputs, past_key_values
+
+ def decode(
+ self,
+ audio_codes: torch.Tensor,
+ padding_mask: Optional[torch.Tensor] = None,
+ decoder_past_key_values: Optional[Union[Cache, List[torch.FloatTensor]]] = None,
+ return_dict: Optional[bool] = None,
+ ) -> Union[Tuple[torch.Tensor, torch.Tensor], MimiDecoderOutput]:
+ """
+ Decodes the given frames into an output audio waveform.
+
+ Note that the output might be a bit bigger than the input. In that case, any extra steps at the end can be
+ trimmed.
+
+ Args:
+ audio_codes (`torch.LongTensor` of shape `(batch_size, num_quantizers, codes_length)`, *optional*):
+ Discret code embeddings computed using `model.encode`.
+ padding_mask (`torch.Tensor` of shape `(batch_size, channels, sequence_length)`):
+ Indicates which inputs are to be ignored due to padding, where elements are either 1 for *not masked* or 0
+ for *masked*.
+ decoder_past_key_values (`Cache`, *optional*):
+ Pre-computed hidden-states (key and values in the self-attention blocks) that can be used to speed up sequential decoding of the decoder transformer.
+ This typically consists in the `past_key_values` returned by the model at a previous stage of decoding, when `use_cache=True` or `config.use_cache=True`.
+
+ The model will output the same cache format that is fed as input.
+
+ If `past_key_values` are used, the user can optionally input only the last `audio_values` or `audio_codes (those that don't
+ have their past key value states given to this model).
+ return_dict (`bool`, *optional*):
+ Whether or not to return a [`~utils.ModelOutput`] instead of a plain tuple.
+
+ """
+ return_dict = return_dict if return_dict is not None else self.config.return_dict
+
+ audio_values, decoder_past_key_values = self._decode_frame(
+ audio_codes, past_key_values=decoder_past_key_values, return_dict=return_dict
+ )
+
+ # truncate based on padding mask
+ if padding_mask is not None and padding_mask.shape[-1] < audio_values.shape[-1]:
+ audio_values = audio_values[..., : padding_mask.shape[-1]]
+
+ if not return_dict:
+ return (
+ audio_values,
+ decoder_past_key_values,
+ )
+ return MimiDecoderOutput(audio_values, decoder_past_key_values)
+
+ @add_start_docstrings_to_model_forward(MIMI_INPUTS_DOCSTRING)
+ @replace_return_docstrings(output_type=MimiOutput, config_class=_CONFIG_FOR_DOC)
+ def forward(
+ self,
+ input_values: torch.Tensor,
+ padding_mask: Optional[torch.Tensor] = None,
+ num_quantizers: Optional[int] = None,
+ audio_codes: Optional[torch.Tensor] = None,
+ encoder_past_key_values: Optional[Union[Cache, List[torch.FloatTensor]]] = None,
+ decoder_past_key_values: Optional[Union[Cache, List[torch.FloatTensor]]] = None,
+ return_dict: Optional[bool] = None,
+ ) -> Union[Tuple[torch.Tensor, torch.Tensor], MimiOutput]:
+ r"""
+ Returns:
+
+ Examples:
+
+ ```python
+ >>> from datasets import load_dataset
+ >>> from transformers import AutoFeatureExtractor, MimiModel
+
+ >>> dataset = load_dataset("hf-internal-testing/ashraq-esc50-1-dog-example")
+ >>> audio_sample = dataset["train"]["audio"][0]["array"]
+
+ >>> model_id = "kyutai/mimi"
+ >>> model = MimiModel.from_pretrained(model_id)
+ >>> feature_extractor = AutoFeatureExtractor.from_pretrained(model_id)
+
+ >>> inputs = feature_extractor(raw_audio=audio_sample, return_tensors="pt")
+
+ >>> outputs = model(**inputs)
+ >>> audio_codes = outputs.audio_codes
+ >>> audio_values = outputs.audio_values
+ ```"""
+ return_dict = return_dict if return_dict is not None else self.config.return_dict
+
+ if padding_mask is None:
+ padding_mask = torch.ones_like(input_values).bool()
+
+ if audio_codes is None:
+ encoder_outputs = self.encode(
+ input_values, padding_mask, num_quantizers, encoder_past_key_values, return_dict=return_dict
+ )
+ audio_codes = encoder_outputs[0]
+ if return_dict:
+ encoder_past_key_values = encoder_outputs.get("past_key_values")
+ elif len(encoder_outputs) > 1:
+ encoder_past_key_values = encoder_outputs[1]
+
+ decoder_outputs = self.decode(audio_codes, padding_mask, decoder_past_key_values, return_dict=return_dict)
+ audio_values = decoder_outputs[0]
+ if return_dict:
+ decoder_past_key_values = decoder_outputs.get("past_key_values")
+ elif len(decoder_outputs) > 1:
+ decoder_past_key_values = decoder_outputs[1]
+
+ if not return_dict:
+ return (audio_codes, audio_values, encoder_past_key_values, decoder_past_key_values)
+
+ return MimiOutput(
+ audio_codes=audio_codes,
+ audio_values=audio_values,
+ encoder_past_key_values=encoder_past_key_values,
+ decoder_past_key_values=decoder_past_key_values,
+ )
diff --git a/src/transformers/utils/dummy_pt_objects.py b/src/transformers/utils/dummy_pt_objects.py
index 2db7b38b580375..5f8ae6b5fbffac 100644
--- a/src/transformers/utils/dummy_pt_objects.py
+++ b/src/transformers/utils/dummy_pt_objects.py
@@ -5840,6 +5840,20 @@ def __init__(self, *args, **kwargs):
requires_backends(self, ["torch"])
+class MimiModel(metaclass=DummyObject):
+ _backends = ["torch"]
+
+ def __init__(self, *args, **kwargs):
+ requires_backends(self, ["torch"])
+
+
+class MimiPreTrainedModel(metaclass=DummyObject):
+ _backends = ["torch"]
+
+ def __init__(self, *args, **kwargs):
+ requires_backends(self, ["torch"])
+
+
class MistralForCausalLM(metaclass=DummyObject):
_backends = ["torch"]
diff --git a/tests/models/mimi/__init__.py b/tests/models/mimi/__init__.py
new file mode 100644
index 00000000000000..e69de29bb2d1d6
diff --git a/tests/models/mimi/test_modeling_mimi.py b/tests/models/mimi/test_modeling_mimi.py
new file mode 100644
index 00000000000000..dd0f77421be728
--- /dev/null
+++ b/tests/models/mimi/test_modeling_mimi.py
@@ -0,0 +1,890 @@
+# coding=utf-8
+# Copyright 2024 The HuggingFace Inc. team. All rights reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+# http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+"""Testing suite for the PyTorch Mimi model."""
+
+import inspect
+import os
+import tempfile
+import unittest
+
+import numpy as np
+from datasets import Audio, load_dataset
+from packaging import version
+from parameterized import parameterized
+from pytest import mark
+
+from transformers import AutoFeatureExtractor, MimiConfig
+from transformers.testing_utils import (
+ is_flaky,
+ is_torch_available,
+ require_flash_attn,
+ require_torch,
+ require_torch_gpu,
+ require_torch_sdpa,
+ slow,
+ torch_device,
+)
+from transformers.utils import (
+ is_torch_bf16_available_on_device,
+ is_torch_fp16_available_on_device,
+)
+
+from ...test_configuration_common import ConfigTester
+from ...test_modeling_common import ModelTesterMixin, _config_zero_init, floats_tensor, ids_tensor
+
+
+if is_torch_available():
+ import torch
+
+ from transformers import MimiModel
+
+
+# Copied from transformers.tests.encodec.test_modeling_encodec.prepare_inputs_dict
+def prepare_inputs_dict(
+ config,
+ input_ids=None,
+ input_values=None,
+ decoder_input_ids=None,
+ attention_mask=None,
+ decoder_attention_mask=None,
+ head_mask=None,
+ decoder_head_mask=None,
+ cross_attn_head_mask=None,
+):
+ if input_ids is not None:
+ encoder_dict = {"input_ids": input_ids}
+ else:
+ encoder_dict = {"input_values": input_values}
+
+ decoder_dict = {"decoder_input_ids": decoder_input_ids} if decoder_input_ids is not None else {}
+
+ return {**encoder_dict, **decoder_dict}
+
+
+@require_torch
+class MimiModelTester:
+ def __init__(
+ self,
+ parent,
+ batch_size=5,
+ num_channels=1,
+ is_training=False,
+ intermediate_size=40,
+ hidden_size=32,
+ num_filters=8,
+ num_residual_layers=1,
+ upsampling_ratios=[8, 4],
+ codebook_size=64,
+ vector_quantization_hidden_dimension=64,
+ codebook_dim=64,
+ upsample_groups=32,
+ num_hidden_layers=2,
+ num_attention_heads=2,
+ num_key_value_heads=2,
+ sliding_window=4,
+ use_cache=False,
+ ):
+ self.parent = parent
+ self.batch_size = batch_size
+ self.num_channels = num_channels
+ self.is_training = is_training
+ self.intermediate_size = intermediate_size
+ self.hidden_size = hidden_size
+ self.num_filters = num_filters
+ self.num_residual_layers = num_residual_layers
+ self.upsampling_ratios = upsampling_ratios
+ self.codebook_size = codebook_size
+ self.vector_quantization_hidden_dimension = vector_quantization_hidden_dimension
+ self.codebook_dim = codebook_dim
+ self.upsample_groups = upsample_groups
+ self.num_hidden_layers = num_hidden_layers
+ self.num_attention_heads = num_attention_heads
+ self.num_key_value_heads = num_key_value_heads
+ self.sliding_window = sliding_window
+ self.use_cache = use_cache
+
+ def prepare_config_and_inputs(self):
+ input_values = floats_tensor([self.batch_size, self.num_channels, self.intermediate_size], scale=1.0)
+ config = self.get_config()
+ inputs_dict = {"input_values": input_values}
+ return config, inputs_dict
+
+ def prepare_config_and_inputs_for_common(self):
+ config, inputs_dict = self.prepare_config_and_inputs()
+ return config, inputs_dict
+
+ def prepare_config_and_inputs_for_model_class(self, model_class):
+ config, inputs_dict = self.prepare_config_and_inputs()
+ inputs_dict["audio_codes"] = ids_tensor([self.batch_size, 1, self.num_channels], self.codebook_size).type(
+ torch.int32
+ )
+
+ return config, inputs_dict
+
+ def get_config(self):
+ return MimiConfig(
+ audio_channels=self.num_channels,
+ chunk_in_sec=None,
+ hidden_size=self.hidden_size,
+ num_filters=self.num_filters,
+ num_residual_layers=self.num_residual_layers,
+ upsampling_ratios=self.upsampling_ratios,
+ codebook_size=self.codebook_size,
+ vector_quantization_hidden_dimension=self.vector_quantization_hidden_dimension,
+ upsample_groups=self.upsample_groups,
+ num_hidden_layers=self.num_hidden_layers,
+ num_attention_heads=self.num_attention_heads,
+ num_key_value_heads=self.num_key_value_heads,
+ sliding_window=self.sliding_window,
+ codebook_dim=self.codebook_dim,
+ use_cache=self.use_cache,
+ )
+
+ def create_and_check_model_forward(self, config, inputs_dict):
+ model = MimiModel(config=config).to(torch_device).eval()
+
+ input_values = inputs_dict["input_values"]
+ result = model(input_values)
+ self.parent.assertEqual(
+ result.audio_values.shape, (self.batch_size, self.num_channels, self.intermediate_size)
+ )
+
+
+@require_torch
+class MimiModelTest(ModelTesterMixin, unittest.TestCase):
+ all_model_classes = (MimiModel,) if is_torch_available() else ()
+ is_encoder_decoder = True
+ test_pruning = False
+ test_headmasking = False
+ test_resize_embeddings = False
+ test_torchscript = False
+ input_name = "input_values"
+
+ def _prepare_for_class(self, inputs_dict, model_class, return_labels=False):
+ # model does support returning hidden states
+ inputs_dict = super()._prepare_for_class(inputs_dict, model_class, return_labels=return_labels)
+ if "output_attentions" in inputs_dict:
+ inputs_dict.pop("output_attentions")
+ if "output_hidden_states" in inputs_dict:
+ inputs_dict.pop("output_hidden_states")
+ return inputs_dict
+
+ def setUp(self):
+ self.model_tester = MimiModelTester(self)
+ self.config_tester = ConfigTester(
+ self, config_class=MimiConfig, hidden_size=37, common_properties=[], has_text_modality=False
+ )
+
+ def test_config(self):
+ self.config_tester.run_common_tests()
+
+ def test_model_forward(self):
+ config_and_inputs = self.model_tester.prepare_config_and_inputs()
+ self.model_tester.create_and_check_model_forward(*config_and_inputs)
+
+ def test_forward_signature(self):
+ config, _ = self.model_tester.prepare_config_and_inputs_for_common()
+
+ for model_class in self.all_model_classes:
+ model = model_class(config)
+ signature = inspect.signature(model.forward)
+ # signature.parameters is an OrderedDict => so arg_names order is deterministic
+ arg_names = [*signature.parameters.keys()]
+
+ expected_arg_names = ["input_values", "padding_mask", "num_quantizers"]
+ self.assertListEqual(arg_names[: len(expected_arg_names)], expected_arg_names)
+
+ @unittest.skip(reason="The MimiModel does not have `inputs_embeds` logics")
+ def test_inputs_embeds(self):
+ pass
+
+ @unittest.skip(reason="The MimiModel does not have `inputs_embeds` logics")
+ def test_model_get_set_embeddings(self):
+ pass
+
+ @unittest.skip(reason="The MimiModel does not have the usual `attention` logic")
+ def test_retain_grad_hidden_states_attentions(self):
+ pass
+
+ @unittest.skip(reason="The MimiModel does not have the usual `attention` logic")
+ def test_torchscript_output_attentions(self):
+ pass
+
+ @unittest.skip(reason="The MimiModel does not have the usual `hidden_states` logic")
+ def test_torchscript_output_hidden_state(self):
+ pass
+
+ # Copied from transformers.tests.encodec.test_modeling_encodec.MimiModelTest._create_and_check_torchscript
+ def _create_and_check_torchscript(self, config, inputs_dict):
+ if not self.test_torchscript:
+ self.skipTest(reason="test_torchscript is set to False")
+
+ configs_no_init = _config_zero_init(config) # To be sure we have no Nan
+ configs_no_init.torchscript = True
+ configs_no_init.return_dict = False
+ for model_class in self.all_model_classes:
+ model = model_class(config=configs_no_init)
+ model.to(torch_device)
+ model.eval()
+ inputs = self._prepare_for_class(inputs_dict, model_class)
+
+ main_input_name = model_class.main_input_name
+
+ try:
+ main_input = inputs[main_input_name]
+ model(main_input)
+ traced_model = torch.jit.trace(model, main_input)
+ except RuntimeError:
+ self.fail("Couldn't trace module.")
+
+ with tempfile.TemporaryDirectory() as tmp_dir_name:
+ pt_file_name = os.path.join(tmp_dir_name, "traced_model.pt")
+
+ try:
+ torch.jit.save(traced_model, pt_file_name)
+ except Exception:
+ self.fail("Couldn't save module.")
+
+ try:
+ loaded_model = torch.jit.load(pt_file_name)
+ except Exception:
+ self.fail("Couldn't load module.")
+
+ model.to(torch_device)
+ model.eval()
+
+ loaded_model.to(torch_device)
+ loaded_model.eval()
+
+ model_state_dict = model.state_dict()
+ loaded_model_state_dict = loaded_model.state_dict()
+
+ non_persistent_buffers = {}
+ for key in loaded_model_state_dict.keys():
+ if key not in model_state_dict.keys():
+ non_persistent_buffers[key] = loaded_model_state_dict[key]
+
+ loaded_model_state_dict = {
+ key: value for key, value in loaded_model_state_dict.items() if key not in non_persistent_buffers
+ }
+
+ self.assertEqual(set(model_state_dict.keys()), set(loaded_model_state_dict.keys()))
+
+ model_buffers = list(model.buffers())
+ for non_persistent_buffer in non_persistent_buffers.values():
+ found_buffer = False
+ for i, model_buffer in enumerate(model_buffers):
+ if torch.equal(non_persistent_buffer, model_buffer):
+ found_buffer = True
+ break
+
+ self.assertTrue(found_buffer)
+ model_buffers.pop(i)
+
+ model_buffers = list(model.buffers())
+ for non_persistent_buffer in non_persistent_buffers.values():
+ found_buffer = False
+ for i, model_buffer in enumerate(model_buffers):
+ if torch.equal(non_persistent_buffer, model_buffer):
+ found_buffer = True
+ break
+
+ self.assertTrue(found_buffer)
+ model_buffers.pop(i)
+
+ models_equal = True
+ for layer_name, p1 in model_state_dict.items():
+ if layer_name in loaded_model_state_dict:
+ p2 = loaded_model_state_dict[layer_name]
+ if p1.data.ne(p2.data).sum() > 0:
+ models_equal = False
+
+ self.assertTrue(models_equal)
+
+ # Avoid memory leak. Without this, each call increase RAM usage by ~20MB.
+ # (Even with this call, there are still memory leak by ~0.04MB)
+ self.clear_torch_jit_class_registry()
+
+ @unittest.skip(reason="The MimiModel does not have the usual `attention` logic")
+ def test_attention_outputs(self):
+ pass
+
+ @unittest.skip(reason="The MimiModel does not have the usual `hidden_states` logic")
+ def test_hidden_states_output(self):
+ pass
+
+ # Copied from transformers.tests.encodec.test_modeling_encodec.MimiModelTest.test_determinism
+ def test_determinism(self):
+ config, inputs_dict = self.model_tester.prepare_config_and_inputs_for_common()
+
+ def check_determinism(first, second):
+ # outputs are not tensors but list (since each sequence don't have the same frame_length)
+ out_1 = first.cpu().numpy()
+ out_2 = second.cpu().numpy()
+ out_1 = out_1[~np.isnan(out_1)]
+ out_2 = out_2[~np.isnan(out_2)]
+ max_diff = np.amax(np.abs(out_1 - out_2))
+ self.assertLessEqual(max_diff, 1e-5)
+
+ for model_class in self.all_model_classes:
+ model = model_class(config)
+ model.to(torch_device)
+ model.eval()
+ with torch.no_grad():
+ first = model(**self._prepare_for_class(inputs_dict, model_class))[0]
+ second = model(**self._prepare_for_class(inputs_dict, model_class))[0]
+
+ if isinstance(first, tuple) and isinstance(second, tuple):
+ for tensor1, tensor2 in zip(first, second):
+ check_determinism(tensor1, tensor2)
+ else:
+ check_determinism(first, second)
+
+ # Copied from transformers.tests.encodec.test_modeling_encodec.MimiModelTest.test_model_outputs_equivalence
+ def test_model_outputs_equivalence(self):
+ config, inputs_dict = self.model_tester.prepare_config_and_inputs_for_common()
+
+ def set_nan_tensor_to_zero(t):
+ t[t != t] = 0
+ return t
+
+ def check_equivalence(model, tuple_inputs, dict_inputs, additional_kwargs={}):
+ with torch.no_grad():
+ tuple_output = model(**tuple_inputs, return_dict=False, **additional_kwargs)
+ dict_output = model(**dict_inputs, return_dict=True, **additional_kwargs)
+
+ self.assertTrue(isinstance(tuple_output, tuple))
+ self.assertTrue(isinstance(dict_output, dict))
+
+ for tuple_value, dict_value in zip(tuple_output, dict_output.values()):
+ self.assertTrue(
+ torch.allclose(
+ set_nan_tensor_to_zero(tuple_value), set_nan_tensor_to_zero(dict_value), atol=1e-5
+ ),
+ msg=(
+ "Tuple and dict output are not equal. Difference:"
+ f" {torch.max(torch.abs(tuple_value - dict_value))}. Tuple has `nan`:"
+ f" {torch.isnan(tuple_value).any()} and `inf`: {torch.isinf(tuple_value)}. Dict has"
+ f" `nan`: {torch.isnan(dict_value).any()} and `inf`: {torch.isinf(dict_value)}."
+ ),
+ )
+
+ for model_class in self.all_model_classes:
+ model = model_class(config)
+ model.to(torch_device)
+ model.eval()
+
+ tuple_inputs = self._prepare_for_class(inputs_dict, model_class)
+ dict_inputs = self._prepare_for_class(inputs_dict, model_class)
+ check_equivalence(model, tuple_inputs, dict_inputs)
+
+ def test_initialization(self):
+ config, inputs_dict = self.model_tester.prepare_config_and_inputs_for_common()
+
+ configs_no_init = _config_zero_init(config)
+ for model_class in self.all_model_classes:
+ model = model_class(config=configs_no_init)
+ for name, param in model.named_parameters():
+ uniform_init_parms = ["conv", "input_proj", "output_proj"]
+ if param.requires_grad:
+ if any(x in name for x in uniform_init_parms):
+ self.assertTrue(
+ -1.0 <= ((param.data.mean() * 1e9).round() / 1e9).item() <= 1.0,
+ msg=f"Parameter {name} of model {model_class} seems not properly initialized",
+ )
+
+ # Copied from transformers.tests.encodec.test_modeling_encodec.MimiModelTest.test_identity_shortcut
+ def test_identity_shortcut(self):
+ config, inputs_dict = self.model_tester.prepare_config_and_inputs()
+ config.use_conv_shortcut = False
+ self.model_tester.create_and_check_model_forward(config, inputs_dict)
+
+ @parameterized.expand([("float16",), ("bfloat16",), ("float32",)])
+ @require_torch_sdpa
+ @slow
+ def test_eager_matches_sdpa_inference(self, torch_dtype: str):
+ if not self.has_attentions:
+ self.skipTest(reason="Model architecture does not support attentions")
+
+ if not self.all_model_classes[0]._supports_sdpa:
+ self.skipTest(f"{self.all_model_classes[0].__name__} does not support SDPA")
+
+ if torch_dtype == "float16" and not is_torch_fp16_available_on_device(torch_device):
+ self.skipTest(f"float16 not supported on {torch_device} (on the specific device currently used)")
+
+ if torch_dtype == "bfloat16" and not is_torch_bf16_available_on_device(torch_device):
+ self.skipTest(
+ f"bfloat16 not supported on {torch_device} (on the specific device currently used, e.g. Nvidia T4 GPU)"
+ )
+
+ # Not sure whether it's fine to put torch.XXX in a decorator if torch is not available so hacking it here instead.
+ if torch_dtype == "float16":
+ torch_dtype = torch.float16
+ elif torch_dtype == "bfloat16":
+ torch_dtype = torch.bfloat16
+ elif torch_dtype == "float32":
+ torch_dtype = torch.float32
+
+ atols = {
+ ("cpu", False, torch.float32): 1e-6,
+ ("cpu", False, torch.bfloat16): 1e-2,
+ ("cpu", True, torch.float32): 1e-6,
+ ("cpu", True, torch.bfloat16): 1e-2,
+ ("cuda", False, torch.float32): 1e-6,
+ ("cuda", False, torch.bfloat16): 1e-2,
+ ("cuda", False, torch.float16): 5e-3,
+ ("cuda", True, torch.float32): 1e-6,
+ ("cuda", True, torch.bfloat16): 1e-2,
+ ("cuda", True, torch.float16): 5e-3,
+ }
+ rtols = {
+ ("cpu", False, torch.float32): 1e-4,
+ ("cpu", False, torch.bfloat16): 1e-2,
+ ("cpu", True, torch.float32): 1e-4,
+ ("cpu", True, torch.bfloat16): 1e-2,
+ ("cuda", False, torch.float32): 1e-4,
+ ("cuda", False, torch.bfloat16): 1e-2,
+ ("cuda", False, torch.float16): 5e-3,
+ ("cuda", True, torch.float32): 1e-4,
+ ("cuda", True, torch.bfloat16): 3e-2,
+ ("cuda", True, torch.float16): 5e-3,
+ }
+
+ def get_mean_reldiff(failcase, x, ref, atol, rtol):
+ return f"{failcase}: mean relative difference: {((x - ref).abs() / (ref.abs() + 1e-12)).mean():.3e}, torch atol = {atol}, torch rtol = {rtol}"
+
+ for model_class in self.all_model_classes:
+ config, inputs_dict = self.model_tester.prepare_config_and_inputs_for_common()
+ model = model_class(config)
+ # FIXME: we deactivate boolean mask for models using "use_mask_token" in their constructors.
+ # These models support masking only in the case `use_mask_token=True`. Otherwise they cannot consume an input mask.
+ # This means that the class needs to be instantiated much later, after `use_mask` is set, which means a significant refactor of the code.
+ # However masking there is not done at any layers that matters (i.e self-attention), therefore we can safely deactivate it.
+ deactivate_mask = "use_mask_token" in inspect.signature(model_class).parameters
+
+ is_encoder_decoder = model.config.is_encoder_decoder
+
+ with tempfile.TemporaryDirectory() as tmpdirname:
+ model.save_pretrained(tmpdirname)
+ model_sdpa = model_class.from_pretrained(tmpdirname, torch_dtype=torch_dtype)
+ model_sdpa = model_sdpa.eval().to(torch_device)
+
+ self.assertTrue(model_sdpa.config._attn_implementation == "sdpa")
+
+ model_eager = model_class.from_pretrained(
+ tmpdirname,
+ torch_dtype=torch_dtype,
+ attn_implementation="eager",
+ )
+ model_eager = model_eager.eval().to(torch_device)
+
+ self.assertTrue(model_eager.config._attn_implementation == "eager")
+
+ for name, submodule in model_eager.named_modules():
+ class_name = submodule.__class__.__name__
+ if "SdpaAttention" in class_name or "SdpaSelfAttention" in class_name:
+ raise ValueError("The eager model should not have SDPA attention layers")
+
+ has_sdpa = False
+ for name, submodule in model_sdpa.named_modules():
+ class_name = submodule.__class__.__name__
+ if "SdpaAttention" in class_name or "SdpaSelfAttention" in class_name:
+ has_sdpa = True
+ break
+ if not has_sdpa and model_sdpa.config.model_type != "falcon":
+ raise ValueError("The SDPA model should have SDPA attention layers")
+
+ # We use these for loops instead of parameterized.expand just for the interest of avoiding loading/saving 16 times the model,
+ # but it would be nicer to have an efficient way to use parameterized.expand
+ fail_cases = []
+ for padding_side in ["left", "right"]:
+ for use_mask in [False, True]:
+ for output_attentions in [True, False]:
+ can_output_attn = "output_attentions" in inspect.signature(model_sdpa.forward).parameters
+ if not (self.has_attentions and can_output_attn) and output_attentions:
+ continue
+ for batch_size in [1, 5]:
+ dummy_input = inputs_dict[model.main_input_name]
+
+ if dummy_input.dtype in [torch.float32, torch.bfloat16, torch.float16]:
+ dummy_input = dummy_input.to(torch_dtype)
+
+ dummy_input = dummy_input[:batch_size]
+ if dummy_input.shape[0] != batch_size:
+ if dummy_input.dtype in [torch.float32, torch.bfloat16, torch.float16]:
+ extension = torch.rand(
+ batch_size - dummy_input.shape[0],
+ *dummy_input.shape[1:],
+ dtype=torch_dtype,
+ device=torch_device,
+ )
+ dummy_input = torch.cat((dummy_input, extension), dim=0).to(torch_device)
+ else:
+ extension = torch.randint(
+ high=5,
+ size=(batch_size - dummy_input.shape[0], *dummy_input.shape[1:]),
+ dtype=dummy_input.dtype,
+ device=torch_device,
+ )
+ dummy_input = torch.cat((dummy_input, extension), dim=0).to(torch_device)
+
+ if not use_mask:
+ dummy_attention_mask = None
+ else:
+ dummy_attention_mask = inputs_dict.get("attention_mask", None)
+ if dummy_attention_mask is None:
+ if is_encoder_decoder:
+ seqlen = inputs_dict.get("decoder_input_ids", dummy_input).shape[-1]
+ else:
+ seqlen = dummy_input.shape[-1]
+ dummy_attention_mask = (
+ torch.ones(batch_size, seqlen).to(torch.int64).to(torch_device)
+ )
+
+ dummy_attention_mask = dummy_attention_mask[:batch_size]
+ if dummy_attention_mask.shape[0] != batch_size:
+ extension = torch.ones(
+ batch_size - dummy_attention_mask.shape[0],
+ *dummy_attention_mask.shape[1:],
+ dtype=dummy_attention_mask.dtype,
+ device=torch_device,
+ )
+ dummy_attention_mask = torch.cat((dummy_attention_mask, extension), dim=0)
+ dummy_attention_mask = dummy_attention_mask.to(torch_device)
+
+ dummy_attention_mask[:] = 1
+ if padding_side == "left":
+ dummy_attention_mask[-1, :-1] = 1
+ dummy_attention_mask[-1, -4:] = 0
+ elif padding_side == "right":
+ dummy_attention_mask[-1, 1:] = 1
+ dummy_attention_mask[-1, :3] = 0
+
+ for enable_kernels in [False, True]:
+ failcase = f"padding_side={padding_side}, use_mask={use_mask}, batch_size={batch_size}, enable_kernels={enable_kernels}"
+ if is_encoder_decoder:
+ decoder_input_ids = inputs_dict.get("decoder_input_ids", dummy_input)[
+ :batch_size
+ ]
+ if decoder_input_ids.shape[0] != batch_size:
+ extension = torch.ones(
+ batch_size - decoder_input_ids.shape[0],
+ *decoder_input_ids.shape[1:],
+ dtype=decoder_input_ids.dtype,
+ device=torch_device,
+ )
+ decoder_input_ids = torch.cat((decoder_input_ids, extension), dim=0)
+ decoder_input_ids = decoder_input_ids.to(torch_device)
+
+ # TODO: never an `attention_mask` arg here?
+ processed_inputs = {
+ model.main_input_name: dummy_input,
+ "decoder_input_ids": decoder_input_ids,
+ "decoder_attention_mask": dummy_attention_mask,
+ "output_hidden_states": True,
+ }
+ else:
+ processed_inputs = {
+ model.main_input_name: dummy_input,
+ "output_hidden_states": True,
+ }
+
+ # Otherwise fails for e.g. WhisperEncoderModel
+ if "attention_mask" in inspect.signature(model_eager.forward).parameters:
+ processed_inputs["attention_mask"] = dummy_attention_mask
+
+ if (
+ self.has_attentions
+ and "output_attentions" in inspect.signature(model_sdpa.forward).parameters
+ ):
+ processed_inputs["output_attentions"] = output_attentions
+ if not deactivate_mask and (
+ "bool_masked_pos" in inspect.signature(model_eager.forward).parameters
+ ):
+ dummy_mask = torch.ones((self.model_tester.num_masks,))
+
+ # In case of additional token (like class) we define a custom `mask_length`
+ if hasattr(self.model_tester, "mask_length"):
+ mask_length = self.model_tester.mask_length - dummy_mask.size(0)
+ else:
+ mask_length = self.model_tester.seq_length - dummy_mask.size(0)
+ dummy_mask = torch.cat([dummy_mask, torch.zeros(mask_length)])
+ dummy_bool_masked_pos = dummy_mask.expand(batch_size, -1).bool()
+ processed_inputs["bool_masked_pos"] = dummy_bool_masked_pos.to(torch_device)
+
+ if "noise" in inspect.signature(model_eager.forward).parameters:
+ np.random.seed(2)
+ num_patches = int(
+ (self.model_tester.image_size // self.model_tester.patch_size) ** 2
+ )
+ noise = np.random.uniform(size=(batch_size, num_patches))
+ processed_inputs["noise"] = torch.from_numpy(noise)
+
+ # TODO: test gradients as well (& for FA2 as well!)
+ with torch.no_grad():
+ with torch.backends.cuda.sdp_kernel(
+ enable_flash=enable_kernels,
+ enable_math=True,
+ enable_mem_efficient=enable_kernels,
+ ):
+ prepared_inputs = self._prepare_for_class(processed_inputs, model_class)
+ outputs_eager = model_eager(**prepared_inputs)
+ outputs_sdpa = model_sdpa(**prepared_inputs)
+
+ # Ignore copy
+ logits_eager = outputs_eager.audio_values
+ # Ignore copy
+ logits_sdpa = outputs_sdpa.audio_values
+
+ if torch_device in ["cpu", "cuda"]:
+ atol = atols[torch_device, enable_kernels, torch_dtype]
+ rtol = rtols[torch_device, enable_kernels, torch_dtype]
+ else:
+ atol = 1e-7
+ rtol = 1e-4
+
+ # Masked tokens output slightly deviates - we don't mind that.
+ if use_mask:
+ if padding_side == "left":
+ sub_sdpa = logits_sdpa[:-1]
+ sub_eager = logits_eager[:-1]
+ if not torch.allclose(sub_sdpa, sub_eager, atol=atol, rtol=rtol):
+ fail_cases.append(
+ get_mean_reldiff(failcase, sub_sdpa, sub_eager, atol, rtol)
+ )
+
+ sub_sdpa = logits_sdpa[-1, :-4]
+ sub_eager = logits_eager[-1, :-4]
+ if not torch.allclose(sub_sdpa, sub_eager, atol=atol, rtol=rtol):
+ fail_cases.append(
+ get_mean_reldiff(failcase, sub_sdpa, sub_eager, atol, rtol)
+ )
+
+ # Testing the padding tokens is not really meaningful but anyway
+ # sub_sdpa = logits_sdpa[-1, -4:]
+ # sub_eager = logits_eager[-1, -4:]
+ # if not torch.allclose(sub_sdpa, sub_eager, atol=atol, rtol=rtol):
+ # fail_cases.append(get_mean_reldiff(failcase, sub_sdpa, sub_eager, 4e-2, 4e-2))
+ elif padding_side == "right":
+ sub_sdpa = logits_sdpa[:-1]
+ sub_eager = logits_eager[:-1]
+ if not torch.allclose(sub_sdpa, sub_eager, atol=atol, rtol=rtol):
+ fail_cases.append(
+ get_mean_reldiff(failcase, sub_sdpa, sub_eager, atol, rtol)
+ )
+
+ sub_sdpa = logits_sdpa[-1, 3:]
+ sub_eager = logits_eager[-1, 3:]
+ if not torch.allclose(sub_sdpa, sub_eager, atol=atol, rtol=rtol):
+ fail_cases.append(
+ get_mean_reldiff(failcase, sub_sdpa, sub_eager, atol, rtol)
+ )
+
+ # Testing the padding tokens is not really meaningful but anyway
+ # sub_sdpa = logits_sdpa[-1, :3]
+ # sub_eager = logits_eager[-1, :3]
+ # if not torch.allclose(sub_sdpa, sub_eager, atol=atol, rtol=rtol):
+ # fail_cases.append(get_mean_reldiff(failcase, sub_sdpa, sub_eager, 4e-2, 4e-2))
+
+ else:
+ if not torch.allclose(logits_sdpa, logits_eager, atol=atol, rtol=rtol):
+ fail_cases.append(
+ get_mean_reldiff(failcase, logits_sdpa, logits_eager, atol, rtol)
+ )
+
+ self.assertTrue(len(fail_cases) == 0, "\n".join(fail_cases))
+
+ @require_flash_attn
+ @require_torch_gpu
+ @mark.flash_attn_test
+ @slow
+ @is_flaky()
+ def test_flash_attn_2_inference_equivalence(self):
+ for model_class in self.all_model_classes:
+ config, inputs_dict = self.model_tester.prepare_config_and_inputs_for_common()
+ model = model_class(config)
+
+ with tempfile.TemporaryDirectory() as tmpdirname:
+ model.save_pretrained(tmpdirname)
+ model_fa = model_class.from_pretrained(
+ tmpdirname, torch_dtype=torch.bfloat16, attn_implementation="flash_attention_2"
+ )
+ model_fa.to(torch_device)
+
+ model = model_class.from_pretrained(tmpdirname, torch_dtype=torch.bfloat16)
+ model.to(torch_device)
+
+ dummy_input = inputs_dict[model.main_input_name][:1]
+ if dummy_input.dtype in [torch.float32, torch.float16]:
+ dummy_input = dummy_input.to(torch.bfloat16)
+
+ outputs = model(dummy_input)
+ outputs_fa = model_fa(dummy_input)
+
+ logits = outputs[1]
+ logits_fa = outputs_fa[1]
+
+ assert torch.allclose(logits_fa, logits, atol=4e-2, rtol=4e-2)
+
+ @unittest.skip(reason="The MimiModel does not support right padding")
+ def test_flash_attn_2_inference_equivalence_right_padding(self):
+ pass
+
+ @unittest.skip(reason="The MimiModel does not have support dynamic compile yet")
+ def test_sdpa_can_compile_dynamic(self):
+ pass
+
+ # For now, Let's focus only on GPU for `torch.compile`
+ @slow
+ @require_torch_gpu
+ def test_torch_compile(self):
+ if version.parse(torch.__version__) < version.parse("2.3"):
+ self.skipTest(reason="This test requires torch >= 2.3 to run.")
+
+ config, inputs_dict = self.model_tester.prepare_config_and_inputs_for_common()
+
+ n_iter = 3
+ for model_class in self.all_model_classes:
+ model = model_class(config).to(torch_device)
+ model.forward = torch.compile(model.forward)
+ for i in range(n_iter):
+ _ = model(inputs_dict["input_values"].to(torch_device))
+
+ @is_flaky()
+ def test_batching_equivalence(self):
+ super().test_batching_equivalence()
+
+
+# Copied from transformers.tests.encodec.test_modeling_encodec.normalize
+def normalize(arr):
+ norm = np.linalg.norm(arr)
+ normalized_arr = arr / norm
+ return normalized_arr
+
+
+# Copied from transformers.tests.encodec.test_modeling_encodec.compute_rmse
+def compute_rmse(arr1, arr2):
+ arr1_normalized = normalize(arr1)
+ arr2_normalized = normalize(arr2)
+ return np.sqrt(((arr1_normalized - arr2_normalized) ** 2).mean())
+
+
+@slow
+@require_torch
+class MimiIntegrationTest(unittest.TestCase):
+ def test_integration_using_cache_decode(self):
+ expected_rmse = {
+ "8": 0.0018785292,
+ "32": 0.0012330565,
+ }
+
+ librispeech_dummy = load_dataset("hf-internal-testing/librispeech_asr_dummy", "clean", split="validation")
+ model_id = "kyutai/mimi"
+
+ model = MimiModel.from_pretrained(model_id, use_cache=True).to(torch_device)
+ processor = AutoFeatureExtractor.from_pretrained(model_id)
+
+ librispeech_dummy = librispeech_dummy.cast_column("audio", Audio(sampling_rate=processor.sampling_rate))
+ audio_sample = librispeech_dummy[-1]["audio"]["array"]
+
+ inputs = processor(
+ raw_audio=audio_sample,
+ sampling_rate=processor.sampling_rate,
+ return_tensors="pt",
+ ).to(torch_device)
+
+ for num_codebooks, expected_rmse in expected_rmse.items():
+ with torch.no_grad():
+ # use max bandwith for best possible reconstruction
+ encoder_outputs = model.encode(inputs["input_values"], num_quantizers=int(num_codebooks))
+
+ audio_codes = encoder_outputs[0]
+
+ decoder_outputs_first_part = model.decode(audio_codes[:, :, : audio_codes.shape[2] // 2])
+ decoder_outputs_second_part = model.decode(
+ audio_codes[:, :, audio_codes.shape[2] // 2 :],
+ decoder_past_key_values=decoder_outputs_first_part.decoder_past_key_values,
+ )
+
+ audio_output_entire_context = model.decode(audio_codes)[0]
+ audio_output_concat_context = torch.cat(
+ [decoder_outputs_first_part[0], decoder_outputs_second_part[0]], dim=2
+ )
+
+ # make sure audios are more or less equal
+ # the RMSE of two random gaussian noise vectors with ~N(0, 1) is around 1.0
+ rmse = compute_rmse(
+ audio_output_concat_context.squeeze().cpu().numpy(),
+ audio_output_entire_context.squeeze().cpu().numpy(),
+ )
+ self.assertTrue(rmse < 1e-3)
+
+ def test_integration(self):
+ expected_rmses = {
+ "8": 0.0018785292,
+ "32": 0.0012330565,
+ }
+ expected_codesums = {
+ "8": 430423,
+ "32": 1803071,
+ }
+ librispeech_dummy = load_dataset("hf-internal-testing/librispeech_asr_dummy", "clean", split="validation")
+ model_id = "kyutai/mimi"
+
+ processor = AutoFeatureExtractor.from_pretrained(model_id)
+
+ librispeech_dummy = librispeech_dummy.cast_column("audio", Audio(sampling_rate=processor.sampling_rate))
+ audio_sample = librispeech_dummy[-1]["audio"]["array"]
+
+ inputs = processor(
+ raw_audio=audio_sample,
+ sampling_rate=processor.sampling_rate,
+ return_tensors="pt",
+ ).to(torch_device)
+
+ for use_cache in [False, True]:
+ model = MimiModel.from_pretrained(model_id, use_cache=use_cache).to(torch_device)
+ for num_codebooks, expected_rmse in expected_rmses.items():
+ with torch.no_grad():
+ # use max bandwith for best possible reconstruction
+ encoder_outputs = model.encode(inputs["input_values"], num_quantizers=int(num_codebooks))
+
+ audio_code_sums = encoder_outputs[0].sum().cpu().item()
+
+ # make sure audio encoded codes are correct
+ # assert relative difference less than a threshold, because `audio_code_sums` varies a bit
+ # depending on torch version
+ self.assertTrue(
+ np.abs(audio_code_sums - expected_codesums[num_codebooks]) <= (3e-3 * audio_code_sums)
+ )
+
+ input_values_dec = model.decode(encoder_outputs[0], padding_mask=inputs["padding_mask"])[0]
+ input_values_enc_dec = model(
+ inputs["input_values"], inputs["padding_mask"], num_quantizers=int(num_codebooks)
+ )[1]
+
+ # make sure forward and decode gives same result
+ self.assertTrue(torch.allclose(input_values_dec, input_values_enc_dec))
+
+ # make sure shape matches
+ self.assertTrue(inputs["input_values"].shape == input_values_enc_dec.shape)
+
+ arr = inputs["input_values"][0].cpu().numpy()
+ arr_enc_dec = input_values_enc_dec[0].cpu().numpy()
+
+ # make sure audios are more or less equal
+ # the RMSE of two random gaussian noise vectors with ~N(0, 1) is around 1.0
+ rmse = compute_rmse(arr, arr_enc_dec)
+ self.assertTrue(np.abs(rmse - expected_rmse) < 1e-5)
From e40bb4845e0eefb52ec1e9cac9c2446ab36aef81 Mon Sep 17 00:00:00 2001
From: Raushan Turganbay
Date: Thu, 19 Sep 2024 09:56:52 +0200
Subject: [PATCH 38/50] Load and save video-processor from separate folder
(#33562)
* load and save from video-processor folder
* Update src/transformers/models/llava_onevision/processing_llava_onevision.py
Co-authored-by: amyeroberts <22614925+amyeroberts@users.noreply.github.com>
---------
Co-authored-by: amyeroberts <22614925+amyeroberts@users.noreply.github.com>
---
.../image_processing_llava_onevision.py | 1 +
.../processing_llava_onevision.py | 53 ++++++++++++++++++-
.../test_processing_llava_onevision.py | 21 ++++----
tests/test_processing_common.py | 8 +++
4 files changed, 71 insertions(+), 12 deletions(-)
diff --git a/src/transformers/models/llava_onevision/image_processing_llava_onevision.py b/src/transformers/models/llava_onevision/image_processing_llava_onevision.py
index 3dddcdd148a416..2047557208372a 100644
--- a/src/transformers/models/llava_onevision/image_processing_llava_onevision.py
+++ b/src/transformers/models/llava_onevision/image_processing_llava_onevision.py
@@ -621,6 +621,7 @@ def preprocess(
"""
do_resize = do_resize if do_resize is not None else self.do_resize
size = size if size is not None else self.size
+ size = get_size_dict(size, default_to_square=False)
image_grid_pinpoints = image_grid_pinpoints if image_grid_pinpoints is not None else self.image_grid_pinpoints
resample = resample if resample is not None else self.resample
do_rescale = do_rescale if do_rescale is not None else self.do_rescale
diff --git a/src/transformers/models/llava_onevision/processing_llava_onevision.py b/src/transformers/models/llava_onevision/processing_llava_onevision.py
index e050ec3f31deea..d4ae02e0bb154c 100644
--- a/src/transformers/models/llava_onevision/processing_llava_onevision.py
+++ b/src/transformers/models/llava_onevision/processing_llava_onevision.py
@@ -17,6 +17,7 @@
"""
import math
+import os
import sys
from typing import Iterable, List, Union
@@ -34,6 +35,11 @@
ProcessorMixin,
)
from ...tokenization_utils_base import PreTokenizedInput, TextInput
+from ...utils import logging
+from ..auto import AutoImageProcessor
+
+
+logger = logging.get_logger(__name__)
class LlavaOnevisionProcessorKwargs(ProcessingKwargs, total=False):
@@ -96,7 +102,7 @@ def __init__(
chat_template=None,
image_token="",
video_token="