huggingface · ylacombe · Sep 13, 2024 · Sep 18, 2024 · Sep 18, 2024 · Sep 13, 2024
diff --git a/docs/source/en/_toctree.yml b/docs/source/en/_toctree.yml
@@ -726,6 +726,8 @@
         title: Mimi
       - local: model_doc/mms
         title: MMS
+      - local: model_doc/moshi
+        title: Moshi
       - local: model_doc/musicgen
         title: MusicGen
       - local: model_doc/musicgen_melody

diff --git a/docs/source/en/model_doc/mimi.md b/docs/source/en/model_doc/mimi.md
@@ -18,6 +18,7 @@ rendered properly in your Markdown viewer.
 
 ## Overview
 
+
 The Mimi model was proposed in [Moshi: a speech-text foundation model for real-time dialogue](https://kyutai.org/Moshi.pdf) by Alexandre Défossez, Laurent Mazaré, Manu Orsini, Amélie Royer, Patrick Pérez, Hervé Jégou, Edouard Grave and Neil Zeghidour. Mimi is a high-fidelity audio codec model developed by the Kyutai team, that combines semantic and acoustic information into audio tokens running at 12Hz and a bitrate of 1.1kbps. In other words, it can be used to map audio waveforms into “audio tokens”, known as “codebooks”.
 
 The abstract from the paper is the following:
@@ -29,6 +30,7 @@ Its architecture is based on [Encodec](model_doc/encodec) with several major dif
 * it uses additional transformers for encoding and decoding for better latent contextualization
 * it uses a different quantization scheme: one codebook is dedicated to semantic projection.
 
+
 ## Usage example 
 
 Here is a quick example of how to encode and decode an audio using this model:
@@ -54,6 +56,7 @@ Here is a quick example of how to encode and decode an audio using this model:
 ```
 
 This model was contributed by [Yoach Lacombe (ylacombe)](https://huggingface.co/ylacombe).
+
 The original code can be found [here](https://github.com/kyutai-labs/moshi).
 
 

diff --git a/docs/source/en/model_doc/moshi.md b/docs/source/en/model_doc/moshi.md
@@ -0,0 +1,53 @@
+<!--Copyright 2024 The HuggingFace Team. All rights reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License"); you may not use this file except in compliance with
+the License. You may obtain a copy of the License at
+
+http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software distributed under the License is distributed on
+an "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. See the License for the
+specific language governing permissions and limitations under the License.
+
+⚠️ Note that this file is in Markdown but contain specific syntax for our doc-builder (similar to MDX) that may not be
+rendered properly in your Markdown viewer.
+
+-->
+
+# Moshi
+
+## Overview
+
+The Moshi model was proposed in [<INSERT PAPER NAME HERE>](<INSERT PAPER LINK HERE>) by <INSERT AUTHORS HERE>.
+<INSERT SHORT SUMMARY HERE>
+
+The abstract from the paper is the following:
+
+*<INSERT PAPER ABSTRACT HERE>*
+
+Tips:
+
+<INSERT TIPS ABOUT MODEL HERE>
+
+This model was contributed by [INSERT YOUR HF USERNAME HERE](https://huggingface.co/<INSERT YOUR HF USERNAME HERE>).
+The original code can be found [here](<INSERT LINK TO GITHUB REPO HERE>).
+
+
+## MoshiConfig
+
+[[autodoc]] MoshiConfig
+
+## MoshiModel
+
+[[autodoc]] MoshiModel
+    - forward
+
+## MoshiForCausalLM
+
+[[autodoc]] MoshiForCausalLM
+    - forward
+
+## MoshiForConditionalGeneration
+
+[[autodoc]] MoshiForConditionalGeneration
+    - forward
diff --git a/src/transformers/__init__.py b/src/transformers/__init__.py
@@ -594,7 +594,9 @@
     "models.mt5": ["MT5Config"],
     "models.musicgen": [
         "MusicgenConfig",
-        "MusicgenDecoderConfig",
+    ],
+    "models.moshi": [
+        "MoshiConfig",
     ],
     "models.musicgen_melody": [
         "MusicgenMelodyConfig",
@@ -2790,6 +2792,15 @@
             "MusicgenProcessor",
         ]
     )
+    _import_structure["models.moshi"].extend(
+        [
+            "MoshiForCausalLM",
+            "MoshiForConditionalGeneration",
+            "MoshiModel",
+            "MoshiPreTrainedModel",
+            "MoshiProcessor",
+        ]
+    )
     _import_structure["models.musicgen_melody"].extend(
         [
             "MusicgenMelodyForCausalLM",
@@ -5384,6 +5395,9 @@
         MusicgenConfig,
         MusicgenDecoderConfig,
     )
+    from .models.moshi import (
+        MoshiConfig,
+    )
     from .models.musicgen_melody import (
         MusicgenMelodyConfig,
         MusicgenMelodyDecoderConfig,
@@ -7319,6 +7333,13 @@
             MusicgenPreTrainedModel,
             MusicgenProcessor,
         )
+        from .models.moshi import (
+            MoshiForCausalLM,
+            MoshiForConditionalGeneration,
+            MoshiModel,
+            MoshiPreTrainedModel,
+            MoshiProcessor,
+        )
         from .models.musicgen_melody import (
             MusicgenMelodyForCausalLM,
             MusicgenMelodyForConditionalGeneration,

diff --git a/src/transformers/configuration_utils.py b/src/transformers/configuration_utils.py
@@ -1033,6 +1033,7 @@ def _get_non_default_generation_parameters(self) -> Dict[str, Any]:
             if decoder_config is not self:
                 default_config = decoder_config.__class__()
             else:
+                default_config = None
                 decoder_config = None
 
         # If it is a composite model, we want to check the subconfig that will be used for generation

diff --git a/src/transformers/models/__init__.py b/src/transformers/models/__init__.py
@@ -163,6 +163,7 @@
     mra,
     mt5,
     musicgen,
+    moshi,
     musicgen_melody,
     mvp,
     nemotron,

diff --git a/src/transformers/models/auto/configuration_auto.py b/src/transformers/models/auto/configuration_auto.py
@@ -180,6 +180,7 @@
         ("mra", "MraConfig"),
         ("mt5", "MT5Config"),
         ("musicgen", "MusicgenConfig"),
+        ("moshi", "MoshiConfig"),
         ("musicgen_melody", "MusicgenMelodyConfig"),
         ("mvp", "MvpConfig"),
         ("nat", "NatConfig"),
@@ -484,6 +485,7 @@
         ("mra", "MRA"),
         ("mt5", "MT5"),
         ("musicgen", "MusicGen"),
+        ("moshi", "Moshi"),
         ("musicgen_melody", "MusicGen Melody"),
         ("mvp", "MVP"),
         ("nat", "NAT"),

diff --git a/src/transformers/models/auto/modeling_auto.py b/src/transformers/models/auto/modeling_auto.py
@@ -171,6 +171,7 @@
         ("mra", "MraModel"),
         ("mt5", "MT5Model"),
         ("musicgen", "MusicgenModel"),
+        ("moshi", "MoshiModel"),
         ("musicgen_melody", "MusicgenMelodyModel"),
         ("mvp", "MvpModel"),
         ("nat", "NatModel"),
@@ -498,6 +499,7 @@
         ("mixtral", "MixtralForCausalLM"),
         ("mpt", "MptForCausalLM"),
         ("musicgen", "MusicgenForCausalLM"),
+        ("moshi", "MoshiForCausalLM"),
         ("musicgen_melody", "MusicgenMelodyForCausalLM"),
         ("mvp", "MvpForCausalLM"),
         ("nemotron", "NemotronForCausalLM"),
@@ -1261,6 +1263,7 @@
         ("bark", "BarkModel"),
         ("fastspeech2_conformer", "FastSpeech2ConformerWithHifiGan"),
         ("musicgen", "MusicgenForConditionalGeneration"),
+        ("moshi", "MoshiForConditionalGeneration"),
         ("musicgen_melody", "MusicgenMelodyForConditionalGeneration"),
         ("seamless_m4t", "SeamlessM4TForTextToSpeech"),
         ("seamless_m4t_v2", "SeamlessM4Tv2ForTextToSpeech"),

diff --git a/src/transformers/models/auto/tokenization_auto.py b/src/transformers/models/auto/tokenization_auto.py
@@ -318,6 +318,7 @@
                 ),
             ),
             ("musicgen", ("T5Tokenizer", "T5TokenizerFast" if is_tokenizers_available() else None)),
+            ("moshi", ("T5Tokenizer", "T5TokenizerFast" if is_tokenizers_available() else None)),
             ("musicgen_melody", ("T5Tokenizer", "T5TokenizerFast" if is_tokenizers_available() else None)),
             ("mvp", ("MvpTokenizer", "MvpTokenizerFast" if is_tokenizers_available() else None)),
             ("nezha", ("BertTokenizer", "BertTokenizerFast" if is_tokenizers_available() else None)),

diff --git a/src/transformers/models/mimi/configuration_mimi.py b/src/transformers/models/mimi/configuration_mimi.py
@@ -30,6 +30,7 @@ class MimiConfig(PretrainedConfig):
     This is the configuration class to store the configuration of an [`MimiModel`]. It is used to instantiate a
     Mimi model according to the specified arguments, defining the model architecture. Instantiating a configuration
     with the defaults will yield a similar configuration to that of the
+
     [kyutai/mimi](https://huggingface.co/kyutai/mimi) architecture.
 
     Configuration objects inherit from [`PretrainedConfig`] and can be used to control the model outputs. Read the
@@ -126,6 +127,7 @@ class MimiConfig(PretrainedConfig):
     ```python
     >>> from transformers import MimiModel, MimiConfig
 
+
     >>> # Initializing a "kyutai/mimi" style configuration
     >>> configuration = MimiConfig()
 

diff --git a/src/transformers/models/mimi/modeling_mimi.py b/src/transformers/models/mimi/modeling_mimi.py
@@ -1000,6 +1000,7 @@ def forward(
             )
             use_cache = False
 
+
         if use_cache and not isinstance(past_key_values, Cache):
             if past_key_values is None:
                 past_key_values = DynamicCache()
@@ -1687,6 +1688,7 @@ def forward(
         >>> dataset = load_dataset("hf-internal-testing/ashraq-esc50-1-dog-example")
         >>> audio_sample = dataset["train"]["audio"][0]["array"]
 
+
         >>> model_id = "kyutai/mimi"
         >>> model = MimiModel.from_pretrained(model_id)
         >>> feature_extractor = AutoFeatureExtractor.from_pretrained(model_id)

diff --git a/src/transformers/models/moshi/__init__.py b/src/transformers/models/moshi/__init__.py
@@ -0,0 +1,59 @@
+# Copyright 2024 The HuggingFace Team. All rights reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+from typing import TYPE_CHECKING
+
+from ...utils import OptionalDependencyNotAvailable, _LazyModule, is_torch_available
+
+
+_import_structure = {
+    "configuration_moshi": [
+        "MoshiConfig",
+    ],
+}
+
+try:
+    if not is_torch_available():
+        raise OptionalDependencyNotAvailable()
+except OptionalDependencyNotAvailable:
+    pass
+else:
+    _import_structure["modeling_moshi"] = [
+        "MoshiForConditionalGeneration",
+        "MoshiForCausalLM",
+        "MoshiModel",
+        "MoshiPreTrainedModel",
+    ]
+
+if TYPE_CHECKING:
+    from .configuration_moshi import (
+        MoshiConfig,
+    )
+
+    try:
+        if not is_torch_available():
+            raise OptionalDependencyNotAvailable()
+    except OptionalDependencyNotAvailable:
+        pass
+    else:
+        from .modeling_moshi import (
+            MoshiForCausalLM,
+            MoshiForConditionalGeneration,
+            MoshiModel,
+            MoshiPreTrainedModel,
+        )
+
+else:
+    import sys
+
+    sys.modules[__name__] = _LazyModule(__name__, globals()["__file__"], _import_structure, module_spec=__spec__)