dottxt-ai · g-prz · Dec 3, 2024 · Dec 3, 2024 · Dec 4, 2024 · Dec 11, 2024
diff --git a/docs/cookbook/audio_understanding.md b/docs/cookbook/audio_understanding.md
@@ -0,0 +1,200 @@
+# Generate structured output for audio understanding
+
+Even though audio-LM models for audio-text-to-text tasks are still pretty niche, they are still useful (and fun) to analyse, extract informations, translate or transcript speeches.
+
+This cookbook highlights the new integration of audio-LM and has been tested with `Qwen/Qwen2-Audio-7B-Instruct` ([HF link](https://huggingface.co/Qwen/Qwen2-Audio-7B-Instruct)).
+
+## Setup
+
+As usual let's have the right packages
+
+```bash
+pip install outlines torch==2.4.0 transformers accelerate librosa
+```
+
+So that you can import as follow:
+
+```python
+# LLM stuff
+import outlines
+from transformers import AutoProcessor, Qwen2AudioForConditionalGeneration
+
+# Audio stuff
+import librosa
+from io import BytesIO
+from urllib.request import urlopen
+
+# Some ooo stuff
+from enum import Enum
+from pydantic import BaseModel
+from typing import Optional
+```
+
+## Load the model and processor
+
+To achieve audio analysis we will need a model and its processor to pre-process prompts and audio. Let's do as follow:
+
+```python
+qwen2_audio = outlines.models.transformers_vision(
+    "Qwen/Qwen2-Audio-7B-Instruct",
+    model_class=Qwen2AudioForConditionalGeneration,
+    model_kwargs={
+        "device_map": "auto",
+        "torch_dtype": torch.bfloat16,
+    },
+    processor_kwargs={
+        "device": "cuda", # set to "cpu" if you don't have a GPU
+    },
+)
+
+processor = AutoProcessor.from_pretrained("Qwen/Qwen2-Audio-7B-Instruct")
+```
+
+Let's also define a useful audio extractor from conversational prompts:
+
+```pyton
+def audio_extractor(conversation):
+    audios = []
+    for message in conversation:
+        if isinstance(message["content"], list):
+            for elt in message["content"]:
+                if elt["type"] == "audio":
+                    audios.append(
+                        librosa.load(
+                            BytesIO(urlopen(elt['audio_url']).read()),
+                            sr=processor.feature_extractor.sampling_rate
+                        )[0]
+                    )
+    return audios
+```
+
+## Question answering
+
+Let's say we want to analyse and answer the question of the lady in this [audio](https://qianwen-res.oss-cn-beijing.aliyuncs.com/Qwen2-Audio/audio/guess_age_gender.wav).
+
+### Data structure
+
+To have a structured data output, we can define the following data model:
+
+```python
+class Age(int, Enum):
+    twenties = 20
+    fifties = 50
+
+class Gender(str, Enum):
+    male = "male"
+    female = "female"
+
+class Person(BaseModel):
+    gender: Gender
+    age: Age
+    language: Optional[str]
+```
+
+### Prompting
+
+Let's have the following prompt to ask our model:
+
+```python
+audio_url = "https://qianwen-res.oss-cn-beijing.aliyuncs.com/Qwen2-Audio/audio/guess_age_gender.wav"
+
+conversation = [
+    {"role": "system", "content": "You are a helpful assistant."},
+    {"role": "user", "content": [
+        {"type": "audio", "audio_url": audio_url},
+        {
+            "type": "text",
+            "text": f"""As asked in the audio, what is the gender and the age of the speaker?
+
+            Return the information in the following JSON schema:
+            {Person.model_json_schema()}
+            """
+        },
+    ]},
+]
+```
+
+But we cannot pass it raw! We need to pre-process it and handle the audio file.
+
+```python
+audios = audio_extractor(conversation)
+
+prompt = processor.apply_chat_template(conversation, add_generation_prompt=True, tokenize=False)
+```
+
+Now we're ready to ask our model!
+
+### Run the model
+
+As usual with the outlines' framework, we will instantiate a generator that specifically struture the output based on our data model:
+
+```python
+person_generator = outlines.generate.json(
+    qwen2_audio,
+    Person,
+    sampler=outlines.samplers.greedy()
+)
+```
+
+That runs just like:
+
+```python
+result = person_generator(prompt, audios)
+```
+
+And you are expecting to get a result as follow:
+```
+Person(
+    gender=<Gender.female: 'female'>,
+    age=<Age.twenties: 20>,
+    language='English'
+)
+```
+
+## Classification
+
+Now we can focus on this [audio](https://qianwen-res.oss-cn-beijing.aliyuncs.com/Qwen2-Audio/audio/glass-breaking-151256.mp3) of a glass breaking.
+
+The integration of audio transformers, allows you to use all the functionalities of the outlines' API such as the `choice` method. We can do as follow:
+
+### Prompting
+
+Let's consider the following prompt and pre-process our audio:
+
+```python
+audio_url = "https://qianwen-res.oss-cn-beijing.aliyuncs.com/Qwen2-Audio/audio/glass-breaking-151256.mp3"
+
+conversation = [
+    {"role": "system", "content": "You are a helpful assistant."},
+    {"role": "user", "content": [
+        {"type": "audio", "audio_url": audio_url},
+        {
+            "type": "text",
+            "text": "Do you hear a dog barking or a glass breaking?"
+        },
+    ]},
+]
+
+audios = audio_extractor(conversation)
+
+prompt = processor.apply_chat_template(conversation, add_generation_prompt=True, tokenize=False)
+```
+
+### Run the model
+
+As mentioned, we will use the `choice` method to generate our structured output:
+
+```python
+choice_generator = outlines.generate.choice(
+    qwen2_audio,
+    ["dog barking", "glass breaking"],
+)
+
+result = choice_generator(prompt, audios)
+```
+
+And you are expected to have:
+```python
+print(result)
+# "glass breaking"
+```
diff --git a/outlines/generate/api.py b/outlines/generate/api.py
@@ -621,3 +621,92 @@ def valid_types(prompts, media):
             )
 
         return prompts, media
+
+
+class AudioSequenceGeneratorAdapter(SequenceGeneratorAdapter):
+    def __call__(  # type: ignore
+        self,
+        prompts: Union[str, List[str]],
+        media: Union[str, Any],
+        max_tokens: Optional[int] = None,
+        stop_at: Optional[Union[str, List[str]]] = None,
+        seed: Optional[int] = None,
+        **model_specific_params,
+    ):
+        """
+        Generate text from a prompt or list of prompts.
+
+        Media: A URI to construct media or media object itself. Used as AutoProcessor argument.
+        """
+        prompts, media = self._validate_prompt_media_types(prompts, media)
+
+        generation_params = self.prepare_generation_parameters(
+            max_tokens, stop_at, seed
+        )
+
+        completions = self.model.generate(
+            prompts,
+            media,
+            generation_params,
+            copy(self.logits_processor),
+            self.sampling_params,
+            **model_specific_params,
+        )
+
+        return self._format(completions)
+
+    def stream(  # type: ignore
+        self,
+        prompts: Union[str, List[str]],
+        media: List[Union[str, Any, List[Union[str, Any]]]],
+        max_tokens: Optional[int] = None,
+        stop_at: Optional[Union[str, List[str]]] = None,
+        seed: Optional[int] = None,
+        **model_specific_params,
+    ):
+        """Return a text generator from a prompt or a list of prompts."""
+        prompts, media = self._validate_prompt_media_types(prompts, media)
+        generation_params = self.prepare_generation_parameters(
+            max_tokens, stop_at, seed
+        )
+        return self.model.stream(
+            prompts,
+            media,
+            generation_params,
+            copy(self.logits_processor),
+            self.sampling_params,
+            **model_specific_params,
+        )
+
+    @classmethod
+    def _validate_prompt_media_types(
+        cls,
+        prompts: Union[str, List[str]],
+        media: Union[str, Any, List[Union[str, Any]]],
+    ) -> Union[Any, List[Any]]:
+        """
+        Prepare media as np.ndarray and ensure for every prompt str there is one List[PIL.Image]
+        """
+
+        def valid_types(prompts, media):
+            import numpy as np  # type: ignore
+
+            if not isinstance(prompts, str):
+                if not isinstance(prompts, list):
+                    return False
+                if not all(isinstance(p, str) for p in prompts):
+                    return False
+            if not isinstance(media, list):
+                return False
+            if not all(isinstance(m, np.ndarray) for m in media):
+                return False
+            return True
+
+        if not valid_types(prompts, media):
+            raise TypeError(
+                "Expected (prompts, media) to be of type "
+                "(str, List[np.ndarray])), or (List[str], List[np.ndarray]]) "
+                f"instead got prompts={prompts}, media={media}"
+            )
+
+        return prompts, media
diff --git a/outlines/generate/cfg.py b/outlines/generate/cfg.py
@@ -1,10 +1,11 @@
 from functools import singledispatch
 
 from outlines.generate.api import (
+    AudioSequenceGeneratorAdapter,
     SequenceGeneratorAdapter,
     VisionSequenceGeneratorAdapter,
 )
-from outlines.models import LlamaCpp, OpenAI, TransformersVision
+from outlines.models import LlamaCpp, OpenAI, TransformersAudio, TransformersVision
 from outlines.samplers import Sampler, multinomial
 
 
@@ -33,6 +34,14 @@ def cfg(
     return SequenceGeneratorAdapter(model, logits_processor, sampler)
 
 
+@cfg.register(TransformersAudio)
+def cfg_audio(model, cfg_str: str, sampler: Sampler = multinomial()):
+    from outlines.processors import CFGLogitsProcessor
+
+    logits_processor = CFGLogitsProcessor(cfg_str, tokenizer=model.tokenizer)
+    return AudioSequenceGeneratorAdapter(model, logits_processor, sampler)
+
+
 @cfg.register(TransformersVision)
 def cfg_vision(model, cfg_str: str, sampler: Sampler = multinomial()):
     from outlines.processors import CFGLogitsProcessor

diff --git a/outlines/generate/fsm.py b/outlines/generate/fsm.py
@@ -4,10 +4,11 @@
 
 from outlines.fsm.guide import RegexGuide
 from outlines.generate.api import (
+    AudioSequenceGeneratorAdapter,
     SequenceGeneratorAdapter,
     VisionSequenceGeneratorAdapter,
 )
-from outlines.models import TransformersVision
+from outlines.models import TransformersAudio, TransformersVision
 from outlines.samplers import Sampler, multinomial
 
 
@@ -22,6 +23,15 @@ def fsm(
     return SequenceGeneratorAdapter(model, logits_processor, sampler)
 
 
+@fsm.register(TransformersAudio)
+def fsm_audio(model, fsm: interegular.fsm.FSM, sampler: Sampler = multinomial()):
+    from outlines.processors import GuideLogitsProcessor
+
+    guide = RegexGuide.from_interegular_fsm(fsm, model.tokenizer)
+    logits_processor = GuideLogitsProcessor(tokenizer=model.tokenizer, guide=guide)
+    return AudioSequenceGeneratorAdapter(model, logits_processor, sampler)
+
+
 @fsm.register(TransformersVision)
 def fsm_vision(model, fsm: interegular.fsm.FSM, sampler: Sampler = multinomial()):
     from outlines.processors import GuideLogitsProcessor

diff --git a/outlines/generate/regex.py b/outlines/generate/regex.py
@@ -1,10 +1,11 @@
 from functools import singledispatch
 
 from outlines.generate.api import (
+    AudioSequenceGeneratorAdapter,
     SequenceGeneratorAdapter,
     VisionSequenceGeneratorAdapter,
 )
-from outlines.models import OpenAI, TransformersVision
+from outlines.models import OpenAI, TransformersAudio, TransformersVision
 from outlines.samplers import Sampler, multinomial
 
 
@@ -35,6 +36,18 @@ def regex(model, regex_str: str, sampler: Sampler = multinomial()):
     return SequenceGeneratorAdapter(model, logits_processor, sampler)
 
 
+@regex.register(TransformersAudio)
+def regex_audio(
+    model,
+    regex_str: str,
+    sampler: Sampler = multinomial(),
+):
+    from outlines.processors import RegexLogitsProcessor
+
+    logits_processor = RegexLogitsProcessor(regex_str, tokenizer=model.tokenizer)
+    return AudioSequenceGeneratorAdapter(model, logits_processor, sampler)
+
+
 @regex.register(TransformersVision)
 def regex_vision(
     model,