Mllama flash version (#2585)

* Working loading state. * Preprocessing. * Working state ? (Broke idefics1 temporarily). * Cleaner condition. * Fix idefics. * Updating config, removing TODO * Mllama * Ugrade transformers 4.45 * Flashing mllama. * Starting to get there. * Working state. * Integrations tests for mllama (cutting to 10 tokens because there seems' to be instability after (meaning size of the batch matters. * Updating model link. * Earlier assert. * Fix vlm ? * remove log. * Force ignore all images but last. * Default dtype bfloat16. * Update integration test after switch to bf16. * Remove dead code. * Removed dead code. * Upgrade the flake to latest transformers/tokenizers * Move to hf tgi-nix * Upgrade to 0.5.0
huggingface · Oct 2, 2024 · d18ed5c · d18ed5c
1 parent 584b4d7
commit d18ed5c
Show file tree

Hide file tree

Showing 24 changed files with 3,242 additions and 1,564 deletions.
diff --git a/Cargo.lock b/Cargo.lock
diff --git a/clients/python/text_generation/types.py b/clients/python/text_generation/types.py
@@ -28,11 +28,17 @@ class ToolCall(BaseModel):
     function: dict
 
 
+class Chunk(BaseModel):
+    type: str
+    text: Optional[str] = None
+    image_url: Any = None
+
+
 class Message(BaseModel):
     # Role of the message sender
     role: str
     # Content of the message
-    content: Optional[str] = None
+    content: Optional[Union[str, List[Chunk]]] = None
     # Optional name of the message sender
     name: Optional[str] = None
     # Tool calls associated with the chat completion

diff --git a/docs/source/supported_models.md b/docs/source/supported_models.md
@@ -35,6 +35,7 @@ Text Generation Inference enables serving optimized models on specific hardware
 - [Gpt Neox](https://huggingface.co/EleutherAI/gpt-neox-20b)
 - [Gptj](https://huggingface.co/EleutherAI/gpt-j-6b)
 - [Idefics](https://huggingface.co/HuggingFaceM4/idefics-9b) (Multimodal)
+- [Mllama](https://huggingface.co/meta-llama/Llama-3.2-11B-Vision-Instruct) (Multimodal)
 
 
 If the above list lacks the model you would like to serve, depending on the model's pipeline type, you can try to initialize and serve the model anyways to see how well it performs, but performance isn't guaranteed for non-optimized models:

diff --git a/flake.lock b/flake.lock
diff --git a/flake.nix b/flake.nix
@@ -5,7 +5,7 @@
       inputs.nixpkgs.follows = "tgi-nix/nixpkgs";
     };
     nix-filter.url = "github:numtide/nix-filter";
-    tgi-nix.url = "github:danieldk/tgi-nix/moe-kernels-0.5.0";
+    tgi-nix.url = "github:huggingface/text-generation-inference-nix";
     nixpkgs.follows = "tgi-nix/nixpkgs";
     flake-utils.url = "github:numtide/flake-utils";
     rust-overlay = {

diff --git a/integration-tests/models/__snapshots__/test_mllama/test_mllama_load.json b/integration-tests/models/__snapshots__/test_mllama/test_mllama_load.json
@@ -0,0 +1,106 @@
+[
+  {
+    "choices": [
+      {
+        "finish_reason": "length",
+        "index": 0,
+        "logprobs": null,
+        "message": {
+          "content": "In a bustling city, a chicken named Cluck",
+          "name": null,
+          "role": "assistant",
+          "tool_calls": null
+        },
+        "usage": null
+      }
+    ],
+    "created": 1727773835,
+    "id": "",
+    "model": "meta-llama/Llama-3.2-11B-Vision-Instruct",
+    "object": "chat.completion",
+    "system_fingerprint": "2.3.1-dev0-native",
+    "usage": {
+      "completion_tokens": 10,
+      "prompt_tokens": 50,
+      "total_tokens": 60
+    }
+  },
+  {
+    "choices": [
+      {
+        "finish_reason": "length",
+        "index": 0,
+        "logprobs": null,
+        "message": {
+          "content": "In a world where even chickens could dream big,",
+          "name": null,
+          "role": "assistant",
+          "tool_calls": null
+        },
+        "usage": null
+      }
+    ],
+    "created": 1727773835,
+    "id": "",
+    "model": "meta-llama/Llama-3.2-11B-Vision-Instruct",
+    "object": "chat.completion",
+    "system_fingerprint": "2.3.1-dev0-native",
+    "usage": {
+      "completion_tokens": 10,
+      "prompt_tokens": 50,
+      "total_tokens": 60
+    }
+  },
+  {
+    "choices": [
+      {
+        "finish_reason": "length",
+        "index": 0,
+        "logprobs": null,
+        "message": {
+          "content": "In a world where even chickens could dream big,",
+          "name": null,
+          "role": "assistant",
+          "tool_calls": null
+        },
+        "usage": null
+      }
+    ],
+    "created": 1727773835,
+    "id": "",
+    "model": "meta-llama/Llama-3.2-11B-Vision-Instruct",
+    "object": "chat.completion",
+    "system_fingerprint": "2.3.1-dev0-native",
+    "usage": {
+      "completion_tokens": 10,
+      "prompt_tokens": 50,
+      "total_tokens": 60
+    }
+  },
+  {
+    "choices": [
+      {
+        "finish_reason": "length",
+        "index": 0,
+        "logprobs": null,
+        "message": {
+          "content": "In a world where even chickens could dream big,",
+          "name": null,
+          "role": "assistant",
+          "tool_calls": null
+        },
+        "usage": null
+      }
+    ],
+    "created": 1727773835,
+    "id": "",
+    "model": "meta-llama/Llama-3.2-11B-Vision-Instruct",
+    "object": "chat.completion",
+    "system_fingerprint": "2.3.1-dev0-native",
+    "usage": {
+      "completion_tokens": 10,
+      "prompt_tokens": 50,
+      "total_tokens": 60
+    }
+  }
+]
diff --git a/integration-tests/models/__snapshots__/test_mllama/test_mllama_simpl.json b/integration-tests/models/__snapshots__/test_mllama/test_mllama_simpl.json
@@ -0,0 +1,26 @@
+{
+  "choices": [
+    {
+      "finish_reason": "length",
+      "index": 0,
+      "logprobs": null,
+      "message": {
+        "content": "In a bustling city, a chicken named Cluck",
+        "name": null,
+        "role": "assistant",
+        "tool_calls": null
+      },
+      "usage": null
+    }
+  ],
+  "created": 1727556016,
+  "id": "",
+  "model": "meta-llama/Llama-3.2-11B-Vision-Instruct",
+  "object": "chat.completion",
+  "system_fingerprint": "2.3.1-dev0-native",
+  "usage": {
+    "completion_tokens": 10,
+    "prompt_tokens": 50,
+    "total_tokens": 60
+  }
+}
diff --git a/integration-tests/models/test_mllama.py b/integration-tests/models/test_mllama.py
@@ -0,0 +1,105 @@
+import pytest
+import base64
+import asyncio
+
+
+@pytest.fixture(scope="module")
+def mllama_handle(launcher):
+    with launcher("meta-llama/Llama-3.2-11B-Vision-Instruct", num_shard=2) as handle:
+        yield handle
+
+
+@pytest.fixture(scope="module")
+async def mllama(mllama_handle):
+    await mllama_handle.health(300)
+    return mllama_handle.client
+
+
+# TODO fix the server parsser to count inline image tokens correctly
+def get_chicken():
+    with open("integration-tests/images/chicken_on_money.png", "rb") as image_file:
+        encoded_string = base64.b64encode(image_file.read())
+    return f"data:image/png;base64,{encoded_string.decode('utf-8')}"
+
+
+def get_cow_beach():
+    with open("integration-tests/images/cow_beach.png", "rb") as image_file:
+        encoded_string = base64.b64encode(image_file.read())
+    return f"data:image/png;base64,{encoded_string.decode('utf-8')}"
+
+
+@pytest.mark.asyncio
+async def test_mllama_simpl(mllama, response_snapshot):
+    # chicken = get_chicken()
+    response = await mllama.chat(
+        max_tokens=10,
+        temperature=0.0,
+        messages=[
+            {
+                "role": "user",
+                "content": [
+                    {
+                        "type": "text",
+                        "text": "Can you tell me a very short story based on the image?",
+                    },
+                    {
+                        "type": "image_url",
+                        "image_url": {
+                            "url": "https://raw.githubusercontent.com/huggingface/text-generation-inference/main/integration-tests/images/chicken_on_money.png"
+                        },
+                    },
+                ],
+            },
+        ],
+    )
+
+    assert response.usage == {
+        "completion_tokens": 10,
+        "prompt_tokens": 50,
+        "total_tokens": 60,
+    }
+    assert (
+        response.choices[0].message.content
+        == "In a bustling city, a chicken named Cluck"
+    )
+    assert response == response_snapshot
+
+
+@pytest.mark.release
+@pytest.mark.asyncio
+async def test_mllama_load(mllama, generate_load, response_snapshot):
+    futures = [
+        mllama.chat(
+            max_tokens=10,
+            temperature=0.0,
+            messages=[
+                {
+                    "role": "user",
+                    "content": [
+                        {
+                            "type": "text",
+                            "text": "Can you tell me a very short story based on the image?",
+                        },
+                        {
+                            "type": "image_url",
+                            "image_url": {
+                                "url": "https://raw.githubusercontent.com/huggingface/text-generation-inference/main/integration-tests/images/chicken_on_money.png"
+                            },
+                        },
+                    ],
+                },
+            ],
+        )
+        for i in range(4)
+    ]
+    responses = await asyncio.gather(*futures)
+
+    generated_texts = [response.choices[0].message.content for response in responses]
+
+    assert generated_texts[0] == "In a bustling city, a chicken named Cluck"
+    assert len(generated_texts) == 4
+    assert generated_texts, all(
+        [text == generated_texts[0] for text in generated_texts]
+    )
+
+    assert responses == response_snapshot
diff --git a/router/src/config.rs b/router/src/config.rs
@@ -146,6 +146,7 @@ pub enum Config {
     ClipVisionModel(ClipVisionModel),
     Mistral,
     Idefics,
+    Mllama,
     Idefics2(Idefics2),
     Ssm,
     GptBigcode,

diff --git a/router/src/infer/chat_template.rs b/router/src/infer/chat_template.rs
@@ -29,7 +29,7 @@ impl ChatTemplate {
         env.set_unknown_method_callback(pycompat::unknown_method_callback);
         let template_str = template.into_boxed_str();
         env.add_function("raise_exception", raise_exception);
-        tracing::debug!("Loading template: {:#?}", template_str);
+        tracing::debug!("Loading template: {}", template_str);
 
         // leaking env and template_str as read-only, static resources for performance.
         let template = Box::leak(env)

diff --git a/router/src/validation.rs b/router/src/validation.rs
@@ -567,6 +567,7 @@ fn image_tokens(
     use HubPreprocessorConfig::*;
     match config {
         Idefics => "<image>".to_string(),
+        Mllama => "<|image|>".to_string(),
         Idefics2(config) => {
             const FAKE: &str = "<fake_token_around_image>";
             const IMAGE: &str = "<image>";
@@ -618,7 +619,7 @@ fn prepare_input(
     use Config::*;
     static RE: Lazy<Regex> = Lazy::new(|| Regex::new(r"!\[\]\([^\)]*\)").unwrap());
     let (tokenizer_query, input_chunks) = match config {
-        Some(config @ (Idefics | Idefics2(_) | Paligemma(_) | LlavaNext(_))) => {
+        Some(config @ (Idefics | Mllama | Idefics2(_) | Paligemma(_) | LlavaNext(_))) => {
             let mut input_chunks = Vec::new();
             let mut tokenizer_query = String::with_capacity(inputs.len());
             let mut start = 0;