fix: fixed incorrect calculation of image tokens for GTP-4o mini models

epam · Nov 5, 2024 · bebe7c9 · bebe7c9
1 parent 4d85ced
commit bebe7c9
Show file tree

Hide file tree

Showing 11 changed files with 131 additions and 77 deletions.
diff --git a/README.md b/README.md
@@ -62,7 +62,7 @@ Copy `.env.example` to `.env` and customize it for your environment.
 
 ### Categories of deployments
 
-The following variables cluster all deployments into the groups of deployments which share the same API.
+The following variables cluster all deployments into the groups of deployments which share the same API and the same tokenization algorithm.
 
 |Variable|Default|Description|
 |---|---|---|
@@ -73,6 +73,7 @@ The following variables cluster all deployments into the groups of deployments w
 |MISTRAL_DEPLOYMENTS|``|Comma-separated list of deployments that support Mistral Large Azure API. Example: `mistral-large-azure,mistral-large`|
 |DATABRICKS_DEPLOYMENTS|``|Comma-separated list of Databricks chat completion deployments. Example: `databricks-dbrx-instruct,databricks-mixtral-8x7b-instruct,databricks-llama-2-70b-chat`|
 |GPT4O_DEPLOYMENTS|``|Comma-separated list of GPT-4o chat completion deployments. Example: `gpt-4o-2024-05-13`|
+|GPT4O_MINI_DEPLOYMENTS|``|Comma-separated list of GPT-4o mini chat completion deployments. Example: `gpt-4o-mini-2024-07-18`|
 |AZURE_AI_VISION_DEPLOYMENTS|``|Comma-separated list of Azure AI Vision embedding deployments. The endpoint of the deployment is expected point to the Azure service: `https://<service-name>.cognitiveservices.azure.com/`|
 
 Deployments that do not fall into any of the categories are considered to support text-to-text chat completion OpenAI API or text embeddings OpenAI API.

diff --git a/aidial_adapter_openai/app.py b/aidial_adapter_openai/app.py
@@ -37,7 +37,6 @@
     DALLE3_DEPLOYMENTS,
     DATABRICKS_DEPLOYMENTS,
     GPT4_VISION_DEPLOYMENTS,
-    GPT4O_DEPLOYMENTS,
     MISTRAL_DEPLOYMENTS,
     MODEL_ALIASES,
     NON_STREAMING_DEPLOYMENTS,
@@ -52,6 +51,7 @@
 )
 from aidial_adapter_openai.utils.auth import get_credentials
 from aidial_adapter_openai.utils.http_client import get_http_client
+from aidial_adapter_openai.utils.image_tokenizer import get_image_tokenizer
 from aidial_adapter_openai.utils.log_config import configure_loggers, logger
 from aidial_adapter_openai.utils.parsers import completions_parser, parse_body
 from aidial_adapter_openai.utils.streaming import create_server_response
@@ -148,34 +148,39 @@ async def call_chat_completion(
     if deployment_id in DATABRICKS_DEPLOYMENTS:
         return await databricks_chat_completion(data, upstream_endpoint, creds)
 
-    if deployment_id in GPT4_VISION_DEPLOYMENTS:
-        storage = create_file_storage("images", request.headers)
-        return await gpt4_vision_chat_completion(
-            data,
-            deployment_id,
-            upstream_endpoint,
-            creds,
-            is_stream,
-            storage,
-            api_version,
-        )
+    text_tokenizer_model = MODEL_ALIASES.get(deployment_id, deployment_id)
 
-    openai_model_name = MODEL_ALIASES.get(deployment_id, deployment_id)
-    if deployment_id in GPT4O_DEPLOYMENTS:
-        tokenizer = MultiModalTokenizer(openai_model_name)
+    if image_tokenizer := get_image_tokenizer(deployment_id):
         storage = create_file_storage("images", request.headers)
-        return await gpt4o_chat_completion(
-            data,
-            deployment_id,
-            upstream_endpoint,
-            creds,
-            is_stream,
-            storage,
-            api_version,
-            tokenizer,
-        )
 
-    tokenizer = PlainTextTokenizer(model=openai_model_name)
+        if deployment_id in GPT4_VISION_DEPLOYMENTS:
+            tokenizer = MultiModalTokenizer("gpt-4", image_tokenizer)
+            return await gpt4_vision_chat_completion(
+                data,
+                deployment_id,
+                upstream_endpoint,
+                creds,
+                is_stream,
+                storage,
+                api_version,
+                tokenizer,
+            )
+        else:
+            tokenizer = MultiModalTokenizer(
+                text_tokenizer_model, image_tokenizer
+            )
+            return await gpt4o_chat_completion(
+                data,
+                deployment_id,
+                upstream_endpoint,
+                creds,
+                is_stream,
+                storage,
+                api_version,
+                tokenizer,
+            )
+
+    tokenizer = PlainTextTokenizer(model=text_tokenizer_model)
     return await gpt_chat_completion(
         data,
         deployment_id,

diff --git a/aidial_adapter_openai/env.py b/aidial_adapter_openai/env.py
@@ -16,6 +16,9 @@
     os.getenv("DATABRICKS_DEPLOYMENTS")
 )
 GPT4O_DEPLOYMENTS = parse_deployment_list(os.getenv("GPT4O_DEPLOYMENTS"))
+GPT4O_MINI_DEPLOYMENTS = parse_deployment_list(
+    os.getenv("GPT4O_MINI_DEPLOYMENTS")
+)
 API_VERSIONS_MAPPING: Dict[str, str] = json.loads(
     os.getenv("API_VERSIONS_MAPPING", "{}")
 )

diff --git a/aidial_adapter_openai/gpt4_multi_modal/chat_completion.py b/aidial_adapter_openai/gpt4_multi_modal/chat_completion.py
@@ -163,6 +163,7 @@ async def gpt4_vision_chat_completion(
     is_stream: bool,
     file_storage: Optional[FileStorage],
     api_version: str,
+    tokenizer: MultiModalTokenizer,
 ):
     return await chat_completion(
         request,
@@ -172,7 +173,7 @@ async def gpt4_vision_chat_completion(
         is_stream,
         file_storage,
         api_version,
-        MultiModalTokenizer("gpt-4"),
+        tokenizer,
         convert_gpt4v_to_gpt4_chunk,
         GPT4V_DEFAULT_MAX_TOKENS,
     )

diff --git a/aidial_adapter_openai/utils/image.py b/aidial_adapter_openai/utils/image.py
@@ -1,5 +1,5 @@
 from io import BytesIO
-from typing import Literal
+from typing import Literal, assert_never
 
 from PIL import Image
 from pydantic import BaseModel
@@ -21,6 +21,8 @@ def resolve_detail_level(
             return "low"
         case "high":
             return "high"
+        case _:
+            assert_never(detail)
 
 
 class ImageMetadata(BaseModel):

diff --git a/aidial_adapter_openai/utils/image_tokenizer.py b/aidial_adapter_openai/utils/image_tokenizer.py
@@ -1,45 +1,78 @@
 """
 Tokenization of images as specified at
-    https://learn.microsoft.com/en-us/azure/ai-services/openai/overview#image-tokens-gpt-4-turbo-with-vision
+    https://learn.microsoft.com/en-us/azure/ai-services/openai/overview#image-tokens
 """
 
-import base64
 import math
-from io import BytesIO
-from typing import assert_never
+from typing import List, Tuple, assert_never
 
-from PIL import Image
+from pydantic import BaseModel
 
+from aidial_adapter_openai.env import (
+    GPT4_VISION_DEPLOYMENTS,
+    GPT4O_DEPLOYMENTS,
+    GPT4O_MINI_DEPLOYMENTS,
+)
 from aidial_adapter_openai.utils.image import ImageDetail, resolve_detail_level
-from aidial_adapter_openai.utils.resource import Resource
 
 
-def tokenize_image_data_url(image_data: str, detail: ImageDetail) -> int:
-    parsed_image_data = Resource.from_data_url(image_data)
-    if not parsed_image_data:
-        raise ValueError(f"Invalid image data URL {parsed_image_data!r}")
-    return tokenize_image(parsed_image_data, detail)
+class ImageTokenizer(BaseModel):
+    low_detail_tokens: int
+    """
+    Number of tokens per image in low resolution mode
+    """
 
+    tokens_per_tile: int
+    """
+    Number of tokens per one tile image of 512x512 size
+    """
 
-def tokenize_image(image: Resource, detail: ImageDetail) -> int:
-    image_data = base64.b64decode(image.data)
-    with Image.open(BytesIO(image_data)) as img:
-        width, height = img.size
-        return tokenize_image_by_size(width, height, detail)
+    def tokenize(self, width: int, height: int, detail: ImageDetail) -> int:
+        concrete_detail = resolve_detail_level(width, height, detail)
+        match concrete_detail:
+            case "low":
+                return self.low_detail_tokens
+            case "high":
+                return self._compute_high_detail_tokens(width, height)
+            case _:
+                assert_never(concrete_detail)
 
+    def _compute_high_detail_tokens(self, width: int, height: int) -> int:
+        # Fit into 2048x2048 box
+        width, height = _fit_longest(width, height, 2048)
 
-def tokenize_image_by_size(width: int, height: int, detail: ImageDetail) -> int:
-    concrete_detail = resolve_detail_level(width, height, detail)
-    match concrete_detail:
-        case "low":
-            return 85
-        case "high":
-            return compute_high_detail_tokens(width, height)
-        case _:
-            assert_never(concrete_detail)
+        # Scale down so the shortest side is 768 pixels
+        width, height = _fit_shortest(width, height, 768)
 
+        # Calculate the number of 512-pixel tiles required
+        cols = math.ceil(width / 512)
+        rows = math.ceil(height / 512)
 
-def fit_longest(width: int, height: int, size: int) -> tuple[int, int]:
+        return (self.tokens_per_tile * cols * rows) + self.low_detail_tokens
+
+
+GPT4O_IMAGE_TOKENIZER = GPT4_VISION_IMAGE_TOKENIZER = ImageTokenizer(
+    low_detail_tokens=85, tokens_per_tile=170
+)
+GPT4O_MINI_IMAGE_TOKENIZER = ImageTokenizer(
+    low_detail_tokens=2833, tokens_per_tile=5667
+)
+
+_TOKENIZERS: List[Tuple[ImageTokenizer, List[str]]] = [
+    (GPT4O_IMAGE_TOKENIZER, GPT4O_DEPLOYMENTS),
+    (GPT4O_MINI_IMAGE_TOKENIZER, GPT4O_MINI_DEPLOYMENTS),
+    (GPT4_VISION_IMAGE_TOKENIZER, GPT4_VISION_DEPLOYMENTS),
+]
+
+
+def get_image_tokenizer(deployment_id: str) -> ImageTokenizer | None:
+    for tokenizer, ids in _TOKENIZERS:
+        if deployment_id in ids:
+            return tokenizer
+    return None
+
+
+def _fit_longest(width: int, height: int, size: int) -> tuple[int, int]:
     ratio = width / height
     if width > height:
         scaled_width = min(width, size)
@@ -51,7 +84,7 @@ def fit_longest(width: int, height: int, size: int) -> tuple[int, int]:
     return scaled_width, scaled_height
 
 
-def fit_shortest(width: int, height: int, size: int) -> tuple[int, int]:
+def _fit_shortest(width: int, height: int, size: int) -> tuple[int, int]:
     ratio = width / height
     if width < height:
         scaled_width = min(width, size)
@@ -61,17 +94,3 @@ def fit_shortest(width: int, height: int, size: int) -> tuple[int, int]:
         scaled_width = int(scaled_height * ratio)
 
     return scaled_width, scaled_height
-
-
-def compute_high_detail_tokens(width: int, height: int) -> int:
-    # Fit into 2048x2048 box
-    width, height = fit_longest(width, height, 2048)
-
-    # Scale down so the shortest side is 768 pixels
-    width, height = fit_shortest(width, height, 768)
-
-    # Calculate the number of 512-pixel tiles required
-    cols = math.ceil(width / 512)
-    rows = math.ceil(height / 512)
-
-    return (170 * cols * rows) + 85
diff --git a/aidial_adapter_openai/utils/tokenizer.py b/aidial_adapter_openai/utils/tokenizer.py
@@ -8,7 +8,7 @@
 from aidial_sdk.exceptions import InternalServerError
 from tiktoken import Encoding, encoding_for_model
 
-from aidial_adapter_openai.utils.image_tokenizer import tokenize_image_by_size
+from aidial_adapter_openai.utils.image_tokenizer import ImageTokenizer
 from aidial_adapter_openai.utils.multi_modal_message import MultiModalMessage
 
 MessageType = TypeVar("MessageType")
@@ -131,6 +131,12 @@ def calculate_message_tokens(self, message: dict) -> int:
 
 
 class MultiModalTokenizer(BaseTokenizer[MultiModalMessage]):
+    image_tokenizer: ImageTokenizer
+
+    def __init__(self, model: str, image_tokenizer: ImageTokenizer):
+        super().__init__(model)
+        self.image_tokenizer = image_tokenizer
+
     def calculate_message_tokens(self, message: MultiModalMessage) -> int:
         tokens = self.tokens_per_message
         raw_message = message.raw_message
@@ -144,7 +150,7 @@ def calculate_message_tokens(self, message: MultiModalMessage) -> int:
 
         # Processing image parts of message
         for metadata in message.image_metadatas:
-            tokens += tokenize_image_by_size(
+            tokens += self.image_tokenizer.tokenize(
                 width=metadata.width,
                 height=metadata.height,
                 detail=metadata.detail,

diff --git a/poetry.lock b/poetry.lock
diff --git a/pyproject.toml b/pyproject.toml
@@ -47,6 +47,8 @@ flake8 = "6.0.0"
 
 [tool.poetry.group.dev.dependencies]
 nox = "^2023.4.22"
+# Required for `make serve` which loads .env file
+python-dotenv = "^1.0.1"
 
 [tool.pytest.ini_options]
 # muting warnings coming from opentelemetry and pkg_resources packages

diff --git a/tests/test_image_tokenization.py b/tests/test_image_tokenization.py
@@ -3,7 +3,7 @@
 import pytest
 
 from aidial_adapter_openai.utils.image import ImageDetail
-from aidial_adapter_openai.utils.image_tokenizer import tokenize_image_by_size
+from aidial_adapter_openai.utils.image_tokenizer import GPT4O_IMAGE_TOKENIZER
 
 test_cases: List[Tuple[int, int, ImageDetail, int]] = [
     (1, 1, "auto", 85),
@@ -25,8 +25,8 @@
 
 @pytest.mark.parametrize("width, height, detail, expected_tokens", test_cases)
 def test_tokenize(width, height, detail, expected_tokens):
-    tokens1 = tokenize_image_by_size(width, height, detail)
-    tokens2 = tokenize_image_by_size(height, width, detail)
+    tokens1 = GPT4O_IMAGE_TOKENIZER.tokenize(width, height, detail)
+    tokens2 = GPT4O_IMAGE_TOKENIZER.tokenize(height, width, detail)
 
     assert tokens1 == expected_tokens
     assert tokens2 == expected_tokens
diff --git a/tests/test_multimodal_truncate.py b/tests/test_multimodal_truncate.py
@@ -8,11 +8,12 @@
     multi_modal_truncate_prompt,
 )
 from aidial_adapter_openai.utils.image import ImageMetadata
+from aidial_adapter_openai.utils.image_tokenizer import GPT4O_IMAGE_TOKENIZER
 from aidial_adapter_openai.utils.multi_modal_message import MultiModalMessage
 from aidial_adapter_openai.utils.resource import Resource
 from aidial_adapter_openai.utils.tokenizer import MultiModalTokenizer
 
-tokenizer = MultiModalTokenizer("gpt-4o")
+tokenizer = MultiModalTokenizer("gpt-4o", GPT4O_IMAGE_TOKENIZER)
 
 
 def test_multimodal_truncate_with_system_and_last_user_error():