Skip to content

Commit

Permalink
fix: fixed incorrect calculation of image tokens for GTP-4o mini models
Browse files Browse the repository at this point in the history
  • Loading branch information
adubovik committed Nov 5, 2024
1 parent 4d85ced commit bebe7c9
Show file tree
Hide file tree
Showing 11 changed files with 131 additions and 77 deletions.
3 changes: 2 additions & 1 deletion README.md
Original file line number Diff line number Diff line change
Expand Up @@ -62,7 +62,7 @@ Copy `.env.example` to `.env` and customize it for your environment.

### Categories of deployments

The following variables cluster all deployments into the groups of deployments which share the same API.
The following variables cluster all deployments into the groups of deployments which share the same API and the same tokenization algorithm.

|Variable|Default|Description|
|---|---|---|
Expand All @@ -73,6 +73,7 @@ The following variables cluster all deployments into the groups of deployments w
|MISTRAL_DEPLOYMENTS|``|Comma-separated list of deployments that support Mistral Large Azure API. Example: `mistral-large-azure,mistral-large`|
|DATABRICKS_DEPLOYMENTS|``|Comma-separated list of Databricks chat completion deployments. Example: `databricks-dbrx-instruct,databricks-mixtral-8x7b-instruct,databricks-llama-2-70b-chat`|
|GPT4O_DEPLOYMENTS|``|Comma-separated list of GPT-4o chat completion deployments. Example: `gpt-4o-2024-05-13`|
|GPT4O_MINI_DEPLOYMENTS|``|Comma-separated list of GPT-4o mini chat completion deployments. Example: `gpt-4o-mini-2024-07-18`|
|AZURE_AI_VISION_DEPLOYMENTS|``|Comma-separated list of Azure AI Vision embedding deployments. The endpoint of the deployment is expected point to the Azure service: `https://<service-name>.cognitiveservices.azure.com/`|

Deployments that do not fall into any of the categories are considered to support text-to-text chat completion OpenAI API or text embeddings OpenAI API.
Expand Down
57 changes: 31 additions & 26 deletions aidial_adapter_openai/app.py
Original file line number Diff line number Diff line change
Expand Up @@ -37,7 +37,6 @@
DALLE3_DEPLOYMENTS,
DATABRICKS_DEPLOYMENTS,
GPT4_VISION_DEPLOYMENTS,
GPT4O_DEPLOYMENTS,
MISTRAL_DEPLOYMENTS,
MODEL_ALIASES,
NON_STREAMING_DEPLOYMENTS,
Expand All @@ -52,6 +51,7 @@
)
from aidial_adapter_openai.utils.auth import get_credentials
from aidial_adapter_openai.utils.http_client import get_http_client
from aidial_adapter_openai.utils.image_tokenizer import get_image_tokenizer
from aidial_adapter_openai.utils.log_config import configure_loggers, logger
from aidial_adapter_openai.utils.parsers import completions_parser, parse_body
from aidial_adapter_openai.utils.streaming import create_server_response
Expand Down Expand Up @@ -148,34 +148,39 @@ async def call_chat_completion(
if deployment_id in DATABRICKS_DEPLOYMENTS:
return await databricks_chat_completion(data, upstream_endpoint, creds)

if deployment_id in GPT4_VISION_DEPLOYMENTS:
storage = create_file_storage("images", request.headers)
return await gpt4_vision_chat_completion(
data,
deployment_id,
upstream_endpoint,
creds,
is_stream,
storage,
api_version,
)
text_tokenizer_model = MODEL_ALIASES.get(deployment_id, deployment_id)

openai_model_name = MODEL_ALIASES.get(deployment_id, deployment_id)
if deployment_id in GPT4O_DEPLOYMENTS:
tokenizer = MultiModalTokenizer(openai_model_name)
if image_tokenizer := get_image_tokenizer(deployment_id):
storage = create_file_storage("images", request.headers)
return await gpt4o_chat_completion(
data,
deployment_id,
upstream_endpoint,
creds,
is_stream,
storage,
api_version,
tokenizer,
)

tokenizer = PlainTextTokenizer(model=openai_model_name)
if deployment_id in GPT4_VISION_DEPLOYMENTS:
tokenizer = MultiModalTokenizer("gpt-4", image_tokenizer)
return await gpt4_vision_chat_completion(
data,
deployment_id,
upstream_endpoint,
creds,
is_stream,
storage,
api_version,
tokenizer,
)
else:
tokenizer = MultiModalTokenizer(
text_tokenizer_model, image_tokenizer
)
return await gpt4o_chat_completion(
data,
deployment_id,
upstream_endpoint,
creds,
is_stream,
storage,
api_version,
tokenizer,
)

tokenizer = PlainTextTokenizer(model=text_tokenizer_model)
return await gpt_chat_completion(
data,
deployment_id,
Expand Down
3 changes: 3 additions & 0 deletions aidial_adapter_openai/env.py
Original file line number Diff line number Diff line change
Expand Up @@ -16,6 +16,9 @@
os.getenv("DATABRICKS_DEPLOYMENTS")
)
GPT4O_DEPLOYMENTS = parse_deployment_list(os.getenv("GPT4O_DEPLOYMENTS"))
GPT4O_MINI_DEPLOYMENTS = parse_deployment_list(
os.getenv("GPT4O_MINI_DEPLOYMENTS")
)
API_VERSIONS_MAPPING: Dict[str, str] = json.loads(
os.getenv("API_VERSIONS_MAPPING", "{}")
)
Expand Down
3 changes: 2 additions & 1 deletion aidial_adapter_openai/gpt4_multi_modal/chat_completion.py
Original file line number Diff line number Diff line change
Expand Up @@ -163,6 +163,7 @@ async def gpt4_vision_chat_completion(
is_stream: bool,
file_storage: Optional[FileStorage],
api_version: str,
tokenizer: MultiModalTokenizer,
):
return await chat_completion(
request,
Expand All @@ -172,7 +173,7 @@ async def gpt4_vision_chat_completion(
is_stream,
file_storage,
api_version,
MultiModalTokenizer("gpt-4"),
tokenizer,
convert_gpt4v_to_gpt4_chunk,
GPT4V_DEFAULT_MAX_TOKENS,
)
Expand Down
4 changes: 3 additions & 1 deletion aidial_adapter_openai/utils/image.py
Original file line number Diff line number Diff line change
@@ -1,5 +1,5 @@
from io import BytesIO
from typing import Literal
from typing import Literal, assert_never

from PIL import Image
from pydantic import BaseModel
Expand All @@ -21,6 +21,8 @@ def resolve_detail_level(
return "low"
case "high":
return "high"
case _:
assert_never(detail)


class ImageMetadata(BaseModel):
Expand Down
101 changes: 60 additions & 41 deletions aidial_adapter_openai/utils/image_tokenizer.py
Original file line number Diff line number Diff line change
@@ -1,45 +1,78 @@
"""
Tokenization of images as specified at
https://learn.microsoft.com/en-us/azure/ai-services/openai/overview#image-tokens-gpt-4-turbo-with-vision
https://learn.microsoft.com/en-us/azure/ai-services/openai/overview#image-tokens
"""

import base64
import math
from io import BytesIO
from typing import assert_never
from typing import List, Tuple, assert_never

from PIL import Image
from pydantic import BaseModel

from aidial_adapter_openai.env import (
GPT4_VISION_DEPLOYMENTS,
GPT4O_DEPLOYMENTS,
GPT4O_MINI_DEPLOYMENTS,
)
from aidial_adapter_openai.utils.image import ImageDetail, resolve_detail_level
from aidial_adapter_openai.utils.resource import Resource


def tokenize_image_data_url(image_data: str, detail: ImageDetail) -> int:
parsed_image_data = Resource.from_data_url(image_data)
if not parsed_image_data:
raise ValueError(f"Invalid image data URL {parsed_image_data!r}")
return tokenize_image(parsed_image_data, detail)
class ImageTokenizer(BaseModel):
low_detail_tokens: int
"""
Number of tokens per image in low resolution mode
"""

tokens_per_tile: int
"""
Number of tokens per one tile image of 512x512 size
"""

def tokenize_image(image: Resource, detail: ImageDetail) -> int:
image_data = base64.b64decode(image.data)
with Image.open(BytesIO(image_data)) as img:
width, height = img.size
return tokenize_image_by_size(width, height, detail)
def tokenize(self, width: int, height: int, detail: ImageDetail) -> int:
concrete_detail = resolve_detail_level(width, height, detail)
match concrete_detail:
case "low":
return self.low_detail_tokens
case "high":
return self._compute_high_detail_tokens(width, height)
case _:
assert_never(concrete_detail)

def _compute_high_detail_tokens(self, width: int, height: int) -> int:
# Fit into 2048x2048 box
width, height = _fit_longest(width, height, 2048)

def tokenize_image_by_size(width: int, height: int, detail: ImageDetail) -> int:
concrete_detail = resolve_detail_level(width, height, detail)
match concrete_detail:
case "low":
return 85
case "high":
return compute_high_detail_tokens(width, height)
case _:
assert_never(concrete_detail)
# Scale down so the shortest side is 768 pixels
width, height = _fit_shortest(width, height, 768)

# Calculate the number of 512-pixel tiles required
cols = math.ceil(width / 512)
rows = math.ceil(height / 512)

def fit_longest(width: int, height: int, size: int) -> tuple[int, int]:
return (self.tokens_per_tile * cols * rows) + self.low_detail_tokens


GPT4O_IMAGE_TOKENIZER = GPT4_VISION_IMAGE_TOKENIZER = ImageTokenizer(
low_detail_tokens=85, tokens_per_tile=170
)
GPT4O_MINI_IMAGE_TOKENIZER = ImageTokenizer(
low_detail_tokens=2833, tokens_per_tile=5667
)

_TOKENIZERS: List[Tuple[ImageTokenizer, List[str]]] = [
(GPT4O_IMAGE_TOKENIZER, GPT4O_DEPLOYMENTS),
(GPT4O_MINI_IMAGE_TOKENIZER, GPT4O_MINI_DEPLOYMENTS),
(GPT4_VISION_IMAGE_TOKENIZER, GPT4_VISION_DEPLOYMENTS),
]


def get_image_tokenizer(deployment_id: str) -> ImageTokenizer | None:
for tokenizer, ids in _TOKENIZERS:
if deployment_id in ids:
return tokenizer
return None


def _fit_longest(width: int, height: int, size: int) -> tuple[int, int]:
ratio = width / height
if width > height:
scaled_width = min(width, size)
Expand All @@ -51,7 +84,7 @@ def fit_longest(width: int, height: int, size: int) -> tuple[int, int]:
return scaled_width, scaled_height


def fit_shortest(width: int, height: int, size: int) -> tuple[int, int]:
def _fit_shortest(width: int, height: int, size: int) -> tuple[int, int]:
ratio = width / height
if width < height:
scaled_width = min(width, size)
Expand All @@ -61,17 +94,3 @@ def fit_shortest(width: int, height: int, size: int) -> tuple[int, int]:
scaled_width = int(scaled_height * ratio)

return scaled_width, scaled_height


def compute_high_detail_tokens(width: int, height: int) -> int:
# Fit into 2048x2048 box
width, height = fit_longest(width, height, 2048)

# Scale down so the shortest side is 768 pixels
width, height = fit_shortest(width, height, 768)

# Calculate the number of 512-pixel tiles required
cols = math.ceil(width / 512)
rows = math.ceil(height / 512)

return (170 * cols * rows) + 85
10 changes: 8 additions & 2 deletions aidial_adapter_openai/utils/tokenizer.py
Original file line number Diff line number Diff line change
Expand Up @@ -8,7 +8,7 @@
from aidial_sdk.exceptions import InternalServerError
from tiktoken import Encoding, encoding_for_model

from aidial_adapter_openai.utils.image_tokenizer import tokenize_image_by_size
from aidial_adapter_openai.utils.image_tokenizer import ImageTokenizer
from aidial_adapter_openai.utils.multi_modal_message import MultiModalMessage

MessageType = TypeVar("MessageType")
Expand Down Expand Up @@ -131,6 +131,12 @@ def calculate_message_tokens(self, message: dict) -> int:


class MultiModalTokenizer(BaseTokenizer[MultiModalMessage]):
image_tokenizer: ImageTokenizer

def __init__(self, model: str, image_tokenizer: ImageTokenizer):
super().__init__(model)
self.image_tokenizer = image_tokenizer

def calculate_message_tokens(self, message: MultiModalMessage) -> int:
tokens = self.tokens_per_message
raw_message = message.raw_message
Expand All @@ -144,7 +150,7 @@ def calculate_message_tokens(self, message: MultiModalMessage) -> int:

# Processing image parts of message
for metadata in message.image_metadatas:
tokens += tokenize_image_by_size(
tokens += self.image_tokenizer.tokenize(
width=metadata.width,
height=metadata.height,
detail=metadata.detail,
Expand Down
16 changes: 15 additions & 1 deletion poetry.lock

Some generated files are not rendered by default. Learn more about how customized files appear on GitHub.

2 changes: 2 additions & 0 deletions pyproject.toml
Original file line number Diff line number Diff line change
Expand Up @@ -47,6 +47,8 @@ flake8 = "6.0.0"

[tool.poetry.group.dev.dependencies]
nox = "^2023.4.22"
# Required for `make serve` which loads .env file
python-dotenv = "^1.0.1"

[tool.pytest.ini_options]
# muting warnings coming from opentelemetry and pkg_resources packages
Expand Down
6 changes: 3 additions & 3 deletions tests/test_image_tokenization.py
Original file line number Diff line number Diff line change
Expand Up @@ -3,7 +3,7 @@
import pytest

from aidial_adapter_openai.utils.image import ImageDetail
from aidial_adapter_openai.utils.image_tokenizer import tokenize_image_by_size
from aidial_adapter_openai.utils.image_tokenizer import GPT4O_IMAGE_TOKENIZER

test_cases: List[Tuple[int, int, ImageDetail, int]] = [
(1, 1, "auto", 85),
Expand All @@ -25,8 +25,8 @@

@pytest.mark.parametrize("width, height, detail, expected_tokens", test_cases)
def test_tokenize(width, height, detail, expected_tokens):
tokens1 = tokenize_image_by_size(width, height, detail)
tokens2 = tokenize_image_by_size(height, width, detail)
tokens1 = GPT4O_IMAGE_TOKENIZER.tokenize(width, height, detail)
tokens2 = GPT4O_IMAGE_TOKENIZER.tokenize(height, width, detail)

assert tokens1 == expected_tokens
assert tokens2 == expected_tokens
3 changes: 2 additions & 1 deletion tests/test_multimodal_truncate.py
Original file line number Diff line number Diff line change
Expand Up @@ -8,11 +8,12 @@
multi_modal_truncate_prompt,
)
from aidial_adapter_openai.utils.image import ImageMetadata
from aidial_adapter_openai.utils.image_tokenizer import GPT4O_IMAGE_TOKENIZER
from aidial_adapter_openai.utils.multi_modal_message import MultiModalMessage
from aidial_adapter_openai.utils.resource import Resource
from aidial_adapter_openai.utils.tokenizer import MultiModalTokenizer

tokenizer = MultiModalTokenizer("gpt-4o")
tokenizer = MultiModalTokenizer("gpt-4o", GPT4O_IMAGE_TOKENIZER)


def test_multimodal_truncate_with_system_and_last_user_error():
Expand Down

0 comments on commit bebe7c9

Please sign in to comment.