From 83b9dd281803469efb0fd8e2686eb09738b5586a Mon Sep 17 00:00:00 2001 From: Anton Dubovik Date: Tue, 5 Nov 2024 17:00:38 +0000 Subject: [PATCH] fix: make image tokenizer respect 'detail' field (#167) --- .../gpt4_multi_modal/transformation.py | 19 +++++++++++++------ aidial_adapter_openai/utils/image.py | 8 +++++--- 2 files changed, 18 insertions(+), 9 deletions(-) diff --git a/aidial_adapter_openai/gpt4_multi_modal/transformation.py b/aidial_adapter_openai/gpt4_multi_modal/transformation.py index 804653c..d876edf 100644 --- a/aidial_adapter_openai/gpt4_multi_modal/transformation.py +++ b/aidial_adapter_openai/gpt4_multi_modal/transformation.py @@ -1,5 +1,5 @@ from dataclasses import dataclass -from typing import List, Set, cast +from typing import List, Optional, Set, cast from aidial_sdk.exceptions import HTTPException as DialException from aidial_sdk.exceptions import InvalidRequestError @@ -13,7 +13,7 @@ parse_attachment, ) from aidial_adapter_openai.dial_api.storage import FileStorage -from aidial_adapter_openai.utils.image import ImageMetadata +from aidial_adapter_openai.utils.image import ImageDetail, ImageMetadata from aidial_adapter_openai.utils.log_config import logger from aidial_adapter_openai.utils.multi_modal_message import ( MultiModalMessage, @@ -42,12 +42,15 @@ class Config: errors: Set[TransformationError] = Field(default_factory=set) def collect_resource( - self, meta: List[ImageMetadata], result: Resource | TransformationError + self, + meta: List[ImageMetadata], + result: Resource | TransformationError, + detail: Optional[ImageDetail], ): if isinstance(result, TransformationError): self.errors.add(result) else: - meta.append(ImageMetadata.from_resource(result)) + meta.append(ImageMetadata.from_resource(result, detail)) async def try_download_resource( self, dial_resource: DialResource @@ -84,7 +87,7 @@ async def download_attachment_images( supported_types=SUPPORTED_IMAGE_TYPES, ) result = await self.try_download_resource(dial_resource) - self.collect_resource(ret, result) + self.collect_resource(ret, result, None) return ret @@ -98,13 +101,17 @@ async def download_content_images( for content_part in content: if image_url := content_part.get("image_url", {}).get("url"): + image_detail = content_part.get("detail") + if image_detail not in [None, "auto", "low", "high"]: + raise ValidationError("Unexpected image detail") + dial_resource = URLResource( url=image_url, entity_name="image", supported_types=SUPPORTED_IMAGE_TYPES, ) result = await self.try_download_resource(dial_resource) - self.collect_resource(ret, result) + self.collect_resource(ret, result, image_detail) return ret diff --git a/aidial_adapter_openai/utils/image.py b/aidial_adapter_openai/utils/image.py index 991b991..621d26c 100644 --- a/aidial_adapter_openai/utils/image.py +++ b/aidial_adapter_openai/utils/image.py @@ -1,5 +1,5 @@ from io import BytesIO -from typing import Literal, assert_never +from typing import Literal, Optional, assert_never from PIL import Image from pydantic import BaseModel @@ -36,7 +36,9 @@ class ImageMetadata(BaseModel): detail: DetailLevel @classmethod - def from_resource(cls, image: Resource) -> "ImageMetadata": + def from_resource( + cls, image: Resource, detail: Optional[ImageDetail] + ) -> "ImageMetadata": with Image.open(BytesIO(image.data)) as img: width, height = img.size @@ -44,5 +46,5 @@ def from_resource(cls, image: Resource) -> "ImageMetadata": image=image, width=width, height=height, - detail=resolve_detail_level(width, height, "auto"), + detail=resolve_detail_level(width, height, detail or "auto"), )