fix: make image tokenizer respect 'detail' field (#167)

epam · Nov 5, 2024 · 83b9dd2 · 83b9dd2
1 parent ca28d1d
commit 83b9dd2
Show file tree

Hide file tree

Showing 2 changed files with 18 additions and 9 deletions.
diff --git a/aidial_adapter_openai/gpt4_multi_modal/transformation.py b/aidial_adapter_openai/gpt4_multi_modal/transformation.py
@@ -1,5 +1,5 @@
 from dataclasses import dataclass
-from typing import List, Set, cast
+from typing import List, Optional, Set, cast
 
 from aidial_sdk.exceptions import HTTPException as DialException
 from aidial_sdk.exceptions import InvalidRequestError
@@ -13,7 +13,7 @@
     parse_attachment,
 )
 from aidial_adapter_openai.dial_api.storage import FileStorage
-from aidial_adapter_openai.utils.image import ImageMetadata
+from aidial_adapter_openai.utils.image import ImageDetail, ImageMetadata
 from aidial_adapter_openai.utils.log_config import logger
 from aidial_adapter_openai.utils.multi_modal_message import (
     MultiModalMessage,
@@ -42,12 +42,15 @@ class Config:
     errors: Set[TransformationError] = Field(default_factory=set)
 
     def collect_resource(
-        self, meta: List[ImageMetadata], result: Resource | TransformationError
+        self,
+        meta: List[ImageMetadata],
+        result: Resource | TransformationError,
+        detail: Optional[ImageDetail],
     ):
         if isinstance(result, TransformationError):
             self.errors.add(result)
         else:
-            meta.append(ImageMetadata.from_resource(result))
+            meta.append(ImageMetadata.from_resource(result, detail))
 
     async def try_download_resource(
         self, dial_resource: DialResource
@@ -84,7 +87,7 @@ async def download_attachment_images(
                 supported_types=SUPPORTED_IMAGE_TYPES,
             )
             result = await self.try_download_resource(dial_resource)
-            self.collect_resource(ret, result)
+            self.collect_resource(ret, result, None)
 
         return ret
 
@@ -98,13 +101,17 @@ async def download_content_images(
 
         for content_part in content:
             if image_url := content_part.get("image_url", {}).get("url"):
+                image_detail = content_part.get("detail")
+                if image_detail not in [None, "auto", "low", "high"]:
+                    raise ValidationError("Unexpected image detail")
+
                 dial_resource = URLResource(
                     url=image_url,
                     entity_name="image",
                     supported_types=SUPPORTED_IMAGE_TYPES,
                 )
                 result = await self.try_download_resource(dial_resource)
-                self.collect_resource(ret, result)
+                self.collect_resource(ret, result, image_detail)
 
         return ret
 

diff --git a/aidial_adapter_openai/utils/image.py b/aidial_adapter_openai/utils/image.py
@@ -1,5 +1,5 @@
 from io import BytesIO
-from typing import Literal, assert_never
+from typing import Literal, Optional, assert_never
 
 from PIL import Image
 from pydantic import BaseModel
@@ -36,13 +36,15 @@ class ImageMetadata(BaseModel):
     detail: DetailLevel
 
     @classmethod
-    def from_resource(cls, image: Resource) -> "ImageMetadata":
+    def from_resource(
+        cls, image: Resource, detail: Optional[ImageDetail]
+    ) -> "ImageMetadata":
         with Image.open(BytesIO(image.data)) as img:
             width, height = img.size
 
         return cls(
             image=image,
             width=width,
             height=height,
-            detail=resolve_detail_level(width, height, "auto"),
+            detail=resolve_detail_level(width, height, detail or "auto"),
         )