From 83b9dd281803469efb0fd8e2686eb09738b5586a Mon Sep 17 00:00:00 2001
From: Anton Dubovik <anton_dubovik@epam.com>
Date: Tue, 5 Nov 2024 17:00:38 +0000
Subject: [PATCH] fix: make image tokenizer respect 'detail' field (#167)

---
 .../gpt4_multi_modal/transformation.py        | 19 +++++++++++++------
 aidial_adapter_openai/utils/image.py          |  8 +++++---
 2 files changed, 18 insertions(+), 9 deletions(-)

diff --git a/aidial_adapter_openai/gpt4_multi_modal/transformation.py b/aidial_adapter_openai/gpt4_multi_modal/transformation.py
index 804653c..d876edf 100644
--- a/aidial_adapter_openai/gpt4_multi_modal/transformation.py
+++ b/aidial_adapter_openai/gpt4_multi_modal/transformation.py
@@ -1,5 +1,5 @@
 from dataclasses import dataclass
-from typing import List, Set, cast
+from typing import List, Optional, Set, cast
 
 from aidial_sdk.exceptions import HTTPException as DialException
 from aidial_sdk.exceptions import InvalidRequestError
@@ -13,7 +13,7 @@
     parse_attachment,
 )
 from aidial_adapter_openai.dial_api.storage import FileStorage
-from aidial_adapter_openai.utils.image import ImageMetadata
+from aidial_adapter_openai.utils.image import ImageDetail, ImageMetadata
 from aidial_adapter_openai.utils.log_config import logger
 from aidial_adapter_openai.utils.multi_modal_message import (
     MultiModalMessage,
@@ -42,12 +42,15 @@ class Config:
     errors: Set[TransformationError] = Field(default_factory=set)
 
     def collect_resource(
-        self, meta: List[ImageMetadata], result: Resource | TransformationError
+        self,
+        meta: List[ImageMetadata],
+        result: Resource | TransformationError,
+        detail: Optional[ImageDetail],
     ):
         if isinstance(result, TransformationError):
             self.errors.add(result)
         else:
-            meta.append(ImageMetadata.from_resource(result))
+            meta.append(ImageMetadata.from_resource(result, detail))
 
     async def try_download_resource(
         self, dial_resource: DialResource
@@ -84,7 +87,7 @@ async def download_attachment_images(
                 supported_types=SUPPORTED_IMAGE_TYPES,
             )
             result = await self.try_download_resource(dial_resource)
-            self.collect_resource(ret, result)
+            self.collect_resource(ret, result, None)
 
         return ret
 
@@ -98,13 +101,17 @@ async def download_content_images(
 
         for content_part in content:
             if image_url := content_part.get("image_url", {}).get("url"):
+                image_detail = content_part.get("detail")
+                if image_detail not in [None, "auto", "low", "high"]:
+                    raise ValidationError("Unexpected image detail")
+
                 dial_resource = URLResource(
                     url=image_url,
                     entity_name="image",
                     supported_types=SUPPORTED_IMAGE_TYPES,
                 )
                 result = await self.try_download_resource(dial_resource)
-                self.collect_resource(ret, result)
+                self.collect_resource(ret, result, image_detail)
 
         return ret
 
diff --git a/aidial_adapter_openai/utils/image.py b/aidial_adapter_openai/utils/image.py
index 991b991..621d26c 100644
--- a/aidial_adapter_openai/utils/image.py
+++ b/aidial_adapter_openai/utils/image.py
@@ -1,5 +1,5 @@
 from io import BytesIO
-from typing import Literal, assert_never
+from typing import Literal, Optional, assert_never
 
 from PIL import Image
 from pydantic import BaseModel
@@ -36,7 +36,9 @@ class ImageMetadata(BaseModel):
     detail: DetailLevel
 
     @classmethod
-    def from_resource(cls, image: Resource) -> "ImageMetadata":
+    def from_resource(
+        cls, image: Resource, detail: Optional[ImageDetail]
+    ) -> "ImageMetadata":
         with Image.open(BytesIO(image.data)) as img:
             width, height = img.size
 
@@ -44,5 +46,5 @@ def from_resource(cls, image: Resource) -> "ImageMetadata":
             image=image,
             width=width,
             height=height,
-            detail=resolve_detail_level(width, height, "auto"),
+            detail=resolve_detail_level(width, height, detail or "auto"),
         )