From 63c39afaf1da5c4996d5f9d338f2c26131c0ff4e Mon Sep 17 00:00:00 2001
From: Roman Romanov <roman_romanov2@epam.com>
Date: Tue, 26 Nov 2024 13:36:53 +0200
Subject: [PATCH 01/10] Replace usage of env variables with application config
 in app state

---
 aidial_adapter_openai/app.py                  | 165 ++++++++++++------
 aidial_adapter_openai/app_config.py           |  39 +++++
 aidial_adapter_openai/completions.py          |   7 +-
 aidial_adapter_openai/constant.py             |   0
 .../utils/image_tokenizer.py                  |  24 +--
 5 files changed, 163 insertions(+), 72 deletions(-)
 create mode 100644 aidial_adapter_openai/app_config.py
 delete mode 100644 aidial_adapter_openai/constant.py

diff --git a/aidial_adapter_openai/app.py b/aidial_adapter_openai/app.py
index 11ce937..e6cceb8 100644
--- a/aidial_adapter_openai/app.py
+++ b/aidial_adapter_openai/app.py
@@ -1,4 +1,5 @@
 from contextlib import asynccontextmanager
+from typing import Annotated
 
 import pydantic
 from aidial_sdk._errors import pydantic_validation_exception_handler
@@ -6,7 +7,7 @@
 from aidial_sdk.exceptions import InvalidRequestError
 from aidial_sdk.telemetry.init import init_telemetry
 from aidial_sdk.telemetry.types import TelemetryConfig
-from fastapi import FastAPI, Request
+from fastapi import Depends, FastAPI, Request
 from fastapi.responses import Response
 from openai import (
     APIConnectionError,
@@ -16,6 +17,7 @@
     OpenAIError,
 )
 
+from aidial_adapter_openai.app_config import ApplicationConfig
 from aidial_adapter_openai.completions import chat_completion as completion
 from aidial_adapter_openai.dalle3 import (
     chat_completion as dalle3_chat_completion,
@@ -30,17 +32,6 @@
 from aidial_adapter_openai.embeddings.openai import (
     embeddings as openai_embeddings,
 )
-from aidial_adapter_openai.env import (
-    API_VERSIONS_MAPPING,
-    AZURE_AI_VISION_DEPLOYMENTS,
-    DALLE3_AZURE_API_VERSION,
-    DALLE3_DEPLOYMENTS,
-    DATABRICKS_DEPLOYMENTS,
-    GPT4_VISION_DEPLOYMENTS,
-    MISTRAL_DEPLOYMENTS,
-    MODEL_ALIASES,
-    NON_STREAMING_DEPLOYMENTS,
-)
 from aidial_adapter_openai.gpt import gpt_chat_completion
 from aidial_adapter_openai.gpt4_multi_modal.chat_completion import (
     gpt4_vision_chat_completion,
@@ -51,7 +42,10 @@
 )
 from aidial_adapter_openai.utils.auth import get_credentials
 from aidial_adapter_openai.utils.http_client import get_http_client
-from aidial_adapter_openai.utils.image_tokenizer import get_image_tokenizer
+from aidial_adapter_openai.utils.image_tokenizer import (
+    ImageTokenizer,
+    get_image_tokenizer,
+)
 from aidial_adapter_openai.utils.log_config import configure_loggers, logger
 from aidial_adapter_openai.utils.parsers import completions_parser, parse_body
 from aidial_adapter_openai.utils.streaming import create_server_response
@@ -68,16 +62,35 @@ async def lifespan(app: FastAPI):
     await get_http_client().aclose()
 
 
-app = FastAPI(lifespan=lifespan)
+def create_app(
+    app_config: ApplicationConfig | None = None,
+    to_init_telemetry: bool = True,
+    to_configure_loggers: bool = True,
+) -> FastAPI:
+    app = FastAPI(lifespan=lifespan)
+
+    if app_config is None:
+        app_config = ApplicationConfig.from_env()
+
+    app.state.app_config = app_config
+
+    if to_init_telemetry:
+        init_telemetry(app, TelemetryConfig())
 
+    if to_configure_loggers:
+        configure_loggers()
 
-init_telemetry(app, TelemetryConfig())
-configure_loggers()
+    return app
 
 
-def get_api_version(request: Request):
+def get_app_config(request: Request) -> ApplicationConfig:
+    return request.app.state.app_config
+
+
+def get_api_version(request: Request) -> str:
     api_version = request.query_params.get("api-version", "")
-    api_version = API_VERSIONS_MAPPING.get(api_version, api_version)
+    app_config = get_app_config(request)
+    api_version = app_config.API_VERSIONS_MAPPING.get(api_version, api_version)
 
     if api_version == "":
         raise InvalidRequestError("api-version is a required query parameter")
@@ -85,26 +98,52 @@ def get_api_version(request: Request):
     return api_version
 
 
+def _get_image_tokenizer(
+    deployment_id: str, app_config: ApplicationConfig
+) -> ImageTokenizer:
+    image_tokenizer = get_image_tokenizer(deployment_id, app_config)
+    if not image_tokenizer:
+        raise RuntimeError(
+            f"No image tokenizer found for deployment {deployment_id}"
+        )
+    return image_tokenizer
+
+
+app = create_app()
+
+
 @app.post("/openai/deployments/{deployment_id:path}/chat/completions")
-async def chat_completion(deployment_id: str, request: Request):
+async def chat_completion(
+    deployment_id: str,
+    request: Request,
+    app_config: Annotated[ApplicationConfig, Depends(get_app_config)],
+):
 
     data = await parse_body(request)
 
     is_stream = bool(data.get("stream"))
 
-    emulate_streaming = deployment_id in NON_STREAMING_DEPLOYMENTS and is_stream
+    emulate_streaming = (
+        deployment_id in app_config.NON_STREAMING_DEPLOYMENTS and is_stream
+    )
 
     if emulate_streaming:
         data["stream"] = False
 
     return create_server_response(
         emulate_streaming,
-        await call_chat_completion(deployment_id, data, is_stream, request),
+        await call_chat_completion(
+            deployment_id, data, is_stream, request, app_config
+        ),
     )
 
 
 async def call_chat_completion(
-    deployment_id: str, data: dict, is_stream: bool, request: Request
+    deployment_id: str,
+    data: dict,
+    is_stream: bool,
+    request: Request,
+    app_config: ApplicationConfig,
 ):
 
     # Azure OpenAI deployments ignore "model" request field,
@@ -129,9 +168,9 @@ async def call_chat_completion(
             creds,
             api_version,
             deployment_id,
+            app_config,
         )
-
-    if deployment_id in DALLE3_DEPLOYMENTS:
+    if deployment_id in app_config.DALLE3_DEPLOYMENTS:
         storage = create_file_storage("images", request.headers)
         return await dalle3_chat_completion(
             data,
@@ -139,46 +178,52 @@ async def call_chat_completion(
             creds,
             is_stream,
             storage,
-            DALLE3_AZURE_API_VERSION,
+            app_config.DALLE3_AZURE_API_VERSION,
         )
 
-    if deployment_id in MISTRAL_DEPLOYMENTS:
+    if deployment_id in app_config.MISTRAL_DEPLOYMENTS:
         return await mistral_chat_completion(data, upstream_endpoint, creds)
 
-    if deployment_id in DATABRICKS_DEPLOYMENTS:
+    if deployment_id in app_config.DATABRICKS_DEPLOYMENTS:
         return await databricks_chat_completion(data, upstream_endpoint, creds)
 
-    text_tokenizer_model = MODEL_ALIASES.get(deployment_id, deployment_id)
+    text_tokenizer_model = app_config.MODEL_ALIASES.get(
+        deployment_id, deployment_id
+    )
 
-    if image_tokenizer := get_image_tokenizer(deployment_id):
-        storage = create_file_storage("images", request.headers)
+    if deployment_id in app_config.GPT4_VISION_DEPLOYMENTS:
+        tokenizer = MultiModalTokenizer(
+            "gpt-4", _get_image_tokenizer(deployment_id, app_config)
+        )
+        return await gpt4_vision_chat_completion(
+            data,
+            deployment_id,
+            upstream_endpoint,
+            creds,
+            is_stream,
+            create_file_storage("images", request.headers),
+            api_version,
+            tokenizer,
+        )
 
-        if deployment_id in GPT4_VISION_DEPLOYMENTS:
-            tokenizer = MultiModalTokenizer("gpt-4", image_tokenizer)
-            return await gpt4_vision_chat_completion(
-                data,
-                deployment_id,
-                upstream_endpoint,
-                creds,
-                is_stream,
-                storage,
-                api_version,
-                tokenizer,
-            )
-        else:
-            tokenizer = MultiModalTokenizer(
-                text_tokenizer_model, image_tokenizer
-            )
-            return await gpt4o_chat_completion(
-                data,
-                deployment_id,
-                upstream_endpoint,
-                creds,
-                is_stream,
-                storage,
-                api_version,
-                tokenizer,
-            )
+    if deployment_id in (
+        *app_config.GPT4O_DEPLOYMENTS,
+        *app_config.GPT4O_MINI_DEPLOYMENTS,
+    ):
+        tokenizer = MultiModalTokenizer(
+            text_tokenizer_model,
+            _get_image_tokenizer(deployment_id, app_config),
+        )
+        return await gpt4o_chat_completion(
+            data,
+            deployment_id,
+            upstream_endpoint,
+            creds,
+            is_stream,
+            create_file_storage("images", request.headers),
+            api_version,
+            tokenizer,
+        )
 
     tokenizer = PlainTextTokenizer(model=text_tokenizer_model)
     return await gpt_chat_completion(
@@ -192,7 +237,11 @@ async def call_chat_completion(
 
 
 @app.post("/openai/deployments/{deployment_id:path}/embeddings")
-async def embedding(deployment_id: str, request: Request):
+async def embedding(
+    deployment_id: str,
+    request: Request,
+    app_config: Annotated[ApplicationConfig, Depends(get_app_config)],
+):
     data = await parse_body(request)
 
     # See note for /chat/completions endpoint
@@ -202,7 +251,7 @@ async def embedding(deployment_id: str, request: Request):
     api_version = get_api_version(request)
     upstream_endpoint = request.headers["X-UPSTREAM-ENDPOINT"]
 
-    if deployment_id in AZURE_AI_VISION_DEPLOYMENTS:
+    if deployment_id in app_config.AZURE_AI_VISION_DEPLOYMENTS:
         storage = create_file_storage("images", request.headers)
         return await azure_ai_vision_embeddings(
             creds, deployment_id, upstream_endpoint, storage, data
diff --git a/aidial_adapter_openai/app_config.py b/aidial_adapter_openai/app_config.py
new file mode 100644
index 0000000..45bdd94
--- /dev/null
+++ b/aidial_adapter_openai/app_config.py
@@ -0,0 +1,39 @@
+from typing import Dict, List
+
+from pydantic import BaseModel, Field
+
+import aidial_adapter_openai.env as env
+
+
+class ApplicationConfig(BaseModel):
+    MODEL_ALIASES: Dict[str, str] = Field(default_factory=dict)
+    DALLE3_DEPLOYMENTS: List[str] = Field(default_factory=list)
+    GPT4_VISION_DEPLOYMENTS: List[str] = Field(default_factory=list)
+    MISTRAL_DEPLOYMENTS: List[str] = Field(default_factory=list)
+    DATABRICKS_DEPLOYMENTS: List[str] = Field(default_factory=list)
+    GPT4O_DEPLOYMENTS: List[str] = Field(default_factory=list)
+    GPT4O_MINI_DEPLOYMENTS: List[str] = Field(default_factory=list)
+    AZURE_AI_VISION_DEPLOYMENTS: List[str] = Field(default_factory=list)
+    API_VERSIONS_MAPPING: Dict[str, str] = Field(default_factory=dict)
+    COMPLETION_DEPLOYMENTS_PROMPT_TEMPLATES: Dict[str, str] = Field(
+        default_factory=dict
+    )
+    DALLE3_AZURE_API_VERSION: str = Field(default="2024-02-01")
+    NON_STREAMING_DEPLOYMENTS: List[str] = Field(default_factory=list)
+
+    @classmethod
+    def from_env(cls) -> "ApplicationConfig":
+        return cls(
+            MODEL_ALIASES=env.MODEL_ALIASES,
+            DALLE3_DEPLOYMENTS=env.DALLE3_DEPLOYMENTS,
+            GPT4_VISION_DEPLOYMENTS=env.GPT4_VISION_DEPLOYMENTS,
+            MISTRAL_DEPLOYMENTS=env.MISTRAL_DEPLOYMENTS,
+            DATABRICKS_DEPLOYMENTS=env.DATABRICKS_DEPLOYMENTS,
+            GPT4O_DEPLOYMENTS=env.GPT4O_DEPLOYMENTS,
+            GPT4O_MINI_DEPLOYMENTS=env.GPT4O_MINI_DEPLOYMENTS,
+            AZURE_AI_VISION_DEPLOYMENTS=env.AZURE_AI_VISION_DEPLOYMENTS,
+            API_VERSIONS_MAPPING=env.API_VERSIONS_MAPPING,
+            COMPLETION_DEPLOYMENTS_PROMPT_TEMPLATES=env.COMPLETION_DEPLOYMENTS_PROMPT_TEMPLATES,
+            DALLE3_AZURE_API_VERSION=env.DALLE3_AZURE_API_VERSION,
+            NON_STREAMING_DEPLOYMENTS=env.NON_STREAMING_DEPLOYMENTS,
+        )
diff --git a/aidial_adapter_openai/completions.py b/aidial_adapter_openai/completions.py
index 90834b5..4f11027 100644
--- a/aidial_adapter_openai/completions.py
+++ b/aidial_adapter_openai/completions.py
@@ -4,7 +4,7 @@
 from openai import AsyncStream
 from openai.types import Completion
 
-from aidial_adapter_openai.env import COMPLETION_DEPLOYMENTS_PROMPT_TEMPLATES
+from aidial_adapter_openai.app_config import ApplicationConfig
 from aidial_adapter_openai.utils.auth import OpenAICreds
 from aidial_adapter_openai.utils.parsers import (
     AzureOpenAIEndpoint,
@@ -46,6 +46,7 @@ async def chat_completion(
     creds: OpenAICreds,
     api_version: str,
     deployment_id: str,
+    app_config: ApplicationConfig,
 ):
 
     if data.get("n") or 1 > 1:
@@ -60,7 +61,9 @@ async def chat_completion(
     prompt = messages[-1].get("content") or ""
 
     if (
-        template := COMPLETION_DEPLOYMENTS_PROMPT_TEMPLATES.get(deployment_id)
+        template := app_config.COMPLETION_DEPLOYMENTS_PROMPT_TEMPLATES.get(
+            deployment_id
+        )
     ) is not None:
         prompt = template.format(prompt=prompt)
 
diff --git a/aidial_adapter_openai/constant.py b/aidial_adapter_openai/constant.py
deleted file mode 100644
index e69de29..0000000
diff --git a/aidial_adapter_openai/utils/image_tokenizer.py b/aidial_adapter_openai/utils/image_tokenizer.py
index 0fe5bf9..b6d328d 100644
--- a/aidial_adapter_openai/utils/image_tokenizer.py
+++ b/aidial_adapter_openai/utils/image_tokenizer.py
@@ -8,11 +8,7 @@
 
 from pydantic import BaseModel
 
-from aidial_adapter_openai.env import (
-    GPT4_VISION_DEPLOYMENTS,
-    GPT4O_DEPLOYMENTS,
-    GPT4O_MINI_DEPLOYMENTS,
-)
+from aidial_adapter_openai.app_config import ApplicationConfig
 from aidial_adapter_openai.utils.image import ImageDetail, resolve_detail_level
 
 
@@ -58,14 +54,18 @@ def _compute_high_detail_tokens(self, width: int, height: int) -> int:
     low_detail_tokens=2833, tokens_per_tile=5667
 )
 
-_TOKENIZERS: List[Tuple[ImageTokenizer, List[str]]] = [
-    (GPT4O_IMAGE_TOKENIZER, GPT4O_DEPLOYMENTS),
-    (GPT4O_MINI_IMAGE_TOKENIZER, GPT4O_MINI_DEPLOYMENTS),
-    (GPT4_VISION_IMAGE_TOKENIZER, GPT4_VISION_DEPLOYMENTS),
-]
-
 
-def get_image_tokenizer(deployment_id: str) -> ImageTokenizer | None:
+def get_image_tokenizer(
+    deployment_id: str, app_config: ApplicationConfig
+) -> ImageTokenizer | None:
+    _TOKENIZERS: List[Tuple[ImageTokenizer, List[str]]] = [
+        (GPT4O_IMAGE_TOKENIZER, app_config.GPT4O_DEPLOYMENTS),
+        (GPT4O_MINI_IMAGE_TOKENIZER, app_config.GPT4O_MINI_DEPLOYMENTS),
+        (
+            GPT4_VISION_IMAGE_TOKENIZER,
+            app_config.GPT4_VISION_DEPLOYMENTS,
+        ),
+    ]
     for tokenizer, ids in _TOKENIZERS:
         if deployment_id in ids:
             return tokenizer

From 6fd770ed6328d9a47dc29eebc1ce81221712a66d Mon Sep 17 00:00:00 2001
From: Roman Romanov <roman_romanov2@epam.com>
Date: Wed, 27 Nov 2024 10:30:07 +0200
Subject: [PATCH 02/10] Big refactoring, address comments in PR

---
 aidial_adapter_openai/app.py                  | 275 +++---------------
 aidial_adapter_openai/app_config.py           |  95 ++++--
 aidial_adapter_openai/env.py                  |  47 ---
 aidial_adapter_openai/gpt.py                  |   2 +
 .../gpt4_multi_modal/chat_completion.py       |   6 +
 aidial_adapter_openai/routers/__init__.py     |   0
 .../routers/chat_completion.py                | 159 ++++++++++
 aidial_adapter_openai/routers/embeddings.py   |  41 +++
 .../utils/image_tokenizer.py                  |  26 +-
 aidial_adapter_openai/utils/request.py        |  27 ++
 aidial_adapter_openai/utils/streaming.py      |   6 +-
 tests/conftest.py                             |  26 +-
 12 files changed, 366 insertions(+), 344 deletions(-)
 delete mode 100644 aidial_adapter_openai/env.py
 create mode 100644 aidial_adapter_openai/routers/__init__.py
 create mode 100644 aidial_adapter_openai/routers/chat_completion.py
 create mode 100644 aidial_adapter_openai/routers/embeddings.py
 create mode 100644 aidial_adapter_openai/utils/request.py

diff --git a/aidial_adapter_openai/app.py b/aidial_adapter_openai/app.py
index e6cceb8..cd348c8 100644
--- a/aidial_adapter_openai/app.py
+++ b/aidial_adapter_openai/app.py
@@ -1,13 +1,11 @@
 from contextlib import asynccontextmanager
-from typing import Annotated
 
 import pydantic
 from aidial_sdk._errors import pydantic_validation_exception_handler
 from aidial_sdk.exceptions import HTTPException as DialException
-from aidial_sdk.exceptions import InvalidRequestError
 from aidial_sdk.telemetry.init import init_telemetry
 from aidial_sdk.telemetry.types import TelemetryConfig
-from fastapi import Depends, FastAPI, Request
+from fastapi import FastAPI, Request
 from fastapi.responses import Response
 from openai import (
     APIConnectionError,
@@ -18,41 +16,10 @@
 )
 
 from aidial_adapter_openai.app_config import ApplicationConfig
-from aidial_adapter_openai.completions import chat_completion as completion
-from aidial_adapter_openai.dalle3 import (
-    chat_completion as dalle3_chat_completion,
-)
-from aidial_adapter_openai.databricks import (
-    chat_completion as databricks_chat_completion,
-)
-from aidial_adapter_openai.dial_api.storage import create_file_storage
-from aidial_adapter_openai.embeddings.azure_ai_vision import (
-    embeddings as azure_ai_vision_embeddings,
-)
-from aidial_adapter_openai.embeddings.openai import (
-    embeddings as openai_embeddings,
-)
-from aidial_adapter_openai.gpt import gpt_chat_completion
-from aidial_adapter_openai.gpt4_multi_modal.chat_completion import (
-    gpt4_vision_chat_completion,
-    gpt4o_chat_completion,
-)
-from aidial_adapter_openai.mistral import (
-    chat_completion as mistral_chat_completion,
-)
-from aidial_adapter_openai.utils.auth import get_credentials
+from aidial_adapter_openai.routers.chat_completion import chat_completion
+from aidial_adapter_openai.routers.embeddings import embedding
 from aidial_adapter_openai.utils.http_client import get_http_client
-from aidial_adapter_openai.utils.image_tokenizer import (
-    ImageTokenizer,
-    get_image_tokenizer,
-)
 from aidial_adapter_openai.utils.log_config import configure_loggers, logger
-from aidial_adapter_openai.utils.parsers import completions_parser, parse_body
-from aidial_adapter_openai.utils.streaming import create_server_response
-from aidial_adapter_openai.utils.tokenizer import (
-    MultiModalTokenizer,
-    PlainTextTokenizer,
-)
 
 
 @asynccontextmanager
@@ -62,205 +29,6 @@ async def lifespan(app: FastAPI):
     await get_http_client().aclose()
 
 
-def create_app(
-    app_config: ApplicationConfig | None = None,
-    to_init_telemetry: bool = True,
-    to_configure_loggers: bool = True,
-) -> FastAPI:
-    app = FastAPI(lifespan=lifespan)
-
-    if app_config is None:
-        app_config = ApplicationConfig.from_env()
-
-    app.state.app_config = app_config
-
-    if to_init_telemetry:
-        init_telemetry(app, TelemetryConfig())
-
-    if to_configure_loggers:
-        configure_loggers()
-
-    return app
-
-
-def get_app_config(request: Request) -> ApplicationConfig:
-    return request.app.state.app_config
-
-
-def get_api_version(request: Request) -> str:
-    api_version = request.query_params.get("api-version", "")
-    app_config = get_app_config(request)
-    api_version = app_config.API_VERSIONS_MAPPING.get(api_version, api_version)
-
-    if api_version == "":
-        raise InvalidRequestError("api-version is a required query parameter")
-
-    return api_version
-
-
-def _get_image_tokenizer(
-    deployment_id: str, app_config: ApplicationConfig
-) -> ImageTokenizer:
-    image_tokenizer = get_image_tokenizer(deployment_id, app_config)
-    if not image_tokenizer:
-        raise RuntimeError(
-            f"No image tokenizer found for deployment {deployment_id}"
-        )
-    return image_tokenizer
-
-
-app = create_app()
-
-
-@app.post("/openai/deployments/{deployment_id:path}/chat/completions")
-async def chat_completion(
-    deployment_id: str,
-    request: Request,
-    app_config: Annotated[ApplicationConfig, Depends(get_app_config)],
-):
-
-    data = await parse_body(request)
-
-    is_stream = bool(data.get("stream"))
-
-    emulate_streaming = (
-        deployment_id in app_config.NON_STREAMING_DEPLOYMENTS and is_stream
-    )
-
-    if emulate_streaming:
-        data["stream"] = False
-
-    return create_server_response(
-        emulate_streaming,
-        await call_chat_completion(
-            deployment_id, data, is_stream, request, app_config
-        ),
-    )
-
-
-async def call_chat_completion(
-    deployment_id: str,
-    data: dict,
-    is_stream: bool,
-    request: Request,
-    app_config: ApplicationConfig,
-):
-
-    # Azure OpenAI deployments ignore "model" request field,
-    # since the deployment id is already encoded in the endpoint path.
-    # This is not the case for non-Azure OpenAI deployments, so
-    # they require the "model" field to be set.
-    # However, openai==1.33.0 requires the "model" field for **both**
-    # Azure and non-Azure deployments.
-    # Therefore, we provide the "model" field for all deployments here.
-    # The same goes for /embeddings endpoint.
-    data["model"] = deployment_id
-
-    creds = await get_credentials(request)
-    api_version = get_api_version(request)
-
-    upstream_endpoint = request.headers["X-UPSTREAM-ENDPOINT"]
-
-    if completions_endpoint := completions_parser.parse(upstream_endpoint):
-        return await completion(
-            data,
-            completions_endpoint,
-            creds,
-            api_version,
-            deployment_id,
-            app_config,
-        )
-    if deployment_id in app_config.DALLE3_DEPLOYMENTS:
-        storage = create_file_storage("images", request.headers)
-        return await dalle3_chat_completion(
-            data,
-            upstream_endpoint,
-            creds,
-            is_stream,
-            storage,
-            app_config.DALLE3_AZURE_API_VERSION,
-        )
-
-    if deployment_id in app_config.MISTRAL_DEPLOYMENTS:
-        return await mistral_chat_completion(data, upstream_endpoint, creds)
-
-    if deployment_id in app_config.DATABRICKS_DEPLOYMENTS:
-        return await databricks_chat_completion(data, upstream_endpoint, creds)
-
-    text_tokenizer_model = app_config.MODEL_ALIASES.get(
-        deployment_id, deployment_id
-    )
-
-    if deployment_id in app_config.GPT4_VISION_DEPLOYMENTS:
-        tokenizer = MultiModalTokenizer(
-            "gpt-4", _get_image_tokenizer(deployment_id, app_config)
-        )
-        return await gpt4_vision_chat_completion(
-            data,
-            deployment_id,
-            upstream_endpoint,
-            creds,
-            is_stream,
-            create_file_storage("images", request.headers),
-            api_version,
-            tokenizer,
-        )
-
-    if deployment_id in (
-        *app_config.GPT4O_DEPLOYMENTS,
-        *app_config.GPT4O_MINI_DEPLOYMENTS,
-    ):
-        tokenizer = MultiModalTokenizer(
-            text_tokenizer_model,
-            _get_image_tokenizer(deployment_id, app_config),
-        )
-        return await gpt4o_chat_completion(
-            data,
-            deployment_id,
-            upstream_endpoint,
-            creds,
-            is_stream,
-            create_file_storage("images", request.headers),
-            api_version,
-            tokenizer,
-        )
-
-    tokenizer = PlainTextTokenizer(model=text_tokenizer_model)
-    return await gpt_chat_completion(
-        data,
-        deployment_id,
-        upstream_endpoint,
-        creds,
-        api_version,
-        tokenizer,
-    )
-
-
-@app.post("/openai/deployments/{deployment_id:path}/embeddings")
-async def embedding(
-    deployment_id: str,
-    request: Request,
-    app_config: Annotated[ApplicationConfig, Depends(get_app_config)],
-):
-    data = await parse_body(request)
-
-    # See note for /chat/completions endpoint
-    data["model"] = deployment_id
-
-    creds = await get_credentials(request)
-    api_version = get_api_version(request)
-    upstream_endpoint = request.headers["X-UPSTREAM-ENDPOINT"]
-
-    if deployment_id in app_config.AZURE_AI_VISION_DEPLOYMENTS:
-        storage = create_file_storage("images", request.headers)
-        return await azure_ai_vision_embeddings(
-            creds, deployment_id, upstream_endpoint, storage, data
-        )
-
-    return await openai_embeddings(creds, upstream_endpoint, api_version, data)
-
-
-@app.exception_handler(OpenAIError)
 def openai_exception_handler(request: Request, e: DialException):
     if isinstance(e, APIStatusError):
         r = e.response
@@ -303,16 +71,43 @@ def openai_exception_handler(request: Request, e: DialException):
         )
 
 
-@app.exception_handler(pydantic.ValidationError)
 def pydantic_exception_handler(request: Request, exc: pydantic.ValidationError):
     return pydantic_validation_exception_handler(request, exc)
 
 
-@app.exception_handler(DialException)
 def dial_exception_handler(request: Request, exc: DialException):
     return exc.to_fastapi_response()
 
 
-@app.get("/health")
-def health():
-    return {"status": "ok"}
+def create_app(
+    app_config: ApplicationConfig | None = None,
+    to_init_telemetry: bool = True,
+) -> FastAPI:
+    app = FastAPI(lifespan=lifespan)
+
+    if app_config is None:
+        app_config = ApplicationConfig.from_env()
+
+    app.state.app_config = app_config
+
+    if to_init_telemetry:
+        init_telemetry(app, TelemetryConfig())
+
+    configure_loggers()
+
+    @app.get("/health")
+    def health():
+        return {"status": "ok"}
+
+    app.post("/openai/deployments/{deployment_id:path}/embeddings")(embedding)
+    app.post("/openai/deployments/{deployment_id:path}/chat/completions")(
+        chat_completion
+    )
+    app.exception_handler(OpenAIError)(openai_exception_handler)
+    app.exception_handler(pydantic.ValidationError)(pydantic_exception_handler)
+    app.exception_handler(DialException)(dial_exception_handler)
+
+    return app
+
+
+app = create_app()
diff --git a/aidial_adapter_openai/app_config.py b/aidial_adapter_openai/app_config.py
index 45bdd94..d024e4c 100644
--- a/aidial_adapter_openai/app_config.py
+++ b/aidial_adapter_openai/app_config.py
@@ -1,39 +1,78 @@
+import json
+import os
 from typing import Dict, List
 
-from pydantic import BaseModel, Field
+from pydantic import BaseModel
 
-import aidial_adapter_openai.env as env
+from aidial_adapter_openai.utils.env import get_env_bool
+from aidial_adapter_openai.utils.log_config import logger
+from aidial_adapter_openai.utils.parsers import parse_deployment_list
+
+
+def _get_eliminate_empty_choices() -> bool:
+    old_name = "FIX_STREAMING_ISSUES_IN_NEW_API_VERSIONS"
+    new_name = "ELIMINATE_EMPTY_CHOICES"
+
+    if old_name in os.environ:
+        logger.warning(
+            f"{old_name} environment variable is deprecated. Use {new_name} instead."
+        )
+        return get_env_bool(old_name, False)
+
+    return get_env_bool(new_name, False)
 
 
 class ApplicationConfig(BaseModel):
-    MODEL_ALIASES: Dict[str, str] = Field(default_factory=dict)
-    DALLE3_DEPLOYMENTS: List[str] = Field(default_factory=list)
-    GPT4_VISION_DEPLOYMENTS: List[str] = Field(default_factory=list)
-    MISTRAL_DEPLOYMENTS: List[str] = Field(default_factory=list)
-    DATABRICKS_DEPLOYMENTS: List[str] = Field(default_factory=list)
-    GPT4O_DEPLOYMENTS: List[str] = Field(default_factory=list)
-    GPT4O_MINI_DEPLOYMENTS: List[str] = Field(default_factory=list)
-    AZURE_AI_VISION_DEPLOYMENTS: List[str] = Field(default_factory=list)
-    API_VERSIONS_MAPPING: Dict[str, str] = Field(default_factory=dict)
-    COMPLETION_DEPLOYMENTS_PROMPT_TEMPLATES: Dict[str, str] = Field(
-        default_factory=dict
-    )
-    DALLE3_AZURE_API_VERSION: str = Field(default="2024-02-01")
-    NON_STREAMING_DEPLOYMENTS: List[str] = Field(default_factory=list)
+    MODEL_ALIASES: Dict[str, str] = {}
+    DALLE3_DEPLOYMENTS: List[str] = []
+    GPT4_VISION_DEPLOYMENTS: List[str] = []
+    MISTRAL_DEPLOYMENTS: List[str] = []
+    DATABRICKS_DEPLOYMENTS: List[str] = []
+    GPT4O_DEPLOYMENTS: List[str] = []
+    GPT4O_MINI_DEPLOYMENTS: List[str] = []
+    AZURE_AI_VISION_DEPLOYMENTS: List[str] = []
+    API_VERSIONS_MAPPING: Dict[str, str] = {}
+    COMPLETION_DEPLOYMENTS_PROMPT_TEMPLATES: Dict[str, str] = {}
+    DALLE3_AZURE_API_VERSION: str = "2024-02-01"
+    NON_STREAMING_DEPLOYMENTS: List[str] = []
+    ELIMINATE_EMPTY_CHOICES: bool = False
 
     @classmethod
     def from_env(cls) -> "ApplicationConfig":
         return cls(
-            MODEL_ALIASES=env.MODEL_ALIASES,
-            DALLE3_DEPLOYMENTS=env.DALLE3_DEPLOYMENTS,
-            GPT4_VISION_DEPLOYMENTS=env.GPT4_VISION_DEPLOYMENTS,
-            MISTRAL_DEPLOYMENTS=env.MISTRAL_DEPLOYMENTS,
-            DATABRICKS_DEPLOYMENTS=env.DATABRICKS_DEPLOYMENTS,
-            GPT4O_DEPLOYMENTS=env.GPT4O_DEPLOYMENTS,
-            GPT4O_MINI_DEPLOYMENTS=env.GPT4O_MINI_DEPLOYMENTS,
-            AZURE_AI_VISION_DEPLOYMENTS=env.AZURE_AI_VISION_DEPLOYMENTS,
-            API_VERSIONS_MAPPING=env.API_VERSIONS_MAPPING,
-            COMPLETION_DEPLOYMENTS_PROMPT_TEMPLATES=env.COMPLETION_DEPLOYMENTS_PROMPT_TEMPLATES,
-            DALLE3_AZURE_API_VERSION=env.DALLE3_AZURE_API_VERSION,
-            NON_STREAMING_DEPLOYMENTS=env.NON_STREAMING_DEPLOYMENTS,
+            MODEL_ALIASES=json.loads(os.getenv("MODEL_ALIASES", "{}")),
+            DALLE3_DEPLOYMENTS=parse_deployment_list(
+                os.getenv("DALLE3_DEPLOYMENTS")
+            ),
+            GPT4_VISION_DEPLOYMENTS=parse_deployment_list(
+                os.getenv("GPT4_VISION_DEPLOYMENTS")
+            ),
+            MISTRAL_DEPLOYMENTS=parse_deployment_list(
+                os.getenv("MISTRAL_DEPLOYMENTS")
+            ),
+            DATABRICKS_DEPLOYMENTS=parse_deployment_list(
+                os.getenv("DATABRICKS_DEPLOYMENTS")
+            ),
+            GPT4O_DEPLOYMENTS=parse_deployment_list(
+                os.getenv("GPT4O_DEPLOYMENTS")
+            ),
+            GPT4O_MINI_DEPLOYMENTS=parse_deployment_list(
+                os.getenv("GPT4O_MINI_DEPLOYMENTS")
+            ),
+            AZURE_AI_VISION_DEPLOYMENTS=parse_deployment_list(
+                os.getenv("AZURE_AI_VISION_DEPLOYMENTS")
+            ),
+            API_VERSIONS_MAPPING=json.loads(
+                os.getenv("API_VERSIONS_MAPPING", "{}")
+            ),
+            COMPLETION_DEPLOYMENTS_PROMPT_TEMPLATES=json.loads(
+                os.getenv("COMPLETION_DEPLOYMENTS_PROMPT_TEMPLATES") or "{}"
+            ),
+            DALLE3_AZURE_API_VERSION=os.getenv(
+                "DALLE3_AZURE_API_VERSION", "2024-02-01"
+            ),
+            NON_STREAMING_DEPLOYMENTS=parse_deployment_list(
+                os.getenv("NON_STREAMING_DEPLOYMENTS")
+            ),
+            ELIMINATE_EMPTY_CHOICES=_get_eliminate_empty_choices(),
         )
diff --git a/aidial_adapter_openai/env.py b/aidial_adapter_openai/env.py
deleted file mode 100644
index e55ddea..0000000
--- a/aidial_adapter_openai/env.py
+++ /dev/null
@@ -1,47 +0,0 @@
-import json
-import os
-from typing import Dict
-
-from aidial_adapter_openai.utils.env import get_env_bool
-from aidial_adapter_openai.utils.log_config import logger
-from aidial_adapter_openai.utils.parsers import parse_deployment_list
-
-MODEL_ALIASES: Dict[str, str] = json.loads(os.getenv("MODEL_ALIASES", "{}"))
-DALLE3_DEPLOYMENTS = parse_deployment_list(os.getenv("DALLE3_DEPLOYMENTS"))
-GPT4_VISION_DEPLOYMENTS = parse_deployment_list(
-    os.getenv("GPT4_VISION_DEPLOYMENTS")
-)
-MISTRAL_DEPLOYMENTS = parse_deployment_list(os.getenv("MISTRAL_DEPLOYMENTS"))
-DATABRICKS_DEPLOYMENTS = parse_deployment_list(
-    os.getenv("DATABRICKS_DEPLOYMENTS")
-)
-GPT4O_DEPLOYMENTS = parse_deployment_list(os.getenv("GPT4O_DEPLOYMENTS"))
-GPT4O_MINI_DEPLOYMENTS = parse_deployment_list(
-    os.getenv("GPT4O_MINI_DEPLOYMENTS")
-)
-API_VERSIONS_MAPPING: Dict[str, str] = json.loads(
-    os.getenv("API_VERSIONS_MAPPING", "{}")
-)
-COMPLETION_DEPLOYMENTS_PROMPT_TEMPLATES: Dict[str, str] = json.loads(
-    os.getenv("COMPLETION_DEPLOYMENTS_PROMPT_TEMPLATES") or "{}"
-)
-DALLE3_AZURE_API_VERSION = os.getenv("DALLE3_AZURE_API_VERSION", "2024-02-01")
-NON_STREAMING_DEPLOYMENTS = parse_deployment_list(
-    os.getenv("NON_STREAMING_DEPLOYMENTS")
-)
-AZURE_AI_VISION_DEPLOYMENTS = parse_deployment_list(
-    os.getenv("AZURE_AI_VISION_DEPLOYMENTS")
-)
-
-
-def get_eliminate_empty_choices() -> bool:
-    old_name = "FIX_STREAMING_ISSUES_IN_NEW_API_VERSIONS"
-    new_name = "ELIMINATE_EMPTY_CHOICES"
-
-    if old_name in os.environ:
-        logger.warning(
-            f"{old_name} environment variable is deprecated. Use {new_name} instead."
-        )
-        return get_env_bool(old_name, False)
-
-    return get_env_bool(new_name, False)
diff --git a/aidial_adapter_openai/gpt.py b/aidial_adapter_openai/gpt.py
index 5d6610d..d4c6cde 100644
--- a/aidial_adapter_openai/gpt.py
+++ b/aidial_adapter_openai/gpt.py
@@ -44,6 +44,7 @@ async def gpt_chat_completion(
     creds: OpenAICreds,
     api_version: str,
     tokenizer: PlainTextTokenizer,
+    eliminate_empty_choices: bool,
 ):
     discarded_messages = None
     estimated_prompt_tokens = None
@@ -83,6 +84,7 @@ async def gpt_chat_completion(
             deployment=deployment_id,
             discarded_messages=discarded_messages,
             stream=map_stream(chunk_to_dict, response),
+            eliminate_empty_choices=eliminate_empty_choices,
         )
     else:
         rest = response.to_dict()
diff --git a/aidial_adapter_openai/gpt4_multi_modal/chat_completion.py b/aidial_adapter_openai/gpt4_multi_modal/chat_completion.py
index 26d82c7..216137d 100644
--- a/aidial_adapter_openai/gpt4_multi_modal/chat_completion.py
+++ b/aidial_adapter_openai/gpt4_multi_modal/chat_completion.py
@@ -143,6 +143,7 @@ async def gpt4o_chat_completion(
     file_storage: Optional[FileStorage],
     api_version: str,
     tokenizer: MultiModalTokenizer,
+    eliminate_empty_choices: bool,
 ):
     return await chat_completion(
         request,
@@ -155,6 +156,7 @@ async def gpt4o_chat_completion(
         tokenizer,
         lambda x: x,
         None,
+        eliminate_empty_choices,
     )
 
 
@@ -167,6 +169,7 @@ async def gpt4_vision_chat_completion(
     file_storage: Optional[FileStorage],
     api_version: str,
     tokenizer: MultiModalTokenizer,
+    eliminate_empty_choices: bool,
 ):
     return await chat_completion(
         request,
@@ -179,6 +182,7 @@ async def gpt4_vision_chat_completion(
         tokenizer,
         convert_gpt4v_to_gpt4_chunk,
         GPT4V_DEFAULT_MAX_TOKENS,
+        eliminate_empty_choices,
     )
 
 
@@ -193,6 +197,7 @@ async def chat_completion(
     tokenizer: MultiModalTokenizer,
     response_transformer: Callable[[dict], dict | None],
     default_max_tokens: Optional[int],
+    eliminate_empty_choices: bool,
 ):
     if request.get("n", 1) > 1:
         raise RequestValidationError("The deployment doesn't support n > 1")
@@ -265,6 +270,7 @@ def debug_print(chunk: T) -> T:
                     response_transformer,
                     parse_openai_sse_stream(response),
                 ),
+                eliminate_empty_choices=eliminate_empty_choices,
             ),
         )
     else:
diff --git a/aidial_adapter_openai/routers/__init__.py b/aidial_adapter_openai/routers/__init__.py
new file mode 100644
index 0000000..e69de29
diff --git a/aidial_adapter_openai/routers/chat_completion.py b/aidial_adapter_openai/routers/chat_completion.py
new file mode 100644
index 0000000..d6f86ac
--- /dev/null
+++ b/aidial_adapter_openai/routers/chat_completion.py
@@ -0,0 +1,159 @@
+from typing import Annotated
+
+from fastapi import Depends, Request
+
+from aidial_adapter_openai.app_config import ApplicationConfig
+from aidial_adapter_openai.completions import chat_completion as completion
+from aidial_adapter_openai.dalle3 import (
+    chat_completion as dalle3_chat_completion,
+)
+from aidial_adapter_openai.databricks import (
+    chat_completion as databricks_chat_completion,
+)
+from aidial_adapter_openai.dial_api.storage import create_file_storage
+from aidial_adapter_openai.gpt import gpt_chat_completion
+from aidial_adapter_openai.gpt4_multi_modal.chat_completion import (
+    gpt4_vision_chat_completion,
+    gpt4o_chat_completion,
+)
+from aidial_adapter_openai.mistral import (
+    chat_completion as mistral_chat_completion,
+)
+from aidial_adapter_openai.utils.auth import get_credentials
+from aidial_adapter_openai.utils.image_tokenizer import get_image_tokenizer
+from aidial_adapter_openai.utils.parsers import completions_parser, parse_body
+from aidial_adapter_openai.utils.request import (
+    get_api_version,
+    get_request_app_config,
+)
+from aidial_adapter_openai.utils.streaming import create_server_response
+from aidial_adapter_openai.utils.tokenizer import (
+    MultiModalTokenizer,
+    PlainTextTokenizer,
+)
+
+
+async def call_chat_completion(
+    deployment_id: str,
+    data: dict,
+    is_stream: bool,
+    request: Request,
+    app_config: ApplicationConfig,
+):
+
+    # Azure OpenAI deployments ignore "model" request field,
+    # since the deployment id is already encoded in the endpoint path.
+    # This is not the case for non-Azure OpenAI deployments, so
+    # they require the "model" field to be set.
+    # However, openai==1.33.0 requires the "model" field for **both**
+    # Azure and non-Azure deployments.
+    # Therefore, we provide the "model" field for all deployments here.
+    # The same goes for /embeddings endpoint.
+    data["model"] = deployment_id
+
+    creds = await get_credentials(request)
+    api_version = get_api_version(request)
+
+    upstream_endpoint = request.headers["X-UPSTREAM-ENDPOINT"]
+
+    if completions_endpoint := completions_parser.parse(upstream_endpoint):
+        return await completion(
+            data,
+            completions_endpoint,
+            creds,
+            api_version,
+            deployment_id,
+            app_config,
+        )
+    if deployment_id in app_config.DALLE3_DEPLOYMENTS:
+        storage = create_file_storage("images", request.headers)
+        return await dalle3_chat_completion(
+            data,
+            upstream_endpoint,
+            creds,
+            is_stream,
+            storage,
+            app_config.DALLE3_AZURE_API_VERSION,
+        )
+
+    if deployment_id in app_config.MISTRAL_DEPLOYMENTS:
+        return await mistral_chat_completion(data, upstream_endpoint, creds)
+
+    if deployment_id in app_config.DATABRICKS_DEPLOYMENTS:
+        return await databricks_chat_completion(data, upstream_endpoint, creds)
+
+    text_tokenizer_model = app_config.MODEL_ALIASES.get(
+        deployment_id, deployment_id
+    )
+
+    if deployment_id in app_config.GPT4_VISION_DEPLOYMENTS:
+        tokenizer = MultiModalTokenizer(
+            "gpt-4", get_image_tokenizer(deployment_id, app_config)
+        )
+        return await gpt4_vision_chat_completion(
+            data,
+            deployment_id,
+            upstream_endpoint,
+            creds,
+            is_stream,
+            create_file_storage("images", request.headers),
+            api_version,
+            tokenizer,
+            app_config.ELIMINATE_EMPTY_CHOICES,
+        )
+
+    if deployment_id in (
+        *app_config.GPT4O_DEPLOYMENTS,
+        *app_config.GPT4O_MINI_DEPLOYMENTS,
+    ):
+        tokenizer = MultiModalTokenizer(
+            text_tokenizer_model,
+            get_image_tokenizer(deployment_id, app_config),
+        )
+        return await gpt4o_chat_completion(
+            data,
+            deployment_id,
+            upstream_endpoint,
+            creds,
+            is_stream,
+            create_file_storage("images", request.headers),
+            api_version,
+            tokenizer,
+            app_config.ELIMINATE_EMPTY_CHOICES,
+        )
+
+    tokenizer = PlainTextTokenizer(model=text_tokenizer_model)
+    return await gpt_chat_completion(
+        data,
+        deployment_id,
+        upstream_endpoint,
+        creds,
+        api_version,
+        tokenizer,
+        app_config.ELIMINATE_EMPTY_CHOICES,
+    )
+
+
+async def chat_completion(
+    deployment_id: str,
+    request: Request,
+    app_config: Annotated[ApplicationConfig, Depends(get_request_app_config)],
+):
+
+    data = await parse_body(request)
+
+    is_stream = bool(data.get("stream"))
+
+    emulate_streaming = (
+        deployment_id in app_config.NON_STREAMING_DEPLOYMENTS and is_stream
+    )
+
+    if emulate_streaming:
+        data["stream"] = False
+
+    return create_server_response(
+        emulate_streaming,
+        await call_chat_completion(
+            deployment_id, data, is_stream, request, app_config
+        ),
+    )
diff --git a/aidial_adapter_openai/routers/embeddings.py b/aidial_adapter_openai/routers/embeddings.py
new file mode 100644
index 0000000..126d3ff
--- /dev/null
+++ b/aidial_adapter_openai/routers/embeddings.py
@@ -0,0 +1,41 @@
+from typing import Annotated
+
+from fastapi import Depends, Request
+
+from aidial_adapter_openai.app_config import ApplicationConfig
+from aidial_adapter_openai.dial_api.storage import create_file_storage
+from aidial_adapter_openai.embeddings.azure_ai_vision import (
+    embeddings as azure_ai_vision_embeddings,
+)
+from aidial_adapter_openai.embeddings.openai import (
+    embeddings as openai_embeddings,
+)
+from aidial_adapter_openai.utils.auth import get_credentials
+from aidial_adapter_openai.utils.parsers import parse_body
+from aidial_adapter_openai.utils.request import (
+    get_api_version,
+    get_request_app_config,
+)
+
+
+async def embedding(
+    deployment_id: str,
+    request: Request,
+    app_config: Annotated[ApplicationConfig, Depends(get_request_app_config)],
+):
+    data = await parse_body(request)
+
+    # See note for /chat/completions endpoint
+    data["model"] = deployment_id
+
+    creds = await get_credentials(request)
+    api_version = get_api_version(request)
+    upstream_endpoint = request.headers["X-UPSTREAM-ENDPOINT"]
+
+    if deployment_id in app_config.AZURE_AI_VISION_DEPLOYMENTS:
+        storage = create_file_storage("images", request.headers)
+        return await azure_ai_vision_embeddings(
+            creds, deployment_id, upstream_endpoint, storage, data
+        )
+
+    return await openai_embeddings(creds, upstream_endpoint, api_version, data)
diff --git a/aidial_adapter_openai/utils/image_tokenizer.py b/aidial_adapter_openai/utils/image_tokenizer.py
index b6d328d..01eb467 100644
--- a/aidial_adapter_openai/utils/image_tokenizer.py
+++ b/aidial_adapter_openai/utils/image_tokenizer.py
@@ -4,7 +4,7 @@
 """
 
 import math
-from typing import List, Tuple, assert_never
+from typing import assert_never
 
 from pydantic import BaseModel
 
@@ -57,19 +57,17 @@ def _compute_high_detail_tokens(self, width: int, height: int) -> int:
 
 def get_image_tokenizer(
     deployment_id: str, app_config: ApplicationConfig
-) -> ImageTokenizer | None:
-    _TOKENIZERS: List[Tuple[ImageTokenizer, List[str]]] = [
-        (GPT4O_IMAGE_TOKENIZER, app_config.GPT4O_DEPLOYMENTS),
-        (GPT4O_MINI_IMAGE_TOKENIZER, app_config.GPT4O_MINI_DEPLOYMENTS),
-        (
-            GPT4_VISION_IMAGE_TOKENIZER,
-            app_config.GPT4_VISION_DEPLOYMENTS,
-        ),
-    ]
-    for tokenizer, ids in _TOKENIZERS:
-        if deployment_id in ids:
-            return tokenizer
-    return None
+) -> ImageTokenizer:
+    if deployment_id in app_config.GPT4O_DEPLOYMENTS:
+        return GPT4O_IMAGE_TOKENIZER
+    elif deployment_id in app_config.GPT4O_MINI_DEPLOYMENTS:
+        return GPT4O_MINI_IMAGE_TOKENIZER
+    elif deployment_id in app_config.GPT4_VISION_DEPLOYMENTS:
+        return GPT4_VISION_IMAGE_TOKENIZER
+    else:
+        raise RuntimeError(
+            f"No image tokenizer found for deployment {deployment_id}"
+        )
 
 
 def _fit_longest(width: int, height: int, size: int) -> tuple[int, int]:
diff --git a/aidial_adapter_openai/utils/request.py b/aidial_adapter_openai/utils/request.py
new file mode 100644
index 0000000..c8d6457
--- /dev/null
+++ b/aidial_adapter_openai/utils/request.py
@@ -0,0 +1,27 @@
+from aidial_sdk.exceptions import InvalidRequestError
+from fastapi import FastAPI, Request
+
+from aidial_adapter_openai.app_config import ApplicationConfig
+
+
+def set_app_config(app: FastAPI, app_config: ApplicationConfig):
+    app.state.app_config = app_config
+
+
+def get_app_config(app: FastAPI) -> ApplicationConfig:
+    return app.state.app_config
+
+
+def get_request_app_config(request: Request) -> ApplicationConfig:
+    return get_app_config(request.app)
+
+
+def get_api_version(request: Request) -> str:
+    api_version = request.query_params.get("api-version", "")
+    app_config = get_request_app_config(request)
+    api_version = app_config.API_VERSIONS_MAPPING.get(api_version, api_version)
+
+    if api_version == "":
+        raise InvalidRequestError("api-version is a required query parameter")
+
+    return api_version
diff --git a/aidial_adapter_openai/utils/streaming.py b/aidial_adapter_openai/utils/streaming.py
index d677ef7..724ae00 100644
--- a/aidial_adapter_openai/utils/streaming.py
+++ b/aidial_adapter_openai/utils/streaming.py
@@ -10,7 +10,6 @@
 from openai.types.chat.chat_completion_chunk import ChatCompletionChunk
 from pydantic import BaseModel
 
-from aidial_adapter_openai.env import get_eliminate_empty_choices
 from aidial_adapter_openai.utils.chat_completion_response import (
     ChatCompletionResponse,
     ChatCompletionStreamingChunk,
@@ -18,8 +17,6 @@
 from aidial_adapter_openai.utils.log_config import logger
 from aidial_adapter_openai.utils.sse_stream import to_openai_sse_stream
 
-ELIMINATE_EMPTY_CHOICES = get_eliminate_empty_choices()
-
 
 def generate_id() -> str:
     return "chatcmpl-" + str(uuid4())
@@ -62,6 +59,7 @@ async def generate_stream(
     deployment: str,
     discarded_messages: Optional[list[int]],
     stream: AsyncIterator[dict],
+    eliminate_empty_choices: bool,
 ) -> AsyncIterator[dict]:
 
     empty_chunk = build_chunk(
@@ -116,7 +114,7 @@ def set_discarded_messages(chunk: dict | None, indices: list[int]) -> dict:
             # when content filtering is enabled for a corresponding deployment.
             # The safety rating of the request is reported in this first chunk.
             # Here we withhold such a chunk and merge it later with a follow-up chunk.
-            if len(choices) == 0 and ELIMINATE_EMPTY_CHOICES:
+            if len(choices) == 0 and eliminate_empty_choices:
                 buffer_chunk = chunk
             else:
                 if last_chunk is not None:
diff --git a/tests/conftest.py b/tests/conftest.py
index 811cab1..a8efc95 100644
--- a/tests/conftest.py
+++ b/tests/conftest.py
@@ -1,25 +1,29 @@
-from unittest.mock import patch
-
 import httpx
 import pytest
 import pytest_asyncio
 from httpx import ASGITransport
 
-from aidial_adapter_openai.app import app
+from aidial_adapter_openai.app import create_app
+from aidial_adapter_openai.utils.request import get_app_config
 
 
-@pytest.fixture
-def eliminate_empty_choices():
-    with patch(
-        "aidial_adapter_openai.utils.streaming.ELIMINATE_EMPTY_CHOICES", True
-    ):
-        yield
+@pytest_asyncio.fixture
+def _app_instance():
+    return create_app()
 
 
 @pytest_asyncio.fixture
-async def test_app():
+async def test_app(_app_instance):
     async with httpx.AsyncClient(
-        transport=ASGITransport(app=app),  # type: ignore
+        transport=ASGITransport(app=_app_instance),
         base_url="http://test-app.com",
     ) as client:
         yield client
+
+
+@pytest.fixture
+def eliminate_empty_choices(_app_instance):
+    app_config = get_app_config(_app_instance)
+    app_config.ELIMINATE_EMPTY_CHOICES = True
+    yield
+    app_config.ELIMINATE_EMPTY_CHOICES = False

From 10aba8ce575b64735760c799f6824c84681f92c5 Mon Sep 17 00:00:00 2001
From: Roman Romanov <roman_romanov2@epam.com>
Date: Wed, 27 Nov 2024 10:32:22 +0200
Subject: [PATCH 03/10] Refactor

---
 aidial_adapter_openai/app.py | 7 ++-----
 1 file changed, 2 insertions(+), 5 deletions(-)

diff --git a/aidial_adapter_openai/app.py b/aidial_adapter_openai/app.py
index cd348c8..1a264ec 100644
--- a/aidial_adapter_openai/app.py
+++ b/aidial_adapter_openai/app.py
@@ -20,6 +20,7 @@
 from aidial_adapter_openai.routers.embeddings import embedding
 from aidial_adapter_openai.utils.http_client import get_http_client
 from aidial_adapter_openai.utils.log_config import configure_loggers, logger
+from aidial_adapter_openai.utils.request import set_app_config
 
 
 @asynccontextmanager
@@ -84,11 +85,7 @@ def create_app(
     to_init_telemetry: bool = True,
 ) -> FastAPI:
     app = FastAPI(lifespan=lifespan)
-
-    if app_config is None:
-        app_config = ApplicationConfig.from_env()
-
-    app.state.app_config = app_config
+    set_app_config(app, app_config or ApplicationConfig.from_env())
 
     if to_init_telemetry:
         init_telemetry(app, TelemetryConfig())

From cd1b3df240686a50487169d97836695412846c95 Mon Sep 17 00:00:00 2001
From: Roman Romanov <roman_romanov2@epam.com>
Date: Wed, 27 Nov 2024 10:39:00 +0200
Subject: [PATCH 04/10] One more refactor - move exception handlers to distinct
 file

---
 aidial_adapter_openai/app.py                | 67 +++------------------
 aidial_adapter_openai/exception_handlers.py | 56 +++++++++++++++++
 2 files changed, 63 insertions(+), 60 deletions(-)
 create mode 100644 aidial_adapter_openai/exception_handlers.py

diff --git a/aidial_adapter_openai/app.py b/aidial_adapter_openai/app.py
index 1a264ec..d7f0202 100644
--- a/aidial_adapter_openai/app.py
+++ b/aidial_adapter_openai/app.py
@@ -1,21 +1,18 @@
 from contextlib import asynccontextmanager
 
 import pydantic
-from aidial_sdk._errors import pydantic_validation_exception_handler
 from aidial_sdk.exceptions import HTTPException as DialException
 from aidial_sdk.telemetry.init import init_telemetry
 from aidial_sdk.telemetry.types import TelemetryConfig
-from fastapi import FastAPI, Request
-from fastapi.responses import Response
-from openai import (
-    APIConnectionError,
-    APIError,
-    APIStatusError,
-    APITimeoutError,
-    OpenAIError,
-)
+from fastapi import FastAPI
+from openai import OpenAIError
 
 from aidial_adapter_openai.app_config import ApplicationConfig
+from aidial_adapter_openai.exception_handlers import (
+    dial_exception_handler,
+    openai_exception_handler,
+    pydantic_exception_handler,
+)
 from aidial_adapter_openai.routers.chat_completion import chat_completion
 from aidial_adapter_openai.routers.embeddings import embedding
 from aidial_adapter_openai.utils.http_client import get_http_client
@@ -30,56 +27,6 @@ async def lifespan(app: FastAPI):
     await get_http_client().aclose()
 
 
-def openai_exception_handler(request: Request, e: DialException):
-    if isinstance(e, APIStatusError):
-        r = e.response
-        headers = r.headers
-
-        # Avoid encoding the error message when the original response was encoded.
-        if "Content-Encoding" in headers:
-            del headers["Content-Encoding"]
-
-        return Response(
-            content=r.content,
-            status_code=r.status_code,
-            headers=headers,
-        )
-
-    if isinstance(e, APITimeoutError):
-        raise DialException(
-            status_code=504,
-            type="timeout",
-            message="Request timed out",
-            display_message="Request timed out. Please try again later.",
-        )
-
-    if isinstance(e, APIConnectionError):
-        raise DialException(
-            status_code=502,
-            type="connection",
-            message="Error communicating with OpenAI",
-            display_message="OpenAI server is not responsive. Please try again later.",
-        )
-
-    if isinstance(e, APIError):
-        raise DialException(
-            status_code=getattr(e, "status_code", None) or 500,
-            message=e.message,
-            type=e.type,
-            code=e.code,
-            param=e.param,
-            display_message=None,
-        )
-
-
-def pydantic_exception_handler(request: Request, exc: pydantic.ValidationError):
-    return pydantic_validation_exception_handler(request, exc)
-
-
-def dial_exception_handler(request: Request, exc: DialException):
-    return exc.to_fastapi_response()
-
-
 def create_app(
     app_config: ApplicationConfig | None = None,
     to_init_telemetry: bool = True,
diff --git a/aidial_adapter_openai/exception_handlers.py b/aidial_adapter_openai/exception_handlers.py
new file mode 100644
index 0000000..c98c122
--- /dev/null
+++ b/aidial_adapter_openai/exception_handlers.py
@@ -0,0 +1,56 @@
+import pydantic
+from aidial_sdk._errors import pydantic_validation_exception_handler
+from aidial_sdk.exceptions import HTTPException as DialException
+from fastapi import Request
+from fastapi.responses import Response
+from openai import APIConnectionError, APIError, APIStatusError, APITimeoutError
+
+
+def openai_exception_handler(request: Request, e: DialException):
+    if isinstance(e, APIStatusError):
+        r = e.response
+        headers = r.headers
+
+        # Avoid encoding the error message when the original response was encoded.
+        if "Content-Encoding" in headers:
+            del headers["Content-Encoding"]
+
+        return Response(
+            content=r.content,
+            status_code=r.status_code,
+            headers=headers,
+        )
+
+    if isinstance(e, APITimeoutError):
+        raise DialException(
+            status_code=504,
+            type="timeout",
+            message="Request timed out",
+            display_message="Request timed out. Please try again later.",
+        )
+
+    if isinstance(e, APIConnectionError):
+        raise DialException(
+            status_code=502,
+            type="connection",
+            message="Error communicating with OpenAI",
+            display_message="OpenAI server is not responsive. Please try again later.",
+        )
+
+    if isinstance(e, APIError):
+        raise DialException(
+            status_code=getattr(e, "status_code", None) or 500,
+            message=e.message,
+            type=e.type,
+            code=e.code,
+            param=e.param,
+            display_message=None,
+        )
+
+
+def pydantic_exception_handler(request: Request, exc: pydantic.ValidationError):
+    return pydantic_validation_exception_handler(request, exc)
+
+
+def dial_exception_handler(request: Request, exc: DialException):
+    return exc.to_fastapi_response()

From e16cef9040b0fab37066ee7a13eb7e1daa2ec05f Mon Sep 17 00:00:00 2001
From: Roman Romanov <roman_romanov2@epam.com>
Date: Wed, 27 Nov 2024 10:42:33 +0200
Subject: [PATCH 05/10] Make app.py even cleaner

---
 aidial_adapter_openai/app.py              | 14 ++++++--------
 aidial_adapter_openai/routers/__init__.py |  3 +++
 aidial_adapter_openai/routers/health.py   |  2 ++
 3 files changed, 11 insertions(+), 8 deletions(-)
 create mode 100644 aidial_adapter_openai/routers/health.py

diff --git a/aidial_adapter_openai/app.py b/aidial_adapter_openai/app.py
index d7f0202..2b4334f 100644
--- a/aidial_adapter_openai/app.py
+++ b/aidial_adapter_openai/app.py
@@ -7,14 +7,13 @@
 from fastapi import FastAPI
 from openai import OpenAIError
 
+import aidial_adapter_openai.routers as routers
 from aidial_adapter_openai.app_config import ApplicationConfig
 from aidial_adapter_openai.exception_handlers import (
     dial_exception_handler,
     openai_exception_handler,
     pydantic_exception_handler,
 )
-from aidial_adapter_openai.routers.chat_completion import chat_completion
-from aidial_adapter_openai.routers.embeddings import embedding
 from aidial_adapter_openai.utils.http_client import get_http_client
 from aidial_adapter_openai.utils.log_config import configure_loggers, logger
 from aidial_adapter_openai.utils.request import set_app_config
@@ -39,13 +38,12 @@ def create_app(
 
     configure_loggers()
 
-    @app.get("/health")
-    def health():
-        return {"status": "ok"}
-
-    app.post("/openai/deployments/{deployment_id:path}/embeddings")(embedding)
+    app.get("/health")(routers.health)
+    app.post("/openai/deployments/{deployment_id:path}/embeddings")(
+        routers.embedding
+    )
     app.post("/openai/deployments/{deployment_id:path}/chat/completions")(
-        chat_completion
+        routers.chat_completion
     )
     app.exception_handler(OpenAIError)(openai_exception_handler)
     app.exception_handler(pydantic.ValidationError)(pydantic_exception_handler)
diff --git a/aidial_adapter_openai/routers/__init__.py b/aidial_adapter_openai/routers/__init__.py
index e69de29..420e7fa 100644
--- a/aidial_adapter_openai/routers/__init__.py
+++ b/aidial_adapter_openai/routers/__init__.py
@@ -0,0 +1,3 @@
+from .chat_completion import chat_completion
+from .embeddings import embedding
+from .health import health
diff --git a/aidial_adapter_openai/routers/health.py b/aidial_adapter_openai/routers/health.py
new file mode 100644
index 0000000..834f2a3
--- /dev/null
+++ b/aidial_adapter_openai/routers/health.py
@@ -0,0 +1,2 @@
+def health():
+    return {"status": "ok"}

From 0b08f3665daab009295298fbcb66a467fceb74d9 Mon Sep 17 00:00:00 2001
From: Roman Romanov <roman_romanov2@epam.com>
Date: Wed, 27 Nov 2024 10:47:32 +0200
Subject: [PATCH 06/10] Turn off telemetry for test app instance

---
 tests/conftest.py | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/tests/conftest.py b/tests/conftest.py
index a8efc95..876cdce 100644
--- a/tests/conftest.py
+++ b/tests/conftest.py
@@ -9,7 +9,7 @@
 
 @pytest_asyncio.fixture
 def _app_instance():
-    return create_app()
+    return create_app(to_init_telemetry=False)
 
 
 @pytest_asyncio.fixture

From 66faf7265ba616ddad3afde66dbd0d82cfdfd8a9 Mon Sep 17 00:00:00 2001
From: Roman Romanov <roman_romanov2@epam.com>
Date: Thu, 28 Nov 2024 17:47:57 +0200
Subject: [PATCH 07/10] More refactoring due to PR comments

---
 aidial_adapter_openai/app.py                  |  16 +-
 aidial_adapter_openai/app_config.py           | 119 ++++++++-----
 aidial_adapter_openai/constant.py             |  11 ++
 .../{routers => endpoints}/__init__.py        |   0
 .../endpoints/chat_completion.py              | 160 ++++++++++++++++++
 .../{routers => endpoints}/embeddings.py      |  12 +-
 .../{routers => endpoints}/health.py          |   0
 .../routers/chat_completion.py                | 159 -----------------
 .../utils/image_tokenizer.py                  |  31 ++--
 aidial_adapter_openai/utils/json.py           |   2 +
 tests/conftest.py                             |   4 +-
 11 files changed, 276 insertions(+), 238 deletions(-)
 create mode 100644 aidial_adapter_openai/constant.py
 rename aidial_adapter_openai/{routers => endpoints}/__init__.py (100%)
 create mode 100644 aidial_adapter_openai/endpoints/chat_completion.py
 rename aidial_adapter_openai/{routers => endpoints}/embeddings.py (79%)
 rename aidial_adapter_openai/{routers => endpoints}/health.py (100%)
 delete mode 100644 aidial_adapter_openai/routers/chat_completion.py
 create mode 100644 aidial_adapter_openai/utils/json.py

diff --git a/aidial_adapter_openai/app.py b/aidial_adapter_openai/app.py
index 2b4334f..5d07629 100644
--- a/aidial_adapter_openai/app.py
+++ b/aidial_adapter_openai/app.py
@@ -2,12 +2,12 @@
 
 import pydantic
 from aidial_sdk.exceptions import HTTPException as DialException
-from aidial_sdk.telemetry.init import init_telemetry
+from aidial_sdk.telemetry.init import init_telemetry as sdk_init_telemetry
 from aidial_sdk.telemetry.types import TelemetryConfig
 from fastapi import FastAPI
 from openai import OpenAIError
 
-import aidial_adapter_openai.routers as routers
+import aidial_adapter_openai.endpoints as endpoints
 from aidial_adapter_openai.app_config import ApplicationConfig
 from aidial_adapter_openai.exception_handlers import (
     dial_exception_handler,
@@ -28,22 +28,22 @@ async def lifespan(app: FastAPI):
 
 def create_app(
     app_config: ApplicationConfig | None = None,
-    to_init_telemetry: bool = True,
+    init_telemetry: bool = True,
 ) -> FastAPI:
     app = FastAPI(lifespan=lifespan)
     set_app_config(app, app_config or ApplicationConfig.from_env())
 
-    if to_init_telemetry:
-        init_telemetry(app, TelemetryConfig())
+    if init_telemetry:
+        sdk_init_telemetry(app, TelemetryConfig())
 
     configure_loggers()
 
-    app.get("/health")(routers.health)
+    app.get("/health")(endpoints.health)
     app.post("/openai/deployments/{deployment_id:path}/embeddings")(
-        routers.embedding
+        endpoints.embedding
     )
     app.post("/openai/deployments/{deployment_id:path}/chat/completions")(
-        routers.chat_completion
+        endpoints.chat_completion
     )
     app.exception_handler(OpenAIError)(openai_exception_handler)
     app.exception_handler(pydantic.ValidationError)(pydantic_exception_handler)
diff --git a/aidial_adapter_openai/app_config.py b/aidial_adapter_openai/app_config.py
index d024e4c..a59161a 100644
--- a/aidial_adapter_openai/app_config.py
+++ b/aidial_adapter_openai/app_config.py
@@ -4,24 +4,13 @@
 
 from pydantic import BaseModel
 
+from aidial_adapter_openai.constant import ChatCompletionDeploymentType
 from aidial_adapter_openai.utils.env import get_env_bool
+from aidial_adapter_openai.utils.json import remove_nones
 from aidial_adapter_openai.utils.log_config import logger
 from aidial_adapter_openai.utils.parsers import parse_deployment_list
 
 
-def _get_eliminate_empty_choices() -> bool:
-    old_name = "FIX_STREAMING_ISSUES_IN_NEW_API_VERSIONS"
-    new_name = "ELIMINATE_EMPTY_CHOICES"
-
-    if old_name in os.environ:
-        logger.warning(
-            f"{old_name} environment variable is deprecated. Use {new_name} instead."
-        )
-        return get_env_bool(old_name, False)
-
-    return get_env_bool(new_name, False)
-
-
 class ApplicationConfig(BaseModel):
     MODEL_ALIASES: Dict[str, str] = {}
     DALLE3_DEPLOYMENTS: List[str] = []
@@ -37,42 +26,78 @@ class ApplicationConfig(BaseModel):
     NON_STREAMING_DEPLOYMENTS: List[str] = []
     ELIMINATE_EMPTY_CHOICES: bool = False
 
+    def get_chat_completion_deployment_type(
+        self, deployment_id: str
+    ) -> ChatCompletionDeploymentType:
+        if deployment_id in self.DALLE3_DEPLOYMENTS:
+            return ChatCompletionDeploymentType.DALLE3
+        elif deployment_id in self.GPT4_VISION_DEPLOYMENTS:
+            return ChatCompletionDeploymentType.GPT4_VISION
+        elif deployment_id in self.MISTRAL_DEPLOYMENTS:
+            return ChatCompletionDeploymentType.MISTRAL
+        elif deployment_id in self.DATABRICKS_DEPLOYMENTS:
+            return ChatCompletionDeploymentType.DATABRICKS
+        elif deployment_id in self.GPT4O_DEPLOYMENTS:
+            return ChatCompletionDeploymentType.GPT4O
+        elif deployment_id in self.GPT4O_MINI_DEPLOYMENTS:
+            return ChatCompletionDeploymentType.GPT4O_MINI
+        else:
+            return ChatCompletionDeploymentType.GPT_TEXT_ONLY
+
     @classmethod
     def from_env(cls) -> "ApplicationConfig":
+        def _parse_env_deployments(deployments_key: str) -> List[str] | None:
+            return parse_deployment_list(os.getenv(deployments_key)) or None
+
+        def _parse_env_dict(key: str) -> Dict[str, str] | None:
+            value = os.getenv(key)
+            return json.loads(value) if value else None
+
+        def _parse_eliminate_empty_choices() -> bool | None:
+            old_name = "FIX_STREAMING_ISSUES_IN_NEW_API_VERSIONS"
+            new_name = "ELIMINATE_EMPTY_CHOICES"
+
+            if old_name in os.environ:
+                logger.warning(
+                    f"{old_name} environment variable is deprecated. Use {new_name} instead."
+                )
+                return get_env_bool(old_name)
+            elif new_name in os.environ:
+                return get_env_bool(new_name)
+
+            return None
+
+        deployment_fields = {
+            deployment_key: _parse_env_deployments(deployment_key)
+            for deployment_key in (
+                "DALLE3_DEPLOYMENTS",
+                "GPT4_VISION_DEPLOYMENTS",
+                "MISTRAL_DEPLOYMENTS",
+                "DATABRICKS_DEPLOYMENTS",
+                "GPT4O_DEPLOYMENTS",
+                "GPT4O_MINI_DEPLOYMENTS",
+                "AZURE_AI_VISION_DEPLOYMENTS",
+                "NON_STREAMING_DEPLOYMENTS",
+            )
+        }
+        dict_fields = {
+            key: _parse_env_dict(key)
+            for key in (
+                "MODEL_ALIASES",
+                "API_VERSIONS_MAPPING",
+                "COMPLETION_DEPLOYMENTS_PROMPT_TEMPLATES",
+            )
+        }
+
         return cls(
-            MODEL_ALIASES=json.loads(os.getenv("MODEL_ALIASES", "{}")),
-            DALLE3_DEPLOYMENTS=parse_deployment_list(
-                os.getenv("DALLE3_DEPLOYMENTS")
-            ),
-            GPT4_VISION_DEPLOYMENTS=parse_deployment_list(
-                os.getenv("GPT4_VISION_DEPLOYMENTS")
-            ),
-            MISTRAL_DEPLOYMENTS=parse_deployment_list(
-                os.getenv("MISTRAL_DEPLOYMENTS")
-            ),
-            DATABRICKS_DEPLOYMENTS=parse_deployment_list(
-                os.getenv("DATABRICKS_DEPLOYMENTS")
-            ),
-            GPT4O_DEPLOYMENTS=parse_deployment_list(
-                os.getenv("GPT4O_DEPLOYMENTS")
-            ),
-            GPT4O_MINI_DEPLOYMENTS=parse_deployment_list(
-                os.getenv("GPT4O_MINI_DEPLOYMENTS")
-            ),
-            AZURE_AI_VISION_DEPLOYMENTS=parse_deployment_list(
-                os.getenv("AZURE_AI_VISION_DEPLOYMENTS")
-            ),
-            API_VERSIONS_MAPPING=json.loads(
-                os.getenv("API_VERSIONS_MAPPING", "{}")
-            ),
-            COMPLETION_DEPLOYMENTS_PROMPT_TEMPLATES=json.loads(
-                os.getenv("COMPLETION_DEPLOYMENTS_PROMPT_TEMPLATES") or "{}"
-            ),
-            DALLE3_AZURE_API_VERSION=os.getenv(
-                "DALLE3_AZURE_API_VERSION", "2024-02-01"
-            ),
-            NON_STREAMING_DEPLOYMENTS=parse_deployment_list(
-                os.getenv("NON_STREAMING_DEPLOYMENTS")
+            **remove_nones(
+                {
+                    **deployment_fields,
+                    **dict_fields,
+                    "DALLE3_AZURE_API_VERSION": os.getenv(
+                        "DALLE3_AZURE_API_VERSION"
+                    ),
+                    "ELIMINATE_EMPTY_CHOICES": _parse_eliminate_empty_choices(),
+                }
             ),
-            ELIMINATE_EMPTY_CHOICES=_get_eliminate_empty_choices(),
         )
diff --git a/aidial_adapter_openai/constant.py b/aidial_adapter_openai/constant.py
new file mode 100644
index 0000000..c24090f
--- /dev/null
+++ b/aidial_adapter_openai/constant.py
@@ -0,0 +1,11 @@
+from enum import StrEnum, auto
+
+
+class ChatCompletionDeploymentType(StrEnum):
+    DALLE3 = auto()
+    MISTRAL = auto()
+    DATABRICKS = auto()
+    GPT4_VISION = auto()
+    GPT4O = auto()
+    GPT4O_MINI = auto()
+    GPT_TEXT_ONLY = auto()
diff --git a/aidial_adapter_openai/routers/__init__.py b/aidial_adapter_openai/endpoints/__init__.py
similarity index 100%
rename from aidial_adapter_openai/routers/__init__.py
rename to aidial_adapter_openai/endpoints/__init__.py
diff --git a/aidial_adapter_openai/endpoints/chat_completion.py b/aidial_adapter_openai/endpoints/chat_completion.py
new file mode 100644
index 0000000..e7f03a6
--- /dev/null
+++ b/aidial_adapter_openai/endpoints/chat_completion.py
@@ -0,0 +1,160 @@
+from typing import assert_never
+
+from fastapi import Request
+
+from aidial_adapter_openai.app_config import ApplicationConfig
+from aidial_adapter_openai.completions import chat_completion as completion
+from aidial_adapter_openai.constant import ChatCompletionDeploymentType
+from aidial_adapter_openai.dalle3 import (
+    chat_completion as dalle3_chat_completion,
+)
+from aidial_adapter_openai.databricks import (
+    chat_completion as databricks_chat_completion,
+)
+from aidial_adapter_openai.dial_api.storage import create_file_storage
+from aidial_adapter_openai.gpt import gpt_chat_completion
+from aidial_adapter_openai.gpt4_multi_modal.chat_completion import (
+    gpt4_vision_chat_completion,
+    gpt4o_chat_completion,
+)
+from aidial_adapter_openai.mistral import (
+    chat_completion as mistral_chat_completion,
+)
+from aidial_adapter_openai.utils.auth import get_credentials
+from aidial_adapter_openai.utils.image_tokenizer import get_image_tokenizer
+from aidial_adapter_openai.utils.parsers import completions_parser, parse_body
+from aidial_adapter_openai.utils.request import (
+    get_api_version,
+    get_request_app_config,
+)
+from aidial_adapter_openai.utils.streaming import create_server_response
+from aidial_adapter_openai.utils.tokenizer import (
+    MultiModalTokenizer,
+    PlainTextTokenizer,
+)
+
+
+async def call_chat_completion(
+    deployment_type: ChatCompletionDeploymentType,
+    deployment_id: str,
+    data: dict,
+    is_stream: bool,
+    request: Request,
+    app_config: ApplicationConfig,
+):
+
+    # Azure OpenAI deployments ignore "model" request field,
+    # since the deployment id is already encoded in the endpoint path.
+    # This is not the case for non-Azure OpenAI deployments, so
+    # they require the "model" field to be set.
+    # However, openai==1.33.0 requires the "model" field for **both**
+    # Azure and non-Azure deployments.
+    # Therefore, we provide the "model" field for all deployments here.
+    # The same goes for /embeddings endpoint.
+    data["model"] = deployment_id
+
+    creds = await get_credentials(request)
+    api_version = get_api_version(request)
+
+    upstream_endpoint = request.headers["X-UPSTREAM-ENDPOINT"]
+
+    if completions_endpoint := completions_parser.parse(upstream_endpoint):
+        return await completion(
+            data,
+            completions_endpoint,
+            creds,
+            api_version,
+            deployment_id,
+            app_config,
+        )
+    match deployment_type:
+        case ChatCompletionDeploymentType.DALLE3:
+            storage = create_file_storage("images", request.headers)
+            return await dalle3_chat_completion(
+                data,
+                upstream_endpoint,
+                creds,
+                is_stream,
+                storage,
+                app_config.DALLE3_AZURE_API_VERSION,
+            )
+        case ChatCompletionDeploymentType.MISTRAL:
+            return await mistral_chat_completion(data, upstream_endpoint, creds)
+        case ChatCompletionDeploymentType.DATABRICKS:
+            return await databricks_chat_completion(
+                data, upstream_endpoint, creds
+            )
+        case ChatCompletionDeploymentType.GPT4_VISION:
+            tokenizer = MultiModalTokenizer(
+                "gpt-4", get_image_tokenizer(deployment_type)
+            )
+            return await gpt4_vision_chat_completion(
+                data,
+                deployment_id,
+                upstream_endpoint,
+                creds,
+                is_stream,
+                create_file_storage("images", request.headers),
+                api_version,
+                tokenizer,
+                app_config.ELIMINATE_EMPTY_CHOICES,
+            )
+        case (
+            ChatCompletionDeploymentType.GPT4O
+            | ChatCompletionDeploymentType.GPT4O_MINI
+        ):
+
+            tokenizer = MultiModalTokenizer(
+                app_config.MODEL_ALIASES.get(deployment_id, deployment_id),
+                get_image_tokenizer(deployment_type),
+            )
+            return await gpt4o_chat_completion(
+                data,
+                deployment_id,
+                upstream_endpoint,
+                creds,
+                is_stream,
+                create_file_storage("images", request.headers),
+                api_version,
+                tokenizer,
+                app_config.ELIMINATE_EMPTY_CHOICES,
+            )
+        case ChatCompletionDeploymentType.GPT_TEXT_ONLY:
+            tokenizer = PlainTextTokenizer(
+                model=app_config.MODEL_ALIASES.get(deployment_id, deployment_id)
+            )
+            return await gpt_chat_completion(
+                data,
+                deployment_id,
+                upstream_endpoint,
+                creds,
+                api_version,
+                tokenizer,
+                app_config.ELIMINATE_EMPTY_CHOICES,
+            )
+        case _:
+            assert_never(deployment_type)
+
+
+async def chat_completion(deployment_id: str, request: Request):
+    app_config = get_request_app_config(request)
+    data = await parse_body(request)
+
+    deployment_type = app_config.get_chat_completion_deployment_type(
+        deployment_id
+    )
+    is_stream = bool(data.get("stream"))
+
+    emulate_streaming = (
+        deployment_id in app_config.NON_STREAMING_DEPLOYMENTS and is_stream
+    )
+
+    if emulate_streaming:
+        data["stream"] = False
+
+    return create_server_response(
+        emulate_streaming,
+        await call_chat_completion(
+            deployment_type, deployment_id, data, is_stream, request, app_config
+        ),
+    )
diff --git a/aidial_adapter_openai/routers/embeddings.py b/aidial_adapter_openai/endpoints/embeddings.py
similarity index 79%
rename from aidial_adapter_openai/routers/embeddings.py
rename to aidial_adapter_openai/endpoints/embeddings.py
index 126d3ff..9960036 100644
--- a/aidial_adapter_openai/routers/embeddings.py
+++ b/aidial_adapter_openai/endpoints/embeddings.py
@@ -1,8 +1,5 @@
-from typing import Annotated
+from fastapi import Request
 
-from fastapi import Depends, Request
-
-from aidial_adapter_openai.app_config import ApplicationConfig
 from aidial_adapter_openai.dial_api.storage import create_file_storage
 from aidial_adapter_openai.embeddings.azure_ai_vision import (
     embeddings as azure_ai_vision_embeddings,
@@ -18,11 +15,8 @@
 )
 
 
-async def embedding(
-    deployment_id: str,
-    request: Request,
-    app_config: Annotated[ApplicationConfig, Depends(get_request_app_config)],
-):
+async def embedding(deployment_id: str, request: Request):
+    app_config = get_request_app_config(request)
     data = await parse_body(request)
 
     # See note for /chat/completions endpoint
diff --git a/aidial_adapter_openai/routers/health.py b/aidial_adapter_openai/endpoints/health.py
similarity index 100%
rename from aidial_adapter_openai/routers/health.py
rename to aidial_adapter_openai/endpoints/health.py
diff --git a/aidial_adapter_openai/routers/chat_completion.py b/aidial_adapter_openai/routers/chat_completion.py
deleted file mode 100644
index d6f86ac..0000000
--- a/aidial_adapter_openai/routers/chat_completion.py
+++ /dev/null
@@ -1,159 +0,0 @@
-from typing import Annotated
-
-from fastapi import Depends, Request
-
-from aidial_adapter_openai.app_config import ApplicationConfig
-from aidial_adapter_openai.completions import chat_completion as completion
-from aidial_adapter_openai.dalle3 import (
-    chat_completion as dalle3_chat_completion,
-)
-from aidial_adapter_openai.databricks import (
-    chat_completion as databricks_chat_completion,
-)
-from aidial_adapter_openai.dial_api.storage import create_file_storage
-from aidial_adapter_openai.gpt import gpt_chat_completion
-from aidial_adapter_openai.gpt4_multi_modal.chat_completion import (
-    gpt4_vision_chat_completion,
-    gpt4o_chat_completion,
-)
-from aidial_adapter_openai.mistral import (
-    chat_completion as mistral_chat_completion,
-)
-from aidial_adapter_openai.utils.auth import get_credentials
-from aidial_adapter_openai.utils.image_tokenizer import get_image_tokenizer
-from aidial_adapter_openai.utils.parsers import completions_parser, parse_body
-from aidial_adapter_openai.utils.request import (
-    get_api_version,
-    get_request_app_config,
-)
-from aidial_adapter_openai.utils.streaming import create_server_response
-from aidial_adapter_openai.utils.tokenizer import (
-    MultiModalTokenizer,
-    PlainTextTokenizer,
-)
-
-
-async def call_chat_completion(
-    deployment_id: str,
-    data: dict,
-    is_stream: bool,
-    request: Request,
-    app_config: ApplicationConfig,
-):
-
-    # Azure OpenAI deployments ignore "model" request field,
-    # since the deployment id is already encoded in the endpoint path.
-    # This is not the case for non-Azure OpenAI deployments, so
-    # they require the "model" field to be set.
-    # However, openai==1.33.0 requires the "model" field for **both**
-    # Azure and non-Azure deployments.
-    # Therefore, we provide the "model" field for all deployments here.
-    # The same goes for /embeddings endpoint.
-    data["model"] = deployment_id
-
-    creds = await get_credentials(request)
-    api_version = get_api_version(request)
-
-    upstream_endpoint = request.headers["X-UPSTREAM-ENDPOINT"]
-
-    if completions_endpoint := completions_parser.parse(upstream_endpoint):
-        return await completion(
-            data,
-            completions_endpoint,
-            creds,
-            api_version,
-            deployment_id,
-            app_config,
-        )
-    if deployment_id in app_config.DALLE3_DEPLOYMENTS:
-        storage = create_file_storage("images", request.headers)
-        return await dalle3_chat_completion(
-            data,
-            upstream_endpoint,
-            creds,
-            is_stream,
-            storage,
-            app_config.DALLE3_AZURE_API_VERSION,
-        )
-
-    if deployment_id in app_config.MISTRAL_DEPLOYMENTS:
-        return await mistral_chat_completion(data, upstream_endpoint, creds)
-
-    if deployment_id in app_config.DATABRICKS_DEPLOYMENTS:
-        return await databricks_chat_completion(data, upstream_endpoint, creds)
-
-    text_tokenizer_model = app_config.MODEL_ALIASES.get(
-        deployment_id, deployment_id
-    )
-
-    if deployment_id in app_config.GPT4_VISION_DEPLOYMENTS:
-        tokenizer = MultiModalTokenizer(
-            "gpt-4", get_image_tokenizer(deployment_id, app_config)
-        )
-        return await gpt4_vision_chat_completion(
-            data,
-            deployment_id,
-            upstream_endpoint,
-            creds,
-            is_stream,
-            create_file_storage("images", request.headers),
-            api_version,
-            tokenizer,
-            app_config.ELIMINATE_EMPTY_CHOICES,
-        )
-
-    if deployment_id in (
-        *app_config.GPT4O_DEPLOYMENTS,
-        *app_config.GPT4O_MINI_DEPLOYMENTS,
-    ):
-        tokenizer = MultiModalTokenizer(
-            text_tokenizer_model,
-            get_image_tokenizer(deployment_id, app_config),
-        )
-        return await gpt4o_chat_completion(
-            data,
-            deployment_id,
-            upstream_endpoint,
-            creds,
-            is_stream,
-            create_file_storage("images", request.headers),
-            api_version,
-            tokenizer,
-            app_config.ELIMINATE_EMPTY_CHOICES,
-        )
-
-    tokenizer = PlainTextTokenizer(model=text_tokenizer_model)
-    return await gpt_chat_completion(
-        data,
-        deployment_id,
-        upstream_endpoint,
-        creds,
-        api_version,
-        tokenizer,
-        app_config.ELIMINATE_EMPTY_CHOICES,
-    )
-
-
-async def chat_completion(
-    deployment_id: str,
-    request: Request,
-    app_config: Annotated[ApplicationConfig, Depends(get_request_app_config)],
-):
-
-    data = await parse_body(request)
-
-    is_stream = bool(data.get("stream"))
-
-    emulate_streaming = (
-        deployment_id in app_config.NON_STREAMING_DEPLOYMENTS and is_stream
-    )
-
-    if emulate_streaming:
-        data["stream"] = False
-
-    return create_server_response(
-        emulate_streaming,
-        await call_chat_completion(
-            deployment_id, data, is_stream, request, app_config
-        ),
-    )
diff --git a/aidial_adapter_openai/utils/image_tokenizer.py b/aidial_adapter_openai/utils/image_tokenizer.py
index 01eb467..7f9eb79 100644
--- a/aidial_adapter_openai/utils/image_tokenizer.py
+++ b/aidial_adapter_openai/utils/image_tokenizer.py
@@ -4,11 +4,11 @@
 """
 
 import math
-from typing import assert_never
+from typing import Literal, assert_never
 
 from pydantic import BaseModel
 
-from aidial_adapter_openai.app_config import ApplicationConfig
+from aidial_adapter_openai.constant import ChatCompletionDeploymentType
 from aidial_adapter_openai.utils.image import ImageDetail, resolve_detail_level
 
 
@@ -54,20 +54,25 @@ def _compute_high_detail_tokens(self, width: int, height: int) -> int:
     low_detail_tokens=2833, tokens_per_tile=5667
 )
 
+MultiModalDeployments = Literal[
+    ChatCompletionDeploymentType.GPT4O,
+    ChatCompletionDeploymentType.GPT4O_MINI,
+    ChatCompletionDeploymentType.GPT4_VISION,
+]
+
 
 def get_image_tokenizer(
-    deployment_id: str, app_config: ApplicationConfig
+    deployment_type: MultiModalDeployments,
 ) -> ImageTokenizer:
-    if deployment_id in app_config.GPT4O_DEPLOYMENTS:
-        return GPT4O_IMAGE_TOKENIZER
-    elif deployment_id in app_config.GPT4O_MINI_DEPLOYMENTS:
-        return GPT4O_MINI_IMAGE_TOKENIZER
-    elif deployment_id in app_config.GPT4_VISION_DEPLOYMENTS:
-        return GPT4_VISION_IMAGE_TOKENIZER
-    else:
-        raise RuntimeError(
-            f"No image tokenizer found for deployment {deployment_id}"
-        )
+    match deployment_type:
+        case ChatCompletionDeploymentType.GPT4O:
+            return GPT4O_IMAGE_TOKENIZER
+        case ChatCompletionDeploymentType.GPT4O_MINI:
+            return GPT4O_MINI_IMAGE_TOKENIZER
+        case ChatCompletionDeploymentType.GPT4_VISION:
+            return GPT4_VISION_IMAGE_TOKENIZER
+        case _:
+            assert_never(deployment_type)
 
 
 def _fit_longest(width: int, height: int, size: int) -> tuple[int, int]:
diff --git a/aidial_adapter_openai/utils/json.py b/aidial_adapter_openai/utils/json.py
new file mode 100644
index 0000000..2a7b8a6
--- /dev/null
+++ b/aidial_adapter_openai/utils/json.py
@@ -0,0 +1,2 @@
+def remove_nones(d: dict) -> dict:
+    return {k: v for k, v in d.items() if v is not None}
diff --git a/tests/conftest.py b/tests/conftest.py
index 876cdce..0a51517 100644
--- a/tests/conftest.py
+++ b/tests/conftest.py
@@ -7,9 +7,9 @@
 from aidial_adapter_openai.utils.request import get_app_config
 
 
-@pytest_asyncio.fixture
+@pytest.fixture
 def _app_instance():
-    return create_app(to_init_telemetry=False)
+    return create_app(init_telemetry=False)
 
 
 @pytest_asyncio.fixture

From 44320f29ce1c1d986e9273724209726c4e917d9f Mon Sep 17 00:00:00 2001
From: Roman Romanov <roman_romanov2@epam.com>
Date: Thu, 28 Nov 2024 20:46:41 +0200
Subject: [PATCH 08/10] Fix: handle completions endpoint

---
 aidial_adapter_openai/endpoints/chat_completion.py | 10 +++++-----
 1 file changed, 5 insertions(+), 5 deletions(-)

diff --git a/aidial_adapter_openai/endpoints/chat_completion.py b/aidial_adapter_openai/endpoints/chat_completion.py
index e7f03a6..4682c42 100644
--- a/aidial_adapter_openai/endpoints/chat_completion.py
+++ b/aidial_adapter_openai/endpoints/chat_completion.py
@@ -35,7 +35,6 @@
 
 
 async def call_chat_completion(
-    deployment_type: ChatCompletionDeploymentType,
     deployment_id: str,
     data: dict,
     is_stream: bool,
@@ -67,6 +66,10 @@ async def call_chat_completion(
             deployment_id,
             app_config,
         )
+
+    deployment_type = app_config.get_chat_completion_deployment_type(
+        deployment_id
+    )
     match deployment_type:
         case ChatCompletionDeploymentType.DALLE3:
             storage = create_file_storage("images", request.headers)
@@ -140,9 +143,6 @@ async def chat_completion(deployment_id: str, request: Request):
     app_config = get_request_app_config(request)
     data = await parse_body(request)
 
-    deployment_type = app_config.get_chat_completion_deployment_type(
-        deployment_id
-    )
     is_stream = bool(data.get("stream"))
 
     emulate_streaming = (
@@ -155,6 +155,6 @@ async def chat_completion(deployment_id: str, request: Request):
     return create_server_response(
         emulate_streaming,
         await call_chat_completion(
-            deployment_type, deployment_id, data, is_stream, request, app_config
+            deployment_id, data, is_stream, request, app_config
         ),
     )

From cbeda2b9a3e90fba96a1ee79fbdc7bf46eb714d5 Mon Sep 17 00:00:00 2001
From: Roman Romanov <roman_romanov2@epam.com>
Date: Fri, 29 Nov 2024 17:14:21 +0200
Subject: [PATCH 09/10] minor fix

---
 aidial_adapter_openai/app_config.py    | 6 ++++--
 aidial_adapter_openai/utils/parsers.py | 4 ----
 2 files changed, 4 insertions(+), 6 deletions(-)

diff --git a/aidial_adapter_openai/app_config.py b/aidial_adapter_openai/app_config.py
index a59161a..3180172 100644
--- a/aidial_adapter_openai/app_config.py
+++ b/aidial_adapter_openai/app_config.py
@@ -8,7 +8,6 @@
 from aidial_adapter_openai.utils.env import get_env_bool
 from aidial_adapter_openai.utils.json import remove_nones
 from aidial_adapter_openai.utils.log_config import logger
-from aidial_adapter_openai.utils.parsers import parse_deployment_list
 
 
 class ApplicationConfig(BaseModel):
@@ -47,7 +46,10 @@ def get_chat_completion_deployment_type(
     @classmethod
     def from_env(cls) -> "ApplicationConfig":
         def _parse_env_deployments(deployments_key: str) -> List[str] | None:
-            return parse_deployment_list(os.getenv(deployments_key)) or None
+            deployments_value = os.getenv(deployments_key)
+            if deployments_value is None:
+                return None
+            return list(map(str.strip, (deployments_value).split(",")))
 
         def _parse_env_dict(key: str) -> Dict[str, str] | None:
             value = os.getenv(key)
diff --git a/aidial_adapter_openai/utils/parsers.py b/aidial_adapter_openai/utils/parsers.py
index 8975093..15b7908 100644
--- a/aidial_adapter_openai/utils/parsers.py
+++ b/aidial_adapter_openai/utils/parsers.py
@@ -110,7 +110,3 @@ async def parse_body(request: Request) -> Dict[str, Any]:
         raise InvalidRequestError(str(data) + " is not of type 'object'")
 
     return data
-
-
-def parse_deployment_list(deployments: str | None) -> List[str]:
-    return list(map(str.strip, (deployments or "").split(",")))

From 5686a8c249fb8bcaff2dd0e971c68df88f58e1e0 Mon Sep 17 00:00:00 2001
From: Roman Romanov <roman_romanov2@epam.com>
Date: Fri, 29 Nov 2024 17:23:15 +0200
Subject: [PATCH 10/10] Fix linter

---
 aidial_adapter_openai/utils/parsers.py | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/aidial_adapter_openai/utils/parsers.py b/aidial_adapter_openai/utils/parsers.py
index 15b7908..3591ff9 100644
--- a/aidial_adapter_openai/utils/parsers.py
+++ b/aidial_adapter_openai/utils/parsers.py
@@ -1,7 +1,7 @@
 import re
 from abc import ABC, abstractmethod
 from json import JSONDecodeError
-from typing import Any, Dict, List, TypedDict
+from typing import Any, Dict, TypedDict
 
 from aidial_sdk.exceptions import InvalidRequestError
 from fastapi import Request