cleanup

vladisavvv · vladisavvv · commit 86ef53ae4512 · 2023-10-02T23:10:56.000+02:00
diff --git a/.env.example b/.env.example
@@ -1,3 +1,3 @@
-DEFAULT_API_VERSION=2023-03-15-preview
+AZURE_API_VERSION=2023-03-15-preview
 MODEL_ALIASES={}
 LOG_LEVEL=INFO
diff --git a/Makefile b/Makefile
@@ -16,9 +16,6 @@ build: install
 serve: install
 	poetry run uvicorn "aidial_adapter_openai.app:app" --reload --host "0.0.0.0" --port $(PORT) --workers=1 --env-file ./.env
 
-client: install
-	poetry run python -m client.client_adapter $(ARGS)
-
 clean:
 	poetry run clean
 	poetry env remove --all
diff --git a/README.md b/README.md
@@ -55,8 +55,8 @@ Copy `.env.example` to `.env` and customize it for your environment:
 |---|---|---|
 |LOG_LEVEL|INFO|Log level. Use DEBUG for dev purposes and INFO in prod|
 |WEB_CONCURRENCY|1|Number of workers for the server|
-|DEFAULT_API_VERSION|2023-03-15-preview|The default version API for requests to Azure Openai API for cases when the user request doesn't contain "api-version"|
-|MODEL_ALIASES|{"gpt-35-turbo":"gpt-3.5-turbo-0301"}|Mapping request's deployment_id to [model name of tiktoken](https://github.com/openai/tiktoken/blob/main/tiktoken/model.py) for correct calculate of tokens.|
+|AZURE_API_VERSION|2023-03-15-preview|The version API for requests to Azure OpenAI API|
+|MODEL_ALIASES|{}|Mapping request's deployment_id to [model name of tiktoken](https://github.com/openai/tiktoken/blob/main/tiktoken/model.py) for correct calculate of tokens. Example: `{"gpt-35-turbo":"gpt-3.5-turbo-0301"}`|
 
 ### Docker
 
diff --git a/aidial_adapter_openai/app.py b/aidial_adapter_openai/app.py
@@ -20,7 +20,7 @@
 logging.config.dictConfig(LogConfig().dict())
 app = FastAPI()
 model_aliases = json.loads(os.getenv("MODEL_ALIASES", "{}"))
-default_api_version = os.getenv("DEFAULT_API_VERSION", "2023-03-15-preview")
+azure_api_version = os.getenv("AZURE_API_VERSION", "2023-03-15-preview")
 
 
 async def handle_exceptions(call):
@@ -38,7 +38,6 @@ async def handle_exceptions(call):
 
 @app.post("/openai/deployments/{deployment_id}/chat/completions")
 async def chat_completion(deployment_id: str, request: Request):
-    api_version = request.query_params.get("api-version", default_api_version)
     data = await parse_body(request)
 
     is_stream = data.get("stream", False)
@@ -54,7 +53,7 @@ async def chat_completion(deployment_id: str, request: Request):
             api_key=dial_api_key,
             api_base=api_base,
             api_type="azure",
-            api_version=api_version,
+            api_version=azure_api_version,
             request_timeout=(10, 600),  # connect timeout and total timeout
             **data
         )
@@ -76,7 +75,6 @@ async def chat_completion(deployment_id: str, request: Request):
 
 @app.post("/openai/deployments/{deployment_id}/embeddings")
 async def embedding(deployment_id: str, request: Request):
-    api_version = request.query_params.get("api-version", default_api_version)
     data = await parse_body(request)
 
     dial_api_key = request.headers["X-UPSTREAM-KEY"]
@@ -90,7 +88,7 @@ async def embedding(deployment_id: str, request: Request):
             api_key=dial_api_key,
             api_base=api_base,
             api_type="azure",
-            api_version=api_version,
+            api_version=azure_api_version,
             request_timeout=(10, 600),  # connect timeout and total timeout
             **data
         )
@@ -117,10 +115,5 @@ def health():
     return {"status": "ok"}
 
 
-@app.get("/blah")
-def blah():
-    return {"blah": "blah"}
-
-
 if __name__ == "__main__":
     uvicorn.run(app, port=5000)
diff --git a/aidial_adapter_openai/openai_override.py b/aidial_adapter_openai/openai_override.py
@@ -1,3 +1,12 @@
+"""
+OpenAI SDK translates various HTTP errors received from OpenAI API
+into Python exceptions: error.RateLimitError, error.InvalidRequestError,
+error.AuthenticationError etc.
+
+We want to retranslate the original HTTP errors to the user.
+So the standard error handlers in the openai.api_requestor.APIRequestor class
+are rewritten to wrap the original HTTP errors into OpenAIException and raise it.
+"""
 import json
 from json import JSONDecodeError
 
@@ -16,12 +25,12 @@ def __init__(self, body, code, resp, headers):
         super().__init__(resp)
 
 
-# Overrided to proxy original errors
+# Overridden to proxy original errors
 def handle_error_response_wrapper(wrapped, self, args, kwargs):
     raise OpenAIException(*args)
 
 
-# Overrided to proxy original errors
+# Overridden to proxy original errors
 def interpret_response_line_wrapper(wrapped, self: APIRequestor, args, kwargs):
     rbody, rcode, rheaders = args
     stream = kwargs.get("stream", False)
diff --git a/aidial_adapter_openai/utils/tokens.py b/aidial_adapter_openai/utils/tokens.py
@@ -1,3 +1,6 @@
+"""
+Implemented based on the official recipe: https://cookbook.openai.com/examples/how_to_count_tokens_with_tiktoken
+"""
 from typing import Any, List
 
 from tiktoken import Encoding
@@ -7,16 +10,20 @@ def calculate_prompt_tokens(
     messages: List[Any], model: str, encoding: Encoding
 ):
     prompt_tokens = 3
-    tokens_per_message = (
-        4 if model == "gpt-3.5-turbo-0301" else 3
-    )  # possible need change gpt-3.5-turbo to something anything
+
+    if model == "gpt-3.5-turbo-0301":
+        tokens_per_message = 4
+        tokens_per_name = -1
+    else:
+        tokens_per_message = 3
+        tokens_per_name = 1
 
     for message in messages:
         prompt_tokens += tokens_per_message
 
         for key, value in message.items():
             prompt_tokens += len(encoding.encode(value))
             if key == "name":
-                prompt_tokens += 1
+                prompt_tokens += tokens_per_name
 
     return prompt_tokens
diff --git a/poetry.lock b/poetry.lock
diff --git a/pyproject.toml b/pyproject.toml
@@ -31,7 +31,6 @@ pydantic = "^1.10.12"
 [tool.poetry.group.test.dependencies]
 pytest = "7.4.0"
 python-dotenv = "1.0.0"
-pytest-dotenv = "^0.5.0"
 pytest-aioresponses = "^0.2.0"
 httpx = "^0.25.0"
 
diff --git a/tests/test_errors.py b/tests/test_errors.py
@@ -85,7 +85,7 @@ async def test_error_during_streaming(aioresponses: aioresponses):
 
 
 @pytest.mark.asyncio
-async def test_incorrect_upsteram_url(aioresponses: aioresponses):
+async def test_incorrect_upstream_url(aioresponses: aioresponses):
     aioresponses.post(
         "http://localhost:5001/openai/deployments/gpt-4/chat/completions?api-version=2023-03-15-preview",
         status=200,
diff --git a/tests/test_streaming.py b/tests/test_streaming.py
@@ -10,7 +10,7 @@
 @pytest.mark.asyncio
 async def test_streaming(aioresponses: aioresponses):
     aioresponses.post(
-        "http://localhost:5001/openai/deployments/gpt-4/chat/completions?api-version=2023-06-15",
+        "http://localhost:5001/openai/deployments/gpt-4/chat/completions?api-version=2023-03-15-preview",
         status=200,
         body="data: "
         + json.dumps(
@@ -22,8 +22,8 @@ async def test_streaming(aioresponses: aioresponses):
                 "choices": [
                     {
                         "index": 0,
-                        "finish_reason": "stop",
-                        "message": {
+                        "finish_reason": None,
+                        "delta": {
                             "role": "assistant",
                         },
                     }
@@ -33,6 +33,26 @@ async def test_streaming(aioresponses: aioresponses):
         )
         + "\n\n"
         + "data: "
+        + json.dumps(
+            {
+                "id": "chatcmpl-test",
+                "object": "chat.completion.chunk",
+                "created": 1695940483,
+                "model": "gpt-4",
+                "choices": [
+                    {
+                        "index": 0,
+                        "finish_reason": None,
+                        "delta": {
+                            "content": "Test content",
+                        },
+                    }
+                ],
+                "usage": None,
+            }
+        )
+        + "\n\n"
+        + "data: "
         + json.dumps(
             {
                 "id": "chatcmpl-test",
@@ -68,4 +88,22 @@ async def test_streaming(aioresponses: aioresponses):
             assert line == ""
             continue
 
-        print("!", index, line, "!")  # TODO: change to asserts
+        if index == 0:
+            assert (
+                line
+                == 'data: {"id":"chatcmpl-test","object":"chat.completion.chunk","created":1695940483,"model":"gpt-4","choices":[{"index":0,"finish_reason":null,"delta":{"role":"assistant"}}],"usage":null}'
+            )
+        elif index == 2:
+            assert (
+                line
+                == 'data: {"id":"chatcmpl-test","object":"chat.completion.chunk","created":1695940483,"model":"gpt-4","choices":[{"index":0,"finish_reason":null,"delta":{"content":"Test content"}}],"usage":null}'
+            )
+        elif index == 4:
+            assert (
+                line
+                == 'data: {"id":"chatcmpl-test","object":"chat.completion.chunk","created":1696245654,"model":"gpt-4","choices":[{"index":0,"finish_reason":"stop","delta":{}}],"usage":{"completion_tokens":2,"prompt_tokens":9,"total_tokens":11}}'
+            )
+        elif index == 6:
+            assert line == "data: [DONE]"
+        else:
+            assert False