From e35a9cb31ae4242c22a3ffac0e95c37b40be1267 Mon Sep 17 00:00:00 2001 From: Geigle Date: Thu, 3 Jun 2021 17:34:37 +0200 Subject: [PATCH 01/23] Initial commit: scaffolding, already implemented a lot for transformers/ adapters --- square-model-inference-api/.gitignore | 133 +++++++++++++++ square-model-inference-api/Makefile | 42 +++++ square-model-inference-api/README.md | 80 +++++++++ .../auth_server/.env.example | 2 + .../auth_server/Dockerfile | 14 ++ .../auth_server/auth_api/__init__.py | 0 .../auth_server/auth_api/config.py | 7 + .../auth_server/auth_api/messages.py | 4 + .../auth_server/auth_api/security.py | 25 +++ .../auth_server/main.py | 8 + .../auth_server/requirements.txt | 3 + square-model-inference-api/docker-compose.yml | 40 +++++ .../inference_server/.dockerignore | 133 +++++++++++++++ .../inference_server/Dockerfile | 14 ++ .../inference_server/main.py | 21 +++ .../inference_server/requirements.txt | 8 + .../square_model_inference/__init__.py | 0 .../square_model_inference/api/__init__.py | 0 .../api/routes/__init__.py | 0 .../api/routes/heartbeat.py | 11 ++ .../api/routes/prediction.py | 20 +++ .../api/routes/router.py | 7 + .../square_model_inference/core/__init__.py | 0 .../square_model_inference/core/config.py | 18 ++ .../core/event_handlers.py | 40 +++++ .../square_model_inference/core/messages.py | 1 + .../inference/__init__.py | 0 .../inference/adaptertransformer.py | 86 ++++++++++ .../square_model_inference/inference/model.py | 18 ++ .../inference/transformer.py | 155 ++++++++++++++++++ .../square_model_inference/models/__init__.py | 0 .../models/heartbeat.py | 5 + .../models/prediction.py | 48 ++++++ .../square_model_inference/models/request.py | 54 ++++++ .../inference_server/tox.ini | 66 ++++++++ square-model-inference-api/nginx/nginx.conf | 34 ++++ square-model-inference-api/tests/__init__.py | 0 square-model-inference-api/tests/conftest.py | 16 ++ .../tests/test_api/__init__.py | 0 .../tests/test_api/test_api_auth.py | 17 ++ .../tests/test_api/test_heartbeat.py | 14 ++ .../tests/test_api/test_prediction.py | 23 +++ .../tests/test_service/__init__.py | 0 .../tests/test_service/test_models.py | 27 +++ 44 files changed, 1194 insertions(+) create mode 100644 square-model-inference-api/.gitignore create mode 100644 square-model-inference-api/Makefile create mode 100644 square-model-inference-api/README.md create mode 100644 square-model-inference-api/auth_server/.env.example create mode 100644 square-model-inference-api/auth_server/Dockerfile create mode 100644 square-model-inference-api/auth_server/auth_api/__init__.py create mode 100644 square-model-inference-api/auth_server/auth_api/config.py create mode 100644 square-model-inference-api/auth_server/auth_api/messages.py create mode 100644 square-model-inference-api/auth_server/auth_api/security.py create mode 100644 square-model-inference-api/auth_server/main.py create mode 100644 square-model-inference-api/auth_server/requirements.txt create mode 100644 square-model-inference-api/docker-compose.yml create mode 100644 square-model-inference-api/inference_server/.dockerignore create mode 100644 square-model-inference-api/inference_server/Dockerfile create mode 100644 square-model-inference-api/inference_server/main.py create mode 100644 square-model-inference-api/inference_server/requirements.txt create mode 100644 square-model-inference-api/inference_server/square_model_inference/__init__.py create mode 100644 square-model-inference-api/inference_server/square_model_inference/api/__init__.py create mode 100644 square-model-inference-api/inference_server/square_model_inference/api/routes/__init__.py create mode 100644 square-model-inference-api/inference_server/square_model_inference/api/routes/heartbeat.py create mode 100644 square-model-inference-api/inference_server/square_model_inference/api/routes/prediction.py create mode 100644 square-model-inference-api/inference_server/square_model_inference/api/routes/router.py create mode 100644 square-model-inference-api/inference_server/square_model_inference/core/__init__.py create mode 100644 square-model-inference-api/inference_server/square_model_inference/core/config.py create mode 100644 square-model-inference-api/inference_server/square_model_inference/core/event_handlers.py create mode 100644 square-model-inference-api/inference_server/square_model_inference/core/messages.py create mode 100644 square-model-inference-api/inference_server/square_model_inference/inference/__init__.py create mode 100644 square-model-inference-api/inference_server/square_model_inference/inference/adaptertransformer.py create mode 100644 square-model-inference-api/inference_server/square_model_inference/inference/model.py create mode 100644 square-model-inference-api/inference_server/square_model_inference/inference/transformer.py create mode 100644 square-model-inference-api/inference_server/square_model_inference/models/__init__.py create mode 100644 square-model-inference-api/inference_server/square_model_inference/models/heartbeat.py create mode 100644 square-model-inference-api/inference_server/square_model_inference/models/prediction.py create mode 100644 square-model-inference-api/inference_server/square_model_inference/models/request.py create mode 100644 square-model-inference-api/inference_server/tox.ini create mode 100644 square-model-inference-api/nginx/nginx.conf create mode 100644 square-model-inference-api/tests/__init__.py create mode 100644 square-model-inference-api/tests/conftest.py create mode 100644 square-model-inference-api/tests/test_api/__init__.py create mode 100644 square-model-inference-api/tests/test_api/test_api_auth.py create mode 100644 square-model-inference-api/tests/test_api/test_heartbeat.py create mode 100644 square-model-inference-api/tests/test_api/test_prediction.py create mode 100644 square-model-inference-api/tests/test_service/__init__.py create mode 100644 square-model-inference-api/tests/test_service/test_models.py diff --git a/square-model-inference-api/.gitignore b/square-model-inference-api/.gitignore new file mode 100644 index 000000000..7389010d8 --- /dev/null +++ b/square-model-inference-api/.gitignore @@ -0,0 +1,133 @@ +.idea/ + +/ml_model/models + +# Hidden files +.DS_store + +# Byte-compiled / optimized / DLL files +__pycache__/ +*.py[cod] +*$py.class + +# C extensions +*.so + +# data is not logged +data/ + +# Distribution / packaging +.Python +env/ +build/ +develop-eggs/ +dist/ +downloads/ +eggs/ +.eggs/ +lib/ +lib64/ +parts/ +sdist/ +var/ +wheels/ +*.egg-info/ +.installed.cfg +*.egg +.vscode/ + +# PyInstaller +# Usually these files are written by a python script from a template +# before PyInstaller builds the exe, so as to inject date/other infos into it. +*.manifest +*.spec + +# Installer logs +pip-log.txt +pip-delete-this-directory.txt + +# Unit test / coverage reports +htmlcov/ +htmlcov-py36/ +.tox/ +.coverage +.coverage.* +.cache +nosetests.xml +coverage.xml +*.cover +.hypothesis/ + +# Translations +*.mo +*.pot + +# Django stuff: +*.log +local_settings.py + +# Flask stuff: +instance/ +.webassets-cache + +# Scrapy stuff: +.scrapy + +# Sphinx documentation +docs/_build/ + +# PyBuilder +target/ + +# Jupyter Notebook +.ipynb_checkpoints + +# pyenv +.python-version + +.vscode/ +.pytest-cache/ +.pytest_cache/ +.empty/ + +# celery beat schedule file +celerybeat-schedule + +# SageMath parsed files +*.sage.py + +# dotenv +.env + +# virtualenv +.venv +venv/ +ENV/ + +# Spyder project settings +.spyderproject +.spyproject + +# Rope project settings +.ropeproject + +# mkdocs documentation +/site + +# mypy +.mypy_cache/ +.idea/ + + + +Child_Watch_API/ +htmlcov-*/ + +# remove large model +ml_model/*.joblib + + +#venv +venv/ + + diff --git a/square-model-inference-api/Makefile b/square-model-inference-api/Makefile new file mode 100644 index 000000000..e49710103 --- /dev/null +++ b/square-model-inference-api/Makefile @@ -0,0 +1,42 @@ +SHELL := /bin/bash + +# Target section and Global definitions +# ----------------------------------------------------------------------------- +.PHONY: all clean test install run deploy down + +all: clean test install run deploy down + +test: + python -m pip install --upgrade pip && pip install setuptools tox + tox + +install: + pip install --upgrade pip && pip install -r inference_server/requirements.txt + +run: + PYTHONPATH=inference_server/ uvicorn inference_server.main:app --reload --host 0.0.0.0 --port 8002 --env-file inference_server/.env.debug + +build: + docker-compose build + +deploy: + docker-compose build + docker-compose up -d + +down: + docker-compose down + +clean: + -find . -name '*.pyc' -exec rm -rf {} \; + -find . -name '__pycache__' -exec rm -rf {} \; + -find . -name 'Thumbs.db' -exec rm -rf {} \; + -find . -name '*~' -exec rm -rf {} \; + -rm -rf .cache + -rm -rf build + -rm -rf dist + -rm -rf *.egg-info + -rm -rf htmlcov* + -rm -rf .tox/ + -rm -rf docs/_build + -rm -r .coverage + -rm -rf .pytest_cache \ No newline at end of file diff --git a/square-model-inference-api/README.md b/square-model-inference-api/README.md new file mode 100644 index 000000000..5b93ceaad --- /dev/null +++ b/square-model-inference-api/README.md @@ -0,0 +1,80 @@ +# SQuARE Model API +Inference API that supports SOTA (QA) models & adapters. +Receives input and returns prediction and other artifacts (e.g. attention scores) + +## Project structure + +TODO +``` +├───tests +│ ├───test_api +│ └───test_service +├───auth_server +│ └───auth_api +├───nginx +├───inference_server +│ ├───square_model_inference +│ │ ├───api - Model API +│ │ │ ├───routes +│ │ ├───core +│ │ ├───models - Pydantic model definitions for in-/ output +│ │ ├───inference +``` + +## Requirements + +Python 3.7+, Docker (optional), Make (optional) + +## Installation +Install the required packages in your local environment (ideally virtualenv, conda, etc.). + + +```sh +python -m venv venv +source venv/bin/activate +make install +``` + +#### Running Localhost + +```sh +make run +``` + +#### Running Via Docker + +```sh +make deploy +``` + +#### Running Tests + +```sh +make test +``` + +## Setup +TODO + +## Run without `make` for development + +1. Start your app with: +```bash +PYTHONPATH=./inference_server uvicorn main:app --reload +``` + +2. Go to [http://localhost:8000/docs](http://localhost:8000/docs) or [http://localhost:8000/redoc](http://localhost:8000/redoc) for alternative swagger + + +## Run Tests with using `make` + +Install testing libraries and run `tox`: +```bash +pip install tox pytest flake8 coverage bandit +tox +``` +This runs tests and coverage for Python 3.8 and Flake8, Bandit. + + diff --git a/square-model-inference-api/auth_server/.env.example b/square-model-inference-api/auth_server/.env.example new file mode 100644 index 000000000..6125fa20f --- /dev/null +++ b/square-model-inference-api/auth_server/.env.example @@ -0,0 +1,2 @@ +API_KEY=example_key +API_KEY_NAME=api_key \ No newline at end of file diff --git a/square-model-inference-api/auth_server/Dockerfile b/square-model-inference-api/auth_server/Dockerfile new file mode 100644 index 000000000..0ced2f573 --- /dev/null +++ b/square-model-inference-api/auth_server/Dockerfile @@ -0,0 +1,14 @@ +FROM python:3.8.2 + +ENV PYTHONUNBUFFERED 1 + +EXPOSE 8081 +WORKDIR /app + +COPY requirements.txt ./ +RUN pip install --upgrade pip && \ + pip install -r requirements.txt + +COPY . ./ + +CMD ["uvicorn", "main:app", "--host", "0.0.0.0", "--port", "8081"] \ No newline at end of file diff --git a/square-model-inference-api/auth_server/auth_api/__init__.py b/square-model-inference-api/auth_server/auth_api/__init__.py new file mode 100644 index 000000000..e69de29bb diff --git a/square-model-inference-api/auth_server/auth_api/config.py b/square-model-inference-api/auth_server/auth_api/config.py new file mode 100644 index 000000000..ebd10abc4 --- /dev/null +++ b/square-model-inference-api/auth_server/auth_api/config.py @@ -0,0 +1,7 @@ +from starlette.config import Config +from starlette.datastructures import Secret + +config = Config(".env") + +API_KEY: Secret = config("API_KEY", cast=Secret) +API_KEY_NAME: str = config("API_KEY_NAME", cast=str, default="api_key") diff --git a/square-model-inference-api/auth_server/auth_api/messages.py b/square-model-inference-api/auth_server/auth_api/messages.py new file mode 100644 index 000000000..85c28dc04 --- /dev/null +++ b/square-model-inference-api/auth_server/auth_api/messages.py @@ -0,0 +1,4 @@ +NO_API_KEY = "No API key provided." +AUTH_REQ = "Authentication required." +HTTP_500_DETAIL = "Internal server error." + diff --git a/square-model-inference-api/auth_server/auth_api/security.py b/square-model-inference-api/auth_server/auth_api/security.py new file mode 100644 index 000000000..8f6082a8c --- /dev/null +++ b/square-model-inference-api/auth_server/auth_api/security.py @@ -0,0 +1,25 @@ +import secrets + +from fastapi import HTTPException, Security +from fastapi.security.api_key import APIKeyHeader, APIKeyQuery +from starlette.status import HTTP_400_BAD_REQUEST, HTTP_401_UNAUTHORIZED + +from .config import API_KEY, API_KEY_NAME +from .messages import AUTH_REQ, NO_API_KEY + +api_key_query = APIKeyQuery(name=API_KEY_NAME, auto_error=False) +api_key_header = APIKeyHeader(name=API_KEY_NAME, auto_error=False) + + +def validate_request(query: str = Security(api_key_query), + header: str = Security(api_key_header),) -> bool: + if query is None and header is None: + raise HTTPException( + status_code=HTTP_400_BAD_REQUEST, detail=NO_API_KEY, headers={} + ) + elif not secrets.compare_digest(query, str(API_KEY)) \ + or not secrets.compare_digest(header, str(API_KEY)): + raise HTTPException( + status_code=HTTP_401_UNAUTHORIZED, detail=AUTH_REQ, headers={} + ) + return True diff --git a/square-model-inference-api/auth_server/main.py b/square-model-inference-api/auth_server/main.py new file mode 100644 index 000000000..90332c6d2 --- /dev/null +++ b/square-model-inference-api/auth_server/main.py @@ -0,0 +1,8 @@ +from fastapi import FastAPI, Depends +import auth_api.security as security + +app = FastAPI() + +@app.get("/auth") +async def auth(authenticated: bool = Depends(security.validate_request),): + return {"authenticated": True} diff --git a/square-model-inference-api/auth_server/requirements.txt b/square-model-inference-api/auth_server/requirements.txt new file mode 100644 index 000000000..e059f7f5f --- /dev/null +++ b/square-model-inference-api/auth_server/requirements.txt @@ -0,0 +1,3 @@ +uvicorn +fastapi +pydantic diff --git a/square-model-inference-api/docker-compose.yml b/square-model-inference-api/docker-compose.yml new file mode 100644 index 000000000..ada141ee6 --- /dev/null +++ b/square-model-inference-api/docker-compose.yml @@ -0,0 +1,40 @@ +version: "3" + +services: + inference_bert: + build: ./inference_server/. + image: square_model_inference:latest + ports: + - 8000:8000 + env_file: + - ./inference_server/.env.bert + container_name: square_model_inference_bert + volumes: + - ./.cache/:/etc/huggingface/.cache/ + - ./inference_server/:/app/ #for developement + + #inference_roberta: + # build: ./inference_server/. + # image: square_model_inference:latest + # ports: + # - 8001:8000 + # env_file: + # - ./inference_server/.env.roberta + # container_name: square_model_inference_roberta + # volumes: + # - ./.cache/:/etc/huggingface/.cache/ + # - ./inference_server/:/app/ #for developement + + auth: + build: ./auth_server/. + ports: + - 8081:8081 + env_file: + - ./auth_server/.env.example + + nginx: + image: nginx + ports: + - 8080:8080 + volumes: + - ./nginx/nginx.conf:/etc/nginx/nginx.conf:ro \ No newline at end of file diff --git a/square-model-inference-api/inference_server/.dockerignore b/square-model-inference-api/inference_server/.dockerignore new file mode 100644 index 000000000..0af8215d8 --- /dev/null +++ b/square-model-inference-api/inference_server/.dockerignore @@ -0,0 +1,133 @@ +/images +/tests +.ascii-art + +*.yml +*.ini + +# Hidden files +.DS_store + +# Byte-compiled / optimized / DLL files +__pycache__/ +*.py[cod] +*$py.class + +# C extensions +*.so + +# data is not logged +data/ + +# Distribution / packaging +.Python +env/ +build/ +develop-eggs/ +dist/ +downloads/ +eggs/ +.eggs/ +lib/ +lib64/ +parts/ +sdist/ +var/ +wheels/ +*.egg-info/ +.installed.cfg +*.egg +.vscode/ + +# PyInstaller +# Usually these files are written by a python script from a template +# before PyInstaller builds the exe, so as to inject date/other infos into it. +*.manifest +*.spec + +# Installer logs +pip-log.txt +pip-delete-this-directory.txt + +# Unit test / coverage reports +htmlcov/ +htmlcov-py36/ +.tox/ +.coverage +.coverage.* +.cache +nosetests.xml +coverage.xml +*.cover +.hypothesis/ + +# Translations +*.mo +*.pot + +# Django stuff: +*.log +local_settings.py + +# Flask stuff: +instance/ +.webassets-cache + +# Scrapy stuff: +.scrapy + +# Sphinx documentation +docs/_build/ + +# PyBuilder +target/ + +# Jupyter Notebook +.ipynb_checkpoints + +# pyenv +.python-version + +.vscode/ +.pytest-cache/ +.pytest_cache/ +.empty/ + +# celery beat schedule file +celerybeat-schedule + +# SageMath parsed files +*.sage.py + +# virtualenv +.venv +venv/ +ENV/ + +# Spyder project settings +.spyderproject +.spyproject + +# Rope project settings +.ropeproject + +# mkdocs documentation +/site + +# mypy +.mypy_cache/ +.idea/ + + + +Child_Watch_API/ +htmlcov-*/ + +# remove large model +ml_model/*.joblib + + +#venv +venv/ + +.idea/ \ No newline at end of file diff --git a/square-model-inference-api/inference_server/Dockerfile b/square-model-inference-api/inference_server/Dockerfile new file mode 100644 index 000000000..4418f182a --- /dev/null +++ b/square-model-inference-api/inference_server/Dockerfile @@ -0,0 +1,14 @@ +FROM python:3.8.2 + +ENV PYTHONUNBUFFERED 1 + +EXPOSE 8000 +WORKDIR /app + +COPY requirements.txt ./ +RUN pip install --upgrade pip && \ + pip install -r requirements.txt + +COPY . ./ + +CMD ["uvicorn", "main:app", "--host", "0.0.0.0", "--port", "8000"] \ No newline at end of file diff --git a/square-model-inference-api/inference_server/main.py b/square-model-inference-api/inference_server/main.py new file mode 100644 index 000000000..ba3ca507f --- /dev/null +++ b/square-model-inference-api/inference_server/main.py @@ -0,0 +1,21 @@ +from fastapi import FastAPI +from square_model_inference.api.routes.router import api_router +from square_model_inference.core.config import API_PREFIX, APP_NAME, APP_VERSION, IS_DEBUG +from square_model_inference.core.event_handlers import start_app_handler, stop_app_handler + + +def get_app() -> FastAPI: + fast_app = FastAPI(title=APP_NAME, version=APP_VERSION, debug=IS_DEBUG) + fast_app.include_router(api_router, prefix=API_PREFIX) + + fast_app.add_event_handler("startup", start_app_handler(fast_app)) + fast_app.add_event_handler("shutdown", stop_app_handler(fast_app)) + + return fast_app + + +app = get_app() + +if __name__ == "__main__": + import uvicorn + uvicorn.run(app, host="0.0.0.0", port=8000) diff --git a/square-model-inference-api/inference_server/requirements.txt b/square-model-inference-api/inference_server/requirements.txt new file mode 100644 index 000000000..0b2dd68dc --- /dev/null +++ b/square-model-inference-api/inference_server/requirements.txt @@ -0,0 +1,8 @@ +uvicorn +fastapi +pydantic +loguru +python-dotenv +adapter-transformers +sentencepiece +torch \ No newline at end of file diff --git a/square-model-inference-api/inference_server/square_model_inference/__init__.py b/square-model-inference-api/inference_server/square_model_inference/__init__.py new file mode 100644 index 000000000..e69de29bb diff --git a/square-model-inference-api/inference_server/square_model_inference/api/__init__.py b/square-model-inference-api/inference_server/square_model_inference/api/__init__.py new file mode 100644 index 000000000..e69de29bb diff --git a/square-model-inference-api/inference_server/square_model_inference/api/routes/__init__.py b/square-model-inference-api/inference_server/square_model_inference/api/routes/__init__.py new file mode 100644 index 000000000..e69de29bb diff --git a/square-model-inference-api/inference_server/square_model_inference/api/routes/heartbeat.py b/square-model-inference-api/inference_server/square_model_inference/api/routes/heartbeat.py new file mode 100644 index 000000000..702a921bc --- /dev/null +++ b/square-model-inference-api/inference_server/square_model_inference/api/routes/heartbeat.py @@ -0,0 +1,11 @@ +from fastapi import APIRouter + +from square_model_inference.models.heartbeat import HearbeatResult + +router = APIRouter() + + +@router.get("/heartbeat", response_model=HearbeatResult, name="heartbeat") +def get_hearbeat() -> HearbeatResult: + heartbeat = HearbeatResult(is_alive=True) + return heartbeat diff --git a/square-model-inference-api/inference_server/square_model_inference/api/routes/prediction.py b/square-model-inference-api/inference_server/square_model_inference/api/routes/prediction.py new file mode 100644 index 000000000..5c2f6ec25 --- /dev/null +++ b/square-model-inference-api/inference_server/square_model_inference/api/routes/prediction.py @@ -0,0 +1,20 @@ +from fastapi import APIRouter +from starlette.requests import Request + +from square_model_inference.models.request import PredictionRequest +from square_model_inference.models.prediction import PredictionOutput +from square_model_inference.inference.model import Model + +router = APIRouter() + + +@router.post("/predict", response_model=PredictionOutput, name="predict") +async def predict( + request: Request, + prediction_request: PredictionRequest = None, +) -> PredictionOutput: + + model: Model = request.app.state.model + prediction: PredictionOutput = await model.predict(prediction_request) + + return prediction diff --git a/square-model-inference-api/inference_server/square_model_inference/api/routes/router.py b/square-model-inference-api/inference_server/square_model_inference/api/routes/router.py new file mode 100644 index 000000000..b1fac0bfd --- /dev/null +++ b/square-model-inference-api/inference_server/square_model_inference/api/routes/router.py @@ -0,0 +1,7 @@ +from fastapi import APIRouter + +from square_model_inference.api.routes import heartbeat, prediction + +api_router = APIRouter() +api_router.include_router(heartbeat.router, tags=["health"], prefix="/health") +api_router.include_router(prediction.router, tags=["prediction"], prefix="/v1") diff --git a/square-model-inference-api/inference_server/square_model_inference/core/__init__.py b/square-model-inference-api/inference_server/square_model_inference/core/__init__.py new file mode 100644 index 000000000..e69de29bb diff --git a/square-model-inference-api/inference_server/square_model_inference/core/config.py b/square-model-inference-api/inference_server/square_model_inference/core/config.py new file mode 100644 index 000000000..d56dd59e2 --- /dev/null +++ b/square-model-inference-api/inference_server/square_model_inference/core/config.py @@ -0,0 +1,18 @@ +from starlette.config import Config + +APP_VERSION = "0.1.0" +APP_NAME = "SQuARE Model Inference API" +API_PREFIX = "/api" + +config = Config(".env") + +IS_DEBUG: bool = config("IS_DEBUG", cast=bool, default=False) + +DISABLE_GPU: bool = config("DISABLE_GPU", cast=bool, default=False) +MODEL_NAME: str = config("MODEL_NAME") +MODEL_TYPE: str = config("MODEL_TYPE") +TRANSFORMERS_CACHE: str = config("TRANSFORMERS_CACHE") + +MAX_BATCH_SIZE: int = config("MAX_BATCH_SIZE", cast=int, default=32) + +RETURN_PLAINTEXT_ARRAYS = config("RETURN_PLAINTEXT_ARRAYS", cast=bool, default=False) \ No newline at end of file diff --git a/square-model-inference-api/inference_server/square_model_inference/core/event_handlers.py b/square-model-inference-api/inference_server/square_model_inference/core/event_handlers.py new file mode 100644 index 000000000..74493aa50 --- /dev/null +++ b/square-model-inference-api/inference_server/square_model_inference/core/event_handlers.py @@ -0,0 +1,40 @@ +from typing import Callable + +from fastapi import FastAPI +from loguru import logger + +from square_model_inference.inference.adaptertransformer import AdapterTransformer +from square_model_inference.core.config import MODEL_TYPE +from square_model_inference.inference.transformer import Transformer + +MODEL_MAPPING = { + "adapter": AdapterTransformer, + "transformer": Transformer +} + + +def _startup_model(app: FastAPI) -> None: + if MODEL_TYPE not in MODEL_MAPPING: + raise RuntimeError(f"Unknown MODEL_MAPPING. Must be one of {MODEL_MAPPING.keys()}") + model_instance = MODEL_MAPPING[MODEL_TYPE]() + app.state.model = model_instance + + +def _shutdown_model(app: FastAPI) -> None: + app.state.model = None + + +def start_app_handler(app: FastAPI) -> Callable: + def startup() -> None: + logger.info("Running app start handler.") + _startup_model(app) + + return startup + + +def stop_app_handler(app: FastAPI) -> Callable: + def shutdown() -> None: + logger.info("Running app shutdown handler.") + _shutdown_model(app) + + return shutdown diff --git a/square-model-inference-api/inference_server/square_model_inference/core/messages.py b/square-model-inference-api/inference_server/square_model_inference/core/messages.py new file mode 100644 index 000000000..b1251e7fa --- /dev/null +++ b/square-model-inference-api/inference_server/square_model_inference/core/messages.py @@ -0,0 +1 @@ +HTTP_500_DETAIL = "Internal server error." diff --git a/square-model-inference-api/inference_server/square_model_inference/inference/__init__.py b/square-model-inference-api/inference_server/square_model_inference/inference/__init__.py new file mode 100644 index 000000000..e69de29bb diff --git a/square-model-inference-api/inference_server/square_model_inference/inference/adaptertransformer.py b/square-model-inference-api/inference_server/square_model_inference/inference/adaptertransformer.py new file mode 100644 index 000000000..8bbe71c96 --- /dev/null +++ b/square-model-inference-api/inference_server/square_model_inference/inference/adaptertransformer.py @@ -0,0 +1,86 @@ +import json + +import torch +from loguru import logger + +from transformers import AutoModelWithHeads +from transformers.adapters.utils import download_cached, ADAPTER_HUB_INDEX_FILE + +from square_model_inference.inference.transformer import Transformer +from square_model_inference.models.request import PredictionRequest, Task + +from square_model_inference.models.prediction import PredictionOutput +from square_model_inference.core.config import MODEL_NAME, TRANSFORMERS_CACHE + + +class AdapterTransformer(Transformer): + def __init__(self): + self._load_model(AutoModelWithHeads) + self._load_adapter() + + def _load_adapter(self): + """ + Pre-load all available adapters for MODEL_NAME from adapterhub.ml. + We parse the hub index to extract all names and then load each model. + """ + logger.info("Loading all available adapters from adapterhub.ml") + index_file = download_cached(ADAPTER_HUB_INDEX_FILE.format(MODEL_NAME)) + adapter_index = json.load(open(index_file)) + adapters = set() + for task, datasets in adapter_index.items(): + for dataset in datasets.keys(): + for key in datasets[dataset].keys(): + if key != "default": + for org in datasets[dataset][key]["versions"].keys(): + adapters.add(f"{task}/{dataset}@{org}") + for adapter in adapters: + logger.debug(f"Loading adapter {adapter}") + self.model.load_adapter(adapter, load_as=adapter, with_head=True, cache_dir=TRANSFORMERS_CACHE) + + # def _load_single_adapter(self, adapter_name: str): + # if adapter_name not in self.model.config.adapters.adapters: + # logger.info(f"Loading new adapter {adapter_name}") + # self.model.load_adapter(adapter_name, with_head=True, load_as=adapter_name) + # else: + # logger.debug(f"Adapter {adapter_name} is already loaded. Not loading again") + + def _token_classification(self, request: PredictionRequest) -> PredictionOutput: + # We only have to change the label2id mapping from config.label2id (what super() uses) to the mapping + # of the chosen head + prediction = super()._token_classification(request) + + label2id = self.model.config.prediction_heads[request.adapter_name]["label2id"] + id2label = {v:k for k,v in label2id.items()} + prediction.task_outputs["id2label"] = id2label + + return prediction + + def _sequence_classification(self, request: PredictionRequest) -> PredictionOutput: + # We only have to change the label2id mapping from config.label2id (what super() uses) to the mapping + # of the chosen head + prediction = super()._sequence_classification(request) + + label2id = self.model.config.prediction_heads[request.adapter_name]["label2id"] + id2label = {v:k for k,v in label2id.items()} + prediction.task_outputs["id2label"] = id2label + + return prediction + + async def predict(self, request: PredictionRequest) -> PredictionOutput: + if request.is_preprocessed: + ValueError("is_preprocessed=True is not supported for this model. Please use text as input.") + + if not request.adapter_name or request.adapter_name not in self.model.config.adapters.adapters: + ValueError(f"Unknown or missing adapter {request.adapter_name}. " + f"Please provider a fully specified adapter name from adapterhub.ml") + self.model.set_active_adapters(request.adapter_name) + + if request.task == Task.sequence_classification: + return self._sequence_classification(request) + elif request.task == Task.token_classification: + return self._token_classification(request) + elif request.task == Task.embedding: + return self._embedding(request) + elif request.task == Task.generation: + return self._generation(request) + diff --git a/square-model-inference-api/inference_server/square_model_inference/inference/model.py b/square-model-inference-api/inference_server/square_model_inference/inference/model.py new file mode 100644 index 000000000..4aa92d8d2 --- /dev/null +++ b/square-model-inference-api/inference_server/square_model_inference/inference/model.py @@ -0,0 +1,18 @@ +from square_model_inference.models.request import PredictionRequest +from square_model_inference.models.prediction import PredictionOutput + + +class Model: + """ + Base class for all models. + __init__ is supposed to load all weights and other necessary files (e.g. tokenizer) + so that the model can directly perform inference on request + """ + async def predict(self, payload: PredictionRequest) -> PredictionOutput: + """ + Take an input, pre-process it accordingly, perform inference according to the task, + post-process the result and return it + :param payload: the prediction request containing the input and any other parameters required + :return: the result of the prediction + """ + raise NotImplementedError diff --git a/square-model-inference-api/inference_server/square_model_inference/inference/transformer.py b/square-model-inference-api/inference_server/square_model_inference/inference/transformer.py new file mode 100644 index 000000000..adb0b490a --- /dev/null +++ b/square-model-inference-api/inference_server/square_model_inference/inference/transformer.py @@ -0,0 +1,155 @@ +import json +from typing import Union, Tuple + +import torch +from loguru import logger + +from transformers import AutoTokenizer, AutoModel + +from square_model_inference.inference.model import Model +from square_model_inference.models.request import PredictionRequest, Task + +from square_model_inference.models.prediction import PredictionOutput +from square_model_inference.core.config import MODEL_NAME, DISABLE_GPU, MAX_BATCH_SIZE + + +class Transformer(Model): + SUPPORTED_EMBEDDING_MODES = ["mean", "max", "cls", "token"] + + def __init__(self): + self._load_model(AutoModel) + + def _load_model(self, model_cls): + """ + Load the base model MODEL_NAME and its tokenizer with Huggingface. + Model will be moved to GPU unless CUDA is unavailable or environment variable DISABLE_GPU is true. + """ + logger.debug(f"Loading model {MODEL_NAME}") + tokenizer = AutoTokenizer.from_pretrained(MODEL_NAME) + # Check if GPU is available + device = "cuda" if torch.cuda.is_available() and not DISABLE_GPU else "cpu" + model = model_cls.from_pretrained(MODEL_NAME).to(device) + logger.info(f"Model {MODEL_NAME} loaded on {device}") + + self.model = model + self.tokenizer = tokenizer + + def _ensure_tensor_on_device(self, **inputs): + """ + Ensure PyTorch tensors are on the specified device. + + Args: + inputs (keyword arguments that should be :obj:`torch.Tensor`): The tensors to place on :obj:`self.device`. + + Return: + :obj:`Dict[str, torch.Tensor]`: The same as :obj:`inputs` but on the proper device. + """ + return {name: tensor.to(self.model.device) for name, tensor in inputs.items()} + + def _predict(self, request: PredictionRequest, output_word_ids=False, output_attention_mask=False) \ + -> Union[dict, Tuple[dict, dict]]: + all_predictions = [] + features = self.tokenizer(request.input, + return_tensors="pt", + padding=True, + truncation=True, + **request.preprocessing_kwargs) + for start_idx in range(0, len(request.input), MAX_BATCH_SIZE): + with torch.no_grad(): + input_features = {k: features[k][start_idx:start_idx+MAX_BATCH_SIZE] for k in features.keys()} + input_features = self._ensure_tensor_on_device(**input_features) + predictions = self.model(**input_features, **request.model_kwargs) + all_predictions.append(predictions) + keys = all_predictions[0].keys() + final_prediction = {} + for key in keys: + # HuggingFace outputs for 'attentions' and more is returned as tuple of tensors + # Tuple of tuples only exists for 'past_key_values' which is only relevant for generation. + # Generation should NOT use this function + if isinstance(all_predictions[0][key], tuple): + tuple_of_lists = list(zip(*[[p.cpu() for p in tpl[key]] for tpl in all_predictions])) + final_prediction[key] = tuple(torch.cat(l) for l in tuple_of_lists) + else: + final_prediction[key] = torch.cat([p[key].cpu() for p in all_predictions]) + if any((output_word_ids, output_attention_mask)): + word_ids = [features.word_ids(i) for i in range(len(request.input))] + attention_mask = features["attention_mask"] + return final_prediction, {"word_ids": word_ids, "attention_mask": attention_mask} + return final_prediction + + def _embedding(self, request: PredictionRequest) -> PredictionOutput: + request.model_kwargs["output_hidden_states"] = True + predictions, other = self._predict(request, output_word_ids=True, output_attention_mask=True) + hidden_state = predictions.pop("hidden_states")[-1] + attention_mask = other["attention_mask"] + + embedding_mode = request.task_kwargs.get("embedding_mode", "mean") + if embedding_mode not in self.SUPPORTED_EMBEDDING_MODES: + ValueError(f"Embedding mode {embedding_mode} not in list of supported modes {self.SUPPORTED_EMBEDDING_MODES}") + + if embedding_mode == "cls": + emb = hidden_state[:, 0, :] + # copied from sentence-transformers pooling + elif embedding_mode == "max": + input_mask_expanded = attention_mask.unsqueeze(-1).expand(hidden_state.size()).float() + hidden_state[input_mask_expanded == 0] = -1e9 # Set padding tokens to large negative value + emb = torch.max(hidden_state, 1)[0] + # copied from sentence-transformers pooling + elif embedding_mode == "mean": + input_mask_expanded = attention_mask.unsqueeze(-1).expand(hidden_state.size()).float() + sum_embeddings = torch.sum(hidden_state * input_mask_expanded, 1) + sum_mask = input_mask_expanded.sum(1) + emb = sum_embeddings / sum_mask + elif embedding_mode == "token": + emb = hidden_state + predictions["embedding"] = emb + task_outputs = { + "embedding_mode": embedding_mode, + "word_ids": other["word_ids"] + } + return PredictionOutput(model_outputs=predictions, task_outputs=task_outputs) + + def _token_classification(self, request: PredictionRequest) -> PredictionOutput: + predictions, other = self._predict(request, output_word_ids=True) + # If logits dim > 1 or if the 'regression' flag is not set, we assume classification: + # We replace the logits by the softmax and add labels chosen with argmax + if predictions["logits"].size()[-1] != 1 and not request.task_kwargs.get("regression", False): + probabilities = torch.softmax(predictions["logits"], dim=-1) + predictions["logits"] = probabilities + predictions["labels"] = torch.argmax(predictions["logits"], dim=-1) + label2id = self.model.config.label2id + id2label = {v:k for k,v in label2id.items()} + task_outputs = { + "id2label": id2label, + "word_ids": other["word_ids"] + } + return PredictionOutput(model_outputs=predictions, task_outputs=task_outputs) + + def _sequence_classification(self, request: PredictionRequest) -> PredictionOutput: + predictions = self._predict(request) + # If logits dim > 1 or if the 'regression' flag is not set, we assume classification: + # We replace the logits by the softmax and add labels chosen with argmax + if predictions["logits"].size()[-1] != 1 and not request.task_kwargs.get("regression", False): + probabilities = torch.softmax(predictions["logits"], dim=-1) + predictions["logits"] = probabilities + predictions["labels"] = torch.argmax(predictions["logits"], dim=-1) + label2id = self.model.config.label2id + id2label = {v:k for k,v in label2id.items()} + task_outputs = { + "id2label": id2label + } + return PredictionOutput(model_outputs=predictions, task_outputs=task_outputs) + + async def predict(self, request: PredictionRequest) -> PredictionOutput: + if request.is_preprocessed: + ValueError("is_preprocessed=True is not supported for this model. Please use text as input.") + + if request.task == Task.sequence_classification: + return self._sequence_classification(request) + elif request.task == Task.token_classification: + return self._token_classification(request) + elif request.task == Task.embedding: + return self._embedding(request) + elif request.task == Task.generation: + return self._generation(request) + diff --git a/square-model-inference-api/inference_server/square_model_inference/models/__init__.py b/square-model-inference-api/inference_server/square_model_inference/models/__init__.py new file mode 100644 index 000000000..e69de29bb diff --git a/square-model-inference-api/inference_server/square_model_inference/models/heartbeat.py b/square-model-inference-api/inference_server/square_model_inference/models/heartbeat.py new file mode 100644 index 000000000..34a47e56e --- /dev/null +++ b/square-model-inference-api/inference_server/square_model_inference/models/heartbeat.py @@ -0,0 +1,5 @@ +from pydantic import BaseModel + + +class HearbeatResult(BaseModel): + is_alive: bool diff --git a/square-model-inference-api/inference_server/square_model_inference/models/prediction.py b/square-model-inference-api/inference_server/square_model_inference/models/prediction.py new file mode 100644 index 000000000..03acbad00 --- /dev/null +++ b/square-model-inference-api/inference_server/square_model_inference/models/prediction.py @@ -0,0 +1,48 @@ +import torch +from io import BytesIO +import base64 +import numpy as np +from pydantic import Field, BaseModel +from square_model_inference.core.config import RETURN_PLAINTEXT_ARRAYS + +def _encode_numpy(obj): + def encode(arr): + if RETURN_PLAINTEXT_ARRAYS: + return arr.tolist() + else: + with BytesIO() as b: + np.save(b, arr) + arr_binary = b.getvalue() + arr_binary_b64 = base64.b64encode(arr_binary) + arr_string_b64 = arr_binary_b64.decode("latin1") + return arr_string_b64 + # LOADING WITH + # arr_binary_b64 = arr_string_b64.encode() + # arr_binary = base64.decodebytes(arr_binary_b64) + # arr = np.load(BytesIO(arr_binary)) + for k, v in obj.items(): + if isinstance(v, torch.Tensor): + v = v.numpy() + obj[k] = encode(v) + elif isinstance(v, tuple) and isinstance(v[0], torch.Tensor): + v = [encode(vv.numpy()) for vv in v] + obj[k] = v + return obj + + +class PredictionOutput(BaseModel): + """ + The results of the prediction of the model on the given input for the requested task. + """ + model_outputs: dict = Field( + description="Dictionary containing the model outputs as numpy arrays cast to lists." + ) + task_outputs: dict = Field( + description="Dictionary containing the task outputs. This covers anything from generated text, " + "id2label mappings, token2word mappings, etc." + ) + + def __init__(self, **data): + super().__init__(**data) + self.model_outputs = _encode_numpy(self.model_outputs) + diff --git a/square-model-inference-api/inference_server/square_model_inference/models/request.py b/square-model-inference-api/inference_server/square_model_inference/models/request.py new file mode 100644 index 000000000..3df379aa8 --- /dev/null +++ b/square-model-inference-api/inference_server/square_model_inference/models/request.py @@ -0,0 +1,54 @@ +from enum import Enum +from typing import Union, List, Optional + +from pydantic import BaseModel, Field + + +class Task(str, Enum): + """ + The available tasks that can be handled. Note that most model actually supports only one task. + Requesting a task that a model cannot handle might fail or produce incorrect results. + """ + sequence_classification = "sequence_classification" + token_classification = "token_classification" + embedding = "embedding" + generation = "generation" + + +class PredictionRequest(BaseModel): + """ + Prediction request containing the input, pre-processing parameters, parameters for the model forward pass, + the task with task-specific parameters, and parameters for any post-processing + """ + input: Union[List[str], List[List[str]], dict] = Field( + ..., + description="Input for the model. Supports Huggingface Transformer inputs (i.e., list of sentences, or " + "list of pairs of sentences), a dictionary with Transformer inputs, or a dictionary containing " + "numpy arrays (as lists). For the numpy arrays, also set is_preprocessed=True." + ) + is_preprocessed: bool = Field( + default=False, + description="Flag indicating that the input contains already pre-processed numpy arrays " + "as list and that it needs no further pre-processing." + ) + preprocessing_kwargs: dict = Field( + default={}, + description="Dictionary containing additional parameters for the pre-processing step." + ) + model_kwargs: dict = Field( + default={}, + description="Dictionary containing parameters that are passed to the model for the forward pass. " + "Set ‘output_attention=True’ to receive the attention weights for Huggingface Transformers in the " + "output." + ) + task: Task = Field(...) + task_kwargs: dict = Field( + default={}, + description="Dictionary containing additional parameters for handling of the task and " + "task-related post-processing." + ) + adapter_name: Optional[str] = Field( + default="", + description="Only necessary for adapter-based models. " + "The fully specified name of the to-be-used adapter from adapterhub.ml" + ) \ No newline at end of file diff --git a/square-model-inference-api/inference_server/tox.ini b/square-model-inference-api/inference_server/tox.ini new file mode 100644 index 000000000..e48ef368e --- /dev/null +++ b/square-model-inference-api/inference_server/tox.ini @@ -0,0 +1,66 @@ +[tox] +envlist = flake8,py38,bandit + +[testenv] +deps = + coverage + pytest + pytest-cov + pycodestyle>=2.0 +commands = + coverage erase + pip install -r {toxinidir}/requirements.txt + pytest --cov=tests --cov=square_model_inference --cov-report=term-missing --cov-config=setup.cfg + coverage report + coverage html -d htmlcov-{envname} + +[testenv:autopep8] +basepython = python3 +skip_install = true +deps = + autopep8>=1.5 +commands = + autopep8 --in-place --aggressive -r square_model_inference/ + + +[testenv:flake8] +basepython = python3 +skip_install = true +deps = + flake8 + flake8-bugbear + flake8-colors + flake8-typing-imports>=1.1 + pep8-naming +commands = + flake8 square_model_inference/ tests/ setup.py + +[testenv:bandit] +basepython = python3 +deps = + bandit +commands = + bandit -r square_model_inference/ + +[flake8] +exclude = + .tox, + .git, + __pycache__, + docs/source/conf.py, + build, + dist, + tests/fixtures/*, + *.pyc, + *.egg-info, + .cache, + .eggs +max-complexity = 10 +import-order-style = google +application-import-names = flake8 +max-line-length = 120 +ignore = + B008, + N803, + N806, + E126 \ No newline at end of file diff --git a/square-model-inference-api/nginx/nginx.conf b/square-model-inference-api/nginx/nginx.conf new file mode 100644 index 000000000..20c5c84d4 --- /dev/null +++ b/square-model-inference-api/nginx/nginx.conf @@ -0,0 +1,34 @@ +events { + worker_connections 1024; +} + +http { + server { + listen 8080; + + location / { + proxy_pass http://square_model_inference_bert:8000/; + } + + location /api/bert-base-uncased { + #auth_request /auth; + proxy_pass http://square_model_inference_bert:8000/api; + } + +# location /api/roberta-base { +# auth_request /auth; +# proxy_pass http://square_model_inference_roberta:8000/api; +# } + + location /auth { + internal; + set $query ''; + if ($request_uri ~* "[^\?]+\?(.*)$") { + set $query $1; + } + proxy_pass http://auth:8081/auth; + proxy_pass_request_body off; + proxy_set_header Content-Length ""; + } + } +} \ No newline at end of file diff --git a/square-model-inference-api/tests/__init__.py b/square-model-inference-api/tests/__init__.py new file mode 100644 index 000000000..e69de29bb diff --git a/square-model-inference-api/tests/conftest.py b/square-model-inference-api/tests/conftest.py new file mode 100644 index 000000000..b21642754 --- /dev/null +++ b/square-model-inference-api/tests/conftest.py @@ -0,0 +1,16 @@ +import pytest +from starlette.config import environ +from starlette.testclient import TestClient + + +environ["API_KEY"] = "example_key" + + +from square_model_inference.main import get_app # noqa: E402 + + +@pytest.fixture() +def test_client(): + app = get_app() + with TestClient(app) as test_client: + yield test_client diff --git a/square-model-inference-api/tests/test_api/__init__.py b/square-model-inference-api/tests/test_api/__init__.py new file mode 100644 index 000000000..e69de29bb diff --git a/square-model-inference-api/tests/test_api/test_api_auth.py b/square-model-inference-api/tests/test_api/test_api_auth.py new file mode 100644 index 000000000..cc5467b20 --- /dev/null +++ b/square-model-inference-api/tests/test_api/test_api_auth.py @@ -0,0 +1,17 @@ +from square_model_inference.core import messages + + +def test_auth_using_prediction_api_no_apikey_header(test_client) -> None: + response = test_client.post("/api/v1/question") + assert response.status_code == 400 + assert response.json() == {"detail": messages.NO_API_KEY} + + +def test_auth_using_prediction_api_wrong_apikey_header(test_client) -> None: + response = test_client.post( + "/api/v1/question", + json={"context": "test", "question": "test"}, + headers={"token": "WRONG_TOKEN"}, + ) + assert response.status_code == 401 + assert response.json() == {"detail": messages.AUTH_REQ} diff --git a/square-model-inference-api/tests/test_api/test_heartbeat.py b/square-model-inference-api/tests/test_api/test_heartbeat.py new file mode 100644 index 000000000..343f594d7 --- /dev/null +++ b/square-model-inference-api/tests/test_api/test_heartbeat.py @@ -0,0 +1,14 @@ +from square_model_inference.main import get_app + +app = get_app() + + +def test_heartbeat(test_client) -> None: + response = test_client.get("/api/health/heartbeat") + assert response.status_code == 200 + assert response.json() == {"is_alive": True} + + +def test_default_route(test_client) -> None: + response = test_client.get("/") + assert response.status_code == 404 diff --git a/square-model-inference-api/tests/test_api/test_prediction.py b/square-model-inference-api/tests/test_api/test_prediction.py new file mode 100644 index 000000000..a00caf028 --- /dev/null +++ b/square-model-inference-api/tests/test_api/test_prediction.py @@ -0,0 +1,23 @@ +def test_prediction(test_client) -> None: + response = test_client.post( + "/api/v1/question", + json={"context": "two plus two equal four", "question": "What is four?"}, + headers={"token": "example_key"}, + ) + assert response.status_code == 200 + assert "score" in response.json() + + +def test_prediction_nopayload(test_client) -> None: + response = test_client.post( + "/api/v1/question", json={}, headers={"token": "example_key"} + ) + """ + ## if nopayload, default Hitchhiker's Guide to the Galaxy example is sent + # context:"42 is the answer to life, the universe and everything." + # question:"What is the answer to life?" + # if no default, assert response.status_code == 422 + """ + + data = response.json() + assert data["answer"] == "42" diff --git a/square-model-inference-api/tests/test_service/__init__.py b/square-model-inference-api/tests/test_service/__init__.py new file mode 100644 index 000000000..e69de29bb diff --git a/square-model-inference-api/tests/test_service/test_models.py b/square-model-inference-api/tests/test_service/test_models.py new file mode 100644 index 000000000..61d666db7 --- /dev/null +++ b/square-model-inference-api/tests/test_service/test_models.py @@ -0,0 +1,27 @@ +import pytest + +from square_model_inference.core import config +from square_model_inference.models.payload import QAPredictionPayload +from square_model_inference.models.prediction import QAPredictionResult +from square_model_inference.transformers.nlp import QAModel + + +def test_prediction(test_client) -> None: + model_path = config.DEFAULT_MODEL_PATH + qa = QAPredictionPayload.parse_obj( + {"context": "two plus two equal four", "question": "What is four?"} + ) + + tm = QAModel(model_path) + result = tm.predict(qa) + assert isinstance(result, QAPredictionResult) + assert result.answer == "two plus two" + + +def test_prediction_no_payload(test_client) -> None: + model_path = config.DEFAULT_MODEL_PATH + + tm = QAModel(model_path) + with pytest.raises(ValueError): + result = tm.predict(None) + assert isinstance(result, QAPredictionResult) From dee676e5036389e1868be60a5ebeeaf0372247ed Mon Sep 17 00:00:00 2001 From: Geigle Date: Thu, 10 Jun 2021 15:42:42 +0200 Subject: [PATCH 02/23] Changed model init to decouple from config, added question answering; added sentence-transformers --- .../{inference_server => }/.dockerignore | 0 square-model-inference-api/Makefile | 10 +- .../test_api => auth_server}/__init__.py | 0 .../__init__.py | 0 .../inference_server/main.py | 5 +- .../square_model_inference/core/config.py | 10 +- .../core/event_handlers.py | 15 +- .../inference/adaptertransformer.py | 16 +- .../inference/sentencetransformer.py | 45 +++++ .../inference/transformer.py | 186 ++++++++++++++---- .../models/prediction.py | 26 ++- .../square_model_inference/models/request.py | 1 + .../inference_server/tox.ini | 66 ------- 13 files changed, 247 insertions(+), 133 deletions(-) rename square-model-inference-api/{inference_server => }/.dockerignore (100%) rename square-model-inference-api/{tests/test_api => auth_server}/__init__.py (100%) rename square-model-inference-api/{tests/test_service => inference_server}/__init__.py (100%) create mode 100644 square-model-inference-api/inference_server/square_model_inference/inference/sentencetransformer.py delete mode 100644 square-model-inference-api/inference_server/tox.ini diff --git a/square-model-inference-api/inference_server/.dockerignore b/square-model-inference-api/.dockerignore similarity index 100% rename from square-model-inference-api/inference_server/.dockerignore rename to square-model-inference-api/.dockerignore diff --git a/square-model-inference-api/Makefile b/square-model-inference-api/Makefile index e49710103..9ca741306 100644 --- a/square-model-inference-api/Makefile +++ b/square-model-inference-api/Makefile @@ -7,14 +7,14 @@ SHELL := /bin/bash all: clean test install run deploy down test: - python -m pip install --upgrade pip && pip install setuptools tox - tox + python -m pip install --upgrade pip && pip install pytest + pytest install: pip install --upgrade pip && pip install -r inference_server/requirements.txt run: - PYTHONPATH=inference_server/ uvicorn inference_server.main:app --reload --host 0.0.0.0 --port 8002 --env-file inference_server/.env.debug + PYTHONPATH=inference_server/ uvicorn inference_server.main:app --reload --host 0.0.0.0 --port 8002 --env-file inference_server/.env build: docker-compose build @@ -31,12 +31,8 @@ clean: -find . -name '__pycache__' -exec rm -rf {} \; -find . -name 'Thumbs.db' -exec rm -rf {} \; -find . -name '*~' -exec rm -rf {} \; - -rm -rf .cache -rm -rf build -rm -rf dist -rm -rf *.egg-info - -rm -rf htmlcov* - -rm -rf .tox/ -rm -rf docs/_build - -rm -r .coverage -rm -rf .pytest_cache \ No newline at end of file diff --git a/square-model-inference-api/tests/test_api/__init__.py b/square-model-inference-api/auth_server/__init__.py similarity index 100% rename from square-model-inference-api/tests/test_api/__init__.py rename to square-model-inference-api/auth_server/__init__.py diff --git a/square-model-inference-api/tests/test_service/__init__.py b/square-model-inference-api/inference_server/__init__.py similarity index 100% rename from square-model-inference-api/tests/test_service/__init__.py rename to square-model-inference-api/inference_server/__init__.py diff --git a/square-model-inference-api/inference_server/main.py b/square-model-inference-api/inference_server/main.py index ba3ca507f..4d4f0825c 100644 --- a/square-model-inference-api/inference_server/main.py +++ b/square-model-inference-api/inference_server/main.py @@ -1,11 +1,11 @@ from fastapi import FastAPI from square_model_inference.api.routes.router import api_router -from square_model_inference.core.config import API_PREFIX, APP_NAME, APP_VERSION, IS_DEBUG +from square_model_inference.core.config import API_PREFIX, APP_NAME, APP_VERSION from square_model_inference.core.event_handlers import start_app_handler, stop_app_handler def get_app() -> FastAPI: - fast_app = FastAPI(title=APP_NAME, version=APP_VERSION, debug=IS_DEBUG) + fast_app = FastAPI(title=APP_NAME, version=APP_VERSION) fast_app.include_router(api_router, prefix=API_PREFIX) fast_app.add_event_handler("startup", start_app_handler(fast_app)) @@ -13,7 +13,6 @@ def get_app() -> FastAPI: return fast_app - app = get_app() if __name__ == "__main__": diff --git a/square-model-inference-api/inference_server/square_model_inference/core/config.py b/square-model-inference-api/inference_server/square_model_inference/core/config.py index d56dd59e2..a3973bdee 100644 --- a/square-model-inference-api/inference_server/square_model_inference/core/config.py +++ b/square-model-inference-api/inference_server/square_model_inference/core/config.py @@ -6,13 +6,19 @@ config = Config(".env") -IS_DEBUG: bool = config("IS_DEBUG", cast=bool, default=False) - +# Disable CUDA even if available DISABLE_GPU: bool = config("DISABLE_GPU", cast=bool, default=False) +# Corresponds to the Huggingface name for Transformers MODEL_NAME: str = config("MODEL_NAME") +# Type of the model, e.g. Transformers, Adapter, ... MODEL_TYPE: str = config("MODEL_TYPE") +# Cache directory where model weights are stored TRANSFORMERS_CACHE: str = config("TRANSFORMERS_CACHE") MAX_BATCH_SIZE: int = config("MAX_BATCH_SIZE", cast=int, default=32) +# For MODEL_TYPE=transformers: decides the AutoModelFor* class used +MODEL_CLASS: str = config("MODEL_CLASS", default="base") + +# Flag that decides if returned numpy arrays are returned as lists or encoded to base64 (smaller but not easily human readable) RETURN_PLAINTEXT_ARRAYS = config("RETURN_PLAINTEXT_ARRAYS", cast=bool, default=False) \ No newline at end of file diff --git a/square-model-inference-api/inference_server/square_model_inference/core/event_handlers.py b/square-model-inference-api/inference_server/square_model_inference/core/event_handlers.py index 74493aa50..bec998460 100644 --- a/square-model-inference-api/inference_server/square_model_inference/core/event_handlers.py +++ b/square-model-inference-api/inference_server/square_model_inference/core/event_handlers.py @@ -4,19 +4,28 @@ from loguru import logger from square_model_inference.inference.adaptertransformer import AdapterTransformer -from square_model_inference.core.config import MODEL_TYPE +from square_model_inference.core.config import MODEL_TYPE, MODEL_NAME, MODEL_CLASS, DISABLE_GPU, MAX_BATCH_SIZE, TRANSFORMERS_CACHE +from square_model_inference.inference.sentencetransformer import SentenceTransformer from square_model_inference.inference.transformer import Transformer MODEL_MAPPING = { "adapter": AdapterTransformer, - "transformer": Transformer + "transformer": Transformer, + "sentence-transformer": SentenceTransformer } +MODEL_KWARGS = { + "model_name": MODEL_NAME, + "model_class": MODEL_CLASS, + "disable_gpu": DISABLE_GPU, + "max_batch_size": MAX_BATCH_SIZE, + "transformers_cache": TRANSFORMERS_CACHE +} def _startup_model(app: FastAPI) -> None: if MODEL_TYPE not in MODEL_MAPPING: raise RuntimeError(f"Unknown MODEL_MAPPING. Must be one of {MODEL_MAPPING.keys()}") - model_instance = MODEL_MAPPING[MODEL_TYPE]() + model_instance = MODEL_MAPPING[MODEL_TYPE](**MODEL_KWARGS) app.state.model = model_instance diff --git a/square-model-inference-api/inference_server/square_model_inference/inference/adaptertransformer.py b/square-model-inference-api/inference_server/square_model_inference/inference/adaptertransformer.py index 8bbe71c96..ed7e0b4c1 100644 --- a/square-model-inference-api/inference_server/square_model_inference/inference/adaptertransformer.py +++ b/square-model-inference-api/inference_server/square_model_inference/inference/adaptertransformer.py @@ -10,21 +10,21 @@ from square_model_inference.models.request import PredictionRequest, Task from square_model_inference.models.prediction import PredictionOutput -from square_model_inference.core.config import MODEL_NAME, TRANSFORMERS_CACHE class AdapterTransformer(Transformer): - def __init__(self): - self._load_model(AutoModelWithHeads) - self._load_adapter() + def __init__(self, model_name, max_batch_size, disable_gpu, transformers_cache, **kwargs): + self._load_model(AutoModelWithHeads, model_name, disable_gpu) + self._load_adapter(model_name, transformers_cache) + self.max_batch_size = max_batch_size - def _load_adapter(self): + def _load_adapter(self, model_name, transformers_cache): """ Pre-load all available adapters for MODEL_NAME from adapterhub.ml. We parse the hub index to extract all names and then load each model. """ logger.info("Loading all available adapters from adapterhub.ml") - index_file = download_cached(ADAPTER_HUB_INDEX_FILE.format(MODEL_NAME)) + index_file = download_cached(ADAPTER_HUB_INDEX_FILE.format(model_name)) adapter_index = json.load(open(index_file)) adapters = set() for task, datasets in adapter_index.items(): @@ -35,7 +35,7 @@ def _load_adapter(self): adapters.add(f"{task}/{dataset}@{org}") for adapter in adapters: logger.debug(f"Loading adapter {adapter}") - self.model.load_adapter(adapter, load_as=adapter, with_head=True, cache_dir=TRANSFORMERS_CACHE) + self.model.load_adapter(adapter, load_as=adapter, with_head=True, cache_dir=transformers_cache) # def _load_single_adapter(self, adapter_name: str): # if adapter_name not in self.model.config.adapters.adapters: @@ -79,6 +79,8 @@ async def predict(self, request: PredictionRequest) -> PredictionOutput: return self._sequence_classification(request) elif request.task == Task.token_classification: return self._token_classification(request) + elif request.task == Task.question_answering: + return self._question_answering(request) elif request.task == Task.embedding: return self._embedding(request) elif request.task == Task.generation: diff --git a/square-model-inference-api/inference_server/square_model_inference/inference/sentencetransformer.py b/square-model-inference-api/inference_server/square_model_inference/inference/sentencetransformer.py new file mode 100644 index 000000000..fdfd32ad6 --- /dev/null +++ b/square-model-inference-api/inference_server/square_model_inference/inference/sentencetransformer.py @@ -0,0 +1,45 @@ +import json +from typing import Union, Tuple + +import torch +from loguru import logger +import numpy as np +from sentence_transformers import SentenceTransformer + +from square_model_inference.inference.model import Model +from square_model_inference.models.request import PredictionRequest, Task + +from square_model_inference.models.prediction import PredictionOutput + + +class SentenceTransformer(Model): + SUPPORTED_EMBEDDING_MODES = ["mean", "max", "cls", "token"] + + def __init__(self, model_name, max_batch_size, disable_gpu, **kwargs): + self._load_model(model_name, disable_gpu) + self.max_batch_size = max_batch_size + + def _load_model(self, model_name, disable_gpu): + """ + Load the Transformer model model_name and its tokenizer with Huggingface. + Model will be moved to GPU unless CUDA is unavailable or disable_gpu is true. + """ + logger.debug(f"Loading model {model_name}") + device = "cuda" if torch.cuda.is_available() and not disable_gpu else "cpu" + model = SentenceTransformer(model_name_or_path=model_name, device=device) + logger.info(f"Model {model_name} loaded on {device}") + self.model = model + + def _embedding(self, request: PredictionRequest) -> PredictionOutput: + embeddings = self.model.encode(request.input, batch_size=self.max_batch_size, show_progress_bar=False) + return PredictionOutput(model_outputs={"embeddings": embeddings}, task_outputs={}) + + + async def predict(self, request: PredictionRequest) -> PredictionOutput: + if request.is_preprocessed: + ValueError("is_preprocessed=True is not supported for this model. Please use text as input.") + + if request.task != Task.embedding: + NotImplementedError("Only embedding task supported by this model") + return self._embedding(request) + diff --git a/square-model-inference-api/inference_server/square_model_inference/inference/transformer.py b/square-model-inference-api/inference_server/square_model_inference/inference/transformer.py index adb0b490a..c4c0a4f92 100644 --- a/square-model-inference-api/inference_server/square_model_inference/inference/transformer.py +++ b/square-model-inference-api/inference_server/square_model_inference/inference/transformer.py @@ -3,33 +3,42 @@ import torch from loguru import logger - -from transformers import AutoTokenizer, AutoModel +import numpy as np +from transformers import AutoTokenizer, AutoModel, AutoModelForSequenceClassification, \ + AutoModelForTokenClassification, AutoModelForQuestionAnswering from square_model_inference.inference.model import Model from square_model_inference.models.request import PredictionRequest, Task from square_model_inference.models.prediction import PredictionOutput -from square_model_inference.core.config import MODEL_NAME, DISABLE_GPU, MAX_BATCH_SIZE +CLASS_MAPPING = { + "base": AutoModel, + "sequence_classification": AutoModelForSequenceClassification, + "token_classifcation": AutoModelForTokenClassification, + "question_answering": AutoModelForQuestionAnswering +} class Transformer(Model): SUPPORTED_EMBEDDING_MODES = ["mean", "max", "cls", "token"] - def __init__(self): - self._load_model(AutoModel) + def __init__(self, model_name, model_class, max_batch_size, disable_gpu, **kwargs): + if model_class not in CLASS_MAPPING: + raise RuntimeError(f"Unknown MODEL_CLASS. Must be one of {CLASS_MAPPING.keys()}") + self._load_model(CLASS_MAPPING[model_class], model_name, disable_gpu) + self.max_batch_size = max_batch_size - def _load_model(self, model_cls): + def _load_model(self, model_cls, model_name, disable_gpu): """ - Load the base model MODEL_NAME and its tokenizer with Huggingface. - Model will be moved to GPU unless CUDA is unavailable or environment variable DISABLE_GPU is true. + Load the Transformer model model_name and its tokenizer with Huggingface. + Model will be moved to GPU unless CUDA is unavailable or disable_gpu is true. """ - logger.debug(f"Loading model {MODEL_NAME}") - tokenizer = AutoTokenizer.from_pretrained(MODEL_NAME) + logger.debug(f"Loading model {model_name}") + tokenizer = AutoTokenizer.from_pretrained(model_name) # Check if GPU is available - device = "cuda" if torch.cuda.is_available() and not DISABLE_GPU else "cpu" - model = model_cls.from_pretrained(MODEL_NAME).to(device) - logger.info(f"Model {MODEL_NAME} loaded on {device}") + device = "cuda" if torch.cuda.is_available() and not disable_gpu else "cpu" + model = model_cls.from_pretrained(model_name).to(device) + logger.info(f"Model {model_name} loaded on {device}") self.model = model self.tokenizer = tokenizer @@ -46,17 +55,17 @@ def _ensure_tensor_on_device(self, **inputs): """ return {name: tensor.to(self.model.device) for name, tensor in inputs.items()} - def _predict(self, request: PredictionRequest, output_word_ids=False, output_attention_mask=False) \ + def _predict(self, request: PredictionRequest, output_features=False) \ -> Union[dict, Tuple[dict, dict]]: all_predictions = [] + request.preprocessing_kwargs["padding"] = request.preprocessing_kwargs.get("padding", True) + request.preprocessing_kwargs["truncation"] = request.preprocessing_kwargs.get("truncation", True) features = self.tokenizer(request.input, return_tensors="pt", - padding=True, - truncation=True, **request.preprocessing_kwargs) - for start_idx in range(0, len(request.input), MAX_BATCH_SIZE): + for start_idx in range(0, len(request.input), self.max_batch_size): with torch.no_grad(): - input_features = {k: features[k][start_idx:start_idx+MAX_BATCH_SIZE] for k in features.keys()} + input_features = {k: features[k][start_idx:start_idx+self.max_batch_size] for k in features.keys()} input_features = self._ensure_tensor_on_device(**input_features) predictions = self.model(**input_features, **request.model_kwargs) all_predictions.append(predictions) @@ -71,19 +80,22 @@ def _predict(self, request: PredictionRequest, output_word_ids=False, output_att final_prediction[key] = tuple(torch.cat(l) for l in tuple_of_lists) else: final_prediction[key] = torch.cat([p[key].cpu() for p in all_predictions]) - if any((output_word_ids, output_attention_mask)): - word_ids = [features.word_ids(i) for i in range(len(request.input))] - attention_mask = features["attention_mask"] - return final_prediction, {"word_ids": word_ids, "attention_mask": attention_mask} + if output_features: + return final_prediction, features return final_prediction def _embedding(self, request: PredictionRequest) -> PredictionOutput: request.model_kwargs["output_hidden_states"] = True - predictions, other = self._predict(request, output_word_ids=True, output_attention_mask=True) + predictions, features = self._predict(request, output_features=True) + # We remove hidden_states from predictions! hidden_state = predictions.pop("hidden_states")[-1] - attention_mask = other["attention_mask"] + attention_mask = features["attention_mask"] embedding_mode = request.task_kwargs.get("embedding_mode", "mean") + task_outputs = { + "embedding_mode": embedding_mode + } + if embedding_mode not in self.SUPPORTED_EMBEDDING_MODES: ValueError(f"Embedding mode {embedding_mode} not in list of supported modes {self.SUPPORTED_EMBEDDING_MODES}") @@ -102,42 +114,132 @@ def _embedding(self, request: PredictionRequest) -> PredictionOutput: emb = sum_embeddings / sum_mask elif embedding_mode == "token": emb = hidden_state - predictions["embedding"] = emb - task_outputs = { - "embedding_mode": embedding_mode, - "word_ids": other["word_ids"] - } + task_outputs["word_ids"] = [features.word_ids(i) for i in range(len(request.input))] + predictions["embeddings"] = emb + return PredictionOutput(model_outputs=predictions, task_outputs=task_outputs) def _token_classification(self, request: PredictionRequest) -> PredictionOutput: - predictions, other = self._predict(request, output_word_ids=True) + predictions, features = self._predict(request, output_features=True) # If logits dim > 1 or if the 'regression' flag is not set, we assume classification: # We replace the logits by the softmax and add labels chosen with argmax - if predictions["logits"].size()[-1] != 1 and not request.task_kwargs.get("regression", False): - probabilities = torch.softmax(predictions["logits"], dim=-1) - predictions["logits"] = probabilities - predictions["labels"] = torch.argmax(predictions["logits"], dim=-1) label2id = self.model.config.label2id id2label = {v:k for k,v in label2id.items()} task_outputs = { "id2label": id2label, - "word_ids": other["word_ids"] + "word_ids": [features.word_ids(i) for i in range(len(request.input))] } + if predictions["logits"].size()[-1] != 1 and not request.task_kwargs.get("regression", False): + probabilities = torch.softmax(predictions["logits"], dim=-1) + predictions["logits"] = probabilities + task_outputs["labels"] = torch.argmax(predictions["logits"], dim=-1).tolist() + return PredictionOutput(model_outputs=predictions, task_outputs=task_outputs) def _sequence_classification(self, request: PredictionRequest) -> PredictionOutput: predictions = self._predict(request) - # If logits dim > 1 or if the 'regression' flag is not set, we assume classification: - # We replace the logits by the softmax and add labels chosen with argmax - if predictions["logits"].size()[-1] != 1 and not request.task_kwargs.get("regression", False): - probabilities = torch.softmax(predictions["logits"], dim=-1) - predictions["logits"] = probabilities - predictions["labels"] = torch.argmax(predictions["logits"], dim=-1) label2id = self.model.config.label2id id2label = {v:k for k,v in label2id.items()} task_outputs = { "id2label": id2label } + # If logits dim > 1 or if the 'regression' flag is not set, we assume classification: + # We replace the logits by the softmax and add labels chosen with argmax + if predictions["logits"].size()[-1] != 1 and not request.task_kwargs.get("regression", False): + probabilities = torch.softmax(predictions["logits"], dim=-1) + predictions["logits"] = probabilities + task_outputs["labels"] = torch.argmax(predictions["logits"], dim=-1).tolist() + + return PredictionOutput(model_outputs=predictions, task_outputs=task_outputs) + + def _generation(self, request: PredictionRequest) -> PredictionOutput: + NotImplementedError("Generation is currently not implemented") + + def _question_answering(self, request: PredictionRequest) -> PredictionOutput: + # Making heavy use of https://huggingface.co/transformers/_modules/transformers/pipelines/question_answering.html#QuestionAnsweringPipeline + def decode(start_: np.ndarray, end_: np.ndarray, topk: int, max_answer_len: int, undesired_tokens_: np.ndarray) -> Tuple: + """ + Take the output of any :obj:`ModelForQuestionAnswering` and will generate probabilities for each span to be the + actual answer. + + In addition, it filters out some unwanted/impossible cases like answer len being greater than max_answer_len or + answer end position being before the starting position. The method supports output the k-best answer through + the topk argument. + + Args: + start_ (:obj:`np.ndarray`): Individual start probabilities for each token. + end (:obj:`np.ndarray`): Individual end_ probabilities for each token. + topk (:obj:`int`): Indicates how many possible answer span(s) to extract from the model output. + max_answer_len (:obj:`int`): Maximum size of the answer to extract from the model's output. + undesired_tokens_ (:obj:`np.ndarray`): Mask determining tokens that can be part of the answer + """ + # Ensure we have batch axis + if start_.ndim == 1: + start_ = start_[None] + + if end_.ndim == 1: + end_ = end_[None] + + # Compute the score of each tuple(start_, end_) to be the real answer + outer = np.matmul(np.expand_dims(start_, -1), np.expand_dims(end_, 1)) + + # Remove candidate with end_ < start_ and end_ - start_ > max_answer_len + candidates = np.tril(np.triu(outer), max_answer_len - 1) + + # Inspired by Chen & al. (https://github.com/facebookresearch/DrQA) + scores_flat = candidates.flatten() + if topk == 1: + idx_sort = [np.argmax(scores_flat)] + elif len(scores_flat) < topk: + idx_sort = np.argsort(-scores_flat) + else: + idx = np.argpartition(-scores_flat, topk)[0:topk] + idx_sort = idx[np.argsort(-scores_flat[idx])] + + starts_, ends_ = np.unravel_index(idx_sort, candidates.shape)[1:] + desired_spans = np.isin(starts_, undesired_tokens_.nonzero()) & np.isin(ends_, undesired_tokens_.nonzero()) + starts_ = starts_[desired_spans] + ends_ = ends_[desired_spans] + scores_ = candidates[0, starts_, ends_] + + return starts_, ends_, scores_ + + request.preprocessing_kwargs["truncation"] = "only_second" + predictions, features = self._predict(request, output_features=True) + + task_outputs = {"answers": []} + for idx, (start, end, (_, context)) in enumerate(zip(predictions["start_logits"], predictions["end_logits"], request.input)): + start = start.numpy() + end = end.numpy() + # Ensure padded tokens & question tokens cannot belong to the set of candidate answers. + undesired_tokens = np.abs(np.array([s != 1 for s in features.sequence_ids(idx)]) - 1) & features["attention_mask"][idx].numpy() + + # Generate mask + undesired_tokens_mask = undesired_tokens == 0.0 + + # Make sure non-context indexes in the tensor cannot contribute to the softmax + start = np.where(undesired_tokens_mask, -10000.0, start) + end = np.where(undesired_tokens_mask, -10000.0, end) + + start = np.exp(start - np.log(np.sum(np.exp(start), axis=-1, keepdims=True))) + end = np.exp(end - np.log(np.sum(np.exp(end), axis=-1, keepdims=True))) + + starts, ends, scores = decode( + start, end, request.task_kwargs.get("topk", 1), request.task_kwargs.get("max_answer_len", 512), undesired_tokens + ) + enc = features[idx] + answers = [ + { + "score": score.item(), + "start": enc.word_to_chars( + enc.token_to_word(s), sequence_index=1)[0], + "end": enc.word_to_chars(enc.token_to_word(e), sequence_index=1)[1], + "answer": context[ + enc.word_to_chars(enc.token_to_word(s), sequence_index=1)[0] : + enc.word_to_chars(enc.token_to_word(e), sequence_index=1)[1]], + } + for s, e, score in zip(starts, ends, scores)] + task_outputs["answers"].append(answers) return PredictionOutput(model_outputs=predictions, task_outputs=task_outputs) async def predict(self, request: PredictionRequest) -> PredictionOutput: @@ -150,6 +252,8 @@ async def predict(self, request: PredictionRequest) -> PredictionOutput: return self._token_classification(request) elif request.task == Task.embedding: return self._embedding(request) + elif request.task == Task.question_answering: + return self._question_answering(request) elif request.task == Task.generation: return self._generation(request) diff --git a/square-model-inference-api/inference_server/square_model_inference/models/prediction.py b/square-model-inference-api/inference_server/square_model_inference/models/prediction.py index 03acbad00..a62c4151d 100644 --- a/square-model-inference-api/inference_server/square_model_inference/models/prediction.py +++ b/square-model-inference-api/inference_server/square_model_inference/models/prediction.py @@ -1,3 +1,5 @@ +from typing import Dict, Union, Tuple + import torch from io import BytesIO import base64 @@ -5,18 +7,26 @@ from pydantic import Field, BaseModel from square_model_inference.core.config import RETURN_PLAINTEXT_ARRAYS -def _encode_numpy(obj): + +def _encode_numpy(obj: Dict[str, Union[torch.Tensor, Tuple[torch.Tensor]]]) -> Dict[str, Union[list, str]]: + """ + Encodes the Torch Tensors first to Numpy arrays and then makes either plain lists or base64-encode them depending + on the flag RETURN_PLAINTEXT_ARRAYS + :param obj: the objects whose tensors will be encoded + :return: the same dictionary with all tensors replaced by lists or base64-encoded array strings. + """ def encode(arr): if RETURN_PLAINTEXT_ARRAYS: return arr.tolist() else: + # np.save expects a file which we emulate with BytesIO with BytesIO() as b: np.save(b, arr) arr_binary = b.getvalue() arr_binary_b64 = base64.b64encode(arr_binary) arr_string_b64 = arr_binary_b64.decode("latin1") return arr_string_b64 - # LOADING WITH + # DECODE THE VALUE WITH # arr_binary_b64 = arr_string_b64.encode() # arr_binary = base64.decodebytes(arr_binary_b64) # arr = np.load(BytesIO(arr_binary)) @@ -35,14 +45,22 @@ class PredictionOutput(BaseModel): The results of the prediction of the model on the given input for the requested task. """ model_outputs: dict = Field( - description="Dictionary containing the model outputs as numpy arrays cast to lists." + description="Dictionary containing the model tensor outputs either as plain list or as base64-encoded numpy array." ) task_outputs: dict = Field( description="Dictionary containing the task outputs. This covers anything from generated text, " - "id2label mappings, token2word mappings, etc." + "labels, id2label mappings, token2word mappings, etc." ) def __init__(self, **data): + """ + Data model for the model and task outputs. + The model outputs (,i.e., tensors) will be encoded as base64 strings or as plain lists depending on the flag + RETURN_PLAINTEXT_ARRAYS. + :param data: + 'model_outputs': dict[str: Union[torch.Tensor, Tuple[torch.Tensor]]]. All tensor results of the model + 'task_outputs': dict[str: Any]. All non-tensor results of the processed task like the predicted labels, extracted spans, etc. + """ super().__init__(**data) self.model_outputs = _encode_numpy(self.model_outputs) diff --git a/square-model-inference-api/inference_server/square_model_inference/models/request.py b/square-model-inference-api/inference_server/square_model_inference/models/request.py index 3df379aa8..a41190689 100644 --- a/square-model-inference-api/inference_server/square_model_inference/models/request.py +++ b/square-model-inference-api/inference_server/square_model_inference/models/request.py @@ -11,6 +11,7 @@ class Task(str, Enum): """ sequence_classification = "sequence_classification" token_classification = "token_classification" + question_answering = "question_answering" embedding = "embedding" generation = "generation" diff --git a/square-model-inference-api/inference_server/tox.ini b/square-model-inference-api/inference_server/tox.ini deleted file mode 100644 index e48ef368e..000000000 --- a/square-model-inference-api/inference_server/tox.ini +++ /dev/null @@ -1,66 +0,0 @@ -[tox] -envlist = flake8,py38,bandit - -[testenv] -deps = - coverage - pytest - pytest-cov - pycodestyle>=2.0 -commands = - coverage erase - pip install -r {toxinidir}/requirements.txt - pytest --cov=tests --cov=square_model_inference --cov-report=term-missing --cov-config=setup.cfg - coverage report - coverage html -d htmlcov-{envname} - -[testenv:autopep8] -basepython = python3 -skip_install = true -deps = - autopep8>=1.5 -commands = - autopep8 --in-place --aggressive -r square_model_inference/ - - -[testenv:flake8] -basepython = python3 -skip_install = true -deps = - flake8 - flake8-bugbear - flake8-colors - flake8-typing-imports>=1.1 - pep8-naming -commands = - flake8 square_model_inference/ tests/ setup.py - -[testenv:bandit] -basepython = python3 -deps = - bandit -commands = - bandit -r square_model_inference/ - -[flake8] -exclude = - .tox, - .git, - __pycache__, - docs/source/conf.py, - build, - dist, - tests/fixtures/*, - *.pyc, - *.egg-info, - .cache, - .eggs -max-complexity = 10 -import-order-style = google -application-import-names = flake8 -max-line-length = 120 -ignore = - B008, - N803, - N806, - E126 \ No newline at end of file From d9046a0835dc148528b0260a57e6e8c74cb33a5e Mon Sep 17 00:00:00 2001 From: Geigle Date: Fri, 11 Jun 2021 16:33:39 +0200 Subject: [PATCH 03/23] Startet adding tests --- square-model-inference-api/Makefile | 4 +- .../inference_server/Dockerfile | 13 +- .../api/routes/router.py | 2 +- .../inference/adaptertransformer.py | 4 +- .../inference/sentencetransformer.py | 4 +- .../inference/transformer.py | 13 +- .../models/prediction.py | 4 +- .../square_model_inference/models/request.py | 2 +- .../inference_server/tests/conftest.py | 85 ++++++ .../pre_test_setup_for_docker_caching.py | 24 ++ .../tests/test_api/test_heartbeat.py | 10 +- .../tests/test_api/test_prediction.py | 39 +++ .../test_prediction_output_numpy.py | 36 +++ .../tests/test_inference/test_transformers.py | 273 ++++++++++++++++++ square-model-inference-api/tests/__init__.py | 0 square-model-inference-api/tests/conftest.py | 16 - .../tests/test_api/test_api_auth.py | 17 -- .../tests/test_api/test_prediction.py | 23 -- .../tests/test_service/test_models.py | 27 -- 19 files changed, 491 insertions(+), 105 deletions(-) create mode 100644 square-model-inference-api/inference_server/tests/conftest.py create mode 100644 square-model-inference-api/inference_server/tests/pre_test_setup_for_docker_caching.py rename square-model-inference-api/{ => inference_server}/tests/test_api/test_heartbeat.py (52%) create mode 100644 square-model-inference-api/inference_server/tests/test_api/test_prediction.py create mode 100644 square-model-inference-api/inference_server/tests/test_inference/test_prediction_output_numpy.py create mode 100644 square-model-inference-api/inference_server/tests/test_inference/test_transformers.py delete mode 100644 square-model-inference-api/tests/__init__.py delete mode 100644 square-model-inference-api/tests/conftest.py delete mode 100644 square-model-inference-api/tests/test_api/test_api_auth.py delete mode 100644 square-model-inference-api/tests/test_api/test_prediction.py delete mode 100644 square-model-inference-api/tests/test_service/test_models.py diff --git a/square-model-inference-api/Makefile b/square-model-inference-api/Makefile index 9ca741306..7acebffb0 100644 --- a/square-model-inference-api/Makefile +++ b/square-model-inference-api/Makefile @@ -7,8 +7,8 @@ SHELL := /bin/bash all: clean test install run deploy down test: - python -m pip install --upgrade pip && pip install pytest - pytest + python -m pip install --upgrade pip && pip install pytest pytest-cov pytest-asyncio + pytest --cov install: pip install --upgrade pip && pip install -r inference_server/requirements.txt diff --git a/square-model-inference-api/inference_server/Dockerfile b/square-model-inference-api/inference_server/Dockerfile index 4418f182a..dee4290fb 100644 --- a/square-model-inference-api/inference_server/Dockerfile +++ b/square-model-inference-api/inference_server/Dockerfile @@ -1,4 +1,4 @@ -FROM python:3.8.2 +FROM python:3.8.2 as build ENV PYTHONUNBUFFERED 1 @@ -9,6 +9,17 @@ COPY requirements.txt ./ RUN pip install --upgrade pip && \ pip install -r requirements.txt +COPY ./tests/pre_test_setup_for_docker_caching.py ./tests/pre_test_setup_for_docker_caching.py +RUN python ./tests/pre_test_setup_for_docker_caching.py + COPY . ./ +FROM base as test +WORKDIR /app +RUN pip install pytest pytest-cov pytest-asyncio +COPY ../tests tests +RUN pytest --cov + +FROM base as build + CMD ["uvicorn", "main:app", "--host", "0.0.0.0", "--port", "8000"] \ No newline at end of file diff --git a/square-model-inference-api/inference_server/square_model_inference/api/routes/router.py b/square-model-inference-api/inference_server/square_model_inference/api/routes/router.py index b1fac0bfd..67d60b65e 100644 --- a/square-model-inference-api/inference_server/square_model_inference/api/routes/router.py +++ b/square-model-inference-api/inference_server/square_model_inference/api/routes/router.py @@ -4,4 +4,4 @@ api_router = APIRouter() api_router.include_router(heartbeat.router, tags=["health"], prefix="/health") -api_router.include_router(prediction.router, tags=["prediction"], prefix="/v1") +api_router.include_router(prediction.router, tags=["prediction"]) diff --git a/square-model-inference-api/inference_server/square_model_inference/inference/adaptertransformer.py b/square-model-inference-api/inference_server/square_model_inference/inference/adaptertransformer.py index ed7e0b4c1..e76259aa3 100644 --- a/square-model-inference-api/inference_server/square_model_inference/inference/adaptertransformer.py +++ b/square-model-inference-api/inference_server/square_model_inference/inference/adaptertransformer.py @@ -68,10 +68,10 @@ def _sequence_classification(self, request: PredictionRequest) -> PredictionOutp async def predict(self, request: PredictionRequest) -> PredictionOutput: if request.is_preprocessed: - ValueError("is_preprocessed=True is not supported for this model. Please use text as input.") + raise ValueError("is_preprocessed=True is not supported for this model. Please use text as input.") if not request.adapter_name or request.adapter_name not in self.model.config.adapters.adapters: - ValueError(f"Unknown or missing adapter {request.adapter_name}. " + raise ValueError(f"Unknown or missing adapter {request.adapter_name}. " f"Please provider a fully specified adapter name from adapterhub.ml") self.model.set_active_adapters(request.adapter_name) diff --git a/square-model-inference-api/inference_server/square_model_inference/inference/sentencetransformer.py b/square-model-inference-api/inference_server/square_model_inference/inference/sentencetransformer.py index fdfd32ad6..f6acf7672 100644 --- a/square-model-inference-api/inference_server/square_model_inference/inference/sentencetransformer.py +++ b/square-model-inference-api/inference_server/square_model_inference/inference/sentencetransformer.py @@ -37,9 +37,9 @@ def _embedding(self, request: PredictionRequest) -> PredictionOutput: async def predict(self, request: PredictionRequest) -> PredictionOutput: if request.is_preprocessed: - ValueError("is_preprocessed=True is not supported for this model. Please use text as input.") + raise ValueError("is_preprocessed=True is not supported for this model. Please use text as input.") if request.task != Task.embedding: - NotImplementedError("Only embedding task supported by this model") + raise NotImplementedError("Only embedding task supported by this model") return self._embedding(request) diff --git a/square-model-inference-api/inference_server/square_model_inference/inference/transformer.py b/square-model-inference-api/inference_server/square_model_inference/inference/transformer.py index c4c0a4f92..32c34cc59 100644 --- a/square-model-inference-api/inference_server/square_model_inference/inference/transformer.py +++ b/square-model-inference-api/inference_server/square_model_inference/inference/transformer.py @@ -5,7 +5,7 @@ from loguru import logger import numpy as np from transformers import AutoTokenizer, AutoModel, AutoModelForSequenceClassification, \ - AutoModelForTokenClassification, AutoModelForQuestionAnswering + AutoModelForTokenClassification, AutoModelForQuestionAnswering, AutoModelForCausalLM from square_model_inference.inference.model import Model from square_model_inference.models.request import PredictionRequest, Task @@ -15,8 +15,9 @@ CLASS_MAPPING = { "base": AutoModel, "sequence_classification": AutoModelForSequenceClassification, - "token_classifcation": AutoModelForTokenClassification, - "question_answering": AutoModelForQuestionAnswering + "token_classification": AutoModelForTokenClassification, + "question_answering": AutoModelForQuestionAnswering, + "generation": AutoModelForCausalLM } class Transformer(Model): @@ -97,7 +98,7 @@ def _embedding(self, request: PredictionRequest) -> PredictionOutput: } if embedding_mode not in self.SUPPORTED_EMBEDDING_MODES: - ValueError(f"Embedding mode {embedding_mode} not in list of supported modes {self.SUPPORTED_EMBEDDING_MODES}") + raise ValueError(f"Embedding mode {embedding_mode} not in list of supported modes {self.SUPPORTED_EMBEDDING_MODES}") if embedding_mode == "cls": emb = hidden_state[:, 0, :] @@ -153,7 +154,7 @@ def _sequence_classification(self, request: PredictionRequest) -> PredictionOutp return PredictionOutput(model_outputs=predictions, task_outputs=task_outputs) def _generation(self, request: PredictionRequest) -> PredictionOutput: - NotImplementedError("Generation is currently not implemented") + raise NotImplementedError("Generation is currently not implemented") def _question_answering(self, request: PredictionRequest) -> PredictionOutput: # Making heavy use of https://huggingface.co/transformers/_modules/transformers/pipelines/question_answering.html#QuestionAnsweringPipeline @@ -244,7 +245,7 @@ def decode(start_: np.ndarray, end_: np.ndarray, topk: int, max_answer_len: int, async def predict(self, request: PredictionRequest) -> PredictionOutput: if request.is_preprocessed: - ValueError("is_preprocessed=True is not supported for this model. Please use text as input.") + raise ValueError("is_preprocessed=True is not supported for this model. Please use text as input.") if request.task == Task.sequence_classification: return self._sequence_classification(request) diff --git a/square-model-inference-api/inference_server/square_model_inference/models/prediction.py b/square-model-inference-api/inference_server/square_model_inference/models/prediction.py index a62c4151d..333b150e7 100644 --- a/square-model-inference-api/inference_server/square_model_inference/models/prediction.py +++ b/square-model-inference-api/inference_server/square_model_inference/models/prediction.py @@ -8,7 +8,7 @@ from square_model_inference.core.config import RETURN_PLAINTEXT_ARRAYS -def _encode_numpy(obj: Dict[str, Union[torch.Tensor, Tuple[torch.Tensor]]]) -> Dict[str, Union[list, str]]: +def _encode_numpy(obj: Dict[str, Union[torch.Tensor, Tuple[torch.Tensor]]], return_plaintext: bool=RETURN_PLAINTEXT_ARRAYS) -> Dict[str, Union[list, str]]: """ Encodes the Torch Tensors first to Numpy arrays and then makes either plain lists or base64-encode them depending on the flag RETURN_PLAINTEXT_ARRAYS @@ -16,7 +16,7 @@ def _encode_numpy(obj: Dict[str, Union[torch.Tensor, Tuple[torch.Tensor]]]) -> D :return: the same dictionary with all tensors replaced by lists or base64-encoded array strings. """ def encode(arr): - if RETURN_PLAINTEXT_ARRAYS: + if return_plaintext: return arr.tolist() else: # np.save expects a file which we emulate with BytesIO diff --git a/square-model-inference-api/inference_server/square_model_inference/models/request.py b/square-model-inference-api/inference_server/square_model_inference/models/request.py index a41190689..ab3c7e1ea 100644 --- a/square-model-inference-api/inference_server/square_model_inference/models/request.py +++ b/square-model-inference-api/inference_server/square_model_inference/models/request.py @@ -39,7 +39,7 @@ class PredictionRequest(BaseModel): model_kwargs: dict = Field( default={}, description="Dictionary containing parameters that are passed to the model for the forward pass. " - "Set ‘output_attention=True’ to receive the attention weights for Huggingface Transformers in the " + "Set ‘output_attentions=True’ to receive the attention weights for Huggingface Transformers in the " "output." ) task: Task = Field(...) diff --git a/square-model-inference-api/inference_server/tests/conftest.py b/square-model-inference-api/inference_server/tests/conftest.py new file mode 100644 index 000000000..8a2f333f8 --- /dev/null +++ b/square-model-inference-api/inference_server/tests/conftest.py @@ -0,0 +1,85 @@ +import pytest + +from fastapi.testclient import TestClient +from pre_test_setup_for_docker_caching import TRANSFORMERS_TESTING_CACHE, TRANSFORMER_MODEL, SENTENCE_MODEL +import torch + +## Due to import and config reasons, the environ is set in pre_test_setup_for_docker_caching ! +## (because we import Transformer, which imports Model, imports PredictionOutput, which imports RETURN_PLAINTEXT_ARRAYS and this creates the starlette config. +## The config is read by this point and starlette forbids overwriting it then) +# from starlette.config import environ +# environ["TRANSFORMERS_CACHE"] = TRANSFORMERS_TESTING_CACHE +# environ["MODEL_NAME"] = "test" +# environ["MODEL_TYPE"] = "test" +# environ["DISABLE_GPU"] = "True" +# environ["MAX_BATCH_SIZE"] = "1" +# environ["RETURN_PLAINTEXT_ARRAYS"] = "False" + +from main import get_app +from square_model_inference.inference.model import Model +from square_model_inference.models.prediction import PredictionOutput +from square_model_inference.models.request import PredictionRequest +from square_model_inference.inference.transformer import Transformer +from square_model_inference.inference.adaptertransformer import AdapterTransformer +from square_model_inference.inference.sentencetransformer import SentenceTransformer + +@pytest.fixture(scope="session") +def test_app(): + app = get_app() + app.state.model = TestModel() + return app + + +@pytest.fixture() +def test_client(test_app): + with TestClient(test_app) as test_client: + yield test_client + + +class TestModel(Model): + def __init__(self): + self.prediction = PredictionOutput(model_outputs={}, task_outputs={}) + + async def predict(self, payload: PredictionRequest) -> PredictionOutput: + return self.prediction + + +# We only load bert-base-uncased, so we fix the random seed to always get the same randomly generated heads on top +@pytest.fixture(scope="class") +def test_transformer_sequence_classification(): + torch.manual_seed(987654321) + return Transformer(TRANSFORMER_MODEL, "sequence_classification", 1, True) + + +@pytest.fixture(scope="class") +def test_transformer_embedding(): + torch.manual_seed(987654321) + return Transformer(TRANSFORMER_MODEL, "base", 1, True) + + +@pytest.fixture(scope="class") +def test_transformer_token_classification(): + torch.manual_seed(987654321) + return Transformer(TRANSFORMER_MODEL, "token_classification", 1, True) + + +@pytest.fixture(scope="class") +def test_transformer_question_answering(): + torch.manual_seed(987654321) + return Transformer(TRANSFORMER_MODEL, "question_answering", 1, True) + + +@pytest.fixture(scope="class") +def test_transformer_generation(): + torch.manual_seed(987654321) + return Transformer(TRANSFORMER_MODEL, "generation", 1, True) + + +@pytest.fixture(scope="class") +def test_adapter(): + return AdapterTransformer(TRANSFORMER_MODEL, 1, True, TRANSFORMERS_TESTING_CACHE) + + +@pytest.fixture(scope="class") +def test_sentence_transformer(): + return SentenceTransformer(SENTENCE_MODEL, 1, True) diff --git a/square-model-inference-api/inference_server/tests/pre_test_setup_for_docker_caching.py b/square-model-inference-api/inference_server/tests/pre_test_setup_for_docker_caching.py new file mode 100644 index 000000000..fe6794e6b --- /dev/null +++ b/square-model-inference-api/inference_server/tests/pre_test_setup_for_docker_caching.py @@ -0,0 +1,24 @@ +# This file is used for the Docker image to cache long-running setup for the tests, +# i.e., downloading finetuned Transformer models and so on. +# This way, adding new tests or even changing the server code does NOT trigger a new download during building +from starlette.config import environ +TRANSFORMERS_TESTING_CACHE = "./model_testing_cache" +environ["TRANSFORMERS_CACHE"] = TRANSFORMERS_TESTING_CACHE +environ["MODEL_NAME"] = "test" +environ["MODEL_TYPE"] = "test" +environ["DISABLE_GPU"] = "True" +environ["MAX_BATCH_SIZE"] = "1" +environ["RETURN_PLAINTEXT_ARRAYS"] = "True" + +from square_model_inference.inference.transformer import Transformer +from square_model_inference.inference.adaptertransformer import AdapterTransformer +from square_model_inference.inference.sentencetransformer import SentenceTransformer + +#Downloaded models: +TRANSFORMER_MODEL = "bert-base-uncased" +SENTENCE_MODEL = "paraphrase-MiniLM-L6-v2 " + +if __name__ == "__main__": + bert_transformer = Transformer(TRANSFORMER_MODEL, "base", 1, True) + adapter_transformer = AdapterTransformer(TRANSFORMER_MODEL, 1, True, TRANSFORMERS_TESTING_CACHE) + sentence_transformer = SentenceTransformer(SENTENCE_MODEL) \ No newline at end of file diff --git a/square-model-inference-api/tests/test_api/test_heartbeat.py b/square-model-inference-api/inference_server/tests/test_api/test_heartbeat.py similarity index 52% rename from square-model-inference-api/tests/test_api/test_heartbeat.py rename to square-model-inference-api/inference_server/tests/test_api/test_heartbeat.py index 343f594d7..eade6ef16 100644 --- a/square-model-inference-api/tests/test_api/test_heartbeat.py +++ b/square-model-inference-api/inference_server/tests/test_api/test_heartbeat.py @@ -1,14 +1,14 @@ -from square_model_inference.main import get_app +from starlette.testclient import TestClient -app = get_app() - -def test_heartbeat(test_client) -> None: +def test_heartbeat(test_app) -> None: + test_client = TestClient(test_app) response = test_client.get("/api/health/heartbeat") assert response.status_code == 200 assert response.json() == {"is_alive": True} -def test_default_route(test_client) -> None: +def test_default_route(test_app) -> None: + test_client = TestClient(test_app) response = test_client.get("/") assert response.status_code == 404 diff --git a/square-model-inference-api/inference_server/tests/test_api/test_prediction.py b/square-model-inference-api/inference_server/tests/test_api/test_prediction.py new file mode 100644 index 000000000..9829ff680 --- /dev/null +++ b/square-model-inference-api/inference_server/tests/test_api/test_prediction.py @@ -0,0 +1,39 @@ +from starlette.testclient import TestClient + + +def test_prediction(test_app) -> None: + test_client = TestClient(test_app) + response = test_client.post( + "/api/predict", + json={ + "input": [ + "this is a test" + ], + "is_preprocessed": False, + "preprocessing_kwargs": {}, + "model_kwargs": {}, + "task": "embedding", + "task_kwargs": {}, + "adapter_name": "" + } + ) + assert response.status_code == 200 + + +def test_prediction_wrong_request(test_app) -> None: + test_client = TestClient(test_app, raise_server_exceptions=False) + response = test_client.post( + "/api/predict", json={ + "input": [ + "this is a test" + ], + "is_preprocessed": False, + "preprocessing_kwargs": {}, + # required + #"model_kwargs": {}, + #"task": "embedding", + "task_kwargs": {}, + "adapter_name": "" + } + ) + assert response.status_code == 422 diff --git a/square-model-inference-api/inference_server/tests/test_inference/test_prediction_output_numpy.py b/square-model-inference-api/inference_server/tests/test_inference/test_prediction_output_numpy.py new file mode 100644 index 000000000..35c4309c6 --- /dev/null +++ b/square-model-inference-api/inference_server/tests/test_inference/test_prediction_output_numpy.py @@ -0,0 +1,36 @@ +from starlette.config import Environ +from square_model_inference.models.prediction import PredictionOutput, _encode_numpy +from io import BytesIO +import base64 +import torch +import numpy as np + +def test_prediction_output_numpy_encoded() -> None: + + arr = np.ones(shape=(10,10), dtype="float32") + + output = _encode_numpy({"test": torch.from_numpy(arr)}, return_plaintext=False) + + encoded_arr = output["test"] + + # reversing code + arr_binary_b64 = encoded_arr.encode() + arr_binary = base64.decodebytes(arr_binary_b64) + arr_back = np.load(BytesIO(arr_binary)) + + np.testing.assert_equal(arr, arr_back) + + +def test_prediction_output_numpy_plaintext() -> None: + + arr = np.ones(shape=(10,10), dtype="float32") + + output = _encode_numpy({"test": torch.from_numpy(arr)}, return_plaintext=True) + + plaintext_list_arr = output["test"] + + # reversing code + arr_back = np.array(plaintext_list_arr) + + np.testing.assert_equal(arr, arr_back) + diff --git a/square-model-inference-api/inference_server/tests/test_inference/test_transformers.py b/square-model-inference-api/inference_server/tests/test_inference/test_transformers.py new file mode 100644 index 000000000..a8631c0ef --- /dev/null +++ b/square-model-inference-api/inference_server/tests/test_inference/test_transformers.py @@ -0,0 +1,273 @@ +import pytest + +from square_model_inference.models.request import PredictionRequest +import numpy as np + +@pytest.mark.usefixtures("test_transformer_sequence_classification") +class TestTransformerSequenceClassification: + + @pytest.mark.asyncio + @pytest.mark.parametrize("input,output", [(["this is a test"], [1]), + (["this is a test", "this is a test with a longer sentence"], [1, 1])], + ids=["single", "batch"]) + async def test_sequence_classification(self, test_transformer_sequence_classification, input, output): + request = PredictionRequest.parse_obj({ + "input": input, + "is_preprocessed": False, + "preprocessing_kwargs": {}, + "model_kwargs": {}, + "task": "sequence_classification", + "task_kwargs": {}, + "adapter_name": "" + }) + + prediction = await test_transformer_sequence_classification.predict(request) + np.testing.assert_allclose(np.sum(prediction.model_outputs["logits"], axis=-1), [1.0]*len(input), err_msg="logits are not converted to softmax") + assert prediction.task_outputs["labels"] == output + assert "logits" in prediction.model_outputs + assert "id2label" in prediction.task_outputs + + @pytest.mark.asyncio + async def test_sequence_classification_output_attention(self, test_transformer_sequence_classification): + request = PredictionRequest.parse_obj({ + "input": ["testing test"], + "is_preprocessed": False, + "preprocessing_kwargs": {}, + "model_kwargs": {"output_attentions": True}, + "task": "sequence_classification", + "task_kwargs": {}, + "adapter_name": "" + }) + + prediction = await test_transformer_sequence_classification.predict(request) + assert "attentions" in prediction.model_outputs + + @pytest.mark.asyncio + @pytest.mark.parametrize("input,output", [(["this is a test"], [[0.17748619616031647, 0.4802907109260559]]), + (["this is a test", "this is a test with a longer sentence"], + [[0.17748622596263885, 0.48029083013534546], [-0.048022523522377014, 0.5764690637588501]])], + ids=["single", "batch"]) + async def test_sequence_classification_regression(self, test_transformer_sequence_classification, input, output): + request = PredictionRequest.parse_obj({ + "input": input, + "is_preprocessed": False, + "preprocessing_kwargs": {}, + "model_kwargs": {}, + "task": "sequence_classification", + "task_kwargs": {"regression": True}, + "adapter_name": "" + }) + + prediction = await test_transformer_sequence_classification.predict(request) + assert prediction.model_outputs["logits"] == output + assert "labels" not in prediction.task_outputs + assert "logits" in prediction.model_outputs + + +@pytest.mark.usefixtures("test_transformer_token_classification") +class TestTransformerTokenClassification: + + @pytest.mark.asyncio + @pytest.mark.parametrize("input,output,word_ids", [(["this is a test"], [[0, 1, 1, 1, 1, 1]], [[None, 0, 1, 2, 3, None]]), + (["this is a test", "this is a test with a longer sentence"], + [[0, 1, 1, 1, 1, 1, 0, 0, 0, 0], [1, 1, 1, 1, 1, 1, 1, 1, 1, 1]], + [[None, 0, 1, 2, 3, None, None, None, None, None], [None, 0, 1, 2, 3, 4, 5, 6, 7, None]])], + ids=["single", "batch"]) + async def test_token_classification(self, test_transformer_token_classification, input, output, word_ids): + request = PredictionRequest.parse_obj({ + "input": input, + "is_preprocessed": False, + "preprocessing_kwargs": {}, + "model_kwargs": {}, + "task": "token_classification", + "task_kwargs": {}, + "adapter_name": "" + }) + + prediction = await test_transformer_token_classification.predict(request) + np.testing.assert_allclose(np.sum(prediction.model_outputs["logits"], axis=-1), np.ones_like(output), err_msg="logits are not converted to softmax") + assert prediction.task_outputs["labels"] == output + assert "logits" in prediction.model_outputs + assert "id2label" in prediction.task_outputs + assert prediction.task_outputs["word_ids"] == word_ids + + + @pytest.mark.asyncio + @pytest.mark.parametrize("input,output,word_ids", [(["this is a test"], [[[0.11221601068973541, 0.05969493091106415], + [-0.15642595291137695, -0.07739989459514618], + [-0.2801300287246704, 0.06872765719890594], + [-0.28268659114837646, -0.01801353693008423], + [0.032697953283786774, 0.051608189940452576], + [0.01579795777797699, 0.23798659443855286]]], + [[None, 0, 1, 2, 3, None]]), + (["this is a test", "this is a test with a longer sentence"], + [[[0.11221593618392944, 0.059694863855838776], + [-0.15642589330673218, -0.07740016281604767], + [-0.2801305651664734, 0.06872743368148804], + [-0.28268682956695557, -0.018013179302215576], + [0.03269825875759125, 0.051608070731163025], + [0.01579788327217102, 0.2379867136478424], + [0.03173816204071045, -0.15285855531692505], + [0.11003853380680084, -0.12873490154743195], + [0.13318589329719543, -0.10646772384643555], + [0.13220593333244324, -0.12443935126066208]], + [[-0.05789753794670105, 0.04508095979690552], + [-0.27872341871261597, -0.1229611188173294], + [-0.5753417015075684, 0.07893967628479004], + [-0.47106701135635376, -0.04298338294029236], + [-0.5324697494506836, 0.10730768740177155], + [-0.31086403131484985, 0.3265591263771057], + [-0.2457294911146164, 0.24867495894432068], + [-0.2389427125453949, 0.4464331567287445], + [-0.08393016457557678, -0.03680300712585449], + [-0.01722254604101181, 0.41447973251342773]]], + [[None, 0, 1, 2, 3, None, None, None, None, None], [None, 0, 1, 2, 3, 4, 5, 6, 7, None]])], + ids=["single", "batch"]) + async def test_token_classification_regression(self, test_transformer_token_classification, input, output, word_ids): + request = PredictionRequest.parse_obj({ + "input": input, + "is_preprocessed": False, + "preprocessing_kwargs": {}, + "model_kwargs": {}, + "task": "token_classification", + "task_kwargs": {"regression": True}, + "adapter_name": "" + }) + + prediction = await test_transformer_token_classification.predict(request) + assert prediction.model_outputs["logits"] == output + assert "labels" not in prediction.task_outputs + assert "logits" in prediction.model_outputs + assert prediction.task_outputs["word_ids"] == word_ids + + +@pytest.mark.usefixtures("test_transformer_embedding") +class TestTransformerSequenceClassification: + + @pytest.mark.asyncio + @pytest.mark.parametrize("input,mode", [(["this is a test"], "mean"), + (["this is a test", "this is a test with a longer sentence"], "mean"), + (["this is a test"], "max"), + (["this is a test", "this is a test with a longer sentence"], "max"), + (["this is a test"], "cls"), + (["this is a test", "this is a test with a longer sentence"], "cls")], + ) + async def test_embedding(self, test_transformer_embedding, input, mode): + request = PredictionRequest.parse_obj({ + "input": input, + "is_preprocessed": False, + "preprocessing_kwargs": {}, + "model_kwargs": {}, + "task": "embedding", + "task_kwargs": {"embedding_mode": mode}, + "adapter_name": "" + }) + + prediction = await test_transformer_embedding.predict(request) + assert np.array(prediction.model_outputs["embeddings"]).shape[1] == 768 + assert np.array(prediction.model_outputs["embeddings"]).shape[0] == len(input) + assert "hidden_states" not in prediction.model_outputs + assert prediction.task_outputs["embedding_mode"] == mode + + @pytest.mark.asyncio + @pytest.mark.parametrize("input,word_ids", [(["this is a test"], [[None, 0, 1, 2, 3, None]]), + (["this is a test", "this is a test with a longer sentence"], + [[None, 0, 1, 2, 3, None, None, None, None, None], [None, 0, 1, 2, 3, 4, 5, 6, 7, None]])], + ids=["single", "batch"]) + async def test_embedding_token(self, test_transformer_embedding, input, word_ids): + request = PredictionRequest.parse_obj({ + "input": input, + "is_preprocessed": False, + "preprocessing_kwargs": {}, + "model_kwargs": {}, + "task": "embedding", + "task_kwargs": {"embedding_mode": "token"}, + "adapter_name": "" + }) + + prediction = await test_transformer_embedding.predict(request) + assert np.array(prediction.model_outputs["embeddings"]).shape[2] == 768 + assert np.array(prediction.model_outputs["embeddings"]).shape[1] == len(word_ids[0]) + assert np.array(prediction.model_outputs["embeddings"]).shape[0] == len(input) + assert "hidden_states" not in prediction.model_outputs + assert prediction.task_outputs["embedding_mode"] == "token" + assert prediction.task_outputs["word_ids"] == word_ids + + @pytest.mark.asyncio + async def test_embedding_unknown_mode(self, test_transformer_embedding): + request = PredictionRequest.parse_obj({ + "input": ["test"], + "is_preprocessed": False, + "preprocessing_kwargs": {}, + "model_kwargs": {}, + "task": "embedding", + "task_kwargs": {"embedding_mode": "this mode does not exist"}, + "adapter_name": "" + }) + with pytest.raises(ValueError): + prediction = await test_transformer_embedding.predict(request) + + +@pytest.mark.usefixtures("test_transformer_question_answering") +class TestTransformerSequenceClassification: + + @pytest.mark.asyncio + @pytest.mark.parametrize("input", [([["What is a test?", "A test is a thing where you test."]]), + ([["What is a test?", "A test is a thing where you test."], + ["What is a longer test?", "A test is a thing where you test. If it is longer you call it longer"]])], + ) + async def test_question_answering(self, test_transformer_question_answering, input): + request = PredictionRequest.parse_obj({ + "input": input, + "is_preprocessed": False, + "preprocessing_kwargs": {}, + "model_kwargs": {}, + "task": "question_answering", + "task_kwargs": {"topk": 1}, + "adapter_name": "" + }) + + prediction = await test_transformer_question_answering.predict(request) + answers = [input[i][1][prediction.task_outputs["answers"][i][0]["start"]:prediction.task_outputs["answers"][i][0]["end"]] for i in range(len(input))] + assert "start_logits" in prediction.model_outputs and "end_logits" in prediction.model_outputs + assert len(prediction.task_outputs["answers"]) == len(input) + assert all(prediction.task_outputs["answers"][i][0]["answer"] == answers[i] for i in range(len(input))) + + @pytest.mark.asyncio + async def test_question_answering_topk(self, test_transformer_question_answering): + input = [["What is a test?", "A test is a thing where you test."]] + request = PredictionRequest.parse_obj({ + "input": input, + "is_preprocessed": False, + "preprocessing_kwargs": {}, + "model_kwargs": {}, + "task": "question_answering", + "task_kwargs": {"topk": 2}, + "adapter_name": "" + }) + + prediction = await test_transformer_question_answering.predict(request) + answers = [input[0][1][prediction.task_outputs["answers"][0][i]["start"]:prediction.task_outputs["answers"][0][i]["end"]] for i in range(2)] + assert "start_logits" in prediction.model_outputs and "end_logits" in prediction.model_outputs + assert len(prediction.task_outputs["answers"]) == len(input) + assert all(prediction.task_outputs["answers"][0][i]["answer"] == answers[i] for i in range(2)) + + +@pytest.mark.usefixtures("test_transformer_generation") +class TestTransformerGeneration: + @pytest.mark.asyncio + async def test_generation(self, test_transformer_generation): + request = PredictionRequest.parse_obj({ + "input": [ + "this is a test" + ], + "is_preprocessed": False, + "preprocessing_kwargs": {}, + "model_kwargs": {}, + "task": "generation", + "task_kwargs": {}, + "adapter_name": "" + }) + + prediction = await test_transformer_generation.predict(request) + assert prediction \ No newline at end of file diff --git a/square-model-inference-api/tests/__init__.py b/square-model-inference-api/tests/__init__.py deleted file mode 100644 index e69de29bb..000000000 diff --git a/square-model-inference-api/tests/conftest.py b/square-model-inference-api/tests/conftest.py deleted file mode 100644 index b21642754..000000000 --- a/square-model-inference-api/tests/conftest.py +++ /dev/null @@ -1,16 +0,0 @@ -import pytest -from starlette.config import environ -from starlette.testclient import TestClient - - -environ["API_KEY"] = "example_key" - - -from square_model_inference.main import get_app # noqa: E402 - - -@pytest.fixture() -def test_client(): - app = get_app() - with TestClient(app) as test_client: - yield test_client diff --git a/square-model-inference-api/tests/test_api/test_api_auth.py b/square-model-inference-api/tests/test_api/test_api_auth.py deleted file mode 100644 index cc5467b20..000000000 --- a/square-model-inference-api/tests/test_api/test_api_auth.py +++ /dev/null @@ -1,17 +0,0 @@ -from square_model_inference.core import messages - - -def test_auth_using_prediction_api_no_apikey_header(test_client) -> None: - response = test_client.post("/api/v1/question") - assert response.status_code == 400 - assert response.json() == {"detail": messages.NO_API_KEY} - - -def test_auth_using_prediction_api_wrong_apikey_header(test_client) -> None: - response = test_client.post( - "/api/v1/question", - json={"context": "test", "question": "test"}, - headers={"token": "WRONG_TOKEN"}, - ) - assert response.status_code == 401 - assert response.json() == {"detail": messages.AUTH_REQ} diff --git a/square-model-inference-api/tests/test_api/test_prediction.py b/square-model-inference-api/tests/test_api/test_prediction.py deleted file mode 100644 index a00caf028..000000000 --- a/square-model-inference-api/tests/test_api/test_prediction.py +++ /dev/null @@ -1,23 +0,0 @@ -def test_prediction(test_client) -> None: - response = test_client.post( - "/api/v1/question", - json={"context": "two plus two equal four", "question": "What is four?"}, - headers={"token": "example_key"}, - ) - assert response.status_code == 200 - assert "score" in response.json() - - -def test_prediction_nopayload(test_client) -> None: - response = test_client.post( - "/api/v1/question", json={}, headers={"token": "example_key"} - ) - """ - ## if nopayload, default Hitchhiker's Guide to the Galaxy example is sent - # context:"42 is the answer to life, the universe and everything." - # question:"What is the answer to life?" - # if no default, assert response.status_code == 422 - """ - - data = response.json() - assert data["answer"] == "42" diff --git a/square-model-inference-api/tests/test_service/test_models.py b/square-model-inference-api/tests/test_service/test_models.py deleted file mode 100644 index 61d666db7..000000000 --- a/square-model-inference-api/tests/test_service/test_models.py +++ /dev/null @@ -1,27 +0,0 @@ -import pytest - -from square_model_inference.core import config -from square_model_inference.models.payload import QAPredictionPayload -from square_model_inference.models.prediction import QAPredictionResult -from square_model_inference.transformers.nlp import QAModel - - -def test_prediction(test_client) -> None: - model_path = config.DEFAULT_MODEL_PATH - qa = QAPredictionPayload.parse_obj( - {"context": "two plus two equal four", "question": "What is four?"} - ) - - tm = QAModel(model_path) - result = tm.predict(qa) - assert isinstance(result, QAPredictionResult) - assert result.answer == "two plus two" - - -def test_prediction_no_payload(test_client) -> None: - model_path = config.DEFAULT_MODEL_PATH - - tm = QAModel(model_path) - with pytest.raises(ValueError): - result = tm.predict(None) - assert isinstance(result, QAPredictionResult) From 074fc3bc1b8f6510f1ff6cd441a3d7b58d8e6dd2 Mon Sep 17 00:00:00 2001 From: Geigle Date: Thu, 17 Jun 2021 14:29:51 +0200 Subject: [PATCH 04/23] Added generation for transformers; continued tests (missing for now just adapters) --- .../inference_server/Dockerfile | 6 +- .../inference/sentencetransformer.py | 6 +- .../inference/transformer.py | 34 ++++- .../models/prediction.py | 20 ++- .../inference_server/tests/conftest.py | 7 +- .../pre_test_setup_for_docker_caching.py | 2 +- .../tests/test_inference/test_adapter.py | 10 ++ .../test_sentence_transformer.py | 54 +++++++ .../tests/test_inference/test_transformers.py | 135 +++++++++++------- 9 files changed, 198 insertions(+), 76 deletions(-) create mode 100644 square-model-inference-api/inference_server/tests/test_inference/test_adapter.py create mode 100644 square-model-inference-api/inference_server/tests/test_inference/test_sentence_transformer.py diff --git a/square-model-inference-api/inference_server/Dockerfile b/square-model-inference-api/inference_server/Dockerfile index dee4290fb..fe2b5344c 100644 --- a/square-model-inference-api/inference_server/Dockerfile +++ b/square-model-inference-api/inference_server/Dockerfile @@ -9,8 +9,8 @@ COPY requirements.txt ./ RUN pip install --upgrade pip && \ pip install -r requirements.txt -COPY ./tests/pre_test_setup_for_docker_caching.py ./tests/pre_test_setup_for_docker_caching.py -RUN python ./tests/pre_test_setup_for_docker_caching.py +COPY ./inference_server/tests/pre_test_setup_for_docker_caching.py ./inference_server/tests/pre_test_setup_for_docker_caching.py +RUN PYTHONPATH=./inference_server; python ./inference_server/tests/pre_test_setup_for_docker_caching.py COPY . ./ @@ -18,7 +18,7 @@ FROM base as test WORKDIR /app RUN pip install pytest pytest-cov pytest-asyncio COPY ../tests tests -RUN pytest --cov +RUN PYTHONPATH=./inference_server; pytest --cov FROM base as build diff --git a/square-model-inference-api/inference_server/square_model_inference/inference/sentencetransformer.py b/square-model-inference-api/inference_server/square_model_inference/inference/sentencetransformer.py index f6acf7672..7ed77b813 100644 --- a/square-model-inference-api/inference_server/square_model_inference/inference/sentencetransformer.py +++ b/square-model-inference-api/inference_server/square_model_inference/inference/sentencetransformer.py @@ -4,7 +4,7 @@ import torch from loguru import logger import numpy as np -from sentence_transformers import SentenceTransformer +from sentence_transformers import SentenceTransformer as SentenceTransformerModel from square_model_inference.inference.model import Model from square_model_inference.models.request import PredictionRequest, Task @@ -26,7 +26,7 @@ def _load_model(self, model_name, disable_gpu): """ logger.debug(f"Loading model {model_name}") device = "cuda" if torch.cuda.is_available() and not disable_gpu else "cpu" - model = SentenceTransformer(model_name_or_path=model_name, device=device) + model = SentenceTransformerModel(model_name_or_path=model_name, device=device) logger.info(f"Model {model_name} loaded on {device}") self.model = model @@ -40,6 +40,6 @@ async def predict(self, request: PredictionRequest) -> PredictionOutput: raise ValueError("is_preprocessed=True is not supported for this model. Please use text as input.") if request.task != Task.embedding: - raise NotImplementedError("Only embedding task supported by this model") + raise ValueError("Only embedding task supported by this model") return self._embedding(request) diff --git a/square-model-inference-api/inference_server/square_model_inference/inference/transformer.py b/square-model-inference-api/inference_server/square_model_inference/inference/transformer.py index 32c34cc59..a6994ec4d 100644 --- a/square-model-inference-api/inference_server/square_model_inference/inference/transformer.py +++ b/square-model-inference-api/inference_server/square_model_inference/inference/transformer.py @@ -1,4 +1,5 @@ import json +from collections import defaultdict from typing import Union, Tuple import torch @@ -130,7 +131,7 @@ def _token_classification(self, request: PredictionRequest) -> PredictionOutput: "id2label": id2label, "word_ids": [features.word_ids(i) for i in range(len(request.input))] } - if predictions["logits"].size()[-1] != 1 and not request.task_kwargs.get("regression", False): + if predictions["logits"].size()[-1] != 1 and not request.task_kwargs.get("is_regression", False): probabilities = torch.softmax(predictions["logits"], dim=-1) predictions["logits"] = probabilities task_outputs["labels"] = torch.argmax(predictions["logits"], dim=-1).tolist() @@ -146,7 +147,7 @@ def _sequence_classification(self, request: PredictionRequest) -> PredictionOutp } # If logits dim > 1 or if the 'regression' flag is not set, we assume classification: # We replace the logits by the softmax and add labels chosen with argmax - if predictions["logits"].size()[-1] != 1 and not request.task_kwargs.get("regression", False): + if predictions["logits"].size()[-1] != 1 and not request.task_kwargs.get("is_regression", False): probabilities = torch.softmax(predictions["logits"], dim=-1) predictions["logits"] = probabilities task_outputs["labels"] = torch.argmax(predictions["logits"], dim=-1).tolist() @@ -154,7 +155,34 @@ def _sequence_classification(self, request: PredictionRequest) -> PredictionOutp return PredictionOutput(model_outputs=predictions, task_outputs=task_outputs) def _generation(self, request: PredictionRequest) -> PredictionOutput: - raise NotImplementedError("Generation is currently not implemented") + request.preprocessing_kwargs["padding"] = request.preprocessing_kwargs.get("padding", False) + request.preprocessing_kwargs["add_special_tokens"] = request.preprocessing_kwargs.get("add_special_tokens", False) + task_outputs = {"generated_texts": []} + model_outputs = defaultdict(list) + for prompt in request.input: + features = self.tokenizer(prompt, return_tensors="pt", **request.preprocessing_kwargs) + input_ids = features["input_ids"] + input_ids = self._ensure_tensor_on_device(input_ids=input_ids)["input_ids"] + request.model_kwargs.update(request.task_kwargs) + request.model_kwargs["return_dict_in_generate"] = True + res = self.model.generate(input_ids, **request.model_kwargs) + + # put everything on CPU and add it to model_outputs + for key in res.keys(): + if isinstance(res[key], tuple): + if isinstance(res[key][0], tuple): + res[key] = tuple((tuple(tensor.cpu() for tensor in tpl)) for tpl in res[key]) + else: + res[key] = tuple(tensor.cpu() for tensor in res[key]) + else: + res[key] = res[key].cpu() + model_outputs[key].append(res[key]) + + generated_texts = [self.tokenizer.decode(seq, skip_special_tokens=True, + clean_up_tokenization_spaces=request.task_kwargs.get("clean_up_tokenization_spaces", False)) + for seq in res["sequences"]] + task_outputs["generated_texts"].append(generated_texts) + return PredictionOutput(model_outputs=model_outputs, task_outputs=task_outputs) def _question_answering(self, request: PredictionRequest) -> PredictionOutput: # Making heavy use of https://huggingface.co/transformers/_modules/transformers/pipelines/question_answering.html#QuestionAnsweringPipeline diff --git a/square-model-inference-api/inference_server/square_model_inference/models/prediction.py b/square-model-inference-api/inference_server/square_model_inference/models/prediction.py index 333b150e7..f3f6df4cd 100644 --- a/square-model-inference-api/inference_server/square_model_inference/models/prediction.py +++ b/square-model-inference-api/inference_server/square_model_inference/models/prediction.py @@ -1,3 +1,4 @@ +from collections import Iterable from typing import Dict, Union, Tuple import torch @@ -15,7 +16,10 @@ def _encode_numpy(obj: Dict[str, Union[torch.Tensor, Tuple[torch.Tensor]]], retu :param obj: the objects whose tensors will be encoded :return: the same dictionary with all tensors replaced by lists or base64-encoded array strings. """ + # Encode numpy array either as lists or base64 string def encode(arr): + if isinstance(arr, torch.Tensor): + arr = arr.numpy() if return_plaintext: return arr.tolist() else: @@ -30,13 +34,17 @@ def encode(arr): # arr_binary_b64 = arr_string_b64.encode() # arr_binary = base64.decodebytes(arr_binary_b64) # arr = np.load(BytesIO(arr_binary)) + + # Recursively go through a value and encode leafs (=tensors) it or iterate over values and encode them + def enc_or_iterate(val): + if isinstance(val, Iterable) and not isinstance(val, torch.Tensor) and not isinstance(val, np.ndarray): + return [enc_or_iterate(v) for v in val] + else: + return encode(val) + for k, v in obj.items(): - if isinstance(v, torch.Tensor): - v = v.numpy() - obj[k] = encode(v) - elif isinstance(v, tuple) and isinstance(v[0], torch.Tensor): - v = [encode(vv.numpy()) for vv in v] - obj[k] = v + v = enc_or_iterate(v) + obj[k] = v return obj diff --git a/square-model-inference-api/inference_server/tests/conftest.py b/square-model-inference-api/inference_server/tests/conftest.py index 8a2f333f8..55a6e05e2 100644 --- a/square-model-inference-api/inference_server/tests/conftest.py +++ b/square-model-inference-api/inference_server/tests/conftest.py @@ -23,6 +23,7 @@ from square_model_inference.inference.adaptertransformer import AdapterTransformer from square_model_inference.inference.sentencetransformer import SentenceTransformer + @pytest.fixture(scope="session") def test_app(): app = get_app() @@ -30,12 +31,6 @@ def test_app(): return app -@pytest.fixture() -def test_client(test_app): - with TestClient(test_app) as test_client: - yield test_client - - class TestModel(Model): def __init__(self): self.prediction = PredictionOutput(model_outputs={}, task_outputs={}) diff --git a/square-model-inference-api/inference_server/tests/pre_test_setup_for_docker_caching.py b/square-model-inference-api/inference_server/tests/pre_test_setup_for_docker_caching.py index fe6794e6b..cf12f1e0f 100644 --- a/square-model-inference-api/inference_server/tests/pre_test_setup_for_docker_caching.py +++ b/square-model-inference-api/inference_server/tests/pre_test_setup_for_docker_caching.py @@ -16,7 +16,7 @@ #Downloaded models: TRANSFORMER_MODEL = "bert-base-uncased" -SENTENCE_MODEL = "paraphrase-MiniLM-L6-v2 " +SENTENCE_MODEL = "paraphrase-albert-small-v2" if __name__ == "__main__": bert_transformer = Transformer(TRANSFORMER_MODEL, "base", 1, True) diff --git a/square-model-inference-api/inference_server/tests/test_inference/test_adapter.py b/square-model-inference-api/inference_server/tests/test_inference/test_adapter.py new file mode 100644 index 000000000..585801453 --- /dev/null +++ b/square-model-inference-api/inference_server/tests/test_inference/test_adapter.py @@ -0,0 +1,10 @@ +import pytest + +from square_model_inference.models.request import PredictionRequest +import numpy as np + + +def test_adapter(): + pytest.fail("Adapter tests waiting for next version of adapter-transformers. " + "Not enough heads available. " + "Waiting for https://github.com/Adapter-Hub/adapter-transformers/commit/94ff58dbf09ac1d14b7107c9ce48770d2d352161") \ No newline at end of file diff --git a/square-model-inference-api/inference_server/tests/test_inference/test_sentence_transformer.py b/square-model-inference-api/inference_server/tests/test_inference/test_sentence_transformer.py new file mode 100644 index 000000000..a290a78c6 --- /dev/null +++ b/square-model-inference-api/inference_server/tests/test_inference/test_sentence_transformer.py @@ -0,0 +1,54 @@ +import pytest + +import numpy as np +from square_model_inference.models.request import PredictionRequest + + +@pytest.mark.usefixtures("test_sentence_transformer") +class TestSentenceTransformerEmbedding: + + @pytest.mark.asyncio + @pytest.mark.parametrize("input", [(["this is a test"]), + (["this is a test", "this is a test with a longer sentence"])]) + async def test_embedding(self, test_sentence_transformer, input): + request = PredictionRequest.parse_obj({ + "input": input, + "is_preprocessed": False, + "preprocessing_kwargs": {}, + "model_kwargs": {}, + "task": "embedding", + "task_kwargs": {}, + "adapter_name": "" + }) + + prediction = await test_sentence_transformer.predict(request) + assert np.array(prediction.model_outputs["embeddings"]).shape[1] == 768 + assert np.array(prediction.model_outputs["embeddings"]).shape[0] == len(input) + + @pytest.mark.asyncio + async def test_not_embedding(self, test_sentence_transformer): + request = PredictionRequest.parse_obj({ + "input": ["input"], + "is_preprocessed": False, + "preprocessing_kwargs": {}, + "model_kwargs": {}, + "task": "sequence_classification", + "task_kwargs": {}, + "adapter_name": "" + }) + with pytest.raises(ValueError): + prediction = await test_sentence_transformer.predict(request) + + @pytest.mark.asyncio + async def test_forbid_is_preprocessed(self, test_sentence_transformer): + request = PredictionRequest.parse_obj({ + "input": ["test"], + "is_preprocessed": True, + "preprocessing_kwargs": {}, + "model_kwargs": {}, + "task": "embedding", + "task_kwargs": {}, + "adapter_name": "" + }) + with pytest.raises(ValueError): + prediction = await test_sentence_transformer.predict(request) \ No newline at end of file diff --git a/square-model-inference-api/inference_server/tests/test_inference/test_transformers.py b/square-model-inference-api/inference_server/tests/test_inference/test_transformers.py index a8631c0ef..9c1a9b00d 100644 --- a/square-model-inference-api/inference_server/tests/test_inference/test_transformers.py +++ b/square-model-inference-api/inference_server/tests/test_inference/test_transformers.py @@ -5,12 +5,11 @@ @pytest.mark.usefixtures("test_transformer_sequence_classification") class TestTransformerSequenceClassification: - @pytest.mark.asyncio - @pytest.mark.parametrize("input,output", [(["this is a test"], [1]), - (["this is a test", "this is a test with a longer sentence"], [1, 1])], + @pytest.mark.parametrize("input", [(["this is a test"]), + (["this is a test", "this is a test with a longer sentence"])], ids=["single", "batch"]) - async def test_sequence_classification(self, test_transformer_sequence_classification, input, output): + async def test_sequence_classification(self, test_transformer_sequence_classification, input): request = PredictionRequest.parse_obj({ "input": input, "is_preprocessed": False, @@ -22,8 +21,9 @@ async def test_sequence_classification(self, test_transformer_sequence_classific }) prediction = await test_transformer_sequence_classification.predict(request) - np.testing.assert_allclose(np.sum(prediction.model_outputs["logits"], axis=-1), [1.0]*len(input), err_msg="logits are not converted to softmax") - assert prediction.task_outputs["labels"] == output + np.testing.assert_allclose(np.sum(prediction.model_outputs["logits"], axis=-1), [1.0]*len(input), err_msg="logits are softmax") + assert len(prediction.task_outputs["labels"]) == len(input) + assert all(isinstance(prediction.task_outputs["labels"][i], int) for i in range(len(input))) assert "logits" in prediction.model_outputs assert "id2label" in prediction.task_outputs @@ -43,23 +43,22 @@ async def test_sequence_classification_output_attention(self, test_transformer_s assert "attentions" in prediction.model_outputs @pytest.mark.asyncio - @pytest.mark.parametrize("input,output", [(["this is a test"], [[0.17748619616031647, 0.4802907109260559]]), - (["this is a test", "this is a test with a longer sentence"], - [[0.17748622596263885, 0.48029083013534546], [-0.048022523522377014, 0.5764690637588501]])], + @pytest.mark.parametrize("input", [(["this is a test"]), + (["this is a test", "this is a test with a longer sentence"])], ids=["single", "batch"]) - async def test_sequence_classification_regression(self, test_transformer_sequence_classification, input, output): + async def test_sequence_classification_regression(self, test_transformer_sequence_classification, input): request = PredictionRequest.parse_obj({ "input": input, "is_preprocessed": False, "preprocessing_kwargs": {}, "model_kwargs": {}, "task": "sequence_classification", - "task_kwargs": {"regression": True}, + "task_kwargs": {"is_regression": True}, "adapter_name": "" }) prediction = await test_transformer_sequence_classification.predict(request) - assert prediction.model_outputs["logits"] == output + assert not np.array_equal(np.sum(prediction.model_outputs["logits"], axis=-1)-1, [0.0]*len(input)), "logits are not softmax" assert "labels" not in prediction.task_outputs assert "logits" in prediction.model_outputs @@ -68,12 +67,11 @@ async def test_sequence_classification_regression(self, test_transformer_sequenc class TestTransformerTokenClassification: @pytest.mark.asyncio - @pytest.mark.parametrize("input,output,word_ids", [(["this is a test"], [[0, 1, 1, 1, 1, 1]], [[None, 0, 1, 2, 3, None]]), + @pytest.mark.parametrize("input,word_ids", [(["this is a test"], [[None, 0, 1, 2, 3, None]]), (["this is a test", "this is a test with a longer sentence"], - [[0, 1, 1, 1, 1, 1, 0, 0, 0, 0], [1, 1, 1, 1, 1, 1, 1, 1, 1, 1]], [[None, 0, 1, 2, 3, None, None, None, None, None], [None, 0, 1, 2, 3, 4, 5, 6, 7, None]])], ids=["single", "batch"]) - async def test_token_classification(self, test_transformer_token_classification, input, output, word_ids): + async def test_token_classification(self, test_transformer_token_classification, input, word_ids): request = PredictionRequest.parse_obj({ "input": input, "is_preprocessed": False, @@ -85,65 +83,39 @@ async def test_token_classification(self, test_transformer_token_classification, }) prediction = await test_transformer_token_classification.predict(request) - np.testing.assert_allclose(np.sum(prediction.model_outputs["logits"], axis=-1), np.ones_like(output), err_msg="logits are not converted to softmax") - assert prediction.task_outputs["labels"] == output + np.testing.assert_allclose(np.sum(prediction.model_outputs["logits"], axis=-1), np.ones(shape=(len(input), len(word_ids[0]))), err_msg="logits are softmax") + assert all(len(prediction.task_outputs["labels"][i]) == len(word_ids[i]) for i in range(len(input))) assert "logits" in prediction.model_outputs assert "id2label" in prediction.task_outputs assert prediction.task_outputs["word_ids"] == word_ids @pytest.mark.asyncio - @pytest.mark.parametrize("input,output,word_ids", [(["this is a test"], [[[0.11221601068973541, 0.05969493091106415], - [-0.15642595291137695, -0.07739989459514618], - [-0.2801300287246704, 0.06872765719890594], - [-0.28268659114837646, -0.01801353693008423], - [0.032697953283786774, 0.051608189940452576], - [0.01579795777797699, 0.23798659443855286]]], + @pytest.mark.parametrize("input,word_ids", [(["this is a test"], [[None, 0, 1, 2, 3, None]]), (["this is a test", "this is a test with a longer sentence"], - [[[0.11221593618392944, 0.059694863855838776], - [-0.15642589330673218, -0.07740016281604767], - [-0.2801305651664734, 0.06872743368148804], - [-0.28268682956695557, -0.018013179302215576], - [0.03269825875759125, 0.051608070731163025], - [0.01579788327217102, 0.2379867136478424], - [0.03173816204071045, -0.15285855531692505], - [0.11003853380680084, -0.12873490154743195], - [0.13318589329719543, -0.10646772384643555], - [0.13220593333244324, -0.12443935126066208]], - [[-0.05789753794670105, 0.04508095979690552], - [-0.27872341871261597, -0.1229611188173294], - [-0.5753417015075684, 0.07893967628479004], - [-0.47106701135635376, -0.04298338294029236], - [-0.5324697494506836, 0.10730768740177155], - [-0.31086403131484985, 0.3265591263771057], - [-0.2457294911146164, 0.24867495894432068], - [-0.2389427125453949, 0.4464331567287445], - [-0.08393016457557678, -0.03680300712585449], - [-0.01722254604101181, 0.41447973251342773]]], [[None, 0, 1, 2, 3, None, None, None, None, None], [None, 0, 1, 2, 3, 4, 5, 6, 7, None]])], ids=["single", "batch"]) - async def test_token_classification_regression(self, test_transformer_token_classification, input, output, word_ids): + async def test_token_classification_regression(self, test_transformer_token_classification, input, word_ids): request = PredictionRequest.parse_obj({ "input": input, "is_preprocessed": False, "preprocessing_kwargs": {}, "model_kwargs": {}, "task": "token_classification", - "task_kwargs": {"regression": True}, + "task_kwargs": {"is_regression": True}, "adapter_name": "" }) prediction = await test_transformer_token_classification.predict(request) - assert prediction.model_outputs["logits"] == output + assert not np.array_equal((np.sum(prediction.model_outputs["logits"], axis=-1), np.ones_like(word_ids)), "logits are not softmax") assert "labels" not in prediction.task_outputs assert "logits" in prediction.model_outputs assert prediction.task_outputs["word_ids"] == word_ids @pytest.mark.usefixtures("test_transformer_embedding") -class TestTransformerSequenceClassification: - +class TestTransformerEmbedding: @pytest.mark.asyncio @pytest.mark.parametrize("input,mode", [(["this is a test"], "mean"), (["this is a test", "this is a test with a longer sentence"], "mean"), @@ -207,9 +179,22 @@ async def test_embedding_unknown_mode(self, test_transformer_embedding): with pytest.raises(ValueError): prediction = await test_transformer_embedding.predict(request) + @pytest.mark.asyncio + async def test_forbid_is_preprocessed(self, test_transformer_embedding): + request = PredictionRequest.parse_obj({ + "input": ["test"], + "is_preprocessed": True, + "preprocessing_kwargs": {}, + "model_kwargs": {}, + "task": "embedding", + "task_kwargs": {}, + "adapter_name": "" + }) + with pytest.raises(ValueError): + prediction = await test_transformer_embedding.predict(request) @pytest.mark.usefixtures("test_transformer_question_answering") -class TestTransformerSequenceClassification: +class TestTransformerQuestionAnswering: @pytest.mark.asyncio @pytest.mark.parametrize("input", [([["What is a test?", "A test is a thing where you test."]]), @@ -256,11 +241,12 @@ async def test_question_answering_topk(self, test_transformer_question_answering @pytest.mark.usefixtures("test_transformer_generation") class TestTransformerGeneration: @pytest.mark.asyncio - async def test_generation(self, test_transformer_generation): + @pytest.mark.parametrize("input", [(["Generate text"]), + (["Generate text", "And more text"])], + ) + async def test_generation(self, test_transformer_generation, input): request = PredictionRequest.parse_obj({ - "input": [ - "this is a test" - ], + "input": input, "is_preprocessed": False, "preprocessing_kwargs": {}, "model_kwargs": {}, @@ -270,4 +256,45 @@ async def test_generation(self, test_transformer_generation): }) prediction = await test_transformer_generation.predict(request) - assert prediction \ No newline at end of file + assert all(isinstance(prediction.task_outputs["generated_texts"][i][0], str) for i in range(len(input))) + + @pytest.mark.asyncio + async def test_generation_output_attention_and_scores(self, test_transformer_generation): + request = PredictionRequest.parse_obj({ + "input": ["Generate text"], + "is_preprocessed": False, + "preprocessing_kwargs": {}, + "model_kwargs": { + "output_attentions": True, + "output_scores": True + }, + "task": "generation", + "task_kwargs": {}, + "adapter_name": "" + }) + + prediction = await test_transformer_generation.predict(request) + assert "scores" in prediction.model_outputs + assert "attentions" in prediction.model_outputs + + @pytest.mark.asyncio + async def test_generation_beam_sample_multiple_seqs(self, test_transformer_generation): + request = PredictionRequest.parse_obj({ + "input": ["Generate text"], + "is_preprocessed": False, + "preprocessing_kwargs": {}, + "model_kwargs": {}, + "task": "generation", + "task_kwargs": { + "num_beams": 2, + "do_sample": True, + "top_k": 10, + "top_p": 0.5, + "no_repeat_ngram_size": 2, + "num_return_sequences": 2 + }, + "adapter_name": "" + }) + + prediction = await test_transformer_generation.predict(request) + assert len(prediction.task_outputs["generated_texts"][0]) == 2 \ No newline at end of file From eee585178c60b07e5b09bc921ac451d806590d74 Mon Sep 17 00:00:00 2001 From: Geigle Date: Thu, 17 Jun 2021 17:12:24 +0200 Subject: [PATCH 05/23] Extended documentation --- .../api/routes/prediction.py | 2 + .../square_model_inference/core/config.py | 17 +++++--- .../core/event_handlers.py | 4 +- .../inference/adaptertransformer.py | 19 +++++++-- .../inference/sentencetransformer.py | 17 ++++++-- .../inference/transformer.py | 40 ++++++++++++++---- .../models/prediction.py | 17 ++++++-- .../square_model_inference/models/request.py | 42 +++++++++++++++---- .../inference_server/tests/conftest.py | 2 +- .../pre_test_setup_for_docker_caching.py | 2 +- 10 files changed, 125 insertions(+), 37 deletions(-) diff --git a/square-model-inference-api/inference_server/square_model_inference/api/routes/prediction.py b/square-model-inference-api/inference_server/square_model_inference/api/routes/prediction.py index 5c2f6ec25..077a58b99 100644 --- a/square-model-inference-api/inference_server/square_model_inference/api/routes/prediction.py +++ b/square-model-inference-api/inference_server/square_model_inference/api/routes/prediction.py @@ -1,5 +1,6 @@ from fastapi import APIRouter from starlette.requests import Request +from loguru import logger from square_model_inference.models.request import PredictionRequest from square_model_inference.models.prediction import PredictionOutput @@ -14,6 +15,7 @@ async def predict( prediction_request: PredictionRequest = None, ) -> PredictionOutput: + logger.info(f"Request: {request.dict()}") model: Model = request.app.state.model prediction: PredictionOutput = await model.predict(prediction_request) diff --git a/square-model-inference-api/inference_server/square_model_inference/core/config.py b/square-model-inference-api/inference_server/square_model_inference/core/config.py index a3973bdee..c1cd0c6a3 100644 --- a/square-model-inference-api/inference_server/square_model_inference/core/config.py +++ b/square-model-inference-api/inference_server/square_model_inference/core/config.py @@ -8,17 +8,24 @@ # Disable CUDA even if available DISABLE_GPU: bool = config("DISABLE_GPU", cast=bool, default=False) -# Corresponds to the Huggingface name for Transformers +# Corresponds to the Huggingface name for finetuned Transformers or the name of a finetuned SentenceTransformers MODEL_NAME: str = config("MODEL_NAME") # Type of the model, e.g. Transformers, Adapter, ... +# See square_model_inference.core.event_handlers.MODEL_MAPPING for all available names with corresponding model MODEL_TYPE: str = config("MODEL_TYPE") # Cache directory where model weights are stored +# This is the name for the env variable used by transformers and sentence-transformers package TRANSFORMERS_CACHE: str = config("TRANSFORMERS_CACHE") -MAX_BATCH_SIZE: int = config("MAX_BATCH_SIZE", cast=int, default=32) +# Batch size used for many inputs +BATCH_SIZE: int = config("BATCH_SIZE", cast=int, default=32) -# For MODEL_TYPE=transformers: decides the AutoModelFor* class used +# For MODEL_TYPE=transformers: decides the AutoModel* class used +# See square_model_inference.inference.transformer.CLASS_MAPPING for valid names and corresponding class MODEL_CLASS: str = config("MODEL_CLASS", default="base") -# Flag that decides if returned numpy arrays are returned as lists or encoded to base64 (smaller but not easily human readable) -RETURN_PLAINTEXT_ARRAYS = config("RETURN_PLAINTEXT_ARRAYS", cast=bool, default=False) \ No newline at end of file +# Flag that decides if returned numpy arrays are returned +# as lists or encoded to base64 (smaller but not easily human readable). +# See the comment in square_model_inference.models.prediction._encode_numpy on information on how to decode +# the base64 string back to the numpy array +RETURN_PLAINTEXT_ARRAYS = config("RETURN_PLAINTEXT_ARRAYS", cast=bool, default=False) diff --git a/square-model-inference-api/inference_server/square_model_inference/core/event_handlers.py b/square-model-inference-api/inference_server/square_model_inference/core/event_handlers.py index bec998460..5a629fa2e 100644 --- a/square-model-inference-api/inference_server/square_model_inference/core/event_handlers.py +++ b/square-model-inference-api/inference_server/square_model_inference/core/event_handlers.py @@ -4,7 +4,7 @@ from loguru import logger from square_model_inference.inference.adaptertransformer import AdapterTransformer -from square_model_inference.core.config import MODEL_TYPE, MODEL_NAME, MODEL_CLASS, DISABLE_GPU, MAX_BATCH_SIZE, TRANSFORMERS_CACHE +from square_model_inference.core.config import MODEL_TYPE, MODEL_NAME, MODEL_CLASS, DISABLE_GPU, BATCH_SIZE, TRANSFORMERS_CACHE from square_model_inference.inference.sentencetransformer import SentenceTransformer from square_model_inference.inference.transformer import Transformer @@ -18,7 +18,7 @@ "model_name": MODEL_NAME, "model_class": MODEL_CLASS, "disable_gpu": DISABLE_GPU, - "max_batch_size": MAX_BATCH_SIZE, + "batch_size": BATCH_SIZE, "transformers_cache": TRANSFORMERS_CACHE } diff --git a/square-model-inference-api/inference_server/square_model_inference/inference/adaptertransformer.py b/square-model-inference-api/inference_server/square_model_inference/inference/adaptertransformer.py index e76259aa3..6e8ef5aa3 100644 --- a/square-model-inference-api/inference_server/square_model_inference/inference/adaptertransformer.py +++ b/square-model-inference-api/inference_server/square_model_inference/inference/adaptertransformer.py @@ -1,6 +1,5 @@ import json -import torch from loguru import logger from transformers import AutoModelWithHeads @@ -13,10 +12,22 @@ class AdapterTransformer(Transformer): - def __init__(self, model_name, max_batch_size, disable_gpu, transformers_cache, **kwargs): + """ + The class for all adapter-based models using the adapter-transformers package + """ + def __init__(self, model_name, batch_size, disable_gpu, transformers_cache, **kwargs): + """ + Initialize the Adapter with its underlying Transformer and pre-load all available adapters from adapterhub.ml + :param model_name: the Huggingface model name + :param batch_size: batch size used for inference + :param disable_gpu: do not move model to GPU even if CUDA is available + :param transformers_cache: Should be same as TRANSFORMERS_CACHE env variable. + This folder will be used to store the adapters + :param kwargs: Not used + """ self._load_model(AutoModelWithHeads, model_name, disable_gpu) self._load_adapter(model_name, transformers_cache) - self.max_batch_size = max_batch_size + self.batch_size = batch_size def _load_adapter(self, model_name, transformers_cache): """ @@ -24,6 +35,7 @@ def _load_adapter(self, model_name, transformers_cache): We parse the hub index to extract all names and then load each model. """ logger.info("Loading all available adapters from adapterhub.ml") + logger.warning("This will not load adapters published only on https://huggingface.co/models") index_file = download_cached(ADAPTER_HUB_INDEX_FILE.format(model_name)) adapter_index = json.load(open(index_file)) adapters = set() @@ -69,7 +81,6 @@ def _sequence_classification(self, request: PredictionRequest) -> PredictionOutp async def predict(self, request: PredictionRequest) -> PredictionOutput: if request.is_preprocessed: raise ValueError("is_preprocessed=True is not supported for this model. Please use text as input.") - if not request.adapter_name or request.adapter_name not in self.model.config.adapters.adapters: raise ValueError(f"Unknown or missing adapter {request.adapter_name}. " f"Please provider a fully specified adapter name from adapterhub.ml") diff --git a/square-model-inference-api/inference_server/square_model_inference/inference/sentencetransformer.py b/square-model-inference-api/inference_server/square_model_inference/inference/sentencetransformer.py index 7ed77b813..6244a0222 100644 --- a/square-model-inference-api/inference_server/square_model_inference/inference/sentencetransformer.py +++ b/square-model-inference-api/inference_server/square_model_inference/inference/sentencetransformer.py @@ -13,11 +13,20 @@ class SentenceTransformer(Model): - SUPPORTED_EMBEDDING_MODES = ["mean", "max", "cls", "token"] + """ + The class for all sentence-transformers models + """ - def __init__(self, model_name, max_batch_size, disable_gpu, **kwargs): + def __init__(self, model_name, batch_size, disable_gpu, **kwargs): + """ + Initialize the SentenceTransformer + :param model_name: the sentence-transformer model name (https://sbert.net/docs/pretrained_models.html) + :param batch_size: batch size used for inference + :param disable_gpu: do not move model to GPU even if CUDA is available + :param kwargs: Not used + """ self._load_model(model_name, disable_gpu) - self.max_batch_size = max_batch_size + self.batch_size = batch_size def _load_model(self, model_name, disable_gpu): """ @@ -31,7 +40,7 @@ def _load_model(self, model_name, disable_gpu): self.model = model def _embedding(self, request: PredictionRequest) -> PredictionOutput: - embeddings = self.model.encode(request.input, batch_size=self.max_batch_size, show_progress_bar=False) + embeddings = self.model.encode(request.input, batch_size=self.batch_size, show_progress_bar=False) return PredictionOutput(model_outputs={"embeddings": embeddings}, task_outputs={}) diff --git a/square-model-inference-api/inference_server/square_model_inference/inference/transformer.py b/square-model-inference-api/inference_server/square_model_inference/inference/transformer.py index a6994ec4d..5e4dc9a15 100644 --- a/square-model-inference-api/inference_server/square_model_inference/inference/transformer.py +++ b/square-model-inference-api/inference_server/square_model_inference/inference/transformer.py @@ -22,13 +22,24 @@ } class Transformer(Model): + """ + The class for all Huggingface transformer-based models + """ SUPPORTED_EMBEDDING_MODES = ["mean", "max", "cls", "token"] - def __init__(self, model_name, model_class, max_batch_size, disable_gpu, **kwargs): + def __init__(self, model_name, model_class, batch_size, disable_gpu, **kwargs): + """ + Initialize the Transformer + :param model_name: the Huggingface model name + :param model_class: the class name (according to CLASS_MAPPING) to use + :param batch_size: batch size used for inference + :param disable_gpu: do not move model to GPU even if CUDA is available + :param kwargs: Not used + """ if model_class not in CLASS_MAPPING: raise RuntimeError(f"Unknown MODEL_CLASS. Must be one of {CLASS_MAPPING.keys()}") self._load_model(CLASS_MAPPING[model_class], model_name, disable_gpu) - self.max_batch_size = max_batch_size + self.batch_size = batch_size def _load_model(self, model_cls, model_name, disable_gpu): """ @@ -59,15 +70,22 @@ def _ensure_tensor_on_device(self, **inputs): def _predict(self, request: PredictionRequest, output_features=False) \ -> Union[dict, Tuple[dict, dict]]: + """ + Inference on the input. + :param request: the request with the input and optional kwargs + :param output_features: return the features of the input. + Necessary if, e.g., attention mask is needed for post-processing. + :return: The model outputs and optionally the input features + """ all_predictions = [] request.preprocessing_kwargs["padding"] = request.preprocessing_kwargs.get("padding", True) request.preprocessing_kwargs["truncation"] = request.preprocessing_kwargs.get("truncation", True) features = self.tokenizer(request.input, return_tensors="pt", **request.preprocessing_kwargs) - for start_idx in range(0, len(request.input), self.max_batch_size): + for start_idx in range(0, len(request.input), self.batch_size): with torch.no_grad(): - input_features = {k: features[k][start_idx:start_idx+self.max_batch_size] for k in features.keys()} + input_features = {k: features[k][start_idx:start_idx+self.batch_size] for k in features.keys()} input_features = self._ensure_tensor_on_device(**input_features) predictions = self.model(**input_features, **request.model_kwargs) all_predictions.append(predictions) @@ -123,7 +141,7 @@ def _embedding(self, request: PredictionRequest) -> PredictionOutput: def _token_classification(self, request: PredictionRequest) -> PredictionOutput: predictions, features = self._predict(request, output_features=True) - # If logits dim > 1 or if the 'regression' flag is not set, we assume classification: + # If logits dim > 1 or if the 'is_regression' flag is not set, we assume classification: # We replace the logits by the softmax and add labels chosen with argmax label2id = self.model.config.label2id id2label = {v:k for k,v in label2id.items()} @@ -145,7 +163,7 @@ def _sequence_classification(self, request: PredictionRequest) -> PredictionOutp task_outputs = { "id2label": id2label } - # If logits dim > 1 or if the 'regression' flag is not set, we assume classification: + # If logits dim > 1 or if the 'is_regression' flag is not set, we assume classification: # We replace the logits by the softmax and add labels chosen with argmax if predictions["logits"].size()[-1] != 1 and not request.task_kwargs.get("is_regression", False): probabilities = torch.softmax(predictions["logits"], dim=-1) @@ -159,6 +177,7 @@ def _generation(self, request: PredictionRequest) -> PredictionOutput: request.preprocessing_kwargs["add_special_tokens"] = request.preprocessing_kwargs.get("add_special_tokens", False) task_outputs = {"generated_texts": []} model_outputs = defaultdict(list) + # We cannot batch generate so we have to to it separately for each input prompt. for prompt in request.input: features = self.tokenizer(prompt, return_tensors="pt", **request.preprocessing_kwargs) input_ids = features["input_ids"] @@ -185,6 +204,13 @@ def _generation(self, request: PredictionRequest) -> PredictionOutput: return PredictionOutput(model_outputs=model_outputs, task_outputs=task_outputs) def _question_answering(self, request: PredictionRequest) -> PredictionOutput: + """ + Span-based question answering for a given question and context. + + We expect the input to use the (question, context) format for the text pairs. + :param request: + :return: + """ # Making heavy use of https://huggingface.co/transformers/_modules/transformers/pipelines/question_answering.html#QuestionAnsweringPipeline def decode(start_: np.ndarray, end_: np.ndarray, topk: int, max_answer_len: int, undesired_tokens_: np.ndarray) -> Tuple: """ @@ -254,7 +280,7 @@ def decode(start_: np.ndarray, end_: np.ndarray, topk: int, max_answer_len: int, end = np.exp(end - np.log(np.sum(np.exp(end), axis=-1, keepdims=True))) starts, ends, scores = decode( - start, end, request.task_kwargs.get("topk", 1), request.task_kwargs.get("max_answer_len", 512), undesired_tokens + start, end, request.task_kwargs.get("topk", 1), request.task_kwargs.get("max_answer_len", 128), undesired_tokens ) enc = features[idx] answers = [ diff --git a/square-model-inference-api/inference_server/square_model_inference/models/prediction.py b/square-model-inference-api/inference_server/square_model_inference/models/prediction.py index f3f6df4cd..98fea524a 100644 --- a/square-model-inference-api/inference_server/square_model_inference/models/prediction.py +++ b/square-model-inference-api/inference_server/square_model_inference/models/prediction.py @@ -11,8 +11,8 @@ def _encode_numpy(obj: Dict[str, Union[torch.Tensor, Tuple[torch.Tensor]]], return_plaintext: bool=RETURN_PLAINTEXT_ARRAYS) -> Dict[str, Union[list, str]]: """ - Encodes the Torch Tensors first to Numpy arrays and then makes either plain lists or base64-encode them depending - on the flag RETURN_PLAINTEXT_ARRAYS + Encodes the Torch Tensors first to Numpy arrays and then encodes them either as plain lists or base64 string + depending on the flag RETURN_PLAINTEXT_ARRAYS :param obj: the objects whose tensors will be encoded :return: the same dictionary with all tensors replaced by lists or base64-encoded array strings. """ @@ -30,6 +30,7 @@ def encode(arr): arr_binary_b64 = base64.b64encode(arr_binary) arr_string_b64 = arr_binary_b64.decode("latin1") return arr_string_b64 + # DECODE THE VALUE WITH # arr_binary_b64 = arr_string_b64.encode() # arr_binary = base64.decodebytes(arr_binary_b64) @@ -53,11 +54,19 @@ class PredictionOutput(BaseModel): The results of the prediction of the model on the given input for the requested task. """ model_outputs: dict = Field( - description="Dictionary containing the model tensor outputs either as plain list or as base64-encoded numpy array." + description="Dictionary containing the model tensor outputs either as plain list or as base64-encoded numpy array.

" + "Decode the base64 string 'arr_string_b64' back to an array in Python like this:
" + "arr_binary_b64 = arr_string_b64.encode()
" + "arr_binary = base64.decodebytes(arr_binary_b64)
" + "arr = np.load(BytesIO(arr_binary))

" + "Task 'generation' does not concatenate the tensors of the inputs together but instead creates a list" + "with each entry corresponding to the respective input." + ) task_outputs: dict = Field( description="Dictionary containing the task outputs. This covers anything from generated text, " - "labels, id2label mappings, token2word mappings, etc." + "labels, id2label mappings, token2word mappings, etc.

" + "SentenceTransformer: This is not used.
" ) def __init__(self, **data): diff --git a/square-model-inference-api/inference_server/square_model_inference/models/request.py b/square-model-inference-api/inference_server/square_model_inference/models/request.py index ab3c7e1ea..ad4af8ae6 100644 --- a/square-model-inference-api/inference_server/square_model_inference/models/request.py +++ b/square-model-inference-api/inference_server/square_model_inference/models/request.py @@ -25,31 +25,55 @@ class PredictionRequest(BaseModel): ..., description="Input for the model. Supports Huggingface Transformer inputs (i.e., list of sentences, or " "list of pairs of sentences), a dictionary with Transformer inputs, or a dictionary containing " - "numpy arrays (as lists). For the numpy arrays, also set is_preprocessed=True." + "numpy arrays (as lists). For the numpy arrays, also set is_preprocessed=True. " + "

" + "Transformer/ Adapter:
" + "Task 'question_answering' expects the input to be in the (question, context) format." ) is_preprocessed: bool = Field( default=False, description="Flag indicating that the input contains already pre-processed numpy arrays " - "as list and that it needs no further pre-processing." + "as list and that it needs no further pre-processing.

" + "Transformer/ Adapter/ SentenceTransformer: 'is_preprocessed' is not supported." ) preprocessing_kwargs: dict = Field( default={}, - description="Dictionary containing additional parameters for the pre-processing step." + description="Optional dictionary containing additional parameters for the pre-processing step.

" + "SentenceTransformer: This is ignored.
" + "Transformer/ Adapter: See the Huggingface tokenizer for possible parameters." ) model_kwargs: dict = Field( default={}, - description="Dictionary containing parameters that are passed to the model for the forward pass. " - "Set ‘output_attentions=True’ to receive the attention weights for Huggingface Transformers in the " - "output." + description="Optional dictionary containing parameters that are passed to the model for the forward pass " + "to control what additional tensors are returned.

" + "SentenceTransformer: This is ignored.
" + "Transformer/ Adapter: See the forward method of the Huggingface models for possible parameters" + "For example, set ‘output_attentions=True’ to receive the attention results in the output." ) task: Task = Field(...) task_kwargs: dict = Field( default={}, - description="Dictionary containing additional parameters for handling of the task and " - "task-related post-processing." + description="Optional dictionary containing additional parameters for handling of the task and " + "task-related post-processing.

" + "SentenceTransformer: This is ignored.
" + "Transformer/ Adapter:
" + "'sentence_classification':
" + "- 'is_regression': Flag to treat output of models with num_labels>1 as regression, too, i.e., no softmax and no labels are returned
" + "'token_classification':
" + "- 'is_regression': Flag to treat output of models with num_labels>1 as regression, too, i.e., no softmax and no labels are returned
" + "'embedding':
" + "- 'embedding_mode: One of 'mean', 'max', 'cls', 'token'. The pooling mode used (or not used for 'token'). Default 'mean'.
" + "'question_answering':
" + "- 'topk': Return the top-k most likely spans. Default 1.
" + "- 'max_answer_len': Maximal token length of answers. Default 128.
" + "'generation':
" + "- 'clean_up_tokenization_spaces': See parameter in Huggingface tokenizer.decode(). Default False
" + "- See Huggingface model.generate() for all possible parameters that can be used. " + "Note, 'model_kwargs' and 'task_kwargs' are merged for generation." + ) adapter_name: Optional[str] = Field( default="", - description="Only necessary for adapter-based models. " + description="Only necessary for Adapter. " "The fully specified name of the to-be-used adapter from adapterhub.ml" ) \ No newline at end of file diff --git a/square-model-inference-api/inference_server/tests/conftest.py b/square-model-inference-api/inference_server/tests/conftest.py index 55a6e05e2..df5906e2f 100644 --- a/square-model-inference-api/inference_server/tests/conftest.py +++ b/square-model-inference-api/inference_server/tests/conftest.py @@ -12,7 +12,7 @@ # environ["MODEL_NAME"] = "test" # environ["MODEL_TYPE"] = "test" # environ["DISABLE_GPU"] = "True" -# environ["MAX_BATCH_SIZE"] = "1" +# environ["BATCH_SIZE"] = "1" # environ["RETURN_PLAINTEXT_ARRAYS"] = "False" from main import get_app diff --git a/square-model-inference-api/inference_server/tests/pre_test_setup_for_docker_caching.py b/square-model-inference-api/inference_server/tests/pre_test_setup_for_docker_caching.py index cf12f1e0f..0b8a3fd41 100644 --- a/square-model-inference-api/inference_server/tests/pre_test_setup_for_docker_caching.py +++ b/square-model-inference-api/inference_server/tests/pre_test_setup_for_docker_caching.py @@ -7,7 +7,7 @@ environ["MODEL_NAME"] = "test" environ["MODEL_TYPE"] = "test" environ["DISABLE_GPU"] = "True" -environ["MAX_BATCH_SIZE"] = "1" +environ["BATCH_SIZE"] = "1" environ["RETURN_PLAINTEXT_ARRAYS"] = "True" from square_model_inference.inference.transformer import Transformer From 0029da1eb5370af5506197bdb68f91f7638c1817 Mon Sep 17 00:00:00 2001 From: Geigle Date: Thu, 24 Jun 2021 15:18:11 +0200 Subject: [PATCH 06/23] Updated prediction documentation; question-answering: added 'no answer' support --- .../api/routes/prediction.py | 2 +- .../inference/adaptertransformer.py | 1 + .../inference/transformer.py | 11 ++++- .../models/prediction.py | 46 ++++++++++++++++++- 4 files changed, 57 insertions(+), 3 deletions(-) diff --git a/square-model-inference-api/inference_server/square_model_inference/api/routes/prediction.py b/square-model-inference-api/inference_server/square_model_inference/api/routes/prediction.py index 077a58b99..6632c3bf4 100644 --- a/square-model-inference-api/inference_server/square_model_inference/api/routes/prediction.py +++ b/square-model-inference-api/inference_server/square_model_inference/api/routes/prediction.py @@ -15,7 +15,7 @@ async def predict( prediction_request: PredictionRequest = None, ) -> PredictionOutput: - logger.info(f"Request: {request.dict()}") + logger.info(f"Request: {prediction_request.dict()}") model: Model = request.app.state.model prediction: PredictionOutput = await model.predict(prediction_request) diff --git a/square-model-inference-api/inference_server/square_model_inference/inference/adaptertransformer.py b/square-model-inference-api/inference_server/square_model_inference/inference/adaptertransformer.py index 6e8ef5aa3..3a0507219 100644 --- a/square-model-inference-api/inference_server/square_model_inference/inference/adaptertransformer.py +++ b/square-model-inference-api/inference_server/square_model_inference/inference/adaptertransformer.py @@ -36,6 +36,7 @@ def _load_adapter(self, model_name, transformers_cache): """ logger.info("Loading all available adapters from adapterhub.ml") logger.warning("This will not load adapters published only on https://huggingface.co/models") + logger.warning("UPDATE with https://github.com/Adapter-Hub/adapter-transformers/pull/193") index_file = download_cached(ADAPTER_HUB_INDEX_FILE.format(model_name)) adapter_index = json.load(open(index_file)) adapters = set() diff --git a/square-model-inference-api/inference_server/square_model_inference/inference/transformer.py b/square-model-inference-api/inference_server/square_model_inference/inference/transformer.py index 5e4dc9a15..46c68c77a 100644 --- a/square-model-inference-api/inference_server/square_model_inference/inference/transformer.py +++ b/square-model-inference-api/inference_server/square_model_inference/inference/transformer.py @@ -267,7 +267,10 @@ def decode(start_: np.ndarray, end_: np.ndarray, topk: int, max_answer_len: int, start = start.numpy() end = end.numpy() # Ensure padded tokens & question tokens cannot belong to the set of candidate answers. - undesired_tokens = np.abs(np.array([s != 1 for s in features.sequence_ids(idx)]) - 1) & features["attention_mask"][idx].numpy() + question_tokens = np.abs(np.array([s != 1 for s in features.sequence_ids(idx)]) - 1) + # Unmask CLS token for 'no answer' + question_tokens[0] = 1 + undesired_tokens = question_tokens & features["attention_mask"][idx].numpy() # Generate mask undesired_tokens_mask = undesired_tokens == 0.0 @@ -279,6 +282,10 @@ def decode(start_: np.ndarray, end_: np.ndarray, topk: int, max_answer_len: int, start = np.exp(start - np.log(np.sum(np.exp(start), axis=-1, keepdims=True))) end = np.exp(end - np.log(np.sum(np.exp(end), axis=-1, keepdims=True))) + # Get score for 'no answer' then mask for decoding step (CLS token + no_answer_score = (start[0] * end[0]).item() + start[0] = end[0] = 0.0 + starts, ends, scores = decode( start, end, request.task_kwargs.get("topk", 1), request.task_kwargs.get("max_answer_len", 128), undesired_tokens ) @@ -294,6 +301,8 @@ def decode(start_: np.ndarray, end_: np.ndarray, topk: int, max_answer_len: int, enc.word_to_chars(enc.token_to_word(e), sequence_index=1)[1]], } for s, e, score in zip(starts, ends, scores)] + answers.append({"score": no_answer_score, "start": 0, "end": 0, "answer": ""}) + answers = sorted(answers, key=lambda x: x["score"], reverse=True)[: request.task_kwargs.get("topk", 1)] task_outputs["answers"].append(answers) return PredictionOutput(model_outputs=predictions, task_outputs=task_outputs) diff --git a/square-model-inference-api/inference_server/square_model_inference/models/prediction.py b/square-model-inference-api/inference_server/square_model_inference/models/prediction.py index 98fea524a..e1160a1da 100644 --- a/square-model-inference-api/inference_server/square_model_inference/models/prediction.py +++ b/square-model-inference-api/inference_server/square_model_inference/models/prediction.py @@ -59,15 +59,59 @@ class PredictionOutput(BaseModel): "arr_binary_b64 = arr_string_b64.encode()
" "arr_binary = base64.decodebytes(arr_binary_b64)
" "arr = np.load(BytesIO(arr_binary))

" + "SentenceTransformer:
" + "'embedding':
" + "- 'embeddings: Embedding tensors.
" + "Transformer/ Adapter:
" + "Optional tensor depend on request's 'model_kwargs' parameters, e.g. 'output_attentions'. " + "See the Huggingface documentation for information like shape etc.
" + + "'sentence_classification':
" + "- 'logits': (Softmax) logits of the classifier.
" + "'token_classification':
" + "- 'logits': (Softmax) logits of the classifier.
" + "'embedding':
" + "- 'embeddings: Embedding tensors.
" + "'question_answering':
" + "- 'start_logits': Logits for the beginning of the span
" + "- 'end_logits': Logits for the end of the span
" + "'generation':
" + "- 'sequences': The generated vocab ids for the sequence
" "Task 'generation' does not concatenate the tensors of the inputs together but instead creates a list" - "with each entry corresponding to the respective input." + "of the tensors for each input." ) task_outputs: dict = Field( description="Dictionary containing the task outputs. This covers anything from generated text, " "labels, id2label mappings, token2word mappings, etc.

" "SentenceTransformer: This is not used.
" + "Transformer/ Adapter:
" + "'sentence_classification':
" + "- 'labels': List of the predicted label ids for the input
" + "- 'id2label': Mapping from label id to the label name
" + "'token_classification':
" + "- 'labels': List of the predicted label ids for the input
" + "- 'id2label': Mapping from label id to the label name
" + "- 'word_ids': Mapping from tokens (as produced by the model tokenizer) to the words of the input.
" + "'None' represents special tokens that have been added by the tokenizer
" + "'embedding':
" + "- 'embedding_mode: One of 'mean', 'max', 'cls', 'token'. The pooling mode used (or not used for 'token')
" + "- 'word_ids': Mapping from tokens (as produced by the model tokenizer) to the words of the input. " + "Only returned if 'embedding_mode=token'
" + "'question_answering':
" + "- 'answers': List of lists of answers. Length of outer list is the number of inputs, " + "length of inner list is parameter 'topk' from the request's 'task_kwargs' (default 1). " + "Each answer is a dictionary with 'score', 'start' (span start index in context), 'end' (span end index in context), " + "and 'answer' (the extracted span). The inner list is sorted by score. If no answer span was extracted, " + "the empty span is returned (start and end both 0).
" + "'generation':
" + "- 'generated_texts': List of list of the generated texts. Length of outer list is the number of inputs, " + "length of inner list is parameter 'num_return_sequences' in request's 'task_kwargs'" ) + model_output_is_encoded: bool = Field( + not RETURN_PLAINTEXT_ARRAYS, + description="Flag indicating that 'model_output' is a base64-encoded numpy array and not a human-readable list." + "See the field description for 'model_output' on information on how to decode the array.") def __init__(self, **data): """ From 32c973a7cfb2557b60d136fa08e6bc71041652f9 Mon Sep 17 00:00:00 2001 From: Geigle Date: Tue, 29 Jun 2021 09:23:20 +0200 Subject: [PATCH 07/23] Removed project dependencies from pre-test setup so it can run without the project code (Docker caching reasons) --- .../pre_test_setup_for_docker_caching.py | 44 +++++++++++++++---- 1 file changed, 35 insertions(+), 9 deletions(-) diff --git a/square-model-inference-api/inference_server/tests/pre_test_setup_for_docker_caching.py b/square-model-inference-api/inference_server/tests/pre_test_setup_for_docker_caching.py index 0b8a3fd41..a6b537d5f 100644 --- a/square-model-inference-api/inference_server/tests/pre_test_setup_for_docker_caching.py +++ b/square-model-inference-api/inference_server/tests/pre_test_setup_for_docker_caching.py @@ -1,8 +1,15 @@ # This file is used for the Docker image to cache long-running setup for the tests, # i.e., downloading finetuned Transformer models and so on. # This way, adding new tests or even changing the server code does NOT trigger a new download during building +import json + +from sentence_transformers import SentenceTransformer from starlette.config import environ -TRANSFORMERS_TESTING_CACHE = "./model_testing_cache" +from transformers import AutoTokenizer, AutoModelWithHeads +from loguru import logger +from transformers.adapter_utils import download_cached, ADAPTER_HUB_INDEX_FILE + +TRANSFORMERS_TESTING_CACHE = "./.model_testing_cache" environ["TRANSFORMERS_CACHE"] = TRANSFORMERS_TESTING_CACHE environ["MODEL_NAME"] = "test" environ["MODEL_TYPE"] = "test" @@ -10,15 +17,34 @@ environ["BATCH_SIZE"] = "1" environ["RETURN_PLAINTEXT_ARRAYS"] = "True" -from square_model_inference.inference.transformer import Transformer -from square_model_inference.inference.adaptertransformer import AdapterTransformer -from square_model_inference.inference.sentencetransformer import SentenceTransformer - -#Downloaded models: +# Downloaded models: TRANSFORMER_MODEL = "bert-base-uncased" SENTENCE_MODEL = "paraphrase-albert-small-v2" if __name__ == "__main__": - bert_transformer = Transformer(TRANSFORMER_MODEL, "base", 1, True) - adapter_transformer = AdapterTransformer(TRANSFORMER_MODEL, 1, True, TRANSFORMERS_TESTING_CACHE) - sentence_transformer = SentenceTransformer(SENTENCE_MODEL) \ No newline at end of file + # We pre-download all models needed for the tests. + # We have to be careful to NOT import anything from square_model_inference because this script runs in the Dockerfile + # BEFORE any other of our code is copied (so that we do not have to re-download after every code change). + device = "cpu" + + # Pre-download Huggingface model for tests + _ = AutoTokenizer.from_pretrained(TRANSFORMER_MODEL) + model = AutoModelWithHeads.from_pretrained(TRANSFORMER_MODEL).to(device) + + # Pre-download adapters + logger.warning("UPDATE with https://github.com/Adapter-Hub/adapter-transformers/pull/193") + index_file = download_cached(ADAPTER_HUB_INDEX_FILE.format(TRANSFORMER_MODEL)) + adapter_index = json.load(open(index_file)) + adapters = set() + for task, datasets in adapter_index.items(): + for dataset in datasets.keys(): + for key in datasets[dataset].keys(): + if key != "default": + for org in datasets[dataset][key]["versions"].keys(): + adapters.add(f"{task}/{dataset}@{org}") + for adapter in adapters: + logger.debug(f"Loading adapter {adapter}") + model.load_adapter(adapter, load_as=adapter, with_head=True, cache_dir=TRANSFORMERS_TESTING_CACHE) + + # Pre-download sentence-transformer models for tests + _ = SentenceTransformer(model_name_or_path=SENTENCE_MODEL, device=device) From 9c5da6ae4818021d17dc49c4721115e4b5fe317a Mon Sep 17 00:00:00 2001 From: Geigle Date: Thu, 1 Jul 2021 16:03:35 +0200 Subject: [PATCH 08/23] Made PredictionRequest a fixture and just set relevant fields --- .../inference_server/tests/conftest.py | 13 ++ .../tests/test_inference/test_transformers.py | 211 +++++------------- 2 files changed, 73 insertions(+), 151 deletions(-) diff --git a/square-model-inference-api/inference_server/tests/conftest.py b/square-model-inference-api/inference_server/tests/conftest.py index df5906e2f..c3a4984ae 100644 --- a/square-model-inference-api/inference_server/tests/conftest.py +++ b/square-model-inference-api/inference_server/tests/conftest.py @@ -78,3 +78,16 @@ def test_adapter(): @pytest.fixture(scope="class") def test_sentence_transformer(): return SentenceTransformer(SENTENCE_MODEL, 1, True) + +@pytest.fixture() +def request(): + request = PredictionRequest.parse_obj({ + "input": ["test"], + "is_preprocessed": False, + "preprocessing_kwargs": {}, + "model_kwargs": {}, + "task": "sequence_classification", + "task_kwargs": {}, + "adapter_name": "" + }) + return request \ No newline at end of file diff --git a/square-model-inference-api/inference_server/tests/test_inference/test_transformers.py b/square-model-inference-api/inference_server/tests/test_inference/test_transformers.py index 9c1a9b00d..bf6c9a3d2 100644 --- a/square-model-inference-api/inference_server/tests/test_inference/test_transformers.py +++ b/square-model-inference-api/inference_server/tests/test_inference/test_transformers.py @@ -9,16 +9,9 @@ class TestTransformerSequenceClassification: @pytest.mark.parametrize("input", [(["this is a test"]), (["this is a test", "this is a test with a longer sentence"])], ids=["single", "batch"]) - async def test_sequence_classification(self, test_transformer_sequence_classification, input): - request = PredictionRequest.parse_obj({ - "input": input, - "is_preprocessed": False, - "preprocessing_kwargs": {}, - "model_kwargs": {}, - "task": "sequence_classification", - "task_kwargs": {}, - "adapter_name": "" - }) + async def test_sequence_classification(self, request, test_transformer_sequence_classification, input): + request.input = input + request.task = "sequence_classification" prediction = await test_transformer_sequence_classification.predict(request) np.testing.assert_allclose(np.sum(prediction.model_outputs["logits"], axis=-1), [1.0]*len(input), err_msg="logits are softmax") @@ -28,16 +21,9 @@ async def test_sequence_classification(self, test_transformer_sequence_classific assert "id2label" in prediction.task_outputs @pytest.mark.asyncio - async def test_sequence_classification_output_attention(self, test_transformer_sequence_classification): - request = PredictionRequest.parse_obj({ - "input": ["testing test"], - "is_preprocessed": False, - "preprocessing_kwargs": {}, - "model_kwargs": {"output_attentions": True}, - "task": "sequence_classification", - "task_kwargs": {}, - "adapter_name": "" - }) + async def test_sequence_classification_output_attention(self, request, test_transformer_sequence_classification): + request.task = "sequence_classification" + request.model_kwargs = {"output_attentions": True} prediction = await test_transformer_sequence_classification.predict(request) assert "attentions" in prediction.model_outputs @@ -46,16 +32,10 @@ async def test_sequence_classification_output_attention(self, test_transformer_s @pytest.mark.parametrize("input", [(["this is a test"]), (["this is a test", "this is a test with a longer sentence"])], ids=["single", "batch"]) - async def test_sequence_classification_regression(self, test_transformer_sequence_classification, input): - request = PredictionRequest.parse_obj({ - "input": input, - "is_preprocessed": False, - "preprocessing_kwargs": {}, - "model_kwargs": {}, - "task": "sequence_classification", - "task_kwargs": {"is_regression": True}, - "adapter_name": "" - }) + async def test_sequence_classification_regression(self, request, test_transformer_sequence_classification, input): + request.input = input + request.task = "sequence_classification" + request.task_kwargs = {"is_regression": True} prediction = await test_transformer_sequence_classification.predict(request) assert not np.array_equal(np.sum(prediction.model_outputs["logits"], axis=-1)-1, [0.0]*len(input)), "logits are not softmax" @@ -71,16 +51,9 @@ class TestTransformerTokenClassification: (["this is a test", "this is a test with a longer sentence"], [[None, 0, 1, 2, 3, None, None, None, None, None], [None, 0, 1, 2, 3, 4, 5, 6, 7, None]])], ids=["single", "batch"]) - async def test_token_classification(self, test_transformer_token_classification, input, word_ids): - request = PredictionRequest.parse_obj({ - "input": input, - "is_preprocessed": False, - "preprocessing_kwargs": {}, - "model_kwargs": {}, - "task": "token_classification", - "task_kwargs": {}, - "adapter_name": "" - }) + async def test_token_classification(self, request, test_transformer_token_classification, input, word_ids): + request.input = input + request.task = "token_classification" prediction = await test_transformer_token_classification.predict(request) np.testing.assert_allclose(np.sum(prediction.model_outputs["logits"], axis=-1), np.ones(shape=(len(input), len(word_ids[0]))), err_msg="logits are softmax") @@ -96,16 +69,10 @@ async def test_token_classification(self, test_transformer_token_classification, (["this is a test", "this is a test with a longer sentence"], [[None, 0, 1, 2, 3, None, None, None, None, None], [None, 0, 1, 2, 3, 4, 5, 6, 7, None]])], ids=["single", "batch"]) - async def test_token_classification_regression(self, test_transformer_token_classification, input, word_ids): - request = PredictionRequest.parse_obj({ - "input": input, - "is_preprocessed": False, - "preprocessing_kwargs": {}, - "model_kwargs": {}, - "task": "token_classification", - "task_kwargs": {"is_regression": True}, - "adapter_name": "" - }) + async def test_token_classification_regression(self, request, test_transformer_token_classification, input, word_ids): + request.input = input + request.task = "token_classification" + request.task_kwargs = {"is_regression": True} prediction = await test_transformer_token_classification.predict(request) assert not np.array_equal((np.sum(prediction.model_outputs["logits"], axis=-1), np.ones_like(word_ids)), "logits are not softmax") @@ -124,16 +91,10 @@ class TestTransformerEmbedding: (["this is a test"], "cls"), (["this is a test", "this is a test with a longer sentence"], "cls")], ) - async def test_embedding(self, test_transformer_embedding, input, mode): - request = PredictionRequest.parse_obj({ - "input": input, - "is_preprocessed": False, - "preprocessing_kwargs": {}, - "model_kwargs": {}, - "task": "embedding", - "task_kwargs": {"embedding_mode": mode}, - "adapter_name": "" - }) + async def test_embedding(self, request, test_transformer_embedding, input, mode): + request.input = input + request.task = "embedding" + request.task_kwargs = {"embedding_mode": mode} prediction = await test_transformer_embedding.predict(request) assert np.array(prediction.model_outputs["embeddings"]).shape[1] == 768 @@ -146,16 +107,10 @@ async def test_embedding(self, test_transformer_embedding, input, mode): (["this is a test", "this is a test with a longer sentence"], [[None, 0, 1, 2, 3, None, None, None, None, None], [None, 0, 1, 2, 3, 4, 5, 6, 7, None]])], ids=["single", "batch"]) - async def test_embedding_token(self, test_transformer_embedding, input, word_ids): - request = PredictionRequest.parse_obj({ - "input": input, - "is_preprocessed": False, - "preprocessing_kwargs": {}, - "model_kwargs": {}, - "task": "embedding", - "task_kwargs": {"embedding_mode": "token"}, - "adapter_name": "" - }) + async def test_embedding_token(self, request, test_transformer_embedding, input, word_ids): + request.input = input + request.task = "embedding" + request.task_kwargs = {"embedding_mode": "token"} prediction = await test_transformer_embedding.predict(request) assert np.array(prediction.model_outputs["embeddings"]).shape[2] == 768 @@ -166,30 +121,18 @@ async def test_embedding_token(self, test_transformer_embedding, input, word_ids assert prediction.task_outputs["word_ids"] == word_ids @pytest.mark.asyncio - async def test_embedding_unknown_mode(self, test_transformer_embedding): - request = PredictionRequest.parse_obj({ - "input": ["test"], - "is_preprocessed": False, - "preprocessing_kwargs": {}, - "model_kwargs": {}, - "task": "embedding", - "task_kwargs": {"embedding_mode": "this mode does not exist"}, - "adapter_name": "" - }) + async def test_embedding_unknown_mode(self, request, test_transformer_embedding): + request.task = "embedding" + request.task_kwargs = {"embedding_mode": "this mode does not exist"} + with pytest.raises(ValueError): prediction = await test_transformer_embedding.predict(request) @pytest.mark.asyncio - async def test_forbid_is_preprocessed(self, test_transformer_embedding): - request = PredictionRequest.parse_obj({ - "input": ["test"], - "is_preprocessed": True, - "preprocessing_kwargs": {}, - "model_kwargs": {}, - "task": "embedding", - "task_kwargs": {}, - "adapter_name": "" - }) + async def test_forbid_is_preprocessed(self, request, test_transformer_embedding): + request.task = "embedding" + request.is_preprocessed = True + with pytest.raises(ValueError): prediction = await test_transformer_embedding.predict(request) @@ -201,16 +144,10 @@ class TestTransformerQuestionAnswering: ([["What is a test?", "A test is a thing where you test."], ["What is a longer test?", "A test is a thing where you test. If it is longer you call it longer"]])], ) - async def test_question_answering(self, test_transformer_question_answering, input): - request = PredictionRequest.parse_obj({ - "input": input, - "is_preprocessed": False, - "preprocessing_kwargs": {}, - "model_kwargs": {}, - "task": "question_answering", - "task_kwargs": {"topk": 1}, - "adapter_name": "" - }) + async def test_question_answering(self, request, test_transformer_question_answering, input): + request.input = input + request.task = "question_answering" + request.task_kwargs = {"topk": 1} prediction = await test_transformer_question_answering.predict(request) answers = [input[i][1][prediction.task_outputs["answers"][i][0]["start"]:prediction.task_outputs["answers"][i][0]["end"]] for i in range(len(input))] @@ -219,17 +156,10 @@ async def test_question_answering(self, test_transformer_question_answering, inp assert all(prediction.task_outputs["answers"][i][0]["answer"] == answers[i] for i in range(len(input))) @pytest.mark.asyncio - async def test_question_answering_topk(self, test_transformer_question_answering): - input = [["What is a test?", "A test is a thing where you test."]] - request = PredictionRequest.parse_obj({ - "input": input, - "is_preprocessed": False, - "preprocessing_kwargs": {}, - "model_kwargs": {}, - "task": "question_answering", - "task_kwargs": {"topk": 2}, - "adapter_name": "" - }) + async def test_question_answering_topk(self, request, test_transformer_question_answering): + request.input = [["What is a test?", "A test is a thing where you test."]] + request.task = "question_answering" + request.task_kwargs = {"topk": 2} prediction = await test_transformer_question_answering.predict(request) answers = [input[0][1][prediction.task_outputs["answers"][0][i]["start"]:prediction.task_outputs["answers"][0][i]["end"]] for i in range(2)] @@ -244,57 +174,36 @@ class TestTransformerGeneration: @pytest.mark.parametrize("input", [(["Generate text"]), (["Generate text", "And more text"])], ) - async def test_generation(self, test_transformer_generation, input): - request = PredictionRequest.parse_obj({ - "input": input, - "is_preprocessed": False, - "preprocessing_kwargs": {}, - "model_kwargs": {}, - "task": "generation", - "task_kwargs": {}, - "adapter_name": "" - }) + async def test_generation(self, request, test_transformer_generation, input): + request.input = input + request.task = "generation" prediction = await test_transformer_generation.predict(request) assert all(isinstance(prediction.task_outputs["generated_texts"][i][0], str) for i in range(len(input))) @pytest.mark.asyncio - async def test_generation_output_attention_and_scores(self, test_transformer_generation): - request = PredictionRequest.parse_obj({ - "input": ["Generate text"], - "is_preprocessed": False, - "preprocessing_kwargs": {}, - "model_kwargs": { - "output_attentions": True, - "output_scores": True - }, - "task": "generation", - "task_kwargs": {}, - "adapter_name": "" - }) + async def test_generation_output_attention_and_scores(self, request, test_transformer_generation): + request.task = "generation" + request.model_kwargs = { + "output_attentions": True, + "output_scores": True + } prediction = await test_transformer_generation.predict(request) assert "scores" in prediction.model_outputs assert "attentions" in prediction.model_outputs @pytest.mark.asyncio - async def test_generation_beam_sample_multiple_seqs(self, test_transformer_generation): - request = PredictionRequest.parse_obj({ - "input": ["Generate text"], - "is_preprocessed": False, - "preprocessing_kwargs": {}, - "model_kwargs": {}, - "task": "generation", - "task_kwargs": { - "num_beams": 2, - "do_sample": True, - "top_k": 10, - "top_p": 0.5, - "no_repeat_ngram_size": 2, - "num_return_sequences": 2 - }, - "adapter_name": "" - }) + async def test_generation_beam_sample_multiple_seqs(self, request, test_transformer_generation): + request.task = "generation" + request.task_kwargs = { + "num_beams": 2, + "do_sample": True, + "top_k": 10, + "top_p": 0.5, + "no_repeat_ngram_size": 2, + "num_return_sequences": 2 + } prediction = await test_transformer_generation.predict(request) assert len(prediction.task_outputs["generated_texts"][0]) == 2 \ No newline at end of file From fd7fae3c0652cc6a32a9496f3d2b24129d2fbef6 Mon Sep 17 00:00:00 2001 From: Geigle Date: Thu, 1 Jul 2021 16:20:22 +0200 Subject: [PATCH 09/23] Fixed disallowed fixture name (request -> prediction_request) --- .../inference_server/tests/conftest.py | 2 +- .../test_sentence_transformer.py | 48 ++----- .../tests/test_inference/test_transformers.py | 126 +++++++++--------- 3 files changed, 77 insertions(+), 99 deletions(-) diff --git a/square-model-inference-api/inference_server/tests/conftest.py b/square-model-inference-api/inference_server/tests/conftest.py index c3a4984ae..880250d56 100644 --- a/square-model-inference-api/inference_server/tests/conftest.py +++ b/square-model-inference-api/inference_server/tests/conftest.py @@ -80,7 +80,7 @@ def test_sentence_transformer(): return SentenceTransformer(SENTENCE_MODEL, 1, True) @pytest.fixture() -def request(): +def prediction_request(): request = PredictionRequest.parse_obj({ "input": ["test"], "is_preprocessed": False, diff --git a/square-model-inference-api/inference_server/tests/test_inference/test_sentence_transformer.py b/square-model-inference-api/inference_server/tests/test_inference/test_sentence_transformer.py index a290a78c6..92d70faf6 100644 --- a/square-model-inference-api/inference_server/tests/test_inference/test_sentence_transformer.py +++ b/square-model-inference-api/inference_server/tests/test_inference/test_sentence_transformer.py @@ -1,7 +1,6 @@ import pytest import numpy as np -from square_model_inference.models.request import PredictionRequest @pytest.mark.usefixtures("test_sentence_transformer") @@ -10,45 +9,24 @@ class TestSentenceTransformerEmbedding: @pytest.mark.asyncio @pytest.mark.parametrize("input", [(["this is a test"]), (["this is a test", "this is a test with a longer sentence"])]) - async def test_embedding(self, test_sentence_transformer, input): - request = PredictionRequest.parse_obj({ - "input": input, - "is_preprocessed": False, - "preprocessing_kwargs": {}, - "model_kwargs": {}, - "task": "embedding", - "task_kwargs": {}, - "adapter_name": "" - }) - - prediction = await test_sentence_transformer.predict(request) + async def test_embedding(self, prediction_request, test_sentence_transformer, input): + prediction_request.task = "embedding" + prediction_request.input = input + + prediction = await test_sentence_transformer.predict(prediction_request) assert np.array(prediction.model_outputs["embeddings"]).shape[1] == 768 assert np.array(prediction.model_outputs["embeddings"]).shape[0] == len(input) @pytest.mark.asyncio - async def test_not_embedding(self, test_sentence_transformer): - request = PredictionRequest.parse_obj({ - "input": ["input"], - "is_preprocessed": False, - "preprocessing_kwargs": {}, - "model_kwargs": {}, - "task": "sequence_classification", - "task_kwargs": {}, - "adapter_name": "" - }) + async def test_not_embedding(self, prediction_request, test_sentence_transformer): + prediction_request.task = "sequence_classification" with pytest.raises(ValueError): - prediction = await test_sentence_transformer.predict(request) + prediction = await test_sentence_transformer.predict(prediction_request) @pytest.mark.asyncio - async def test_forbid_is_preprocessed(self, test_sentence_transformer): - request = PredictionRequest.parse_obj({ - "input": ["test"], - "is_preprocessed": True, - "preprocessing_kwargs": {}, - "model_kwargs": {}, - "task": "embedding", - "task_kwargs": {}, - "adapter_name": "" - }) + async def test_forbid_is_preprocessed(self, prediction_request, test_sentence_transformer): + prediction_request.task = "embedding" + prediction_request.is_preprocessed = True + with pytest.raises(ValueError): - prediction = await test_sentence_transformer.predict(request) \ No newline at end of file + prediction = await test_sentence_transformer.predict(prediction_request) \ No newline at end of file diff --git a/square-model-inference-api/inference_server/tests/test_inference/test_transformers.py b/square-model-inference-api/inference_server/tests/test_inference/test_transformers.py index bf6c9a3d2..5f99ed649 100644 --- a/square-model-inference-api/inference_server/tests/test_inference/test_transformers.py +++ b/square-model-inference-api/inference_server/tests/test_inference/test_transformers.py @@ -1,19 +1,19 @@ import pytest -from square_model_inference.models.request import PredictionRequest import numpy as np + @pytest.mark.usefixtures("test_transformer_sequence_classification") class TestTransformerSequenceClassification: @pytest.mark.asyncio @pytest.mark.parametrize("input", [(["this is a test"]), (["this is a test", "this is a test with a longer sentence"])], ids=["single", "batch"]) - async def test_sequence_classification(self, request, test_transformer_sequence_classification, input): - request.input = input - request.task = "sequence_classification" + async def test_sequence_classification(self, prediction_request, test_transformer_sequence_classification, input): + prediction_request.input = input + prediction_request.task = "sequence_classification" - prediction = await test_transformer_sequence_classification.predict(request) + prediction = await test_transformer_sequence_classification.predict(prediction_request) np.testing.assert_allclose(np.sum(prediction.model_outputs["logits"], axis=-1), [1.0]*len(input), err_msg="logits are softmax") assert len(prediction.task_outputs["labels"]) == len(input) assert all(isinstance(prediction.task_outputs["labels"][i], int) for i in range(len(input))) @@ -21,23 +21,23 @@ async def test_sequence_classification(self, request, test_transformer_sequence_ assert "id2label" in prediction.task_outputs @pytest.mark.asyncio - async def test_sequence_classification_output_attention(self, request, test_transformer_sequence_classification): - request.task = "sequence_classification" - request.model_kwargs = {"output_attentions": True} + async def test_sequence_classification_output_attention(self, prediction_request, test_transformer_sequence_classification): + prediction_request.task = "sequence_classification" + prediction_request.model_kwargs = {"output_attentions": True} - prediction = await test_transformer_sequence_classification.predict(request) + prediction = await test_transformer_sequence_classification.predict(prediction_request) assert "attentions" in prediction.model_outputs @pytest.mark.asyncio @pytest.mark.parametrize("input", [(["this is a test"]), (["this is a test", "this is a test with a longer sentence"])], ids=["single", "batch"]) - async def test_sequence_classification_regression(self, request, test_transformer_sequence_classification, input): - request.input = input - request.task = "sequence_classification" - request.task_kwargs = {"is_regression": True} + async def test_sequence_classification_regression(self, prediction_request, test_transformer_sequence_classification, input): + prediction_request.input = input + prediction_request.task = "sequence_classification" + prediction_request.task_kwargs = {"is_regression": True} - prediction = await test_transformer_sequence_classification.predict(request) + prediction = await test_transformer_sequence_classification.predict(prediction_request) assert not np.array_equal(np.sum(prediction.model_outputs["logits"], axis=-1)-1, [0.0]*len(input)), "logits are not softmax" assert "labels" not in prediction.task_outputs assert "logits" in prediction.model_outputs @@ -51,11 +51,11 @@ class TestTransformerTokenClassification: (["this is a test", "this is a test with a longer sentence"], [[None, 0, 1, 2, 3, None, None, None, None, None], [None, 0, 1, 2, 3, 4, 5, 6, 7, None]])], ids=["single", "batch"]) - async def test_token_classification(self, request, test_transformer_token_classification, input, word_ids): - request.input = input - request.task = "token_classification" + async def test_token_classification(self, prediction_request, test_transformer_token_classification, input, word_ids): + prediction_request.input = input + prediction_request.task = "token_classification" - prediction = await test_transformer_token_classification.predict(request) + prediction = await test_transformer_token_classification.predict(prediction_request) np.testing.assert_allclose(np.sum(prediction.model_outputs["logits"], axis=-1), np.ones(shape=(len(input), len(word_ids[0]))), err_msg="logits are softmax") assert all(len(prediction.task_outputs["labels"][i]) == len(word_ids[i]) for i in range(len(input))) assert "logits" in prediction.model_outputs @@ -69,12 +69,12 @@ async def test_token_classification(self, request, test_transformer_token_classi (["this is a test", "this is a test with a longer sentence"], [[None, 0, 1, 2, 3, None, None, None, None, None], [None, 0, 1, 2, 3, 4, 5, 6, 7, None]])], ids=["single", "batch"]) - async def test_token_classification_regression(self, request, test_transformer_token_classification, input, word_ids): - request.input = input - request.task = "token_classification" - request.task_kwargs = {"is_regression": True} + async def test_token_classification_regression(self, prediction_request, test_transformer_token_classification, input, word_ids): + prediction_request.input = input + prediction_request.task = "token_classification" + prediction_request.task_kwargs = {"is_regression": True} - prediction = await test_transformer_token_classification.predict(request) + prediction = await test_transformer_token_classification.predict(prediction_request) assert not np.array_equal((np.sum(prediction.model_outputs["logits"], axis=-1), np.ones_like(word_ids)), "logits are not softmax") assert "labels" not in prediction.task_outputs assert "logits" in prediction.model_outputs @@ -91,12 +91,12 @@ class TestTransformerEmbedding: (["this is a test"], "cls"), (["this is a test", "this is a test with a longer sentence"], "cls")], ) - async def test_embedding(self, request, test_transformer_embedding, input, mode): - request.input = input - request.task = "embedding" - request.task_kwargs = {"embedding_mode": mode} + async def test_embedding(self, prediction_request, test_transformer_embedding, input, mode): + prediction_request.input = input + prediction_request.task = "embedding" + prediction_request.task_kwargs = {"embedding_mode": mode} - prediction = await test_transformer_embedding.predict(request) + prediction = await test_transformer_embedding.predict(prediction_request) assert np.array(prediction.model_outputs["embeddings"]).shape[1] == 768 assert np.array(prediction.model_outputs["embeddings"]).shape[0] == len(input) assert "hidden_states" not in prediction.model_outputs @@ -107,12 +107,12 @@ async def test_embedding(self, request, test_transformer_embedding, input, mode) (["this is a test", "this is a test with a longer sentence"], [[None, 0, 1, 2, 3, None, None, None, None, None], [None, 0, 1, 2, 3, 4, 5, 6, 7, None]])], ids=["single", "batch"]) - async def test_embedding_token(self, request, test_transformer_embedding, input, word_ids): - request.input = input - request.task = "embedding" - request.task_kwargs = {"embedding_mode": "token"} + async def test_embedding_token(self, prediction_request, test_transformer_embedding, input, word_ids): + prediction_request.input = input + prediction_request.task = "embedding" + prediction_request.task_kwargs = {"embedding_mode": "token"} - prediction = await test_transformer_embedding.predict(request) + prediction = await test_transformer_embedding.predict(prediction_request) assert np.array(prediction.model_outputs["embeddings"]).shape[2] == 768 assert np.array(prediction.model_outputs["embeddings"]).shape[1] == len(word_ids[0]) assert np.array(prediction.model_outputs["embeddings"]).shape[0] == len(input) @@ -121,20 +121,20 @@ async def test_embedding_token(self, request, test_transformer_embedding, input, assert prediction.task_outputs["word_ids"] == word_ids @pytest.mark.asyncio - async def test_embedding_unknown_mode(self, request, test_transformer_embedding): - request.task = "embedding" - request.task_kwargs = {"embedding_mode": "this mode does not exist"} + async def test_embedding_unknown_mode(self, prediction_request, test_transformer_embedding): + prediction_request.task = "embedding" + prediction_request.task_kwargs = {"embedding_mode": "this mode does not exist"} with pytest.raises(ValueError): - prediction = await test_transformer_embedding.predict(request) + prediction = await test_transformer_embedding.predict(prediction_request) @pytest.mark.asyncio - async def test_forbid_is_preprocessed(self, request, test_transformer_embedding): - request.task = "embedding" - request.is_preprocessed = True + async def test_forbid_is_preprocessed(self, prediction_request, test_transformer_embedding): + prediction_request.task = "embedding" + prediction_request.is_preprocessed = True with pytest.raises(ValueError): - prediction = await test_transformer_embedding.predict(request) + prediction = await test_transformer_embedding.predict(prediction_request) @pytest.mark.usefixtures("test_transformer_question_answering") class TestTransformerQuestionAnswering: @@ -144,24 +144,24 @@ class TestTransformerQuestionAnswering: ([["What is a test?", "A test is a thing where you test."], ["What is a longer test?", "A test is a thing where you test. If it is longer you call it longer"]])], ) - async def test_question_answering(self, request, test_transformer_question_answering, input): - request.input = input - request.task = "question_answering" - request.task_kwargs = {"topk": 1} + async def test_question_answering(self, prediction_request, test_transformer_question_answering, input): + prediction_request.input = input + prediction_request.task = "question_answering" + prediction_request.task_kwargs = {"topk": 1} - prediction = await test_transformer_question_answering.predict(request) + prediction = await test_transformer_question_answering.predict(prediction_request) answers = [input[i][1][prediction.task_outputs["answers"][i][0]["start"]:prediction.task_outputs["answers"][i][0]["end"]] for i in range(len(input))] assert "start_logits" in prediction.model_outputs and "end_logits" in prediction.model_outputs assert len(prediction.task_outputs["answers"]) == len(input) assert all(prediction.task_outputs["answers"][i][0]["answer"] == answers[i] for i in range(len(input))) @pytest.mark.asyncio - async def test_question_answering_topk(self, request, test_transformer_question_answering): - request.input = [["What is a test?", "A test is a thing where you test."]] - request.task = "question_answering" - request.task_kwargs = {"topk": 2} + async def test_question_answering_topk(self, prediction_request, test_transformer_question_answering): + prediction_request.input = [["What is a test?", "A test is a thing where you test."]] + prediction_request.task = "question_answering" + prediction_request.task_kwargs = {"topk": 2} - prediction = await test_transformer_question_answering.predict(request) + prediction = await test_transformer_question_answering.predict(prediction_request) answers = [input[0][1][prediction.task_outputs["answers"][0][i]["start"]:prediction.task_outputs["answers"][0][i]["end"]] for i in range(2)] assert "start_logits" in prediction.model_outputs and "end_logits" in prediction.model_outputs assert len(prediction.task_outputs["answers"]) == len(input) @@ -174,29 +174,29 @@ class TestTransformerGeneration: @pytest.mark.parametrize("input", [(["Generate text"]), (["Generate text", "And more text"])], ) - async def test_generation(self, request, test_transformer_generation, input): - request.input = input - request.task = "generation" + async def test_generation(self, prediction_request, test_transformer_generation, input): + prediction_request.input = input + prediction_request.task = "generation" - prediction = await test_transformer_generation.predict(request) + prediction = await test_transformer_generation.predict(prediction_request) assert all(isinstance(prediction.task_outputs["generated_texts"][i][0], str) for i in range(len(input))) @pytest.mark.asyncio - async def test_generation_output_attention_and_scores(self, request, test_transformer_generation): - request.task = "generation" - request.model_kwargs = { + async def test_generation_output_attention_and_scores(self, prediction_request, test_transformer_generation): + prediction_request.task = "generation" + prediction_request.model_kwargs = { "output_attentions": True, "output_scores": True } - prediction = await test_transformer_generation.predict(request) + prediction = await test_transformer_generation.predict(prediction_request) assert "scores" in prediction.model_outputs assert "attentions" in prediction.model_outputs @pytest.mark.asyncio - async def test_generation_beam_sample_multiple_seqs(self, request, test_transformer_generation): - request.task = "generation" - request.task_kwargs = { + async def test_generation_beam_sample_multiple_seqs(self, prediction_request, test_transformer_generation): + prediction_request.task = "generation" + prediction_request.task_kwargs = { "num_beams": 2, "do_sample": True, "top_k": 10, @@ -205,5 +205,5 @@ async def test_generation_beam_sample_multiple_seqs(self, request, test_transfor "num_return_sequences": 2 } - prediction = await test_transformer_generation.predict(request) + prediction = await test_transformer_generation.predict(prediction_request) assert len(prediction.task_outputs["generated_texts"][0]) == 2 \ No newline at end of file From 9f03fff0118d742b1af7eeab0fa79963ee9fa43a Mon Sep 17 00:00:00 2001 From: Geigle Date: Fri, 9 Jul 2021 16:30:54 +0200 Subject: [PATCH 10/23] Updated requirements and installation process; started splitting PredictionOutput in multiple classes for better readable documentation --- square-model-inference-api/Makefile | 5 +- square-model-inference-api/README.md | 18 ++-- .../inference_server/Dockerfile | 26 +++--- .../inference_server/requirements.txt | 8 -- .../inference_server/requirements1.txt | 8 ++ .../inference_server/requirements2.txt | 1 + .../api/routes/prediction.py | 69 +++++++++++--- .../square_model_inference/core/config.py | 12 ++- .../core/event_handlers.py | 6 +- .../inference/adaptertransformer.py | 55 +++++------ .../square_model_inference/inference/model.py | 5 +- .../inference/sentencetransformer.py | 15 +-- .../inference/transformer.py | 31 ++++--- .../models/prediction.py | 85 +++++++++++------ .../square_model_inference/models/request.py | 2 +- .../inference_server/tests/conftest.py | 15 ++- .../pre_test_setup_for_docker_caching.py | 27 +++--- .../tests/test_inference/test_adapter.py | 3 +- .../test_sentence_transformer.py | 18 ++-- .../tests/test_inference/test_transformers.py | 93 +++++++++---------- .../uninstall_requirements.txt | 1 + square-model-inference-api/nginx/nginx.conf | 8 +- 22 files changed, 304 insertions(+), 207 deletions(-) delete mode 100644 square-model-inference-api/inference_server/requirements.txt create mode 100644 square-model-inference-api/inference_server/requirements1.txt create mode 100644 square-model-inference-api/inference_server/requirements2.txt create mode 100644 square-model-inference-api/inference_server/uninstall_requirements.txt diff --git a/square-model-inference-api/Makefile b/square-model-inference-api/Makefile index 7acebffb0..13e50caf8 100644 --- a/square-model-inference-api/Makefile +++ b/square-model-inference-api/Makefile @@ -11,7 +11,10 @@ test: pytest --cov install: - pip install --upgrade pip && pip install -r inference_server/requirements.txt + pip install --upgrade pip + pip install -r inference_server/requirements1.txt + pip uninstall -y -r inference_server/uninstall_requirements.txt + pip install -r inference_server/requirements2.txt run: PYTHONPATH=inference_server/ uvicorn inference_server.main:app --reload --host 0.0.0.0 --port 8002 --env-file inference_server/.env diff --git a/square-model-inference-api/README.md b/square-model-inference-api/README.md index 5b93ceaad..324b89cdc 100644 --- a/square-model-inference-api/README.md +++ b/square-model-inference-api/README.md @@ -27,15 +27,21 @@ Python 3.7+, Docker (optional), Make (optional) ## Installation Install the required packages in your local environment (ideally virtualenv, conda, etc.). - - +```bash +pip install -r inference_server/requirements1.txt +pip uninstall -y -r inference_server/uninstall_requirements.txt +pip install -r inference_server/requirements2.txt +``` +or ```sh -python -m venv venv -source venv/bin/activate make install ``` +**Why two requirement.txt and why the uninstall?** +`sentence-transformers` depends on `transformers` and it will be installed along with it. +However, we use `adapter-transformers` (a fork of `transformers`) in this project. +Both `transformers` and `adapter-transformers` use the same namespace so they conflict. +Thus, we first install `sentence-transformers` along with `transformers`, +uninstall `transformers`, and finally install `adapter-transformers`. #### Running Localhost diff --git a/square-model-inference-api/inference_server/Dockerfile b/square-model-inference-api/inference_server/Dockerfile index fe2b5344c..fefc84ae8 100644 --- a/square-model-inference-api/inference_server/Dockerfile +++ b/square-model-inference-api/inference_server/Dockerfile @@ -1,25 +1,29 @@ -FROM python:3.8.2 as build +FROM python:3.7.6-slim-buster as base ENV PYTHONUNBUFFERED 1 EXPOSE 8000 WORKDIR /app -COPY requirements.txt ./ -RUN pip install --upgrade pip && \ - pip install -r requirements.txt +RUN pip install --upgrade pip +COPY requirements1.txt requirements2.txt uninstall_requirements.txt ./ +RUN pip install -r requirements1.txt +RUN pip uninstall -y -r uninstall_requirements.txt +RUN pip install -r requirements2.txt -COPY ./inference_server/tests/pre_test_setup_for_docker_caching.py ./inference_server/tests/pre_test_setup_for_docker_caching.py -RUN PYTHONPATH=./inference_server; python ./inference_server/tests/pre_test_setup_for_docker_caching.py +# Testing stage. We first pre-download any models separately for caching (pre_test_setup_for_docker_caching.py) and then +# run the tests +FROM base as test +COPY ./tests/pre_test_setup_for_docker_caching.py ./tests/pre_test_setup_for_docker_caching.py +RUN python ./tests/pre_test_setup_for_docker_caching.py COPY . ./ - -FROM base as test -WORKDIR /app RUN pip install pytest pytest-cov pytest-asyncio -COPY ../tests tests -RUN PYTHONPATH=./inference_server; pytest --cov +RUN PYTHONPATH=./ pytest --cov +# Deployment stage FROM base as build +COPY . ./ + CMD ["uvicorn", "main:app", "--host", "0.0.0.0", "--port", "8000"] \ No newline at end of file diff --git a/square-model-inference-api/inference_server/requirements.txt b/square-model-inference-api/inference_server/requirements.txt deleted file mode 100644 index 0b2dd68dc..000000000 --- a/square-model-inference-api/inference_server/requirements.txt +++ /dev/null @@ -1,8 +0,0 @@ -uvicorn -fastapi -pydantic -loguru -python-dotenv -adapter-transformers -sentencepiece -torch \ No newline at end of file diff --git a/square-model-inference-api/inference_server/requirements1.txt b/square-model-inference-api/inference_server/requirements1.txt new file mode 100644 index 000000000..d8a296d3e --- /dev/null +++ b/square-model-inference-api/inference_server/requirements1.txt @@ -0,0 +1,8 @@ +uvicorn==0.13.4 +fastapi==0.65.1 +pydantic==1.8.2 +loguru==0.5.3 +python-dotenv==0.17.1 +sentencepiece==0.1.95 +torch==1.8.1 +sentence-transformers==1.2.0 \ No newline at end of file diff --git a/square-model-inference-api/inference_server/requirements2.txt b/square-model-inference-api/inference_server/requirements2.txt new file mode 100644 index 000000000..1503d3a94 --- /dev/null +++ b/square-model-inference-api/inference_server/requirements2.txt @@ -0,0 +1 @@ +adapter-transformers==2.0.1 \ No newline at end of file diff --git a/square-model-inference-api/inference_server/square_model_inference/api/routes/prediction.py b/square-model-inference-api/inference_server/square_model_inference/api/routes/prediction.py index 6632c3bf4..c06c0633f 100644 --- a/square-model-inference-api/inference_server/square_model_inference/api/routes/prediction.py +++ b/square-model-inference-api/inference_server/square_model_inference/api/routes/prediction.py @@ -2,21 +2,68 @@ from starlette.requests import Request from loguru import logger -from square_model_inference.models.request import PredictionRequest -from square_model_inference.models.prediction import PredictionOutput -from square_model_inference.inference.model import Model +from square_model_inference.models.request import PredictionRequest, Task +from square_model_inference.models.prediction import PredictionOutputForSequenceClassification, PredictionOutputForTokenClassification, \ + PredictionOutputForQuestionAnswering, PredictionOutputForGeneration, PredictionOutputForEmbedding router = APIRouter() +@router.post("/sequence-classification", response_model=PredictionOutputForSequenceClassification, name="sequence classification") +async def sequence_classification( + request: Request, + prediction_request: PredictionRequest, +) -> PredictionOutputForSequenceClassification: -@router.post("/predict", response_model=PredictionOutput, name="predict") -async def predict( - request: Request, - prediction_request: PredictionRequest = None, -) -> PredictionOutput: + logger.info(f"Sequence Classification Request: {prediction_request.dict()}") + model = request.app.state.model + prediction = await model.predict(prediction_request, Task.sequence_classification) - logger.info(f"Request: {prediction_request.dict()}") - model: Model = request.app.state.model - prediction: PredictionOutput = await model.predict(prediction_request) + return prediction + +@router.post("/token-classification", response_model=PredictionOutputForTokenClassification, name="token classification") +async def token_classification( + request: Request, + prediction_request: PredictionRequest, +) -> PredictionOutputForTokenClassification: + + logger.info(f"Token Classification Request: {prediction_request.dict()}") + model = request.app.state.model + prediction = await model.predict(prediction_request, Task.token_classification) + + return prediction + +@router.post("/embedding", response_model=PredictionOutputForEmbedding, name="embedding") +async def embedding( + request: Request, + prediction_request: PredictionRequest, +) -> PredictionOutputForEmbedding: + + logger.info(f"Embedding Request: {prediction_request.dict()}") + model = request.app.state.model + prediction = await model.predict(prediction_request, Task.embedding) return prediction + +@router.post("/question-answering", response_model=PredictionOutputForQuestionAnswering, name="question answering") +async def question_answering( + request: Request, + prediction_request: PredictionRequest, +) -> PredictionOutputForQuestionAnswering: + + logger.info(f"Question Answering Request: {prediction_request.dict()}") + model = request.app.state.model + prediction = await model.predict(prediction_request, Task.question_answering) + + return prediction + +@router.post("/generation", response_model=PredictionOutputForGeneration, name="generation") +async def generation( + request: Request, + prediction_request: PredictionRequest, +) -> PredictionOutputForGeneration: + + logger.info(f"Generation Request: {prediction_request.dict()}") + model = request.app.state.model + prediction = await model.predict(prediction_request, Task.generation) + + return prediction \ No newline at end of file diff --git a/square-model-inference-api/inference_server/square_model_inference/core/config.py b/square-model-inference-api/inference_server/square_model_inference/core/config.py index c1cd0c6a3..224417b3d 100644 --- a/square-model-inference-api/inference_server/square_model_inference/core/config.py +++ b/square-model-inference-api/inference_server/square_model_inference/core/config.py @@ -6,19 +6,21 @@ config = Config(".env") -# Disable CUDA even if available -DISABLE_GPU: bool = config("DISABLE_GPU", cast=bool, default=False) # Corresponds to the Huggingface name for finetuned Transformers or the name of a finetuned SentenceTransformers MODEL_NAME: str = config("MODEL_NAME") # Type of the model, e.g. Transformers, Adapter, ... # See square_model_inference.core.event_handlers.MODEL_MAPPING for all available names with corresponding model MODEL_TYPE: str = config("MODEL_TYPE") -# Cache directory where model weights are stored -# This is the name for the env variable used by transformers and sentence-transformers package -TRANSFORMERS_CACHE: str = config("TRANSFORMERS_CACHE") +# Disable CUDA even if available +DISABLE_GPU: bool = config("DISABLE_GPU", cast=bool, default=False) # Batch size used for many inputs BATCH_SIZE: int = config("BATCH_SIZE", cast=int, default=32) +MAX_INPUT_SIZE: int = config("MAX_INPUT_SIZE", cast=int, default=1024) + +# Cache directory where model weights are stored +# This is the name for the env variable used by transformers and sentence-transformers package +TRANSFORMERS_CACHE: str = config("TRANSFORMERS_CACHE") # For MODEL_TYPE=transformers: decides the AutoModel* class used # See square_model_inference.inference.transformer.CLASS_MAPPING for valid names and corresponding class diff --git a/square-model-inference-api/inference_server/square_model_inference/core/event_handlers.py b/square-model-inference-api/inference_server/square_model_inference/core/event_handlers.py index 5a629fa2e..9498a0b38 100644 --- a/square-model-inference-api/inference_server/square_model_inference/core/event_handlers.py +++ b/square-model-inference-api/inference_server/square_model_inference/core/event_handlers.py @@ -4,7 +4,8 @@ from loguru import logger from square_model_inference.inference.adaptertransformer import AdapterTransformer -from square_model_inference.core.config import MODEL_TYPE, MODEL_NAME, MODEL_CLASS, DISABLE_GPU, BATCH_SIZE, TRANSFORMERS_CACHE +from square_model_inference.core.config import MODEL_TYPE, MODEL_NAME, MODEL_CLASS, DISABLE_GPU, BATCH_SIZE, \ + TRANSFORMERS_CACHE, MAX_INPUT_SIZE from square_model_inference.inference.sentencetransformer import SentenceTransformer from square_model_inference.inference.transformer import Transformer @@ -19,7 +20,8 @@ "model_class": MODEL_CLASS, "disable_gpu": DISABLE_GPU, "batch_size": BATCH_SIZE, - "transformers_cache": TRANSFORMERS_CACHE + "transformers_cache": TRANSFORMERS_CACHE, + "max_input_size": MAX_INPUT_SIZE } def _startup_model(app: FastAPI) -> None: diff --git a/square-model-inference-api/inference_server/square_model_inference/inference/adaptertransformer.py b/square-model-inference-api/inference_server/square_model_inference/inference/adaptertransformer.py index 3a0507219..e9c052c4c 100644 --- a/square-model-inference-api/inference_server/square_model_inference/inference/adaptertransformer.py +++ b/square-model-inference-api/inference_server/square_model_inference/inference/adaptertransformer.py @@ -2,8 +2,7 @@ from loguru import logger -from transformers import AutoModelWithHeads -from transformers.adapters.utils import download_cached, ADAPTER_HUB_INDEX_FILE +from transformers import AutoModelWithHeads, list_adapters from square_model_inference.inference.transformer import Transformer from square_model_inference.models.request import PredictionRequest, Task @@ -15,7 +14,7 @@ class AdapterTransformer(Transformer): """ The class for all adapter-based models using the adapter-transformers package """ - def __init__(self, model_name, batch_size, disable_gpu, transformers_cache, **kwargs): + def __init__(self, model_name, batch_size, disable_gpu, transformers_cache, max_input_size, **kwargs): """ Initialize the Adapter with its underlying Transformer and pre-load all available adapters from adapterhub.ml :param model_name: the Huggingface model name @@ -23,34 +22,36 @@ def __init__(self, model_name, batch_size, disable_gpu, transformers_cache, **kw :param disable_gpu: do not move model to GPU even if CUDA is available :param transformers_cache: Should be same as TRANSFORMERS_CACHE env variable. This folder will be used to store the adapters + :param max_input_size: requests with a larger input are rejected :param kwargs: Not used """ self._load_model(AutoModelWithHeads, model_name, disable_gpu) self._load_adapter(model_name, transformers_cache) self.batch_size = batch_size + self.max_input_size = max_input_size def _load_adapter(self, model_name, transformers_cache): """ Pre-load all available adapters for MODEL_NAME from adapterhub.ml. We parse the hub index to extract all names and then load each model. """ - logger.info("Loading all available adapters from adapterhub.ml") - logger.warning("This will not load adapters published only on https://huggingface.co/models") - logger.warning("UPDATE with https://github.com/Adapter-Hub/adapter-transformers/pull/193") - index_file = download_cached(ADAPTER_HUB_INDEX_FILE.format(model_name)) - adapter_index = json.load(open(index_file)) - adapters = set() - for task, datasets in adapter_index.items(): - for dataset in datasets.keys(): - for key in datasets[dataset].keys(): - if key != "default": - for org in datasets[dataset][key]["versions"].keys(): - adapters.add(f"{task}/{dataset}@{org}") + logger.info("Loading all available adapters") + adapter_infos = [info for info in list_adapters(source="ah") if info.model_name==model_name] + adapters = set(f"{adapter_info.task}/{adapter_info.subtask}@{adapter_info.username}" for adapter_info in adapter_infos) for adapter in adapters: logger.debug(f"Loading adapter {adapter}") - self.model.load_adapter(adapter, load_as=adapter, with_head=True, cache_dir=transformers_cache) - - # def _load_single_adapter(self, adapter_name: str): + try: + self.model.load_adapter(adapter, load_as=adapter, with_head=True, cache_dir=transformers_cache) + except KeyError as e: + logger.debug(f"Could not load {adapter} due to incomplete config: Missing key {e.args[0]}") + except RuntimeError as e: + if "Error(s) in loading state_dict" in e.args[0]: + logger.debug(f"Could not load {adapter} due to missing label_ids in config resulting in exception:\n{e.args[0]}") + else: + raise(e) + + +# def _load_single_adapter(self, adapter_name: str): # if adapter_name not in self.model.config.adapters.adapters: # logger.info(f"Loading new adapter {adapter_name}") # self.model.load_adapter(adapter_name, with_head=True, load_as=adapter_name) @@ -64,7 +65,7 @@ def _token_classification(self, request: PredictionRequest) -> PredictionOutput: label2id = self.model.config.prediction_heads[request.adapter_name]["label2id"] id2label = {v:k for k,v in label2id.items()} - prediction.task_outputs["id2label"] = id2label + prediction.id2label = id2label return prediction @@ -75,26 +76,28 @@ def _sequence_classification(self, request: PredictionRequest) -> PredictionOutp label2id = self.model.config.prediction_heads[request.adapter_name]["label2id"] id2label = {v:k for k,v in label2id.items()} - prediction.task_outputs["id2label"] = id2label + prediction.id2label = id2label return prediction - async def predict(self, request: PredictionRequest) -> PredictionOutput: + async def predict(self, request: PredictionRequest, task: Task) -> PredictionOutput: if request.is_preprocessed: raise ValueError("is_preprocessed=True is not supported for this model. Please use text as input.") + if len(request.input) > self.max_input_size: + raise ValueError(f"Input is too large. Max input size is {self.max_input_size}") if not request.adapter_name or request.adapter_name not in self.model.config.adapters.adapters: raise ValueError(f"Unknown or missing adapter {request.adapter_name}. " f"Please provider a fully specified adapter name from adapterhub.ml") self.model.set_active_adapters(request.adapter_name) - if request.task == Task.sequence_classification: + if task == Task.sequence_classification: return self._sequence_classification(request) - elif request.task == Task.token_classification: + elif task == Task.token_classification: return self._token_classification(request) - elif request.task == Task.question_answering: + elif task == Task.question_answering: return self._question_answering(request) - elif request.task == Task.embedding: + elif task == Task.embedding: return self._embedding(request) - elif request.task == Task.generation: + elif task == Task.generation: return self._generation(request) diff --git a/square-model-inference-api/inference_server/square_model_inference/inference/model.py b/square-model-inference-api/inference_server/square_model_inference/inference/model.py index 4aa92d8d2..f8fbed524 100644 --- a/square-model-inference-api/inference_server/square_model_inference/inference/model.py +++ b/square-model-inference-api/inference_server/square_model_inference/inference/model.py @@ -1,4 +1,4 @@ -from square_model_inference.models.request import PredictionRequest +from square_model_inference.models.request import PredictionRequest, Task from square_model_inference.models.prediction import PredictionOutput @@ -8,11 +8,12 @@ class Model: __init__ is supposed to load all weights and other necessary files (e.g. tokenizer) so that the model can directly perform inference on request """ - async def predict(self, payload: PredictionRequest) -> PredictionOutput: + async def predict(self, payload: PredictionRequest, task: Task) -> PredictionOutput: """ Take an input, pre-process it accordingly, perform inference according to the task, post-process the result and return it :param payload: the prediction request containing the input and any other parameters required + :param task: The task that the model should perform with the payload :return: the result of the prediction """ raise NotImplementedError diff --git a/square-model-inference-api/inference_server/square_model_inference/inference/sentencetransformer.py b/square-model-inference-api/inference_server/square_model_inference/inference/sentencetransformer.py index 6244a0222..7523bab29 100644 --- a/square-model-inference-api/inference_server/square_model_inference/inference/sentencetransformer.py +++ b/square-model-inference-api/inference_server/square_model_inference/inference/sentencetransformer.py @@ -9,7 +9,7 @@ from square_model_inference.inference.model import Model from square_model_inference.models.request import PredictionRequest, Task -from square_model_inference.models.prediction import PredictionOutput +from square_model_inference.models.prediction import PredictionOutput, PredictionOutputForEmbedding class SentenceTransformer(Model): @@ -17,16 +17,18 @@ class SentenceTransformer(Model): The class for all sentence-transformers models """ - def __init__(self, model_name, batch_size, disable_gpu, **kwargs): + def __init__(self, model_name, batch_size, disable_gpu, max_input_size, **kwargs): """ Initialize the SentenceTransformer :param model_name: the sentence-transformer model name (https://sbert.net/docs/pretrained_models.html) :param batch_size: batch size used for inference :param disable_gpu: do not move model to GPU even if CUDA is available + :param max_input_size: requests with a larger input are rejected :param kwargs: Not used """ self._load_model(model_name, disable_gpu) self.batch_size = batch_size + self.max_input_size = max_input_size def _load_model(self, model_name, disable_gpu): """ @@ -41,14 +43,15 @@ def _load_model(self, model_name, disable_gpu): def _embedding(self, request: PredictionRequest) -> PredictionOutput: embeddings = self.model.encode(request.input, batch_size=self.batch_size, show_progress_bar=False) - return PredictionOutput(model_outputs={"embeddings": embeddings}, task_outputs={}) + return PredictionOutputForEmbedding(model_outputs={"embeddings": embeddings}) - async def predict(self, request: PredictionRequest) -> PredictionOutput: + async def predict(self, request: PredictionRequest, task: Task) -> PredictionOutput: if request.is_preprocessed: raise ValueError("is_preprocessed=True is not supported for this model. Please use text as input.") - - if request.task != Task.embedding: + if len(request.input) > self.max_input_size: + raise ValueError(f"Input is too large. Max input size is {self.max_input_size}") + if task != Task.embedding: raise ValueError("Only embedding task supported by this model") return self._embedding(request) diff --git a/square-model-inference-api/inference_server/square_model_inference/inference/transformer.py b/square-model-inference-api/inference_server/square_model_inference/inference/transformer.py index 46c68c77a..c7657de47 100644 --- a/square-model-inference-api/inference_server/square_model_inference/inference/transformer.py +++ b/square-model-inference-api/inference_server/square_model_inference/inference/transformer.py @@ -11,7 +11,8 @@ from square_model_inference.inference.model import Model from square_model_inference.models.request import PredictionRequest, Task -from square_model_inference.models.prediction import PredictionOutput +from square_model_inference.models.prediction import PredictionOutput, PredictionOutputForSequenceClassification, PredictionOutputForTokenClassification, \ + PredictionOutputForQuestionAnswering, PredictionOutputForGeneration, PredictionOutputForEmbedding CLASS_MAPPING = { "base": AutoModel, @@ -27,19 +28,21 @@ class Transformer(Model): """ SUPPORTED_EMBEDDING_MODES = ["mean", "max", "cls", "token"] - def __init__(self, model_name, model_class, batch_size, disable_gpu, **kwargs): + def __init__(self, model_name, model_class, batch_size, disable_gpu, max_input_size, **kwargs): """ Initialize the Transformer :param model_name: the Huggingface model name :param model_class: the class name (according to CLASS_MAPPING) to use :param batch_size: batch size used for inference :param disable_gpu: do not move model to GPU even if CUDA is available + :param max_input_size: requests with a larger input are rejected :param kwargs: Not used """ if model_class not in CLASS_MAPPING: raise RuntimeError(f"Unknown MODEL_CLASS. Must be one of {CLASS_MAPPING.keys()}") self._load_model(CLASS_MAPPING[model_class], model_name, disable_gpu) self.batch_size = batch_size + self.max_input_size = max_input_size def _load_model(self, model_cls, model_name, disable_gpu): """ @@ -137,7 +140,7 @@ def _embedding(self, request: PredictionRequest) -> PredictionOutput: task_outputs["word_ids"] = [features.word_ids(i) for i in range(len(request.input))] predictions["embeddings"] = emb - return PredictionOutput(model_outputs=predictions, task_outputs=task_outputs) + return PredictionOutputForEmbedding(model_outputs=predictions, **task_outputs) def _token_classification(self, request: PredictionRequest) -> PredictionOutput: predictions, features = self._predict(request, output_features=True) @@ -154,7 +157,7 @@ def _token_classification(self, request: PredictionRequest) -> PredictionOutput: predictions["logits"] = probabilities task_outputs["labels"] = torch.argmax(predictions["logits"], dim=-1).tolist() - return PredictionOutput(model_outputs=predictions, task_outputs=task_outputs) + return PredictionOutputForTokenClassification(model_outputs=predictions, **task_outputs) def _sequence_classification(self, request: PredictionRequest) -> PredictionOutput: predictions = self._predict(request) @@ -170,7 +173,7 @@ def _sequence_classification(self, request: PredictionRequest) -> PredictionOutp predictions["logits"] = probabilities task_outputs["labels"] = torch.argmax(predictions["logits"], dim=-1).tolist() - return PredictionOutput(model_outputs=predictions, task_outputs=task_outputs) + return PredictionOutputForSequenceClassification(model_outputs=predictions, **task_outputs) def _generation(self, request: PredictionRequest) -> PredictionOutput: request.preprocessing_kwargs["padding"] = request.preprocessing_kwargs.get("padding", False) @@ -201,7 +204,7 @@ def _generation(self, request: PredictionRequest) -> PredictionOutput: clean_up_tokenization_spaces=request.task_kwargs.get("clean_up_tokenization_spaces", False)) for seq in res["sequences"]] task_outputs["generated_texts"].append(generated_texts) - return PredictionOutput(model_outputs=model_outputs, task_outputs=task_outputs) + return PredictionOutputForGeneration(model_outputs=model_outputs, **task_outputs) def _question_answering(self, request: PredictionRequest) -> PredictionOutput: """ @@ -304,20 +307,22 @@ def decode(start_: np.ndarray, end_: np.ndarray, topk: int, max_answer_len: int, answers.append({"score": no_answer_score, "start": 0, "end": 0, "answer": ""}) answers = sorted(answers, key=lambda x: x["score"], reverse=True)[: request.task_kwargs.get("topk", 1)] task_outputs["answers"].append(answers) - return PredictionOutput(model_outputs=predictions, task_outputs=task_outputs) + return PredictionOutputForQuestionAnswering(model_outputs=predictions, **task_outputs) - async def predict(self, request: PredictionRequest) -> PredictionOutput: + async def predict(self, request: PredictionRequest, task: Task) -> PredictionOutput: if request.is_preprocessed: raise ValueError("is_preprocessed=True is not supported for this model. Please use text as input.") + if len(request.input) > self.max_input_size: + raise ValueError(f"Input is too large. Max input size is {self.max_input_size}") - if request.task == Task.sequence_classification: + if task == Task.sequence_classification: return self._sequence_classification(request) - elif request.task == Task.token_classification: + elif task == Task.token_classification: return self._token_classification(request) - elif request.task == Task.embedding: + elif task == Task.embedding: return self._embedding(request) - elif request.task == Task.question_answering: + elif task == Task.question_answering: return self._question_answering(request) - elif request.task == Task.generation: + elif task == Task.generation: return self._generation(request) diff --git a/square-model-inference-api/inference_server/square_model_inference/models/prediction.py b/square-model-inference-api/inference_server/square_model_inference/models/prediction.py index e1160a1da..67be53aba 100644 --- a/square-model-inference-api/inference_server/square_model_inference/models/prediction.py +++ b/square-model-inference-api/inference_server/square_model_inference/models/prediction.py @@ -1,5 +1,5 @@ from collections import Iterable -from typing import Dict, Union, Tuple +from typing import Dict, Union, Tuple, List, Optional import torch from io import BytesIO @@ -53,7 +53,7 @@ class PredictionOutput(BaseModel): """ The results of the prediction of the model on the given input for the requested task. """ - model_outputs: dict = Field( + model_outputs: Dict = Field( description="Dictionary containing the model tensor outputs either as plain list or as base64-encoded numpy array.

" "Decode the base64 string 'arr_string_b64' back to an array in Python like this:
" "arr_binary_b64 = arr_string_b64.encode()
" @@ -79,34 +79,6 @@ class PredictionOutput(BaseModel): "- 'sequences': The generated vocab ids for the sequence
" "Task 'generation' does not concatenate the tensors of the inputs together but instead creates a list" "of the tensors for each input." - - ) - task_outputs: dict = Field( - description="Dictionary containing the task outputs. This covers anything from generated text, " - "labels, id2label mappings, token2word mappings, etc.

" - "SentenceTransformer: This is not used.
" - "Transformer/ Adapter:
" - "'sentence_classification':
" - "- 'labels': List of the predicted label ids for the input
" - "- 'id2label': Mapping from label id to the label name
" - "'token_classification':
" - "- 'labels': List of the predicted label ids for the input
" - "- 'id2label': Mapping from label id to the label name
" - "- 'word_ids': Mapping from tokens (as produced by the model tokenizer) to the words of the input.
" - "'None' represents special tokens that have been added by the tokenizer
" - "'embedding':
" - "- 'embedding_mode: One of 'mean', 'max', 'cls', 'token'. The pooling mode used (or not used for 'token')
" - "- 'word_ids': Mapping from tokens (as produced by the model tokenizer) to the words of the input. " - "Only returned if 'embedding_mode=token'
" - "'question_answering':
" - "- 'answers': List of lists of answers. Length of outer list is the number of inputs, " - "length of inner list is parameter 'topk' from the request's 'task_kwargs' (default 1). " - "Each answer is a dictionary with 'score', 'start' (span start index in context), 'end' (span end index in context), " - "and 'answer' (the extracted span). The inner list is sorted by score. If no answer span was extracted, " - "the empty span is returned (start and end both 0).
" - "'generation':
" - "- 'generated_texts': List of list of the generated texts. Length of outer list is the number of inputs, " - "length of inner list is parameter 'num_return_sequences' in request's 'task_kwargs'" ) model_output_is_encoded: bool = Field( not RETURN_PLAINTEXT_ARRAYS, @@ -125,3 +97,56 @@ def __init__(self, **data): super().__init__(**data) self.model_outputs = _encode_numpy(self.model_outputs) + +class PredictionOutputForSequenceClassification(PredictionOutput): + labels: List[int] = Field([], description="List of the predicted label ids for the input. Not set for regression.") + id2label: Dict[int, str] = Field({}, description="Mapping from label id to the label name. Not set for regression.") + + def __init__(self, **data): + super().__init__(**data) + + +class PredictionOutputForTokenClassification(PredictionOutput): + labels: List[List[int]] = Field([], description="List of the predicted label ids for the input. Not set for regression.") + id2label: Dict[int, str] = Field({}, description="Mapping from label id to the label name. Not set for regression.") + word_ids: List[List[Optional[int]]] = Field(..., description="Mapping from each token to the corresponding word in the input. " + "'None' represents special tokens added by tokenizer") + def __init__(self, **data): + super().__init__(**data) + + +class PredictionOutputForEmbedding(PredictionOutput): + embedding_mode: str = Field("", description="Only used by Transformers/ Adapters.
One of 'mean', 'max', 'cls', 'token'. The pooling mode used (or not used for 'token')") + word_ids: List[List[Optional[int]]] = Field([], description="Only used by Transformers/ Adapters.
" + "Only set with embedding_mode='token'." + " Mapping from each token to the corresponding word in the input. " + "'None' represents special tokens added by tokenizer") + def __init__(self, **data): + super().__init__(**data) + + +class PredictionOutputForGeneration(PredictionOutput): + generated_texts: List[List[str]] = Field(..., description="List of list of the generated texts. Length of outer list is the number of inputs, " + "length of inner list is parameter 'num_return_sequences' in request's 'task_kwargs'") + def __init__(self, **data): + super().__init__(**data) + + +class QAAnswer(BaseModel): + """ + A span answer for question_answering with a score, the start and end character index and the extracted span answer. + """ + score: float + start: int + end: int + answer: str + + +class PredictionOutputForQuestionAnswering(PredictionOutput): + answers: List[List[QAAnswer]] = Field(..., description="List of lists of answers. Length of outer list is the number of inputs, " + "length of inner list is parameter 'topk' from the request's 'task_kwargs' (default 1). " + "Each answer is a dictionary with 'score', 'start' (span start index in context), 'end' (span end index in context), " + "and 'answer' (the extracted span). The inner list is sorted by score. If no answer span was extracted, " + "the empty span is returned (start and end both 0)") + def __init__(self, **data): + super().__init__(**data) \ No newline at end of file diff --git a/square-model-inference-api/inference_server/square_model_inference/models/request.py b/square-model-inference-api/inference_server/square_model_inference/models/request.py index ad4af8ae6..547de9191 100644 --- a/square-model-inference-api/inference_server/square_model_inference/models/request.py +++ b/square-model-inference-api/inference_server/square_model_inference/models/request.py @@ -50,7 +50,7 @@ class PredictionRequest(BaseModel): "Transformer/ Adapter: See the forward method of the Huggingface models for possible parameters" "For example, set ‘output_attentions=True’ to receive the attention results in the output." ) - task: Task = Field(...) + #task: Task = Field(...) task_kwargs: dict = Field( default={}, description="Optional dictionary containing additional parameters for handling of the task and " diff --git a/square-model-inference-api/inference_server/tests/conftest.py b/square-model-inference-api/inference_server/tests/conftest.py index 880250d56..268f69990 100644 --- a/square-model-inference-api/inference_server/tests/conftest.py +++ b/square-model-inference-api/inference_server/tests/conftest.py @@ -43,41 +43,41 @@ async def predict(self, payload: PredictionRequest) -> PredictionOutput: @pytest.fixture(scope="class") def test_transformer_sequence_classification(): torch.manual_seed(987654321) - return Transformer(TRANSFORMER_MODEL, "sequence_classification", 1, True) + return Transformer(TRANSFORMER_MODEL, "sequence_classification", 1, True, 50) @pytest.fixture(scope="class") def test_transformer_embedding(): torch.manual_seed(987654321) - return Transformer(TRANSFORMER_MODEL, "base", 1, True) + return Transformer(TRANSFORMER_MODEL, "base", 1, True, 50) @pytest.fixture(scope="class") def test_transformer_token_classification(): torch.manual_seed(987654321) - return Transformer(TRANSFORMER_MODEL, "token_classification", 1, True) + return Transformer(TRANSFORMER_MODEL, "token_classification", 1, True, 50) @pytest.fixture(scope="class") def test_transformer_question_answering(): torch.manual_seed(987654321) - return Transformer(TRANSFORMER_MODEL, "question_answering", 1, True) + return Transformer(TRANSFORMER_MODEL, "question_answering", 1, True, 50) @pytest.fixture(scope="class") def test_transformer_generation(): torch.manual_seed(987654321) - return Transformer(TRANSFORMER_MODEL, "generation", 1, True) + return Transformer(TRANSFORMER_MODEL, "generation", 1, True, 50) @pytest.fixture(scope="class") def test_adapter(): - return AdapterTransformer(TRANSFORMER_MODEL, 1, True, TRANSFORMERS_TESTING_CACHE) + return AdapterTransformer(TRANSFORMER_MODEL, 1, True, TRANSFORMERS_TESTING_CACHE, 50) @pytest.fixture(scope="class") def test_sentence_transformer(): - return SentenceTransformer(SENTENCE_MODEL, 1, True) + return SentenceTransformer(SENTENCE_MODEL, 1, True, 50) @pytest.fixture() def prediction_request(): @@ -86,7 +86,6 @@ def prediction_request(): "is_preprocessed": False, "preprocessing_kwargs": {}, "model_kwargs": {}, - "task": "sequence_classification", "task_kwargs": {}, "adapter_name": "" }) diff --git a/square-model-inference-api/inference_server/tests/pre_test_setup_for_docker_caching.py b/square-model-inference-api/inference_server/tests/pre_test_setup_for_docker_caching.py index a6b537d5f..bb41c27ba 100644 --- a/square-model-inference-api/inference_server/tests/pre_test_setup_for_docker_caching.py +++ b/square-model-inference-api/inference_server/tests/pre_test_setup_for_docker_caching.py @@ -5,9 +5,8 @@ from sentence_transformers import SentenceTransformer from starlette.config import environ -from transformers import AutoTokenizer, AutoModelWithHeads +from transformers import AutoTokenizer, AutoModelWithHeads, list_adapters from loguru import logger -from transformers.adapter_utils import download_cached, ADAPTER_HUB_INDEX_FILE TRANSFORMERS_TESTING_CACHE = "./.model_testing_cache" environ["TRANSFORMERS_CACHE"] = TRANSFORMERS_TESTING_CACHE @@ -16,6 +15,7 @@ environ["DISABLE_GPU"] = "True" environ["BATCH_SIZE"] = "1" environ["RETURN_PLAINTEXT_ARRAYS"] = "True" +environ["MAX_INPUT_SIZE"] = "100" # Downloaded models: TRANSFORMER_MODEL = "bert-base-uncased" @@ -32,19 +32,20 @@ model = AutoModelWithHeads.from_pretrained(TRANSFORMER_MODEL).to(device) # Pre-download adapters - logger.warning("UPDATE with https://github.com/Adapter-Hub/adapter-transformers/pull/193") - index_file = download_cached(ADAPTER_HUB_INDEX_FILE.format(TRANSFORMER_MODEL)) - adapter_index = json.load(open(index_file)) - adapters = set() - for task, datasets in adapter_index.items(): - for dataset in datasets.keys(): - for key in datasets[dataset].keys(): - if key != "default": - for org in datasets[dataset][key]["versions"].keys(): - adapters.add(f"{task}/{dataset}@{org}") + logger.info("Loading all available adapters") + adapter_infos = [info for info in list_adapters(source="ah") if info.model_name==TRANSFORMER_MODEL] + adapters = set(f"{adapter_info.task}/{adapter_info.subtask}@{adapter_info.username}" for adapter_info in adapter_infos) for adapter in adapters: logger.debug(f"Loading adapter {adapter}") - model.load_adapter(adapter, load_as=adapter, with_head=True, cache_dir=TRANSFORMERS_TESTING_CACHE) + try: + model.load_adapter(adapter, load_as=adapter, with_head=True, cache_dir=TRANSFORMERS_TESTING_CACHE) + except KeyError as e: + logger.debug(f"Could not load {adapter} due to incomplete config:\n{e.args[0]}") + except RuntimeError as e: + if "Error(s) in loading state_dict" in e.args[0]: + logger.debug(f"Could not load {adapter} due to missing label_ids in config resulting in exception:\n{e.args[0]}") + else: + raise(e) # Pre-download sentence-transformer models for tests _ = SentenceTransformer(model_name_or_path=SENTENCE_MODEL, device=device) diff --git a/square-model-inference-api/inference_server/tests/test_inference/test_adapter.py b/square-model-inference-api/inference_server/tests/test_inference/test_adapter.py index 585801453..aa02d1e35 100644 --- a/square-model-inference-api/inference_server/tests/test_inference/test_adapter.py +++ b/square-model-inference-api/inference_server/tests/test_inference/test_adapter.py @@ -3,7 +3,8 @@ from square_model_inference.models.request import PredictionRequest import numpy as np - +# todo +@pytest.mark.skip(reason="Waiting for next update of adapter-transformers") def test_adapter(): pytest.fail("Adapter tests waiting for next version of adapter-transformers. " "Not enough heads available. " diff --git a/square-model-inference-api/inference_server/tests/test_inference/test_sentence_transformer.py b/square-model-inference-api/inference_server/tests/test_inference/test_sentence_transformer.py index 92d70faf6..1265aafb2 100644 --- a/square-model-inference-api/inference_server/tests/test_inference/test_sentence_transformer.py +++ b/square-model-inference-api/inference_server/tests/test_inference/test_sentence_transformer.py @@ -2,6 +2,8 @@ import numpy as np +from square_model_inference.models.request import Task + @pytest.mark.usefixtures("test_sentence_transformer") class TestSentenceTransformerEmbedding: @@ -10,23 +12,27 @@ class TestSentenceTransformerEmbedding: @pytest.mark.parametrize("input", [(["this is a test"]), (["this is a test", "this is a test with a longer sentence"])]) async def test_embedding(self, prediction_request, test_sentence_transformer, input): - prediction_request.task = "embedding" prediction_request.input = input - prediction = await test_sentence_transformer.predict(prediction_request) + prediction = await test_sentence_transformer.predict(prediction_request, Task.embedding) assert np.array(prediction.model_outputs["embeddings"]).shape[1] == 768 assert np.array(prediction.model_outputs["embeddings"]).shape[0] == len(input) @pytest.mark.asyncio async def test_not_embedding(self, prediction_request, test_sentence_transformer): - prediction_request.task = "sequence_classification" with pytest.raises(ValueError): - prediction = await test_sentence_transformer.predict(prediction_request) + prediction = await test_sentence_transformer.predict(prediction_request, Task.sequence_classification) @pytest.mark.asyncio async def test_forbid_is_preprocessed(self, prediction_request, test_sentence_transformer): - prediction_request.task = "embedding" prediction_request.is_preprocessed = True with pytest.raises(ValueError): - prediction = await test_sentence_transformer.predict(prediction_request) \ No newline at end of file + prediction = await test_sentence_transformer.predict(prediction_request, Task.embedding) + + @pytest.mark.asyncio + async def test_input_too_big(self, prediction_request, test_sentence_transformer): + prediction_request.input = ["test"]*1000 + + with pytest.raises(ValueError): + prediction = await test_sentence_transformer.predict(prediction_request, Task.embedding) \ No newline at end of file diff --git a/square-model-inference-api/inference_server/tests/test_inference/test_transformers.py b/square-model-inference-api/inference_server/tests/test_inference/test_transformers.py index 5f99ed649..f389e9d00 100644 --- a/square-model-inference-api/inference_server/tests/test_inference/test_transformers.py +++ b/square-model-inference-api/inference_server/tests/test_inference/test_transformers.py @@ -2,6 +2,8 @@ import numpy as np +from square_model_inference.models.request import Task + @pytest.mark.usefixtures("test_transformer_sequence_classification") class TestTransformerSequenceClassification: @@ -11,21 +13,18 @@ class TestTransformerSequenceClassification: ids=["single", "batch"]) async def test_sequence_classification(self, prediction_request, test_transformer_sequence_classification, input): prediction_request.input = input - prediction_request.task = "sequence_classification" - - prediction = await test_transformer_sequence_classification.predict(prediction_request) + + prediction = await test_transformer_sequence_classification.predict(prediction_request, Task.sequence_classification) np.testing.assert_allclose(np.sum(prediction.model_outputs["logits"], axis=-1), [1.0]*len(input), err_msg="logits are softmax") - assert len(prediction.task_outputs["labels"]) == len(input) - assert all(isinstance(prediction.task_outputs["labels"][i], int) for i in range(len(input))) + assert len(prediction.labels) == len(input) + assert all(isinstance(prediction.labels[i], int) for i in range(len(input))) assert "logits" in prediction.model_outputs - assert "id2label" in prediction.task_outputs @pytest.mark.asyncio async def test_sequence_classification_output_attention(self, prediction_request, test_transformer_sequence_classification): - prediction_request.task = "sequence_classification" prediction_request.model_kwargs = {"output_attentions": True} - prediction = await test_transformer_sequence_classification.predict(prediction_request) + prediction = await test_transformer_sequence_classification.predict(prediction_request, Task.sequence_classification) assert "attentions" in prediction.model_outputs @pytest.mark.asyncio @@ -34,12 +33,10 @@ async def test_sequence_classification_output_attention(self, prediction_request ids=["single", "batch"]) async def test_sequence_classification_regression(self, prediction_request, test_transformer_sequence_classification, input): prediction_request.input = input - prediction_request.task = "sequence_classification" prediction_request.task_kwargs = {"is_regression": True} - prediction = await test_transformer_sequence_classification.predict(prediction_request) + prediction = await test_transformer_sequence_classification.predict(prediction_request, Task.sequence_classification) assert not np.array_equal(np.sum(prediction.model_outputs["logits"], axis=-1)-1, [0.0]*len(input)), "logits are not softmax" - assert "labels" not in prediction.task_outputs assert "logits" in prediction.model_outputs @@ -53,14 +50,12 @@ class TestTransformerTokenClassification: ids=["single", "batch"]) async def test_token_classification(self, prediction_request, test_transformer_token_classification, input, word_ids): prediction_request.input = input - prediction_request.task = "token_classification" - prediction = await test_transformer_token_classification.predict(prediction_request) + prediction = await test_transformer_token_classification.predict(prediction_request, Task.token_classification) np.testing.assert_allclose(np.sum(prediction.model_outputs["logits"], axis=-1), np.ones(shape=(len(input), len(word_ids[0]))), err_msg="logits are softmax") - assert all(len(prediction.task_outputs["labels"][i]) == len(word_ids[i]) for i in range(len(input))) + assert all(len(prediction.labels[i]) == len(word_ids[i]) for i in range(len(input))) assert "logits" in prediction.model_outputs - assert "id2label" in prediction.task_outputs - assert prediction.task_outputs["word_ids"] == word_ids + assert prediction.word_ids == word_ids @pytest.mark.asyncio @@ -71,14 +66,12 @@ async def test_token_classification(self, prediction_request, test_transformer_t ids=["single", "batch"]) async def test_token_classification_regression(self, prediction_request, test_transformer_token_classification, input, word_ids): prediction_request.input = input - prediction_request.task = "token_classification" prediction_request.task_kwargs = {"is_regression": True} - prediction = await test_transformer_token_classification.predict(prediction_request) + prediction = await test_transformer_token_classification.predict(prediction_request, Task.token_classification) assert not np.array_equal((np.sum(prediction.model_outputs["logits"], axis=-1), np.ones_like(word_ids)), "logits are not softmax") - assert "labels" not in prediction.task_outputs assert "logits" in prediction.model_outputs - assert prediction.task_outputs["word_ids"] == word_ids + assert prediction.word_ids == word_ids @pytest.mark.usefixtures("test_transformer_embedding") @@ -93,14 +86,13 @@ class TestTransformerEmbedding: ) async def test_embedding(self, prediction_request, test_transformer_embedding, input, mode): prediction_request.input = input - prediction_request.task = "embedding" prediction_request.task_kwargs = {"embedding_mode": mode} - prediction = await test_transformer_embedding.predict(prediction_request) + prediction = await test_transformer_embedding.predict(prediction_request, Task.embedding) assert np.array(prediction.model_outputs["embeddings"]).shape[1] == 768 assert np.array(prediction.model_outputs["embeddings"]).shape[0] == len(input) assert "hidden_states" not in prediction.model_outputs - assert prediction.task_outputs["embedding_mode"] == mode + assert prediction.embedding_mode == mode @pytest.mark.asyncio @pytest.mark.parametrize("input,word_ids", [(["this is a test"], [[None, 0, 1, 2, 3, None]]), @@ -109,32 +101,36 @@ async def test_embedding(self, prediction_request, test_transformer_embedding, i ids=["single", "batch"]) async def test_embedding_token(self, prediction_request, test_transformer_embedding, input, word_ids): prediction_request.input = input - prediction_request.task = "embedding" prediction_request.task_kwargs = {"embedding_mode": "token"} - prediction = await test_transformer_embedding.predict(prediction_request) + prediction = await test_transformer_embedding.predict(prediction_request, Task.embedding) assert np.array(prediction.model_outputs["embeddings"]).shape[2] == 768 assert np.array(prediction.model_outputs["embeddings"]).shape[1] == len(word_ids[0]) assert np.array(prediction.model_outputs["embeddings"]).shape[0] == len(input) assert "hidden_states" not in prediction.model_outputs - assert prediction.task_outputs["embedding_mode"] == "token" - assert prediction.task_outputs["word_ids"] == word_ids + assert prediction.embedding_mode == "token" + assert prediction.word_ids == word_ids @pytest.mark.asyncio async def test_embedding_unknown_mode(self, prediction_request, test_transformer_embedding): - prediction_request.task = "embedding" prediction_request.task_kwargs = {"embedding_mode": "this mode does not exist"} with pytest.raises(ValueError): - prediction = await test_transformer_embedding.predict(prediction_request) + prediction = await test_transformer_embedding.predict(prediction_request, Task.embedding) @pytest.mark.asyncio async def test_forbid_is_preprocessed(self, prediction_request, test_transformer_embedding): - prediction_request.task = "embedding" prediction_request.is_preprocessed = True with pytest.raises(ValueError): - prediction = await test_transformer_embedding.predict(prediction_request) + prediction = await test_transformer_embedding.predict(prediction_request, Task.embedding) + + @pytest.mark.asyncio + async def test_input_too_big(self, prediction_request, test_transformer_embedding): + prediction_request.input = ["test"]*1000 + + with pytest.raises(ValueError): + prediction = await test_transformer_embedding.predict(prediction_request, Task.embedding) @pytest.mark.usefixtures("test_transformer_question_answering") class TestTransformerQuestionAnswering: @@ -146,26 +142,26 @@ class TestTransformerQuestionAnswering: ) async def test_question_answering(self, prediction_request, test_transformer_question_answering, input): prediction_request.input = input - prediction_request.task = "question_answering" prediction_request.task_kwargs = {"topk": 1} - prediction = await test_transformer_question_answering.predict(prediction_request) - answers = [input[i][1][prediction.task_outputs["answers"][i][0]["start"]:prediction.task_outputs["answers"][i][0]["end"]] for i in range(len(input))] + prediction = await test_transformer_question_answering.predict(prediction_request, Task.question_answering) + answers = [input[i][1][prediction.answers[i][0].start:prediction.answers[i][0].end] for i in range(len(input))] assert "start_logits" in prediction.model_outputs and "end_logits" in prediction.model_outputs - assert len(prediction.task_outputs["answers"]) == len(input) - assert all(prediction.task_outputs["answers"][i][0]["answer"] == answers[i] for i in range(len(input))) + assert len(prediction.answers) == len(input) + assert all(prediction.answers[i][0].answer == answers[i] for i in range(len(input))) @pytest.mark.asyncio async def test_question_answering_topk(self, prediction_request, test_transformer_question_answering): - prediction_request.input = [["What is a test?", "A test is a thing where you test."]] - prediction_request.task = "question_answering" + input = [["What is a test?", "A test is a thing where you test."]] + prediction_request.input = input prediction_request.task_kwargs = {"topk": 2} - prediction = await test_transformer_question_answering.predict(prediction_request) - answers = [input[0][1][prediction.task_outputs["answers"][0][i]["start"]:prediction.task_outputs["answers"][0][i]["end"]] for i in range(2)] + prediction = await test_transformer_question_answering.predict(prediction_request, Task.question_answering) + answers = [input[0][1][prediction.answers[0][i].start:prediction.answers[0][i].end] for i in range(2)] assert "start_logits" in prediction.model_outputs and "end_logits" in prediction.model_outputs - assert len(prediction.task_outputs["answers"]) == len(input) - assert all(prediction.task_outputs["answers"][0][i]["answer"] == answers[i] for i in range(2)) + assert len(prediction.answers) == len(input) + assert prediction.answers[0][0].score >= prediction.answers[0][1].score + assert all(prediction.answers[0][i].answer == answers[i] for i in range(2)) @pytest.mark.usefixtures("test_transformer_generation") @@ -176,26 +172,23 @@ class TestTransformerGeneration: ) async def test_generation(self, prediction_request, test_transformer_generation, input): prediction_request.input = input - prediction_request.task = "generation" - prediction = await test_transformer_generation.predict(prediction_request) - assert all(isinstance(prediction.task_outputs["generated_texts"][i][0], str) for i in range(len(input))) + prediction = await test_transformer_generation.predict(prediction_request, Task.generation) + assert all(isinstance(prediction.generated_texts[i][0], str) for i in range(len(input))) @pytest.mark.asyncio async def test_generation_output_attention_and_scores(self, prediction_request, test_transformer_generation): - prediction_request.task = "generation" prediction_request.model_kwargs = { "output_attentions": True, "output_scores": True } - prediction = await test_transformer_generation.predict(prediction_request) + prediction = await test_transformer_generation.predict(prediction_request, Task.generation) assert "scores" in prediction.model_outputs assert "attentions" in prediction.model_outputs @pytest.mark.asyncio async def test_generation_beam_sample_multiple_seqs(self, prediction_request, test_transformer_generation): - prediction_request.task = "generation" prediction_request.task_kwargs = { "num_beams": 2, "do_sample": True, @@ -205,5 +198,5 @@ async def test_generation_beam_sample_multiple_seqs(self, prediction_request, te "num_return_sequences": 2 } - prediction = await test_transformer_generation.predict(prediction_request) - assert len(prediction.task_outputs["generated_texts"][0]) == 2 \ No newline at end of file + prediction = await test_transformer_generation.predict(prediction_request, Task.generation) + assert len(prediction.generated_texts[0]) == 2 \ No newline at end of file diff --git a/square-model-inference-api/inference_server/uninstall_requirements.txt b/square-model-inference-api/inference_server/uninstall_requirements.txt new file mode 100644 index 000000000..747b7aa97 --- /dev/null +++ b/square-model-inference-api/inference_server/uninstall_requirements.txt @@ -0,0 +1 @@ +transformers \ No newline at end of file diff --git a/square-model-inference-api/nginx/nginx.conf b/square-model-inference-api/nginx/nginx.conf index 20c5c84d4..c6051be8f 100644 --- a/square-model-inference-api/nginx/nginx.conf +++ b/square-model-inference-api/nginx/nginx.conf @@ -11,7 +11,7 @@ http { } location /api/bert-base-uncased { - #auth_request /auth; + auth_request /auth; proxy_pass http://square_model_inference_bert:8000/api; } @@ -22,13 +22,7 @@ http { location /auth { internal; - set $query ''; - if ($request_uri ~* "[^\?]+\?(.*)$") { - set $query $1; - } proxy_pass http://auth:8081/auth; - proxy_pass_request_body off; - proxy_set_header Content-Length ""; } } } \ No newline at end of file From 5b6d3fef3acd936b08b8f6ef5df78dd294b8c1bb Mon Sep 17 00:00:00 2001 From: Geigle Date: Tue, 13 Jul 2021 13:48:55 +0200 Subject: [PATCH 11/23] Updated adapter-transformers and added tests for adapter model --- .../inference_server/requirements2.txt | 2 +- .../inference/adaptertransformer.py | 5 +- .../models/prediction.py | 1 + .../inference_server/tests/conftest.py | 21 +- .../pre_test_setup_for_docker_caching.py | 2 - .../tests/test_api/test_prediction.py | 85 ++++++- .../tests/test_inference/test_adapter.py | 208 +++++++++++++++++- 7 files changed, 294 insertions(+), 30 deletions(-) diff --git a/square-model-inference-api/inference_server/requirements2.txt b/square-model-inference-api/inference_server/requirements2.txt index 1503d3a94..05ea5692f 100644 --- a/square-model-inference-api/inference_server/requirements2.txt +++ b/square-model-inference-api/inference_server/requirements2.txt @@ -1 +1 @@ -adapter-transformers==2.0.1 \ No newline at end of file +adapter-transformers==2.1.1 \ No newline at end of file diff --git a/square-model-inference-api/inference_server/square_model_inference/inference/adaptertransformer.py b/square-model-inference-api/inference_server/square_model_inference/inference/adaptertransformer.py index e9c052c4c..06995863c 100644 --- a/square-model-inference-api/inference_server/square_model_inference/inference/adaptertransformer.py +++ b/square-model-inference-api/inference_server/square_model_inference/inference/adaptertransformer.py @@ -42,14 +42,11 @@ def _load_adapter(self, model_name, transformers_cache): logger.debug(f"Loading adapter {adapter}") try: self.model.load_adapter(adapter, load_as=adapter, with_head=True, cache_dir=transformers_cache) - except KeyError as e: - logger.debug(f"Could not load {adapter} due to incomplete config: Missing key {e.args[0]}") except RuntimeError as e: if "Error(s) in loading state_dict" in e.args[0]: logger.debug(f"Could not load {adapter} due to missing label_ids in config resulting in exception:\n{e.args[0]}") else: - raise(e) - + raise e # def _load_single_adapter(self, adapter_name: str): # if adapter_name not in self.model.config.adapters.adapters: diff --git a/square-model-inference-api/inference_server/square_model_inference/models/prediction.py b/square-model-inference-api/inference_server/square_model_inference/models/prediction.py index 67be53aba..18a4f2ed3 100644 --- a/square-model-inference-api/inference_server/square_model_inference/models/prediction.py +++ b/square-model-inference-api/inference_server/square_model_inference/models/prediction.py @@ -54,6 +54,7 @@ class PredictionOutput(BaseModel): The results of the prediction of the model on the given input for the requested task. """ model_outputs: Dict = Field( + {}, description="Dictionary containing the model tensor outputs either as plain list or as base64-encoded numpy array.

" "Decode the base64 string 'arr_string_b64' back to an array in Python like this:
" "arr_binary_b64 = arr_string_b64.encode()
" diff --git a/square-model-inference-api/inference_server/tests/conftest.py b/square-model-inference-api/inference_server/tests/conftest.py index 268f69990..fc8984e78 100644 --- a/square-model-inference-api/inference_server/tests/conftest.py +++ b/square-model-inference-api/inference_server/tests/conftest.py @@ -17,8 +17,9 @@ from main import get_app from square_model_inference.inference.model import Model -from square_model_inference.models.prediction import PredictionOutput -from square_model_inference.models.request import PredictionRequest +from square_model_inference.models.prediction import PredictionOutput, PredictionOutputForGeneration, \ + PredictionOutputForEmbedding, PredictionOutputForTokenClassification, PredictionOutputForSequenceClassification, PredictionOutputForQuestionAnswering +from square_model_inference.models.request import PredictionRequest, Task from square_model_inference.inference.transformer import Transformer from square_model_inference.inference.adaptertransformer import AdapterTransformer from square_model_inference.inference.sentencetransformer import SentenceTransformer @@ -32,11 +33,17 @@ def test_app(): class TestModel(Model): - def __init__(self): - self.prediction = PredictionOutput(model_outputs={}, task_outputs={}) - - async def predict(self, payload: PredictionRequest) -> PredictionOutput: - return self.prediction + async def predict(self, payload, task) -> PredictionOutput: + if task == Task.generation: + return PredictionOutputForGeneration(generated_texts=[[""]]) + elif task == Task.question_answering: + return PredictionOutputForQuestionAnswering(answers=[[{"score": 0, "start": 0, "end": 0, "answer": ""}]]) + elif task == Task.embedding: + return PredictionOutputForEmbedding(word_ids=[[0]]) + elif task == Task.token_classification: + return PredictionOutputForTokenClassification(word_ids=[[0]]) + elif task == Task.sequence_classification: + return PredictionOutputForSequenceClassification() # We only load bert-base-uncased, so we fix the random seed to always get the same randomly generated heads on top diff --git a/square-model-inference-api/inference_server/tests/pre_test_setup_for_docker_caching.py b/square-model-inference-api/inference_server/tests/pre_test_setup_for_docker_caching.py index bb41c27ba..12c6de162 100644 --- a/square-model-inference-api/inference_server/tests/pre_test_setup_for_docker_caching.py +++ b/square-model-inference-api/inference_server/tests/pre_test_setup_for_docker_caching.py @@ -39,8 +39,6 @@ logger.debug(f"Loading adapter {adapter}") try: model.load_adapter(adapter, load_as=adapter, with_head=True, cache_dir=TRANSFORMERS_TESTING_CACHE) - except KeyError as e: - logger.debug(f"Could not load {adapter} due to incomplete config:\n{e.args[0]}") except RuntimeError as e: if "Error(s) in loading state_dict" in e.args[0]: logger.debug(f"Could not load {adapter} due to missing label_ids in config resulting in exception:\n{e.args[0]}") diff --git a/square-model-inference-api/inference_server/tests/test_api/test_prediction.py b/square-model-inference-api/inference_server/tests/test_api/test_prediction.py index 9829ff680..77767924a 100644 --- a/square-model-inference-api/inference_server/tests/test_api/test_prediction.py +++ b/square-model-inference-api/inference_server/tests/test_api/test_prediction.py @@ -1,10 +1,10 @@ from starlette.testclient import TestClient -def test_prediction(test_app) -> None: +def test_api_sequence_classification(test_app) -> None: test_client = TestClient(test_app) response = test_client.post( - "/api/predict", + "/api/sequence-classification", json={ "input": [ "this is a test" @@ -12,7 +12,6 @@ def test_prediction(test_app) -> None: "is_preprocessed": False, "preprocessing_kwargs": {}, "model_kwargs": {}, - "task": "embedding", "task_kwargs": {}, "adapter_name": "" } @@ -20,20 +19,88 @@ def test_prediction(test_app) -> None: assert response.status_code == 200 -def test_prediction_wrong_request(test_app) -> None: +def test_api_sequence_classification_malformed_input(test_app) -> None: test_client = TestClient(test_app, raise_server_exceptions=False) response = test_client.post( - "/api/predict", json={ + "/api/sequence-classification", json={ + # "input": [ + # "this is a test" + # ], + "is_preprocessed": False, + "preprocessing_kwargs": {}, + "task_kwargs": {}, + "adapter_name": "" + } + ) + assert response.status_code == 422 + + +def test_api_token_classification(test_app) -> None: + test_client = TestClient(test_app) + response = test_client.post( + "/api/token-classification", + json={ "input": [ "this is a test" ], "is_preprocessed": False, "preprocessing_kwargs": {}, - # required - #"model_kwargs": {}, - #"task": "embedding", + "model_kwargs": {}, "task_kwargs": {}, "adapter_name": "" } ) - assert response.status_code == 422 + assert response.status_code == 200 + +def test_api_embedding(test_app) -> None: + test_client = TestClient(test_app) + response = test_client.post( + "/api/embedding", + json={ + "input": [ + "this is a test" + ], + "is_preprocessed": False, + "preprocessing_kwargs": {}, + "model_kwargs": {}, + "task_kwargs": {}, + "adapter_name": "" + } + ) + assert response.status_code == 200 + + +def test_api_question_answering(test_app) -> None: + test_client = TestClient(test_app) + response = test_client.post( + "/api/question-answering", + json={ + "input": [ + "this is a test" + ], + "is_preprocessed": False, + "preprocessing_kwargs": {}, + "model_kwargs": {}, + "task_kwargs": {}, + "adapter_name": "" + } + ) + assert response.status_code == 200 + + +def test_api_generation(test_app) -> None: + test_client = TestClient(test_app) + response = test_client.post( + "/api/generation", + json={ + "input": [ + "this is a test" + ], + "is_preprocessed": False, + "preprocessing_kwargs": {}, + "model_kwargs": {}, + "task_kwargs": {}, + "adapter_name": "" + } + ) + assert response.status_code == 200 \ No newline at end of file diff --git a/square-model-inference-api/inference_server/tests/test_inference/test_adapter.py b/square-model-inference-api/inference_server/tests/test_inference/test_adapter.py index aa02d1e35..b379c3e2a 100644 --- a/square-model-inference-api/inference_server/tests/test_inference/test_adapter.py +++ b/square-model-inference-api/inference_server/tests/test_inference/test_adapter.py @@ -1,11 +1,205 @@ import pytest -from square_model_inference.models.request import PredictionRequest import numpy as np -# todo -@pytest.mark.skip(reason="Waiting for next update of adapter-transformers") -def test_adapter(): - pytest.fail("Adapter tests waiting for next version of adapter-transformers. " - "Not enough heads available. " - "Waiting for https://github.com/Adapter-Hub/adapter-transformers/commit/94ff58dbf09ac1d14b7107c9ce48770d2d352161") \ No newline at end of file +from square_model_inference.models.request import Task + + +@pytest.mark.usefixtures("test_adapter") +class TestTransformerAdapter: + @pytest.mark.asyncio + @pytest.mark.parametrize("input", [(["this is a test"]), + (["this is a test", "this is a test with a longer sentence"])], + ids=["single", "batch"]) + async def test_sequence_classification(self, prediction_request, test_adapter, input): + prediction_request.input = input + prediction_request.adapter_name = "nli/rte@ukp" + + prediction = await test_adapter.predict(prediction_request, Task.sequence_classification) + np.testing.assert_allclose(np.sum(prediction.model_outputs["logits"], axis=-1), [1.0]*len(input), err_msg="logits are softmax") + assert len(prediction.labels) == len(input) + assert all(isinstance(prediction.labels[i], int) for i in range(len(input))) + assert "logits" in prediction.model_outputs + + @pytest.mark.asyncio + async def test_sequence_classification_output_attention(self, prediction_request, test_adapter): + prediction_request.model_kwargs = {"output_attentions": True} + prediction_request.adapter_name = "nli/rte@ukp" + + prediction = await test_adapter.predict(prediction_request, Task.sequence_classification) + assert "attentions" in prediction.model_outputs + + @pytest.mark.asyncio + @pytest.mark.parametrize("input", [(["this is a test"]), + (["this is a test", "this is a test with a longer sentence"])], + ids=["single", "batch"]) + async def test_sequence_classification_regression(self, prediction_request, test_adapter, input): + prediction_request.input = input + prediction_request.task_kwargs = {"is_regression": True} + prediction_request.adapter_name = "nli/rte@ukp" + + prediction = await test_adapter.predict(prediction_request, Task.sequence_classification) + assert not np.array_equal(np.sum(prediction.model_outputs["logits"], axis=-1)-1, [0.0]*len(input)), "logits are not softmax" + assert "logits" in prediction.model_outputs + + + @pytest.mark.asyncio + @pytest.mark.parametrize("input,word_ids", [(["this is a test"], [[None, 0, 1, 2, 3, None]]), + (["this is a test", "this is a test with a longer sentence"], + [[None, 0, 1, 2, 3, None, None, None, None, None], [None, 0, 1, 2, 3, 4, 5, 6, 7, None]])], + ids=["single", "batch"]) + async def test_token_classification(self, prediction_request, test_adapter, input, word_ids): + prediction_request.input = input + prediction_request.adapter_name = "ner/conll2003@ukp" + + prediction = await test_adapter.predict(prediction_request, Task.token_classification) + np.testing.assert_allclose(np.sum(prediction.model_outputs["logits"], axis=-1), np.ones(shape=(len(input), len(word_ids[0]))), atol=1e-6, err_msg="logits should be softmax") + assert all(len(prediction.labels[i]) == len(word_ids[i]) for i in range(len(input))) + assert "logits" in prediction.model_outputs + assert prediction.word_ids == word_ids + + + @pytest.mark.asyncio + @pytest.mark.parametrize("input,word_ids", [(["this is a test"], + [[None, 0, 1, 2, 3, None]]), + (["this is a test", "this is a test with a longer sentence"], + [[None, 0, 1, 2, 3, None, None, None, None, None], [None, 0, 1, 2, 3, 4, 5, 6, 7, None]])], + ids=["single", "batch"]) + async def test_token_classification_regression(self, prediction_request, test_adapter, input, word_ids): + prediction_request.input = input + prediction_request.task_kwargs = {"is_regression": True} + prediction_request.adapter_name = "ner/conll2003@ukp" + + prediction = await test_adapter.predict(prediction_request, Task.token_classification) + assert not np.array_equal((np.sum(prediction.model_outputs["logits"], axis=-1), np.ones_like(word_ids)), "logits are not softmax") + assert "logits" in prediction.model_outputs + assert prediction.word_ids == word_ids + + @pytest.mark.asyncio + @pytest.mark.parametrize("input,mode", [(["this is a test"], "mean"), + (["this is a test", "this is a test with a longer sentence"], "mean"), + (["this is a test"], "max"), + (["this is a test", "this is a test with a longer sentence"], "max"), + (["this is a test"], "cls"), + (["this is a test", "this is a test with a longer sentence"], "cls")], + ) + async def test_embedding(self, prediction_request, test_adapter, input, mode): + prediction_request.input = input + prediction_request.task_kwargs = {"embedding_mode": mode} + prediction_request.adapter_name = "sts/sts-b@ukp" + + prediction = await test_adapter.predict(prediction_request, Task.embedding) + assert np.array(prediction.model_outputs["embeddings"]).shape[1] == 768 + assert np.array(prediction.model_outputs["embeddings"]).shape[0] == len(input) + assert "hidden_states" not in prediction.model_outputs + assert prediction.embedding_mode == mode + + @pytest.mark.asyncio + @pytest.mark.parametrize("input,word_ids", [(["this is a test"], [[None, 0, 1, 2, 3, None]]), + (["this is a test", "this is a test with a longer sentence"], + [[None, 0, 1, 2, 3, None, None, None, None, None], [None, 0, 1, 2, 3, 4, 5, 6, 7, None]])], + ids=["single", "batch"]) + async def test_embedding_token(self, prediction_request, test_adapter, input, word_ids): + prediction_request.input = input + prediction_request.task_kwargs = {"embedding_mode": "token"} + prediction_request.adapter_name = "sts/sts-b@ukp" + + prediction = await test_adapter.predict(prediction_request, Task.embedding) + assert np.array(prediction.model_outputs["embeddings"]).shape[2] == 768 + assert np.array(prediction.model_outputs["embeddings"]).shape[1] == len(word_ids[0]) + assert np.array(prediction.model_outputs["embeddings"]).shape[0] == len(input) + assert "hidden_states" not in prediction.model_outputs + assert prediction.embedding_mode == "token" + assert prediction.word_ids == word_ids + + @pytest.mark.asyncio + async def test_embedding_unknown_mode(self, prediction_request, test_adapter): + prediction_request.task_kwargs = {"embedding_mode": "this mode does not exist"} + prediction_request.adapter_name = "sts/sts-b@ukp" + + with pytest.raises(ValueError): + prediction = await test_adapter.predict(prediction_request, Task.embedding) + + @pytest.mark.asyncio + async def test_forbid_is_preprocessed(self, prediction_request, test_adapter): + prediction_request.is_preprocessed = True + prediction_request.adapter_name = "sts/sts-b@ukp" + + with pytest.raises(ValueError): + prediction = await test_adapter.predict(prediction_request, Task.embedding) + + @pytest.mark.asyncio + async def test_input_too_big(self, prediction_request, test_adapter): + prediction_request.input = ["test"]*1000 + prediction_request.adapter_name = "sts/sts-b@ukp" + + with pytest.raises(ValueError): + prediction = await test_adapter.predict(prediction_request, Task.embedding) + + @pytest.mark.asyncio + @pytest.mark.parametrize("input", [([["What is a test?", "A test is a thing where you test."]]), + ([["What is a test?", "A test is a thing where you test."], + ["What is a longer test?", "A test is a thing where you test. If it is longer you call it longer"]])], + ) + async def test_question_answering(self, prediction_request, test_adapter, input): + prediction_request.input = input + prediction_request.task_kwargs = {"topk": 1} + prediction_request.adapter_name = "qa/squad2@ukp" + + prediction = await test_adapter.predict(prediction_request, Task.question_answering) + answers = [input[i][1][prediction.answers[i][0].start:prediction.answers[i][0].end] for i in range(len(input))] + assert "start_logits" in prediction.model_outputs and "end_logits" in prediction.model_outputs + assert len(prediction.answers) == len(input) + assert all(prediction.answers[i][0].answer == answers[i] for i in range(len(input))) + + @pytest.mark.asyncio + async def test_question_answering_topk(self, prediction_request, test_adapter): + input = [["What is a test?", "A test is a thing where you test."]] + prediction_request.input = input + prediction_request.task_kwargs = {"topk": 2} + prediction_request.adapter_name = "qa/squad2@ukp" + + prediction = await test_adapter.predict(prediction_request, Task.question_answering) + answers = [input[0][1][prediction.answers[0][i].start:prediction.answers[0][i].end] for i in range(2)] + assert "start_logits" in prediction.model_outputs and "end_logits" in prediction.model_outputs + assert len(prediction.answers) == len(input) + assert prediction.answers[0][0].score >= prediction.answers[0][1].score + assert all(prediction.answers[0][i].answer == answers[i] for i in range(2)) + + @pytest.mark.skip("No generation adapter for bert-base-uncased available currently.") + @pytest.mark.asyncio + @pytest.mark.parametrize("input", [(["Generate text"]), + (["Generate text", "And more text"])], + ) + async def test_generation(self, prediction_request, test_adapter, input): + prediction_request.input = input + + prediction = await test_adapter.predict(prediction_request, Task.generation) + assert all(isinstance(prediction.generated_texts[i][0], str) for i in range(len(input))) + + @pytest.mark.skip("No generation adapter for bert-base-uncased available currently.") + @pytest.mark.asyncio + async def test_generation_output_attention_and_scores(self, prediction_request, test_adapter): + prediction_request.model_kwargs = { + "output_attentions": True, + "output_scores": True + } + + prediction = await test_adapter.predict(prediction_request, Task.generation) + assert "scores" in prediction.model_outputs + assert "attentions" in prediction.model_outputs + + @pytest.mark.skip("No generation adapter for bert-base-uncased available currently.") + @pytest.mark.asyncio + async def test_generation_beam_sample_multiple_seqs(self, prediction_request, test_adapter): + prediction_request.task_kwargs = { + "num_beams": 2, + "do_sample": True, + "top_k": 10, + "top_p": 0.5, + "no_repeat_ngram_size": 2, + "num_return_sequences": 2 + } + + prediction = await test_adapter.predict(prediction_request, Task.generation) + assert len(prediction.generated_texts[0]) == 2 \ No newline at end of file From e368434cf0a3fea6859f4ab3c3acb3e01c5f1017 Mon Sep 17 00:00:00 2001 From: Geigle Date: Tue, 13 Jul 2021 15:27:16 +0200 Subject: [PATCH 12/23] Updated auth server; fixed output serialization problem discovered by Docker integration test --- .../auth_server/.env.example | 2 +- .../auth_server/Dockerfile | 6 +-- .../auth_server/auth_api/messages.py | 2 +- .../auth_server/auth_api/security.py | 13 +++--- .../auth_server/requirements.txt | 6 +-- square-model-inference-api/docker-compose.yml | 40 ------------------ .../example_docker-compose.yml | 42 +++++++++++++++++++ .../models/prediction.py | 12 +++++- 8 files changed, 65 insertions(+), 58 deletions(-) delete mode 100644 square-model-inference-api/docker-compose.yml create mode 100644 square-model-inference-api/example_docker-compose.yml diff --git a/square-model-inference-api/auth_server/.env.example b/square-model-inference-api/auth_server/.env.example index 6125fa20f..95fa8b3b7 100644 --- a/square-model-inference-api/auth_server/.env.example +++ b/square-model-inference-api/auth_server/.env.example @@ -1,2 +1,2 @@ API_KEY=example_key -API_KEY_NAME=api_key \ No newline at end of file +API_KEY_NAME=Authorization \ No newline at end of file diff --git a/square-model-inference-api/auth_server/Dockerfile b/square-model-inference-api/auth_server/Dockerfile index 0ced2f573..aadc437d3 100644 --- a/square-model-inference-api/auth_server/Dockerfile +++ b/square-model-inference-api/auth_server/Dockerfile @@ -1,13 +1,13 @@ -FROM python:3.8.2 +FROM python:3.7.6-slim-buster ENV PYTHONUNBUFFERED 1 EXPOSE 8081 WORKDIR /app +RUN pip install --upgrade pip COPY requirements.txt ./ -RUN pip install --upgrade pip && \ - pip install -r requirements.txt +RUN pip install -r requirements.txt COPY . ./ diff --git a/square-model-inference-api/auth_server/auth_api/messages.py b/square-model-inference-api/auth_server/auth_api/messages.py index 85c28dc04..03eb2e101 100644 --- a/square-model-inference-api/auth_server/auth_api/messages.py +++ b/square-model-inference-api/auth_server/auth_api/messages.py @@ -1,4 +1,4 @@ -NO_API_KEY = "No API key provided." +NO_API_KEY = "No API key provided in header {}." AUTH_REQ = "Authentication required." HTTP_500_DETAIL = "Internal server error." diff --git a/square-model-inference-api/auth_server/auth_api/security.py b/square-model-inference-api/auth_server/auth_api/security.py index 8f6082a8c..943eee929 100644 --- a/square-model-inference-api/auth_server/auth_api/security.py +++ b/square-model-inference-api/auth_server/auth_api/security.py @@ -1,24 +1,21 @@ import secrets from fastapi import HTTPException, Security -from fastapi.security.api_key import APIKeyHeader, APIKeyQuery +from fastapi.security.api_key import APIKeyHeader from starlette.status import HTTP_400_BAD_REQUEST, HTTP_401_UNAUTHORIZED from .config import API_KEY, API_KEY_NAME from .messages import AUTH_REQ, NO_API_KEY -api_key_query = APIKeyQuery(name=API_KEY_NAME, auto_error=False) api_key_header = APIKeyHeader(name=API_KEY_NAME, auto_error=False) -def validate_request(query: str = Security(api_key_query), - header: str = Security(api_key_header),) -> bool: - if query is None and header is None: +def validate_request(header: str = Security(api_key_header),) -> bool: + if header is None: raise HTTPException( - status_code=HTTP_400_BAD_REQUEST, detail=NO_API_KEY, headers={} + status_code=HTTP_400_BAD_REQUEST, detail=NO_API_KEY.format(API_KEY_NAME), headers={} ) - elif not secrets.compare_digest(query, str(API_KEY)) \ - or not secrets.compare_digest(header, str(API_KEY)): + elif not secrets.compare_digest(header, str(API_KEY)): raise HTTPException( status_code=HTTP_401_UNAUTHORIZED, detail=AUTH_REQ, headers={} ) diff --git a/square-model-inference-api/auth_server/requirements.txt b/square-model-inference-api/auth_server/requirements.txt index e059f7f5f..c3f33f964 100644 --- a/square-model-inference-api/auth_server/requirements.txt +++ b/square-model-inference-api/auth_server/requirements.txt @@ -1,3 +1,3 @@ -uvicorn -fastapi -pydantic +uvicorn==0.13.4 +fastapi==0.65.1 +pydantic==1.8.2 diff --git a/square-model-inference-api/docker-compose.yml b/square-model-inference-api/docker-compose.yml deleted file mode 100644 index ada141ee6..000000000 --- a/square-model-inference-api/docker-compose.yml +++ /dev/null @@ -1,40 +0,0 @@ -version: "3" - -services: - inference_bert: - build: ./inference_server/. - image: square_model_inference:latest - ports: - - 8000:8000 - env_file: - - ./inference_server/.env.bert - container_name: square_model_inference_bert - volumes: - - ./.cache/:/etc/huggingface/.cache/ - - ./inference_server/:/app/ #for developement - - #inference_roberta: - # build: ./inference_server/. - # image: square_model_inference:latest - # ports: - # - 8001:8000 - # env_file: - # - ./inference_server/.env.roberta - # container_name: square_model_inference_roberta - # volumes: - # - ./.cache/:/etc/huggingface/.cache/ - # - ./inference_server/:/app/ #for developement - - auth: - build: ./auth_server/. - ports: - - 8081:8081 - env_file: - - ./auth_server/.env.example - - nginx: - image: nginx - ports: - - 8080:8080 - volumes: - - ./nginx/nginx.conf:/etc/nginx/nginx.conf:ro \ No newline at end of file diff --git a/square-model-inference-api/example_docker-compose.yml b/square-model-inference-api/example_docker-compose.yml new file mode 100644 index 000000000..d15b00b0a --- /dev/null +++ b/square-model-inference-api/example_docker-compose.yml @@ -0,0 +1,42 @@ +version: "3" + +services: + auth: + build: ./auth_server/. + ports: + - 8081:8081 + env_file: + - ./auth_server/.env.example + + nginx: + image: nginx + ports: + - 8080:8080 + volumes: + - ./nginx/nginx.conf:/etc/nginx/nginx.conf:ro + + # Example config for one model server + inference_bert: + build: ./inference_server/. + image: square_model_inference:latest + ports: + - 8000:8000 + env_file: + - ./inference_server/.env.bert + container_name: square_model_inference_bert + volumes: + - ./.cache/:/etc/huggingface/.cache/ + #- ./inference_server/:/app/ #for developement + + # Example config for another model server + inference_roberta: + build: ./inference_server/. + image: square_model_inference:latest + ports: + - 8001:8000 + env_file: + - ./inference_server/.env.roberta + container_name: square_model_inference_roberta + volumes: + - ./.cache/:/etc/huggingface/.cache/ + #- ./inference_server/:/app/ #for developement diff --git a/square-model-inference-api/inference_server/square_model_inference/models/prediction.py b/square-model-inference-api/inference_server/square_model_inference/models/prediction.py index 18a4f2ed3..3f0963f69 100644 --- a/square-model-inference-api/inference_server/square_model_inference/models/prediction.py +++ b/square-model-inference-api/inference_server/square_model_inference/models/prediction.py @@ -36,15 +36,23 @@ def encode(arr): # arr_binary = base64.decodebytes(arr_binary_b64) # arr = np.load(BytesIO(arr_binary)) - # Recursively go through a value and encode leafs (=tensors) it or iterate over values and encode them + # Recursively go through a value and encode leaves (=tensors) it or iterate over values and encode them def enc_or_iterate(val): + # Stop attempt to encode an already encoded array + # This can happen because PredictionOutput is initialized twice + # - once by Model and once when request response is serialized by fastAPI + if isinstance(val, int) or isinstance(val, float) or isinstance(val, str): + raise ValueError("Array is already encoded") if isinstance(val, Iterable) and not isinstance(val, torch.Tensor) and not isinstance(val, np.ndarray): return [enc_or_iterate(v) for v in val] else: return encode(val) for k, v in obj.items(): - v = enc_or_iterate(v) + try: + v = enc_or_iterate(v) + except ValueError: + break obj[k] = v return obj From 9575ef8e7a14e27581dedd72683cb21314ceb9f4 Mon Sep 17 00:00:00 2001 From: Geigle Date: Wed, 14 Jul 2021 14:19:15 +0200 Subject: [PATCH 13/23] Extended documentation --- square-model-inference-api/.dockerignore | 1 + square-model-inference-api/.gitignore | 1 + square-model-inference-api/Makefile | 8 +-- square-model-inference-api/README.md | 69 +++++++++---------- .../auth_server/.env.example | 2 +- .../auth_server/auth_api/config.py | 4 +- .../auth_server/auth_api/messages.py | 1 - .../auth_server/auth_api/security.py | 6 +- .../auth_server/main.py | 7 +- .../auth_server/requirements.txt | 6 +- .../inference_server/.env.example | 26 +++++++ .../inference_server/main.py | 2 +- .../inference_server/requirements1.txt | 10 +-- .../api/routes/heartbeat.py | 8 +-- .../api/routes/prediction.py | 7 +- .../square_model_inference/core/config.py | 1 + .../core/event_handlers.py | 4 ++ .../inference/adaptertransformer.py | 2 - .../models/heartbeat.py | 2 +- .../models/prediction.py | 3 +- 20 files changed, 104 insertions(+), 66 deletions(-) create mode 100644 square-model-inference-api/inference_server/.env.example diff --git a/square-model-inference-api/.dockerignore b/square-model-inference-api/.dockerignore index 0af8215d8..cc3361cfa 100644 --- a/square-model-inference-api/.dockerignore +++ b/square-model-inference-api/.dockerignore @@ -60,6 +60,7 @@ nosetests.xml coverage.xml *.cover .hypothesis/ +.model_testing_cache # Translations *.mo diff --git a/square-model-inference-api/.gitignore b/square-model-inference-api/.gitignore index 7389010d8..ed7883485 100644 --- a/square-model-inference-api/.gitignore +++ b/square-model-inference-api/.gitignore @@ -57,6 +57,7 @@ nosetests.xml coverage.xml *.cover .hypothesis/ +.model_testing_cache # Translations *.mo diff --git a/square-model-inference-api/Makefile b/square-model-inference-api/Makefile index 13e50caf8..0e8bc2e54 100644 --- a/square-model-inference-api/Makefile +++ b/square-model-inference-api/Makefile @@ -8,16 +8,16 @@ all: clean test install run deploy down test: python -m pip install --upgrade pip && pip install pytest pytest-cov pytest-asyncio - pytest --cov + PYTHONPATH=inference_server/ pytest --cov install: pip install --upgrade pip pip install -r inference_server/requirements1.txt - pip uninstall -y -r inference_server/uninstall_requirements.txt - pip install -r inference_server/requirements2.txt + pip uninstall -y -r inference_server/uninstall_requirements.txt + pip install -r inference_server/requirements2.txt run: - PYTHONPATH=inference_server/ uvicorn inference_server.main:app --reload --host 0.0.0.0 --port 8002 --env-file inference_server/.env + PYTHONPATH=inference_server/ uvicorn inference_server.main:app --reload --host 0.0.0.0 --port 8000 --env-file inference_server/.env build: docker-compose build diff --git a/square-model-inference-api/README.md b/square-model-inference-api/README.md index 324b89cdc..c51c79ab3 100644 --- a/square-model-inference-api/README.md +++ b/square-model-inference-api/README.md @@ -4,21 +4,30 @@ Receives input and returns prediction and other artifacts (e.g. attention scores ## Project structure -TODO +The Model API uses 3 components: +1 authorization server, n inference servers (each with their own model), +and a nginx server that serves as API gateway to forward requests to the correct inference server and +to handle authorization of requests with the auth server. ``` -├───tests -│ ├───test_api -│ └───test_service -├───auth_server +├───auth_server # FastAPI Authorization Server +│ ├───main.py # Entry point in server +│ ├───Dockerfile # Dockerfile for server │ └───auth_api -├───nginx -├───inference_server -│ ├───square_model_inference -│ │ ├───api - Model API -│ │ │ ├───routes -│ │ ├───core -│ │ ├───models - Pydantic model definitions for in-/ output -│ │ ├───inference +├───nginx # nginx config for API Gateway & Authorizatio +│ └───nginx.conf +├───inference_server # FastAPI Model API Server +│ ├───tests # Unit Tests +│ │ ├───test_api +│ │ ├───test_inference +│ ├───main.py # Entry point in server +│ ├───Dockerfile # Dockerfile for server +│ └───square_model_inference # Server Logic +│ ├───api # API Routes +│ │ ├───routes +│ ├───core # Server config, Startup logic, etc. +│ ├───models # Input/ output modelling for API +│ └───inference # Deep Model implementation and inference code for NLP tasks +└───example_docker-compose.yml # Example docker-compose setup for the Model API ``` ## Requirements @@ -43,11 +52,16 @@ Both `transformers` and `adapter-transformers` use the same namespace so they co Thus, we first install `sentence-transformers` along with `transformers`, uninstall `transformers`, and finally install `adapter-transformers`. +## Running + #### Running Localhost ```sh make run ``` +This *only* starts one inference server using `inference_server/.env`. No nginx, no auth server. +For debugging, `inference_server/main.py` can also be used as entry. + #### Running Via Docker @@ -62,25 +76,10 @@ make test ``` ## Setup -TODO - -## Run without `make` for development - -1. Start your app with: -```bash -PYTHONPATH=./inference_server uvicorn main:app --reload -``` - -2. Go to [http://localhost:8000/docs](http://localhost:8000/docs) or [http://localhost:8000/redoc](http://localhost:8000/redoc) for alternative swagger - - -## Run Tests with using `make` - -Install testing libraries and run `tox`: -```bash -pip install tox pytest flake8 coverage bandit -tox -``` -This runs tests and coverage for Python 3.8 and Flake8, Bandit. - - +1. Create `auth_server/.env` with secret API key. See [here](auth_server/.env.example) for an example. +2. For each model server that should run, create a `.env.$model` to configure it. + See [here](inference_server/.env.example) for an example. +3. Configure `nginx/nginx.conf` to correctly forward requests to each server. The server DNS name has to + match `container_name` of each server in the `docker-compose.yaml`. +4. Configure `docker-compose.yaml` by adding services for the auth server, nginx (with the config), and the +model servers (each with their .env file). See [example_docker-compose.yml](example_docker-compose.yml) for an example. \ No newline at end of file diff --git a/square-model-inference-api/auth_server/.env.example b/square-model-inference-api/auth_server/.env.example index 95fa8b3b7..6f37be409 100644 --- a/square-model-inference-api/auth_server/.env.example +++ b/square-model-inference-api/auth_server/.env.example @@ -1,2 +1,2 @@ API_KEY=example_key -API_KEY_NAME=Authorization \ No newline at end of file +API_KEY_HEADER_NAME=Authorization \ No newline at end of file diff --git a/square-model-inference-api/auth_server/auth_api/config.py b/square-model-inference-api/auth_server/auth_api/config.py index ebd10abc4..ec1987091 100644 --- a/square-model-inference-api/auth_server/auth_api/config.py +++ b/square-model-inference-api/auth_server/auth_api/config.py @@ -3,5 +3,7 @@ config = Config(".env") +# The API key required to get authorization API_KEY: Secret = config("API_KEY", cast=Secret) -API_KEY_NAME: str = config("API_KEY_NAME", cast=str, default="api_key") +# Name of the header in the request that contains the API key +API_KEY_HEADER_NAME: str = config("API_KEY_HEADER_NAME", cast=str, default="Authorization") diff --git a/square-model-inference-api/auth_server/auth_api/messages.py b/square-model-inference-api/auth_server/auth_api/messages.py index 03eb2e101..16425b6c8 100644 --- a/square-model-inference-api/auth_server/auth_api/messages.py +++ b/square-model-inference-api/auth_server/auth_api/messages.py @@ -1,4 +1,3 @@ NO_API_KEY = "No API key provided in header {}." AUTH_REQ = "Authentication required." -HTTP_500_DETAIL = "Internal server error." diff --git a/square-model-inference-api/auth_server/auth_api/security.py b/square-model-inference-api/auth_server/auth_api/security.py index 943eee929..1a74e31d8 100644 --- a/square-model-inference-api/auth_server/auth_api/security.py +++ b/square-model-inference-api/auth_server/auth_api/security.py @@ -4,16 +4,16 @@ from fastapi.security.api_key import APIKeyHeader from starlette.status import HTTP_400_BAD_REQUEST, HTTP_401_UNAUTHORIZED -from .config import API_KEY, API_KEY_NAME +from .config import API_KEY, API_KEY_HEADER_NAME from .messages import AUTH_REQ, NO_API_KEY -api_key_header = APIKeyHeader(name=API_KEY_NAME, auto_error=False) +api_key_header = APIKeyHeader(name=API_KEY_HEADER_NAME, auto_error=False) def validate_request(header: str = Security(api_key_header),) -> bool: if header is None: raise HTTPException( - status_code=HTTP_400_BAD_REQUEST, detail=NO_API_KEY.format(API_KEY_NAME), headers={} + status_code=HTTP_400_BAD_REQUEST, detail=NO_API_KEY.format(API_KEY_HEADER_NAME), headers={} ) elif not secrets.compare_digest(header, str(API_KEY)): raise HTTPException( diff --git a/square-model-inference-api/auth_server/main.py b/square-model-inference-api/auth_server/main.py index 90332c6d2..93894341f 100644 --- a/square-model-inference-api/auth_server/main.py +++ b/square-model-inference-api/auth_server/main.py @@ -3,6 +3,9 @@ app = FastAPI() + @app.get("/auth") -async def auth(authenticated: bool = Depends(security.validate_request),): - return {"authenticated": True} +async def auth(authenticated: bool = Depends(security.validate_request)): + # authenticated is always True because security.validate_request raises an exception if validation fails + # and it does not return False + return {"authenticated": authenticated} diff --git a/square-model-inference-api/auth_server/requirements.txt b/square-model-inference-api/auth_server/requirements.txt index c3f33f964..1141b0ad8 100644 --- a/square-model-inference-api/auth_server/requirements.txt +++ b/square-model-inference-api/auth_server/requirements.txt @@ -1,3 +1,3 @@ -uvicorn==0.13.4 -fastapi==0.65.1 -pydantic==1.8.2 +uvicorn==0.13.4 # ASGI server +fastapi==0.65.1 # REST API Framework +pydantic==1.8.2 # Input/ output modelling diff --git a/square-model-inference-api/inference_server/.env.example b/square-model-inference-api/inference_server/.env.example new file mode 100644 index 000000000..cff919980 --- /dev/null +++ b/square-model-inference-api/inference_server/.env.example @@ -0,0 +1,26 @@ +# Corresponds to the Huggingface name for finetuned Transformers or the name of a finetuned SentenceTransformers +MODEL_NAME=bert-base-uncased +# Type of the model, e.g. Transformers, Adapter, ... +# See square_model_inference.core.event_handlers.MODEL_MAPPING for all available names with corresponding model +MODEL_TYPE=adapter + +# Disable CUDA even if available +DISABLE_GPU=True +# Batch size used for many inputs +BATCH_SIZE=32 +# Inputs larger than this size are rejected +MAX_INPUT_SIZE=1024 + +# Cache directory where model weights are stored +# This is the name for the env variable used by transformers and sentence-transformers package +TRANSFORMERS_CACHE=../.cache + +# For MODEL_TYPE=transformers: decides the AutoModel* class used +# See square_model_inference.inference.transformer.CLASS_MAPPING for valid names and corresponding class +MODEL_CLASS=base + +# Flag that decides if returned numpy arrays are returned +# as lists or encoded to base64 (smaller but not easily human readable). +# See the comment in square_model_inference.models.prediction._encode_numpy on information on how to decode +# the base64 string back to the numpy array +RETURN_PLAINTEXT_ARRAYS=False \ No newline at end of file diff --git a/square-model-inference-api/inference_server/main.py b/square-model-inference-api/inference_server/main.py index 4d4f0825c..e7004c49e 100644 --- a/square-model-inference-api/inference_server/main.py +++ b/square-model-inference-api/inference_server/main.py @@ -17,4 +17,4 @@ def get_app() -> FastAPI: if __name__ == "__main__": import uvicorn - uvicorn.run(app, host="0.0.0.0", port=8000) + uvicorn.run(app, host="0.0.0.0", port=8002) diff --git a/square-model-inference-api/inference_server/requirements1.txt b/square-model-inference-api/inference_server/requirements1.txt index d8a296d3e..00e9c0a57 100644 --- a/square-model-inference-api/inference_server/requirements1.txt +++ b/square-model-inference-api/inference_server/requirements1.txt @@ -1,8 +1,8 @@ -uvicorn==0.13.4 -fastapi==0.65.1 -pydantic==1.8.2 -loguru==0.5.3 -python-dotenv==0.17.1 +uvicorn==0.13.4 # ASGI server +fastapi==0.65.1 # REST API Framework +pydantic==1.8.2 # Input/ output modelling +loguru==0.5.3 # Easier logging +python-dotenv==0.17.1 # Required for .env configs sentencepiece==0.1.95 torch==1.8.1 sentence-transformers==1.2.0 \ No newline at end of file diff --git a/square-model-inference-api/inference_server/square_model_inference/api/routes/heartbeat.py b/square-model-inference-api/inference_server/square_model_inference/api/routes/heartbeat.py index 702a921bc..67c0570f2 100644 --- a/square-model-inference-api/inference_server/square_model_inference/api/routes/heartbeat.py +++ b/square-model-inference-api/inference_server/square_model_inference/api/routes/heartbeat.py @@ -1,11 +1,11 @@ from fastapi import APIRouter -from square_model_inference.models.heartbeat import HearbeatResult +from square_model_inference.models.heartbeat import HeartbeatResult router = APIRouter() -@router.get("/heartbeat", response_model=HearbeatResult, name="heartbeat") -def get_hearbeat() -> HearbeatResult: - heartbeat = HearbeatResult(is_alive=True) +@router.get("/heartbeat", response_model=HeartbeatResult, name="heartbeat") +def get_hearbeat() -> HeartbeatResult: + heartbeat = HeartbeatResult(is_alive=True) return heartbeat diff --git a/square-model-inference-api/inference_server/square_model_inference/api/routes/prediction.py b/square-model-inference-api/inference_server/square_model_inference/api/routes/prediction.py index c06c0633f..eae7bda22 100644 --- a/square-model-inference-api/inference_server/square_model_inference/api/routes/prediction.py +++ b/square-model-inference-api/inference_server/square_model_inference/api/routes/prediction.py @@ -8,6 +8,7 @@ router = APIRouter() + @router.post("/sequence-classification", response_model=PredictionOutputForSequenceClassification, name="sequence classification") async def sequence_classification( request: Request, @@ -15,11 +16,12 @@ async def sequence_classification( ) -> PredictionOutputForSequenceClassification: logger.info(f"Sequence Classification Request: {prediction_request.dict()}") - model = request.app.state.model + model = request.app.state.model # Access model from global app state prediction = await model.predict(prediction_request, Task.sequence_classification) return prediction + @router.post("/token-classification", response_model=PredictionOutputForTokenClassification, name="token classification") async def token_classification( request: Request, @@ -32,6 +34,7 @@ async def token_classification( return prediction + @router.post("/embedding", response_model=PredictionOutputForEmbedding, name="embedding") async def embedding( request: Request, @@ -44,6 +47,7 @@ async def embedding( return prediction + @router.post("/question-answering", response_model=PredictionOutputForQuestionAnswering, name="question answering") async def question_answering( request: Request, @@ -56,6 +60,7 @@ async def question_answering( return prediction + @router.post("/generation", response_model=PredictionOutputForGeneration, name="generation") async def generation( request: Request, diff --git a/square-model-inference-api/inference_server/square_model_inference/core/config.py b/square-model-inference-api/inference_server/square_model_inference/core/config.py index 224417b3d..aba698196 100644 --- a/square-model-inference-api/inference_server/square_model_inference/core/config.py +++ b/square-model-inference-api/inference_server/square_model_inference/core/config.py @@ -16,6 +16,7 @@ DISABLE_GPU: bool = config("DISABLE_GPU", cast=bool, default=False) # Batch size used for many inputs BATCH_SIZE: int = config("BATCH_SIZE", cast=int, default=32) +# Inputs larger than this size are rejected MAX_INPUT_SIZE: int = config("MAX_INPUT_SIZE", cast=int, default=1024) # Cache directory where model weights are stored diff --git a/square-model-inference-api/inference_server/square_model_inference/core/event_handlers.py b/square-model-inference-api/inference_server/square_model_inference/core/event_handlers.py index 9498a0b38..6011b30ec 100644 --- a/square-model-inference-api/inference_server/square_model_inference/core/event_handlers.py +++ b/square-model-inference-api/inference_server/square_model_inference/core/event_handlers.py @@ -24,7 +24,11 @@ "max_input_size": MAX_INPUT_SIZE } + def _startup_model(app: FastAPI) -> None: + """ + Initialize the model used by the server and set it to the app state for global access + """ if MODEL_TYPE not in MODEL_MAPPING: raise RuntimeError(f"Unknown MODEL_MAPPING. Must be one of {MODEL_MAPPING.keys()}") model_instance = MODEL_MAPPING[MODEL_TYPE](**MODEL_KWARGS) diff --git a/square-model-inference-api/inference_server/square_model_inference/inference/adaptertransformer.py b/square-model-inference-api/inference_server/square_model_inference/inference/adaptertransformer.py index 06995863c..5c97532e9 100644 --- a/square-model-inference-api/inference_server/square_model_inference/inference/adaptertransformer.py +++ b/square-model-inference-api/inference_server/square_model_inference/inference/adaptertransformer.py @@ -1,5 +1,3 @@ -import json - from loguru import logger from transformers import AutoModelWithHeads, list_adapters diff --git a/square-model-inference-api/inference_server/square_model_inference/models/heartbeat.py b/square-model-inference-api/inference_server/square_model_inference/models/heartbeat.py index 34a47e56e..55f1fea20 100644 --- a/square-model-inference-api/inference_server/square_model_inference/models/heartbeat.py +++ b/square-model-inference-api/inference_server/square_model_inference/models/heartbeat.py @@ -1,5 +1,5 @@ from pydantic import BaseModel -class HearbeatResult(BaseModel): +class HeartbeatResult(BaseModel): is_alive: bool diff --git a/square-model-inference-api/inference_server/square_model_inference/models/prediction.py b/square-model-inference-api/inference_server/square_model_inference/models/prediction.py index 3f0963f69..a3f1d7fbb 100644 --- a/square-model-inference-api/inference_server/square_model_inference/models/prediction.py +++ b/square-model-inference-api/inference_server/square_model_inference/models/prediction.py @@ -1,5 +1,4 @@ -from collections import Iterable -from typing import Dict, Union, Tuple, List, Optional +from typing import Dict, Union, Tuple, List, Optional, Iterable import torch from io import BytesIO From d75668c991056802e4e97fd27f133f6e95f3e395 Mon Sep 17 00:00:00 2001 From: Geigle Date: Wed, 14 Jul 2021 16:17:16 +0200 Subject: [PATCH 14/23] Added offline encoding script for data api --- .../offline_encoding_for_data_api.py | 322 ++++++++++++++++++ 1 file changed, 322 insertions(+) create mode 100644 square-model-inference-api/offline_encoding_for_data_api.py diff --git a/square-model-inference-api/offline_encoding_for_data_api.py b/square-model-inference-api/offline_encoding_for_data_api.py new file mode 100644 index 000000000..4f787e24e --- /dev/null +++ b/square-model-inference-api/offline_encoding_for_data_api.py @@ -0,0 +1,322 @@ +# This is a stand-alone script that does not require additional code (except the imported packages). +# We copy-pasted code from Model API and removing some not needed stuff for this. +import argparse +import os +import logging +import json +import pickle +from typing import Union, List + +import torch +from pydantic import BaseModel, Field +# Conditionally load adapter or sentence-transformer later to simplify installation +#from sentence_transformers import SentenceTransformer as SentenceTransformerModel +import transformers +from transformers import AutoModel, AutoTokenizer #, AutoModelWithHeads + +logger = logging.getLogger(__name__) +logger.setLevel(logging.DEBUG) + + +class PredictionRequest(BaseModel): + """ + Prediction request containing the input, pre-processing parameters, parameters for the model forward pass, + the task with task-specific parameters, and parameters for any post-processing + """ + input: Union[List[str], List[List[str]], dict] = Field( + ..., + description="Input for the model. Supports Huggingface Transformer inputs (i.e., list of sentences, or " + "list of pairs of sentences), a dictionary with Transformer inputs, or a dictionary containing " + "numpy arrays (as lists). For the numpy arrays, also set is_preprocessed=True. " + "

" + "Transformer/ Adapter:
" + "Task 'question_answering' expects the input to be in the (question, context) format." + ) + preprocessing_kwargs: dict = Field( + default={}, + description="Optional dictionary containing additional parameters for the pre-processing step.

" + "SentenceTransformer: This is ignored.
" + "Transformer/ Adapter: See the Huggingface tokenizer for possible parameters." + ) + model_kwargs: dict = Field( + default={}, + description="Optional dictionary containing parameters that are passed to the model for the forward pass " + "to control what additional tensors are returned.

" + "SentenceTransformer: This is ignored.
" + "Transformer/ Adapter: See the forward method of the Huggingface models for possible parameters" + "For example, set ‘output_attentions=True’ to receive the attention results in the output." + ) + task_kwargs: dict = Field( + default={}, + description="Optional dictionary containing additional parameters for handling of the task and " + "task-related post-processing.

" + "SentenceTransformer: This is ignored.
" + "Transformer/ Adapter:
" + "'sentence_classification':
" + "- 'is_regression': Flag to treat output of models with num_labels>1 as regression, too, i.e., no softmax and no labels are returned
" + "'token_classification':
" + "- 'is_regression': Flag to treat output of models with num_labels>1 as regression, too, i.e., no softmax and no labels are returned
" + "'embedding':
" + "- 'embedding_mode: One of 'mean', 'max', 'cls', 'token'. The pooling mode used (or not used for 'token'). Default 'mean'.
" + "'question_answering':
" + "- 'topk': Return the top-k most likely spans. Default 1.
" + "- 'max_answer_len': Maximal token length of answers. Default 128.
" + "'generation':
" + "- 'clean_up_tokenization_spaces': See parameter in Huggingface tokenizer.decode(). Default False
" + "- See Huggingface model.generate() for all possible parameters that can be used. " + "Note, 'model_kwargs' and 'task_kwargs' are merged for generation." + + ) + + +class SentenceTransformer: + """ + The class for all sentence-transformers models + """ + + def __init__(self, model_name, batch_size, disable_gpu): + """ + Initialize the SentenceTransformer + :param model_name: the sentence-transformer model name (https://sbert.net/docs/pretrained_models.html) + :param batch_size: batch size used for inference + :param disable_gpu: do not move model to GPU even if CUDA is available + :param max_input_size: requests with a larger input are rejected + """ + self._load_model(model_name, disable_gpu) + self.batch_size = batch_size + + def _load_model(self, model_name, disable_gpu): + """ + Load the Transformer model model_name and its tokenizer with Huggingface. + Model will be moved to GPU unless CUDA is unavailable or disable_gpu is true. + """ + import sentence_transformers + logger.debug(f"Loading model {model_name}") + device = "cuda" if torch.cuda.is_available() and not disable_gpu else "cpu" + model = sentence_transformers.SentenceTransformer(model_name_or_path=model_name, device=device) + logger.info(f"Model {model_name} loaded on {device}") + self.model = model + + def embedding(self, request): + embeddings = self.model.encode(request.input, batch_size=self.batch_size, show_progress_bar=False, convert_to_tensor=True) + return embeddings + + +class Transformer: + """ + The class for all Huggingface transformer-based models + """ + SUPPORTED_EMBEDDING_MODES = ["mean", "max", "cls", "token"] + + def __init__(self, model_name, batch_size, disable_gpu): + """ + Initialize the Transformer + :param model_name: the Huggingface model name + :param batch_size: batch size used for inference + :param disable_gpu: do not move model to GPU even if CUDA is available + """ + self._load_model(AutoModel, model_name, disable_gpu) + self.batch_size = batch_size + + def _load_model(self, model_cls, model_name, disable_gpu): + """ + Load the Transformer model model_name and its tokenizer with Huggingface. + Model will be moved to GPU unless CUDA is unavailable or disable_gpu is true. + """ + logger.debug(f"Loading model {model_name}") + tokenizer = AutoTokenizer.from_pretrained(model_name) + # Check if GPU is available + device = "cuda" if torch.cuda.is_available() and not disable_gpu else "cpu" + model = model_cls.from_pretrained(model_name).to(device) + logger.info(f"Model {model_name} loaded on {device}") + + self.model = model + self.tokenizer = tokenizer + + def _ensure_tensor_on_device(self, **inputs): + """ + Ensure PyTorch tensors are on the specified device. + + Args: + inputs (keyword arguments that should be :obj:`torch.Tensor`): The tensors to place on :obj:`self.device`. + + Return: + :obj:`Dict[str, torch.Tensor]`: The same as :obj:`inputs` but on the proper device. + """ + return {name: tensor.to(self.model.device) for name, tensor in inputs.items()} + + def _predict(self, request, output_features=False): + """ + Inference on the input. + :param request: the request with the input and optional kwargs + :param output_features: return the features of the input. + Necessary if, e.g., attention mask is needed for post-processing. + :return: The model outputs and optionally the input features + """ + all_predictions = [] + request.preprocessing_kwargs["padding"] = request.preprocessing_kwargs.get("padding", True) + request.preprocessing_kwargs["truncation"] = request.preprocessing_kwargs.get("truncation", True) + features = self.tokenizer(request.input, + return_tensors="pt", + **request.preprocessing_kwargs) + for start_idx in range(0, len(request.input), self.batch_size): + with torch.no_grad(): + input_features = {k: features[k][start_idx:start_idx+self.batch_size] for k in features.keys()} + input_features = self._ensure_tensor_on_device(**input_features) + predictions = self.model(**input_features, **request.model_kwargs) + all_predictions.append(predictions) + keys = all_predictions[0].keys() + final_prediction = {} + for key in keys: + # HuggingFace outputs for 'attentions' and more is returned as tuple of tensors + # Tuple of tuples only exists for 'past_key_values' which is only relevant for generation. + # Generation should NOT use this function + if isinstance(all_predictions[0][key], tuple): + tuple_of_lists = list(zip(*[[p.cpu() for p in tpl[key]] for tpl in all_predictions])) + final_prediction[key] = tuple(torch.cat(l) for l in tuple_of_lists) + else: + final_prediction[key] = torch.cat([p[key].cpu() for p in all_predictions]) + if output_features: + return final_prediction, features + return final_prediction + + def embedding(self, request): + request.model_kwargs["output_hidden_states"] = True + predictions, features = self._predict(request, output_features=True) + # We remove hidden_states from predictions! + hidden_state = predictions.pop("hidden_states")[-1] + attention_mask = features["attention_mask"] + + embedding_mode = request.task_kwargs.get("embedding_mode", "mean") + + if embedding_mode not in self.SUPPORTED_EMBEDDING_MODES: + raise ValueError(f"Embedding mode {embedding_mode} not in list of supported modes {self.SUPPORTED_EMBEDDING_MODES}") + + if embedding_mode == "cls": + emb = hidden_state[:, 0, :] + # copied from sentence-transformers pooling + elif embedding_mode == "max": + input_mask_expanded = attention_mask.unsqueeze(-1).expand(hidden_state.size()).float() + hidden_state[input_mask_expanded == 0] = -1e9 # Set padding tokens to large negative value + emb = torch.max(hidden_state, 1)[0] + # copied from sentence-transformers pooling + elif embedding_mode == "mean": + input_mask_expanded = attention_mask.unsqueeze(-1).expand(hidden_state.size()).float() + sum_embeddings = torch.sum(hidden_state * input_mask_expanded, 1) + sum_mask = input_mask_expanded.sum(1) + emb = sum_embeddings / sum_mask + elif embedding_mode == "token": + emb = hidden_state + #word_ids = [features.word_ids(i) for i in range(len(request.input))] + return emb + + +class AdapterTransformer(Transformer): + """ + The class for all adapter-based models using the adapter-transformers package + """ + def __init__(self, model_name, batch_size, disable_gpu, transformers_cache, adapter_name): + """ + Initialize the Adapter with its underlying Transformer and pre-load all available adapters from adapterhub.ml + :param model_name: the Huggingface model name + :param batch_size: batch size used for inference + :param disable_gpu: do not move model to GPU even if CUDA is available + :param transformers_cache: Should be same as TRANSFORMERS_CACHE env variable. + This folder will be used to store the adapters + """ + self._load_model(transformers.AutoModelWithHeads, model_name, disable_gpu) + self.model.load_adapter(adapter_name, load_as=adapter_name, cache_dir=transformers_cache) + self.model.to(self.model.device) + self.model.set_active_adapters(adapter_name) + self.batch_size = batch_size + + +def read_batch(file_pointer, batch_size): + i = 0 + lines = [] + finished = False + while i < batch_size: + line = file_pointer.readline().strip() + # empty string -> file finished + if not line: + finished = True + break + else: + lines.append(line) + i += 1 + return lines, finished + + +def encode(args): + transformers_cache = args.transformers_cache + os.environ["TRANSFORMERS_CACHE"] = transformers_cache + model_name = args.model_name + model_type = args.model_type + batch_size = args.batch_size + chunk_size = args.chunk_size + input_file = args.input_file + output_file = os.path.splitext(args.output_file)[0] # Remove .pkl from name + adapter_name = args.adapter_name + + if model_type == "sentence-transformer": + model = SentenceTransformer(model_name, batch_size, False) + elif model_type == "transformer": + model = Transformer(model_name, batch_size, False) + elif model_type == "adapter": + model = AdapterTransformer(model_name, batch_size, False, transformers_cache, adapter_name) + + logger.info(f"Reading input from {input_file}") + if os.path.dirname(output_file): + os.makedirs(os.path.dirname(output_file), exist_ok=True) + with open(input_file, "r", encoding="utf-8") as f_in: + # We do not know how large the input file is so we read it batch-wise for memory safety reasons + finished_reading = False + total_processed_lines = 0 + chunk_idx = 0 + current_processed_lines = 0 + current_output = {} + while not finished_reading: + lines, finished_reading = read_batch(f_in, batch_size) + if lines: + lines = [json.loads(line) for line in lines] + texts = [line["text"] for line in lines] + ids = [line["id"] for line in lines] + + input = PredictionRequest(input=texts) + embeddings = model.embedding(input) + embeddings = embeddings.cpu().numpy() + + current_output.update({id: row for id, row in zip(ids, embeddings)}) + + total_processed_lines += len(lines) + current_processed_lines += len(lines) + + if current_processed_lines >= chunk_size or finished_reading: + logger.info(f"Processed {total_processed_lines} lines ({chunk_idx+1} chunks)") + + chunk_output_file = f"{output_file}_{chunk_idx}.pkl" + with open(chunk_output_file, "wb") as out_f: + logger.info(f"Writing chunk in {chunk_output_file}") + pickle.dump(current_output, out_f) + + current_processed_lines = 0 + current_output = {} + chunk_idx += 1 + +if __name__ == "__main__": + parser = argparse.ArgumentParser() + parser.add_argument("--transformers_cache", help="Cache folder where model files will be downloaded into and loaded from") + parser.add_argument("--model_name", help="Model name, i.e., name used in transformers oder sentence-transformers to load the pre-trained model") + parser.add_argument("--model_type", help="Model type, one of 'adapter', 'transformer', 'sentence-transformer'") + parser.add_argument("--batch_size", type=int, help="Batch size used for encoding") + parser.add_argument("--chunk_size", type=int, help="Chunk size used for writing out embeddings. " + "ATTENTION: This value will be set to the first value satisfying true_chunk_size mod batch_size == 0" + "Each output file contains chunk_size embeddings " + "(except the last one if len($input) mod chunk_size != 0)") + parser.add_argument("--input_file", help="Input .jsonl file. Each line is a dict object: {'id': 'xxx', 'text': 'abc...'}") + parser.add_argument("--output_file", help="Output .pkl file. A chunk index will be inserted between the name and extension: 'path/to/name_chunkidx.pkl' ." + "Format: Dict[str, numpy.ndarray], i.e. mapping ids to the document embeddings") + parser.add_argument("--adapter_name", help="For model_type=adapter, the name of the adapter that should be loaded") + args = parser.parse_args() + + encode(args) From 16182cbbae960dde569c13d767441fed7320bc9f Mon Sep 17 00:00:00 2001 From: Geigle Date: Wed, 14 Jul 2021 16:37:21 +0200 Subject: [PATCH 15/23] Adapter and GPUs: Fixed adapters not moved to model device --- .../square_model_inference/inference/adaptertransformer.py | 2 ++ 1 file changed, 2 insertions(+) diff --git a/square-model-inference-api/inference_server/square_model_inference/inference/adaptertransformer.py b/square-model-inference-api/inference_server/square_model_inference/inference/adaptertransformer.py index 5c97532e9..99911bef2 100644 --- a/square-model-inference-api/inference_server/square_model_inference/inference/adaptertransformer.py +++ b/square-model-inference-api/inference_server/square_model_inference/inference/adaptertransformer.py @@ -45,6 +45,8 @@ def _load_adapter(self, model_name, transformers_cache): logger.debug(f"Could not load {adapter} due to missing label_ids in config resulting in exception:\n{e.args[0]}") else: raise e + # Move all freshly loaded adapter weights to the same device as the model + self.model.to(self.model.device) # def _load_single_adapter(self, adapter_name: str): # if adapter_name not in self.model.config.adapters.adapters: From f1d8e00219400fe7125723f8110d3b3b37fdcada Mon Sep 17 00:00:00 2001 From: Geigle Date: Thu, 22 Jul 2021 09:47:29 +0200 Subject: [PATCH 16/23] Added support for .hdf5 and float16 storage of embeddings. Note: pickle seems to actually be slightly smaller than hdf5 --- .../offline_encoding_for_data_api.py | 77 ++++++------------- 1 file changed, 24 insertions(+), 53 deletions(-) diff --git a/square-model-inference-api/offline_encoding_for_data_api.py b/square-model-inference-api/offline_encoding_for_data_api.py index 4f787e24e..66c48696f 100644 --- a/square-model-inference-api/offline_encoding_for_data_api.py +++ b/square-model-inference-api/offline_encoding_for_data_api.py @@ -5,10 +5,10 @@ import logging import json import pickle +from dataclasses import dataclass from typing import Union, List - +import h5py import torch -from pydantic import BaseModel, Field # Conditionally load adapter or sentence-transformer later to simplify installation #from sentence_transformers import SentenceTransformer as SentenceTransformerModel import transformers @@ -18,55 +18,16 @@ logger.setLevel(logging.DEBUG) -class PredictionRequest(BaseModel): +@dataclass +class PredictionRequest: """ Prediction request containing the input, pre-processing parameters, parameters for the model forward pass, the task with task-specific parameters, and parameters for any post-processing """ - input: Union[List[str], List[List[str]], dict] = Field( - ..., - description="Input for the model. Supports Huggingface Transformer inputs (i.e., list of sentences, or " - "list of pairs of sentences), a dictionary with Transformer inputs, or a dictionary containing " - "numpy arrays (as lists). For the numpy arrays, also set is_preprocessed=True. " - "

" - "Transformer/ Adapter:
" - "Task 'question_answering' expects the input to be in the (question, context) format." - ) - preprocessing_kwargs: dict = Field( - default={}, - description="Optional dictionary containing additional parameters for the pre-processing step.

" - "SentenceTransformer: This is ignored.
" - "Transformer/ Adapter: See the Huggingface tokenizer for possible parameters." - ) - model_kwargs: dict = Field( - default={}, - description="Optional dictionary containing parameters that are passed to the model for the forward pass " - "to control what additional tensors are returned.

" - "SentenceTransformer: This is ignored.
" - "Transformer/ Adapter: See the forward method of the Huggingface models for possible parameters" - "For example, set ‘output_attentions=True’ to receive the attention results in the output." - ) - task_kwargs: dict = Field( - default={}, - description="Optional dictionary containing additional parameters for handling of the task and " - "task-related post-processing.

" - "SentenceTransformer: This is ignored.
" - "Transformer/ Adapter:
" - "'sentence_classification':
" - "- 'is_regression': Flag to treat output of models with num_labels>1 as regression, too, i.e., no softmax and no labels are returned
" - "'token_classification':
" - "- 'is_regression': Flag to treat output of models with num_labels>1 as regression, too, i.e., no softmax and no labels are returned
" - "'embedding':
" - "- 'embedding_mode: One of 'mean', 'max', 'cls', 'token'. The pooling mode used (or not used for 'token'). Default 'mean'.
" - "'question_answering':
" - "- 'topk': Return the top-k most likely spans. Default 1.
" - "- 'max_answer_len': Maximal token length of answers. Default 128.
" - "'generation':
" - "- 'clean_up_tokenization_spaces': See parameter in Huggingface tokenizer.decode(). Default False
" - "- See Huggingface model.generate() for all possible parameters that can be used. " - "Note, 'model_kwargs' and 'task_kwargs' are merged for generation." - - ) + input: Union[List[str], List[List[str]], dict] + preprocessing_kwargs: dict + model_kwargs: dict + task_kwargs: dict class SentenceTransformer: @@ -282,9 +243,11 @@ def encode(args): texts = [line["text"] for line in lines] ids = [line["id"] for line in lines] - input = PredictionRequest(input=texts) + input = PredictionRequest(input=texts, preprocessing_kwargs={}, model_kwargs={}, task_kwargs={"embedding_mode": "mean"}) embeddings = model.embedding(input) embeddings = embeddings.cpu().numpy() + if args.float16: + embeddings = embeddings.astype("float16") current_output.update({id: row for id, row in zip(ids, embeddings)}) @@ -294,11 +257,17 @@ def encode(args): if current_processed_lines >= chunk_size or finished_reading: logger.info(f"Processed {total_processed_lines} lines ({chunk_idx+1} chunks)") - chunk_output_file = f"{output_file}_{chunk_idx}.pkl" - with open(chunk_output_file, "wb") as out_f: - logger.info(f"Writing chunk in {chunk_output_file}") - pickle.dump(current_output, out_f) - + if args.hdf5: + chunk_output_file = f"{output_file}_{chunk_idx}.h5" + with h5py.File(chunk_output_file, "w") as out_f: + logger.info(f"Writing chunk in {chunk_output_file}") + for k, v in current_output.items(): + out_f.create_dataset(k, data=v) + else: + chunk_output_file = f"{output_file}_{chunk_idx}.pkl" + with open(chunk_output_file, "wb") as out_f: + logger.info(f"Writing chunk in {chunk_output_file}") + pickle.dump(current_output, out_f) current_processed_lines = 0 current_output = {} chunk_idx += 1 @@ -317,6 +286,8 @@ def encode(args): parser.add_argument("--output_file", help="Output .pkl file. A chunk index will be inserted between the name and extension: 'path/to/name_chunkidx.pkl' ." "Format: Dict[str, numpy.ndarray], i.e. mapping ids to the document embeddings") parser.add_argument("--adapter_name", help="For model_type=adapter, the name of the adapter that should be loaded") + parser.add_argument("--hdf5", action="store_true") + parser.add_argument("--float16", action="store_true") args = parser.parse_args() encode(args) From fe0e868d9351488db410510c5c90492992a71706 Mon Sep 17 00:00:00 2001 From: Geigle Date: Fri, 30 Jul 2021 14:40:25 +0200 Subject: [PATCH 17/23] Offline Encoding: changed output format so hdf5 can better compress it; added hdf5 gzip compression option --- .../offline_encoding_for_data_api.py | 27 +++++++++++++------ 1 file changed, 19 insertions(+), 8 deletions(-) diff --git a/square-model-inference-api/offline_encoding_for_data_api.py b/square-model-inference-api/offline_encoding_for_data_api.py index 66c48696f..2e1036145 100644 --- a/square-model-inference-api/offline_encoding_for_data_api.py +++ b/square-model-inference-api/offline_encoding_for_data_api.py @@ -9,6 +9,7 @@ from typing import Union, List import h5py import torch +import numpy as np # Conditionally load adapter or sentence-transformer later to simplify installation #from sentence_transformers import SentenceTransformer as SentenceTransformerModel import transformers @@ -235,7 +236,7 @@ def encode(args): total_processed_lines = 0 chunk_idx = 0 current_processed_lines = 0 - current_output = {} + current_output = {"ids": [], "embeddings": []} while not finished_reading: lines, finished_reading = read_batch(f_in, batch_size) if lines: @@ -249,7 +250,8 @@ def encode(args): if args.float16: embeddings = embeddings.astype("float16") - current_output.update({id: row for id, row in zip(ids, embeddings)}) + current_output["ids"].extend(ids) + current_output["embeddings"].append(embeddings) total_processed_lines += len(lines) current_processed_lines += len(lines) @@ -257,19 +259,25 @@ def encode(args): if current_processed_lines >= chunk_size or finished_reading: logger.info(f"Processed {total_processed_lines} lines ({chunk_idx+1} chunks)") + current_output["embeddings"] = np.concatenate(current_output["embeddings"]) + if args.hdf5: chunk_output_file = f"{output_file}_{chunk_idx}.h5" with h5py.File(chunk_output_file, "w") as out_f: logger.info(f"Writing chunk in {chunk_output_file}") - for k, v in current_output.items(): - out_f.create_dataset(k, data=v) + if args.hdf5_gzip_level < 0: + out_f.create_dataset("ids", data=np.array(current_output["ids"], dtype="S")) + out_f.create_dataset("embeddings", data=current_output["embeddings"]) + else: + out_f.create_dataset("ids", data=np.array(current_output["ids"], dtype="S"), compression="gzip", compression_opts=min(args.hdf5_gzip_level, 9)) + out_f.create_dataset("embeddings", data=current_output["embeddings"], compression="gzip", compression_opts=min(args.hdf5_gzip_level, 9)) else: chunk_output_file = f"{output_file}_{chunk_idx}.pkl" with open(chunk_output_file, "wb") as out_f: logger.info(f"Writing chunk in {chunk_output_file}") pickle.dump(current_output, out_f) current_processed_lines = 0 - current_output = {} + current_output = {"ids": [], "embeddings": []} chunk_idx += 1 if __name__ == "__main__": @@ -279,14 +287,17 @@ def encode(args): parser.add_argument("--model_type", help="Model type, one of 'adapter', 'transformer', 'sentence-transformer'") parser.add_argument("--batch_size", type=int, help="Batch size used for encoding") parser.add_argument("--chunk_size", type=int, help="Chunk size used for writing out embeddings. " - "ATTENTION: This value will be set to the first value satisfying true_chunk_size mod batch_size == 0" + "ATTENTION: This value will be set to the first value satisfying: true_chunk_size mod batch_size == 0" "Each output file contains chunk_size embeddings " - "(except the last one if len($input) mod chunk_size != 0)") + "(except the last one if len(input) mod chunk_size != 0)") parser.add_argument("--input_file", help="Input .jsonl file. Each line is a dict object: {'id': 'xxx', 'text': 'abc...'}") parser.add_argument("--output_file", help="Output .pkl file. A chunk index will be inserted between the name and extension: 'path/to/name_chunkidx.pkl' ." - "Format: Dict[str, numpy.ndarray], i.e. mapping ids to the document embeddings") + "Format: {'ids': List[str], 'embeddings': ndarray}. " + "Note for hdf5, use f['ids'].asstr() to load ids as string because default is binary.") parser.add_argument("--adapter_name", help="For model_type=adapter, the name of the adapter that should be loaded") parser.add_argument("--hdf5", action="store_true") + parser.add_argument("--hdf5-gzip-level", type=int, default=4, help="GZIP compression level for HDF5 in range 0-9, default 4 (bigger is more compressed). " + "Set to negative value to disable compression") parser.add_argument("--float16", action="store_true") args = parser.parse_args() From fe59bcf41410b099fe123a905544e5704d5e066a Mon Sep 17 00:00:00 2001 From: Geigle Date: Fri, 30 Jul 2021 16:15:47 +0200 Subject: [PATCH 18/23] Included in CICD Pipeline; changed logging formatting for ELK stack --- .github/workflows/ci.yml | 183 ++++++++++++++++++ square-model-inference-api/README.md | 3 + .../auth_server/Dockerfile | 7 +- .../auth_server/ElkJsonFormatter.tar.gz | Bin 0 -> 1407 bytes .../auth_server/auth_api/security.py | 5 +- .../auth_server/logging.conf | 21 ++ .../auth_server/main.py | 3 + .../auth_server/requirements.txt | 1 + .../example_docker-compose.yml | 11 +- .../inference_server/Dockerfile | 15 +- .../inference_server/ElkJsonFormatter.tar.gz | Bin 0 -> 1407 bytes .../inference_server/logging.conf | 21 ++ .../inference_server/main.py | 6 +- .../inference_server/requirements1.txt | 3 +- square-model-inference-api/nginx/nginx.conf | 17 ++ 15 files changed, 283 insertions(+), 13 deletions(-) create mode 100644 square-model-inference-api/auth_server/ElkJsonFormatter.tar.gz create mode 100644 square-model-inference-api/auth_server/logging.conf create mode 100644 square-model-inference-api/inference_server/ElkJsonFormatter.tar.gz create mode 100644 square-model-inference-api/inference_server/logging.conf diff --git a/.github/workflows/ci.yml b/.github/workflows/ci.yml index 5c484fd29..6f860fb37 100644 --- a/.github/workflows/ci.yml +++ b/.github/workflows/ci.yml @@ -151,3 +151,186 @@ jobs: run: | rm -rf /tmp/.buildx-cache mv /tmp/.buildx-cache-new /tmp/.buildx-cache + + model-api: + runs-on: ubuntu-latest + + steps: + - uses: actions/checkout@v2 + + - name: Prepare + id: prep + run: | + TAG=$(echo $GITHUB_SHA | head -c7) + IMAGE="ukpsquare/square-model-api" + echo ::set-output name=image::${IMAGE} + echo ::set-output name=tag::${TAG} + - name: Set up Docker Buildx + id: buildx + uses: docker/setup-buildx-action@v1 + with: + install: true + + - name: Cache Docker layers + uses: actions/cache@v2 + with: + path: /tmp/.buildx-cache + key: ${{ runner.os }}-buildx-model_api-${{ github.sha }} + restore-keys: | + ${{ runner.os }}-buildx-model_api- + ${{ runner.os }}-buildx- + + - name: Build test image + uses: docker/build-push-action@v2 + with: + builder: ${{ steps.buildx.outputs.name }} + context: ./square-model-inference-api/inference_server + target: test + load: true + tags: ${{ steps.prep.outputs.image }}:${{ steps.prep.outputs.tag }}-test + cache-from: type=local,src=/tmp/.buildx-cache + cache-to: type=local,mode=max,dest=/tmp/.buildx-cache-new + + # Temp fix + # https://github.com/docker/build-push-action/issues/252 + # https://github.com/moby/buildkit/issues/1896 + - name: Move cache + run: | + rm -rf /tmp/.buildx-cache + mv /tmp/.buildx-cache-new /tmp/.buildx-cache + + - name: Retrieve Test Reports + id: extract + uses: shrink/actions-docker-extract@v1 + with: + image: ${{ steps.prep.outputs.image }}:${{ steps.prep.outputs.tag }}-test + path: /app/test-reports + + - uses: actions/upload-artifact@v2 + with: + name: model_api-test-reports + path: ${{ steps.extract.outputs.destination }}/test-reports + + - name: Publish Test Report + uses: mikepenz/action-junit-report@v2 + with: + report_paths: ${{ steps.extract.outputs.destination }}/test-reports/junit.xml + check_name: Model API Test Report + fail_on_failure: true + + - name: Login to Docker Hub + uses: docker/login-action@v1 + with: + username: ${{ secrets.DOCKER_HUB_USERNAME }} + password: ${{ secrets.DOCKER_HUB_ACCESS_TOKEN }} + + - name: Build deployable image + uses: docker/build-push-action@v2 + with: + builder: ${{ steps.buildx.outputs.name }} + context: ./square-model-inference-api/inference_server + target: build + push: ${{github.ref == 'refs/heads/master'}} + tags: ${{ steps.prep.outputs.image }}:${{ steps.prep.outputs.tag }}, ${{ steps.prep.outputs.image }}:latest + cache-from: type=local,src=/tmp/.buildx-cache + cache-to: type=local,mode=max,dest=/tmp/.buildx-cache-new + + # Temp fix + # https://github.com/docker/build-push-action/issues/252 + # https://github.com/moby/buildkit/issues/1896 + - name: Move cache + run: | + rm -rf /tmp/.buildx-cache + mv /tmp/.buildx-cache-new /tmp/.buildx-cache + + model-api-auth: + runs-on: ubuntu-latest + + steps: + - uses: actions/checkout@v2 + + - name: Prepare + id: prep + run: | + TAG=$(echo $GITHUB_SHA | head -c7) + IMAGE="ukpsquare/square-model-api-auth" + echo ::set-output name=image::${IMAGE} + echo ::set-output name=tag::${TAG} + - name: Set up Docker Buildx + id: buildx + uses: docker/setup-buildx-action@v1 + with: + install: true + + - name: Cache Docker layers + uses: actions/cache@v2 + with: + path: /tmp/.buildx-cache + key: ${{ runner.os }}-buildx-model_api_auth-${{ github.sha }} + restore-keys: | + ${{ runner.os }}-buildx-model_api_auth- + ${{ runner.os }}-buildx- + +# No tests for auth server currently. +# - name: Build test image +# uses: docker/build-push-action@v2 +# with: +# builder: ${{ steps.buildx.outputs.name }} +# context: ./square-model-inference-api/auth_server +# target: test +# load: true +# tags: ${{ steps.prep.outputs.image }}:${{ steps.prep.outputs.tag }}-test +# cache-from: type=local,src=/tmp/.buildx-cache +# cache-to: type=local,mode=max,dest=/tmp/.buildx-cache-new +# +# # Temp fix +# # https://github.com/docker/build-push-action/issues/252 +# # https://github.com/moby/buildkit/issues/1896 +# - name: Move cache +# run: | +# rm -rf /tmp/.buildx-cache +# mv /tmp/.buildx-cache-new /tmp/.buildx-cache +# +# - name: Retrieve Test Reports +# id: extract +# uses: shrink/actions-docker-extract@v1 +# with: +# image: ${{ steps.prep.outputs.image }}:${{ steps.prep.outputs.tag }}-test +# path: /app/test-reports +# +# - uses: actions/upload-artifact@v2 +# with: +# name: model_api_auth-test-reports +# path: ${{ steps.extract.outputs.destination }}/test-reports +# +# - name: Publish Test Report +# uses: mikepenz/action-junit-report@v2 +# with: +# report_paths: ${{ steps.extract.outputs.destination }}/test-reports/junit.xml +# check_name: Model API Auth Test Report +# fail_on_failure: true + + - name: Login to Docker Hub + uses: docker/login-action@v1 + with: + username: ${{ secrets.DOCKER_HUB_USERNAME }} + password: ${{ secrets.DOCKER_HUB_ACCESS_TOKEN }} + + - name: Build deployable image + uses: docker/build-push-action@v2 + with: + builder: ${{ steps.buildx.outputs.name }} + context: ./square-model-inference-api/auth_server + #target: build # Not multistage + push: ${{github.ref == 'refs/heads/master'}} + tags: ${{ steps.prep.outputs.image }}:${{ steps.prep.outputs.tag }}, ${{ steps.prep.outputs.image }}:latest + cache-from: type=local,src=/tmp/.buildx-cache + cache-to: type=local,mode=max,dest=/tmp/.buildx-cache-new + + # Temp fix + # https://github.com/docker/build-push-action/issues/252 + # https://github.com/moby/buildkit/issues/1896 + - name: Move cache + run: | + rm -rf /tmp/.buildx-cache + mv /tmp/.buildx-cache-new /tmp/.buildx-cache \ No newline at end of file diff --git a/square-model-inference-api/README.md b/square-model-inference-api/README.md index c51c79ab3..26dcb90a1 100644 --- a/square-model-inference-api/README.md +++ b/square-model-inference-api/README.md @@ -30,6 +30,9 @@ to handle authorization of requests with the auth server. └───example_docker-compose.yml # Example docker-compose setup for the Model API ``` +### Logging +The components use the json-formatted logging used by the ELK Stack in square-core/logging. + ## Requirements Python 3.7+, Docker (optional), Make (optional) diff --git a/square-model-inference-api/auth_server/Dockerfile b/square-model-inference-api/auth_server/Dockerfile index aadc437d3..e630e0821 100644 --- a/square-model-inference-api/auth_server/Dockerfile +++ b/square-model-inference-api/auth_server/Dockerfile @@ -5,10 +5,13 @@ ENV PYTHONUNBUFFERED 1 EXPOSE 8081 WORKDIR /app +COPY ElkJsonFormatter.tar.gz ./ElkJsonFormatter.tar.gz RUN pip install --upgrade pip COPY requirements.txt ./ RUN pip install -r requirements.txt -COPY . ./ +COPY main.py main.py +COPY ./auth_api auth_api +COPY logging.conf logging.conf -CMD ["uvicorn", "main:app", "--host", "0.0.0.0", "--port", "8081"] \ No newline at end of file +CMD ["uvicorn", "main:app", "--host", "0.0.0.0", "--port", "8081", "--log-config", "logging.conf"] \ No newline at end of file diff --git a/square-model-inference-api/auth_server/ElkJsonFormatter.tar.gz b/square-model-inference-api/auth_server/ElkJsonFormatter.tar.gz new file mode 100644 index 0000000000000000000000000000000000000000..19a8ecae00106938f782c0e9437af63071c4adce GIT binary patch literal 1407 zcmV-_1%Ub=iwFp$Kh$6X|72-%bT370Yf5u(Zbol%ZDDkDWpXVrE-)@KE_7jX0PUPx zQ{qS%hP}?O=<)`rf|!IH)GeyU+HrOcJLr1Yxv;F*CY{i0PMdBV`S;tMfIvV|bTaPn zy*H3fcLzwm_sQ33THR51-W}t+j|BT9-})iaN ztn$?^Up2#=kNLUyH*L$=LROveFM{CFdZJno{Ce@Pz6t*~7yaJ=|F6aWcbE9TVegvS zE)ZaI{l8kg^U~0Gmj4^V0W|Rcjq!iOUiAMi_ifL``HA%lWTPkH~ zP#m(kymdrwNmMi}OVpwu!7*80z4)1C>pxf(XIVr&%xRhw^&6*uogbVWHrDWM;6Lym z_z(PF#edhO3FWT)9PU4r|C;4g_%GB2{@2ETwR*ZIfUlN=G5l(J8{@y-MgHr+|62I( zEF1sb+jAUROr-6vt#p-GVLtQi{58~@d9&4BUWjq!iT|K3dgC;cDtKk)wt{r~vO zADvJA4@a+<{~1no{MT?a6Y{@0^p$YzV~#t2lbq2s*+;sF|2~e%{$lobU#GZ8l+$4x z<9xJ_&ih~bN8kF&$22CL42NV|bTH(DG_P)VNQ`MTE&Y>vBw;hn-V^5KG~>n5CWXfl z=5oY!S_1J4{0II6|3S6!|Mckm%&;>?@4$ zVWwsyMiQSS-pGw;a>G_s&&j_-nv+%4c$&GzQ`+ZT z-oSt0Kky&48T?;nyf5Uxrs);_TekfF7x4eb3ql?8f8amxANUXae-i&^GpSd~{}=xM zZJ4@a0srezIrY0E=V^D!8OXh!Aua(OJOKC){0IJnfd4NXB3;RUqniJlT?6?4Q_eqx zKJdSG{=@noKac;**8k{gyIluXfZAaGuQAJSef}@Z|I;nU)`9grO@R>|81whi=LC9x65pzV(o%fD=Ie5BWdj|Des{{~Bw(t;2u$u)o6pt^?!$ zwWwJCtvK5}O(Qm0_^l{uqkty9J02ZiO8FPTX0arxT${AFUA=m?+r`dt*4x`IE~ate zq?_Jcr%jr8(a bool: if header is None: + logger.info("Attempted access without API Key") raise HTTPException( status_code=HTTP_400_BAD_REQUEST, detail=NO_API_KEY.format(API_KEY_HEADER_NAME), headers={} ) elif not secrets.compare_digest(header, str(API_KEY)): + logger.info(f"Attempted access with wrong API Key {header}") raise HTTPException( status_code=HTTP_401_UNAUTHORIZED, detail=AUTH_REQ, headers={} ) + logger.info("Successful authorization with API Key") return True diff --git a/square-model-inference-api/auth_server/logging.conf b/square-model-inference-api/auth_server/logging.conf new file mode 100644 index 000000000..d62db8187 --- /dev/null +++ b/square-model-inference-api/auth_server/logging.conf @@ -0,0 +1,21 @@ +[loggers] +keys = root + +[logger_root] +level = DEBUG +handlers = root + +[handlers] +keys = root + +[handler_root] +class = StreamHandler +level = DEBUG +formatter = json + +[formatters] +keys = json + +[formatter_json] +class = ElkJsonFormatter.ElkJsonFormatter + diff --git a/square-model-inference-api/auth_server/main.py b/square-model-inference-api/auth_server/main.py index 93894341f..c137f9d0a 100644 --- a/square-model-inference-api/auth_server/main.py +++ b/square-model-inference-api/auth_server/main.py @@ -1,5 +1,8 @@ from fastapi import FastAPI, Depends import auth_api.security as security +from logging.config import fileConfig + +fileConfig("logging.conf") app = FastAPI() diff --git a/square-model-inference-api/auth_server/requirements.txt b/square-model-inference-api/auth_server/requirements.txt index 1141b0ad8..a7128b727 100644 --- a/square-model-inference-api/auth_server/requirements.txt +++ b/square-model-inference-api/auth_server/requirements.txt @@ -1,3 +1,4 @@ uvicorn==0.13.4 # ASGI server fastapi==0.65.1 # REST API Framework pydantic==1.8.2 # Input/ output modelling +ElkJsonFormatter.tar.gz # Logging Formatter \ No newline at end of file diff --git a/square-model-inference-api/example_docker-compose.yml b/square-model-inference-api/example_docker-compose.yml index d15b00b0a..e03421974 100644 --- a/square-model-inference-api/example_docker-compose.yml +++ b/square-model-inference-api/example_docker-compose.yml @@ -2,7 +2,8 @@ version: "3" services: auth: - build: ./auth_server/. + #build: ./auth_server/. + image: ukpsquare/square-model-api-auth:latest ports: - 8081:8081 env_file: @@ -17,8 +18,8 @@ services: # Example config for one model server inference_bert: - build: ./inference_server/. - image: square_model_inference:latest + #build: ./inference_server/. + image: ukpsquare/square-model-api:latest ports: - 8000:8000 env_file: @@ -30,8 +31,8 @@ services: # Example config for another model server inference_roberta: - build: ./inference_server/. - image: square_model_inference:latest + #build: ./inference_server/. + image: ukpsquare/square-model-api:latest ports: - 8001:8000 env_file: diff --git a/square-model-inference-api/inference_server/Dockerfile b/square-model-inference-api/inference_server/Dockerfile index fefc84ae8..041a92c18 100644 --- a/square-model-inference-api/inference_server/Dockerfile +++ b/square-model-inference-api/inference_server/Dockerfile @@ -5,6 +5,7 @@ ENV PYTHONUNBUFFERED 1 EXPOSE 8000 WORKDIR /app +COPY ElkJsonFormatter.tar.gz ./ElkJsonFormatter.tar.gz RUN pip install --upgrade pip COPY requirements1.txt requirements2.txt uninstall_requirements.txt ./ RUN pip install -r requirements1.txt @@ -19,11 +20,19 @@ COPY ./tests/pre_test_setup_for_docker_caching.py ./tests/pre_test_setup_for_doc RUN python ./tests/pre_test_setup_for_docker_caching.py COPY . ./ RUN pip install pytest pytest-cov pytest-asyncio -RUN PYTHONPATH=./ pytest --cov +RUN mkdir test-reports +RUN PYTHONPATH=./ pytest \ + --junitxml=test-reports/junit.xml \ + --cov \ + --cov-report=xml:test-reports/coverage.xml \ + --cov-report=html:test-reports/coverage.html; \ + echo $? > test-reports/pytest.existcode # Deployment stage FROM base as build -COPY . ./ +COPY main.py main.py +COPY ./square_model_inference square_model_inference +COPY logging.conf logging.conf -CMD ["uvicorn", "main:app", "--host", "0.0.0.0", "--port", "8000"] \ No newline at end of file +CMD ["uvicorn", "main:app", "--host", "0.0.0.0", "--port", "8000", "--log-config", "logging.conf"] \ No newline at end of file diff --git a/square-model-inference-api/inference_server/ElkJsonFormatter.tar.gz b/square-model-inference-api/inference_server/ElkJsonFormatter.tar.gz new file mode 100644 index 0000000000000000000000000000000000000000..19a8ecae00106938f782c0e9437af63071c4adce GIT binary patch literal 1407 zcmV-_1%Ub=iwFp$Kh$6X|72-%bT370Yf5u(Zbol%ZDDkDWpXVrE-)@KE_7jX0PUPx zQ{qS%hP}?O=<)`rf|!IH)GeyU+HrOcJLr1Yxv;F*CY{i0PMdBV`S;tMfIvV|bTaPn zy*H3fcLzwm_sQ33THR51-W}t+j|BT9-})iaN ztn$?^Up2#=kNLUyH*L$=LROveFM{CFdZJno{Ce@Pz6t*~7yaJ=|F6aWcbE9TVegvS zE)ZaI{l8kg^U~0Gmj4^V0W|Rcjq!iOUiAMi_ifL``HA%lWTPkH~ zP#m(kymdrwNmMi}OVpwu!7*80z4)1C>pxf(XIVr&%xRhw^&6*uogbVWHrDWM;6Lym z_z(PF#edhO3FWT)9PU4r|C;4g_%GB2{@2ETwR*ZIfUlN=G5l(J8{@y-MgHr+|62I( zEF1sb+jAUROr-6vt#p-GVLtQi{58~@d9&4BUWjq!iT|K3dgC;cDtKk)wt{r~vO zADvJA4@a+<{~1no{MT?a6Y{@0^p$YzV~#t2lbq2s*+;sF|2~e%{$lobU#GZ8l+$4x z<9xJ_&ih~bN8kF&$22CL42NV|bTH(DG_P)VNQ`MTE&Y>vBw;hn-V^5KG~>n5CWXfl z=5oY!S_1J4{0II6|3S6!|Mckm%&;>?@4$ zVWwsyMiQSS-pGw;a>G_s&&j_-nv+%4c$&GzQ`+ZT z-oSt0Kky&48T?;nyf5Uxrs);_TekfF7x4eb3ql?8f8amxANUXae-i&^GpSd~{}=xM zZJ4@a0srezIrY0E=V^D!8OXh!Aua(OJOKC){0IJnfd4NXB3;RUqniJlT?6?4Q_eqx zKJdSG{=@noKac;**8k{gyIluXfZAaGuQAJSef}@Z|I;nU)`9grO@R>|81whi=LC9x65pzV(o%fD=Ie5BWdj|Des{{~Bw(t;2u$u)o6pt^?!$ zwWwJCtvK5}O(Qm0_^l{uqkty9J02ZiO8FPTX0arxT${AFUA=m?+r`dt*4x`IE~ate zq?_Jcr%jr8(a FastAPI: + # Set logging config. Loguru uses this then, too + fileConfig("logging.conf") fast_app = FastAPI(title=APP_NAME, version=APP_VERSION) fast_app.include_router(api_router, prefix=API_PREFIX) diff --git a/square-model-inference-api/inference_server/requirements1.txt b/square-model-inference-api/inference_server/requirements1.txt index 00e9c0a57..794d45f82 100644 --- a/square-model-inference-api/inference_server/requirements1.txt +++ b/square-model-inference-api/inference_server/requirements1.txt @@ -5,4 +5,5 @@ loguru==0.5.3 # Easier logging python-dotenv==0.17.1 # Required for .env configs sentencepiece==0.1.95 torch==1.8.1 -sentence-transformers==1.2.0 \ No newline at end of file +sentence-transformers==1.2.0 +ElkJsonFormatter.tar.gz # Logging Formatter \ No newline at end of file diff --git a/square-model-inference-api/nginx/nginx.conf b/square-model-inference-api/nginx/nginx.conf index c6051be8f..1858be5bf 100644 --- a/square-model-inference-api/nginx/nginx.conf +++ b/square-model-inference-api/nginx/nginx.conf @@ -3,7 +3,24 @@ events { } http { + log_format json_combined escape=json + '{ "@timestamp": "$time_iso8601", ' + '"remote_addr": "$remote_addr", ' + '"http_referer": "$http_referer", ' + '"request": "$request", ' + '"status": $status, ' + '"body_bytes_sent": $body_bytes_sent, ' + '"http_user_agent": "$http_user_agent", ' + '"http_x_forwarded_for": "$http_x_forwarded_for", ' + '"upstream_addr": "$upstream_addr",' + '"upstream_http_host": "$upstream_http_host",' + '"upstream_response_time": "$upstream_response_time",' + '"request_time": "$request_time"' + '}'; + server { + access_log /var/log/nginx/access.log json_combined; + listen 8080; location / { From 547791511c63d7761c44cef3b2a95718533208d6 Mon Sep 17 00:00:00 2001 From: Geigle Date: Thu, 5 Aug 2021 14:10:59 +0200 Subject: [PATCH 19/23] Updated adapter-transformers version to fix error; added basic tests for auth server and included that in ci --- .github/workflows/ci.yml | 75 +++++++++---------- .../auth_server/Dockerfile | 15 +++- .../auth_server/main.py | 7 +- .../auth_server/tests/conftest.py | 24 ++++++ .../auth_server/tests/test_api/__init__.py | 0 .../auth_server/tests/test_api/test_auth.py | 34 +++++++++ .../inference_server/main.py | 6 +- .../inference_server/requirements2.txt | 2 +- 8 files changed, 120 insertions(+), 43 deletions(-) create mode 100644 square-model-inference-api/auth_server/tests/conftest.py create mode 100644 square-model-inference-api/auth_server/tests/test_api/__init__.py create mode 100644 square-model-inference-api/auth_server/tests/test_api/test_auth.py diff --git a/.github/workflows/ci.yml b/.github/workflows/ci.yml index 6f860fb37..7bb1ad9c8 100644 --- a/.github/workflows/ci.yml +++ b/.github/workflows/ci.yml @@ -271,44 +271,43 @@ jobs: ${{ runner.os }}-buildx-model_api_auth- ${{ runner.os }}-buildx- -# No tests for auth server currently. -# - name: Build test image -# uses: docker/build-push-action@v2 -# with: -# builder: ${{ steps.buildx.outputs.name }} -# context: ./square-model-inference-api/auth_server -# target: test -# load: true -# tags: ${{ steps.prep.outputs.image }}:${{ steps.prep.outputs.tag }}-test -# cache-from: type=local,src=/tmp/.buildx-cache -# cache-to: type=local,mode=max,dest=/tmp/.buildx-cache-new -# -# # Temp fix -# # https://github.com/docker/build-push-action/issues/252 -# # https://github.com/moby/buildkit/issues/1896 -# - name: Move cache -# run: | -# rm -rf /tmp/.buildx-cache -# mv /tmp/.buildx-cache-new /tmp/.buildx-cache -# -# - name: Retrieve Test Reports -# id: extract -# uses: shrink/actions-docker-extract@v1 -# with: -# image: ${{ steps.prep.outputs.image }}:${{ steps.prep.outputs.tag }}-test -# path: /app/test-reports -# -# - uses: actions/upload-artifact@v2 -# with: -# name: model_api_auth-test-reports -# path: ${{ steps.extract.outputs.destination }}/test-reports -# -# - name: Publish Test Report -# uses: mikepenz/action-junit-report@v2 -# with: -# report_paths: ${{ steps.extract.outputs.destination }}/test-reports/junit.xml -# check_name: Model API Auth Test Report -# fail_on_failure: true + - name: Build test image + uses: docker/build-push-action@v2 + with: + builder: ${{ steps.buildx.outputs.name }} + context: ./square-model-inference-api/auth_server + target: test + load: true + tags: ${{ steps.prep.outputs.image }}:${{ steps.prep.outputs.tag }}-test + cache-from: type=local,src=/tmp/.buildx-cache + cache-to: type=local,mode=max,dest=/tmp/.buildx-cache-new + + # Temp fix + # https://github.com/docker/build-push-action/issues/252 + # https://github.com/moby/buildkit/issues/1896 + - name: Move cache + run: | + rm -rf /tmp/.buildx-cache + mv /tmp/.buildx-cache-new /tmp/.buildx-cache + + - name: Retrieve Test Reports + id: extract + uses: shrink/actions-docker-extract@v1 + with: + image: ${{ steps.prep.outputs.image }}:${{ steps.prep.outputs.tag }}-test + path: /app/test-reports + + - uses: actions/upload-artifact@v2 + with: + name: model_api_auth-test-reports + path: ${{ steps.extract.outputs.destination }}/test-reports + + - name: Publish Test Report + uses: mikepenz/action-junit-report@v2 + with: + report_paths: ${{ steps.extract.outputs.destination }}/test-reports/junit.xml + check_name: Model API Auth Test Report + fail_on_failure: true - name: Login to Docker Hub uses: docker/login-action@v1 diff --git a/square-model-inference-api/auth_server/Dockerfile b/square-model-inference-api/auth_server/Dockerfile index e630e0821..394ce24d8 100644 --- a/square-model-inference-api/auth_server/Dockerfile +++ b/square-model-inference-api/auth_server/Dockerfile @@ -1,4 +1,4 @@ -FROM python:3.7.6-slim-buster +FROM python:3.7.6-slim-buster as base ENV PYTHONUNBUFFERED 1 @@ -14,4 +14,17 @@ COPY main.py main.py COPY ./auth_api auth_api COPY logging.conf logging.conf +FROM base as test +RUN pip install pytest pytest-cov pytest-asyncio +RUN mkdir test-reports +RUN PYTHONPATH=./ pytest \ + --junitxml=test-reports/junit.xml \ + --cov \ + --cov-report=xml:test-reports/coverage.xml \ + --cov-report=html:test-reports/coverage.html; \ + echo $? > test-reports/pytest.existcode + +# Deployment stage +FROM base as build + CMD ["uvicorn", "main:app", "--host", "0.0.0.0", "--port", "8081", "--log-config", "logging.conf"] \ No newline at end of file diff --git a/square-model-inference-api/auth_server/main.py b/square-model-inference-api/auth_server/main.py index c137f9d0a..fb323930d 100644 --- a/square-model-inference-api/auth_server/main.py +++ b/square-model-inference-api/auth_server/main.py @@ -1,9 +1,12 @@ from fastapi import FastAPI, Depends import auth_api.security as security from logging.config import fileConfig +from loguru import logger -fileConfig("logging.conf") - +try: + fileConfig("logging.conf") +except: + logger.info("Failed to load 'logging.conf'. Continuing without configuring the server logger") app = FastAPI() diff --git a/square-model-inference-api/auth_server/tests/conftest.py b/square-model-inference-api/auth_server/tests/conftest.py new file mode 100644 index 000000000..b06b27e94 --- /dev/null +++ b/square-model-inference-api/auth_server/tests/conftest.py @@ -0,0 +1,24 @@ +import pytest + +from starlette.config import environ +API_KEY = "test_key" +API_KEY_HEADER_NAME = "Authorization" +environ["API_KEY"] = API_KEY +environ["API_KEY_HEADER_NAME"] = API_KEY_HEADER_NAME + +from main import app + + +@pytest.fixture() +def test_app(): + return app + + +@pytest.fixture() +def test_key(): + return API_KEY + + +@pytest.fixture() +def test_header(): + return API_KEY_HEADER_NAME \ No newline at end of file diff --git a/square-model-inference-api/auth_server/tests/test_api/__init__.py b/square-model-inference-api/auth_server/tests/test_api/__init__.py new file mode 100644 index 000000000..e69de29bb diff --git a/square-model-inference-api/auth_server/tests/test_api/test_auth.py b/square-model-inference-api/auth_server/tests/test_api/test_auth.py new file mode 100644 index 000000000..10c33ed56 --- /dev/null +++ b/square-model-inference-api/auth_server/tests/test_api/test_auth.py @@ -0,0 +1,34 @@ +from starlette.testclient import TestClient + +def test_api_correct_auth(test_app, test_key, test_header) -> None: + test_client = TestClient(test_app) + response = test_client.get( + "/auth", + headers={test_header: test_key} + ) + assert response.status_code == 200 + assert response.json()["authenticated"] == True + + +def test_api_no_header(test_app) -> None: + test_client = TestClient(test_app) + response = test_client.get( + "/auth" + ) + assert response.status_code == 400 + +def test_api_wrong_header(test_app, test_key, test_header) -> None: + test_client = TestClient(test_app) + response = test_client.get( + "/auth", + headers={test_header+"wrong": test_key} + ) + assert response.status_code == 400 + +def test_api_wrong_key(test_app, test_key, test_header) -> None: + test_client = TestClient(test_app) + response = test_client.get( + "/auth", + headers={test_header: test_key+"wrong"} + ) + assert response.status_code == 401 \ No newline at end of file diff --git a/square-model-inference-api/inference_server/main.py b/square-model-inference-api/inference_server/main.py index 167adb04c..a3335add8 100644 --- a/square-model-inference-api/inference_server/main.py +++ b/square-model-inference-api/inference_server/main.py @@ -5,10 +5,14 @@ from square_model_inference.core.config import API_PREFIX, APP_NAME, APP_VERSION from square_model_inference.core.event_handlers import start_app_handler, stop_app_handler from logging.config import fileConfig +from loguru import logger def get_app() -> FastAPI: # Set logging config. Loguru uses this then, too - fileConfig("logging.conf") + try: + fileConfig("logging.conf") + except: + logger.info("Failed to load 'logging.conf'. Continuing without configuring the server logger") fast_app = FastAPI(title=APP_NAME, version=APP_VERSION) fast_app.include_router(api_router, prefix=API_PREFIX) diff --git a/square-model-inference-api/inference_server/requirements2.txt b/square-model-inference-api/inference_server/requirements2.txt index 05ea5692f..02424bec4 100644 --- a/square-model-inference-api/inference_server/requirements2.txt +++ b/square-model-inference-api/inference_server/requirements2.txt @@ -1 +1 @@ -adapter-transformers==2.1.1 \ No newline at end of file +adapter-transformers==2.1.2 \ No newline at end of file From c8203556a7f6aaba34fd67a20b90c015740fa80e Mon Sep 17 00:00:00 2001 From: Geigle Date: Fri, 6 Aug 2021 15:33:16 +0200 Subject: [PATCH 20/23] Added multiprocessing with multiple GPUs for offline encoding (code only tested with 1 GPU) [skip ci] --- .../offline_encoding_for_data_api.py | 167 +++++++++++++++++- 1 file changed, 158 insertions(+), 9 deletions(-) diff --git a/square-model-inference-api/offline_encoding_for_data_api.py b/square-model-inference-api/offline_encoding_for_data_api.py index 2e1036145..173982e6f 100644 --- a/square-model-inference-api/offline_encoding_for_data_api.py +++ b/square-model-inference-api/offline_encoding_for_data_api.py @@ -5,6 +5,7 @@ import logging import json import pickle +import time from dataclasses import dataclass from typing import Union, List import h5py @@ -13,11 +14,15 @@ # Conditionally load adapter or sentence-transformer later to simplify installation #from sentence_transformers import SentenceTransformer as SentenceTransformerModel import transformers +import torch.multiprocessing as mp +import queue from transformers import AutoModel, AutoTokenizer #, AutoModelWithHeads logger = logging.getLogger(__name__) -logger.setLevel(logging.DEBUG) - +handler = logging.StreamHandler() +handler.setFormatter(logging.Formatter('%(processName)s-%(levelname)s-%(asctime)s: %(message)s')) +logger.addHandler(handler) +logger.setLevel(logging.INFO) @dataclass class PredictionRequest: @@ -42,11 +47,13 @@ def __init__(self, model_name, batch_size, disable_gpu): :param model_name: the sentence-transformer model name (https://sbert.net/docs/pretrained_models.html) :param batch_size: batch size used for inference :param disable_gpu: do not move model to GPU even if CUDA is available - :param max_input_size: requests with a larger input are rejected """ self._load_model(model_name, disable_gpu) self.batch_size = batch_size + def to(self, device): + self.model.to(device) + def _load_model(self, model_name, disable_gpu): """ Load the Transformer model model_name and its tokenizer with Huggingface. @@ -80,6 +87,9 @@ def __init__(self, model_name, batch_size, disable_gpu): self._load_model(AutoModel, model_name, disable_gpu) self.batch_size = batch_size + def to(self, device): + self.model.to(device) + def _load_model(self, model_cls, model_name, disable_gpu): """ Load the Transformer model model_name and its tokenizer with Huggingface. @@ -169,7 +179,6 @@ def embedding(self, request): emb = sum_embeddings / sum_mask elif embedding_mode == "token": emb = hidden_state - #word_ids = [features.word_ids(i) for i in range(len(request.input))] return emb @@ -280,6 +289,135 @@ def encode(args): current_output = {"ids": [], "embeddings": []} chunk_idx += 1 + +def _read_process(input_file, batch_size, input_queue): + logger.info(f"Reading input from {input_file}") + with open(input_file, "r", encoding="utf-8") as f_in: + # We do not know how large the input file is so we read it batch-wise for memory safety reasons + finished_reading = False + while not finished_reading: + lines, finished_reading = read_batch(f_in, batch_size) + if lines: + lines = [json.loads(line) for line in lines] + texts = [line["text"] for line in lines] + ids = [line["id"] for line in lines] + + input = PredictionRequest(input=texts, preprocessing_kwargs={}, model_kwargs={}, task_kwargs={"embedding_mode": "mean"}) + input_queue.put((input, ids), block=True) + + +def _encode_process(model, device, float16, input_queue, output_queue): + logger.info(f"Moving model to {device}") + model.to(device) + while True: + try: + input, ids = input_queue.get() + embeddings = model.embedding(input) + embeddings = embeddings.cpu().numpy() + if float16: + embeddings = embeddings.astype("float16") + output_queue.put((embeddings, ids)) + except queue.Empty: + break + + +def _write_process(output_file, chunk_size, args, output_queue): + chunk_idx = 0 + current_processed_lines = 0 + total_processed_lines = 0 + current_output = {"ids": [], "embeddings": []} + while True: + try: + # We do not know when we are done writing. Instead we wait 60s and if we get nothing new, we assume we are done. + # i.e., after the timeout, queue.Empty exception is triggered and we write the remaining output and finish + embeddings, ids = output_queue.get(timeout=60) + current_output["ids"].extend(ids) + current_output["embeddings"].append(embeddings) + + total_processed_lines += len(ids) + current_processed_lines += len(ids) + + if current_processed_lines >= chunk_size: + logger.info(f"Processed {total_processed_lines} lines ({chunk_idx+1} chunks)") + + current_output["embeddings"] = np.concatenate(current_output["embeddings"]) + + if args.hdf5: + chunk_output_file = f"{output_file}_{chunk_idx}.h5" + with h5py.File(chunk_output_file, "w") as out_f: + logger.info(f"Writing chunk in {chunk_output_file}") + if args.hdf5_gzip_level < 0: + out_f.create_dataset("ids", data=np.array(current_output["ids"], dtype="S")) + out_f.create_dataset("embeddings", data=current_output["embeddings"]) + else: + out_f.create_dataset("ids", data=np.array(current_output["ids"], dtype="S"), compression="gzip", compression_opts=min(args.hdf5_gzip_level, 9)) + out_f.create_dataset("embeddings", data=current_output["embeddings"], compression="gzip", compression_opts=min(args.hdf5_gzip_level, 9)) + else: + chunk_output_file = f"{output_file}_{chunk_idx}.pkl" + with open(chunk_output_file, "wb") as out_f: + logger.info(f"Writing chunk in {chunk_output_file}") + pickle.dump(current_output, out_f) + current_processed_lines = 0 + current_output = {"ids": [], "embeddings": []} + chunk_idx += 1 + except queue.Empty: + logger.info(f"Processed {total_processed_lines} lines ({chunk_idx+1} chunks)") + current_output["embeddings"] = np.concatenate(current_output["embeddings"]) + if args.hdf5: + chunk_output_file = f"{output_file}_{chunk_idx}.h5" + with h5py.File(chunk_output_file, "w") as out_f: + logger.info(f"Writing chunk in {chunk_output_file}") + if args.hdf5_gzip_level < 0: + out_f.create_dataset("ids", data=np.array(current_output["ids"], dtype="S")) + out_f.create_dataset("embeddings", data=current_output["embeddings"]) + else: + out_f.create_dataset("ids", data=np.array(current_output["ids"], dtype="S"), compression="gzip", compression_opts=min(args.hdf5_gzip_level, 9)) + out_f.create_dataset("embeddings", data=current_output["embeddings"], compression="gzip", compression_opts=min(args.hdf5_gzip_level, 9)) + else: + chunk_output_file = f"{output_file}_{chunk_idx}.pkl" + with open(chunk_output_file, "wb") as out_f: + logger.info(f"Writing chunk in {chunk_output_file}") + pickle.dump(current_output, out_f) + break + + +def encode_multiprocess(args): + transformers_cache = args.transformers_cache + os.environ["TRANSFORMERS_CACHE"] = transformers_cache + model_name = args.model_name + model_type = args.model_type + batch_size = args.batch_size + chunk_size = args.chunk_size + input_file = args.input_file + output_file = os.path.splitext(args.output_file)[0] # Remove file extension from name + adapter_name = args.adapter_name + devices = args.gpus.split(",") + + ctx = mp.get_context('spawn') + + if model_type == "sentence-transformer": + model = SentenceTransformer(model_name, batch_size, True) + elif model_type == "transformer": + model = Transformer(model_name, batch_size, True) + elif model_type == "adapter": + model = AdapterTransformer(model_name, batch_size, True, transformers_cache, adapter_name) + + if os.path.dirname(output_file): + os.makedirs(os.path.dirname(output_file), exist_ok=True) + + input_queue = ctx.Queue(maxsize=2*len(devices)) + output_queue = ctx.Queue() + + read_p = ctx.Process(target=_read_process, args=(input_file, batch_size, input_queue), daemon=True) + read_p.start() + write_p = ctx.Process(target=_write_process, args=(output_file, chunk_size, args, output_queue), daemon=True) + write_p.start() + for cuda_id in devices: + p = ctx.Process(target=_encode_process, args=(model, cuda_id, args.float16, input_queue, output_queue), daemon=True) + p.start() + + write_p.join() + if __name__ == "__main__": parser = argparse.ArgumentParser() parser.add_argument("--transformers_cache", help="Cache folder where model files will be downloaded into and loaded from") @@ -291,14 +429,25 @@ def encode(args): "Each output file contains chunk_size embeddings " "(except the last one if len(input) mod chunk_size != 0)") parser.add_argument("--input_file", help="Input .jsonl file. Each line is a dict object: {'id': 'xxx', 'text': 'abc...'}") - parser.add_argument("--output_file", help="Output .pkl file. A chunk index will be inserted between the name and extension: 'path/to/name_chunkidx.pkl' ." + parser.add_argument("--output_file", help="Output .pkl/.h5 file. A chunk index will be inserted between the name and extension: e.g. 'path/to/name_chunkidx.pkl' ." "Format: {'ids': List[str], 'embeddings': ndarray}. " "Note for hdf5, use f['ids'].asstr() to load ids as string because default is binary.") parser.add_argument("--adapter_name", help="For model_type=adapter, the name of the adapter that should be loaded") - parser.add_argument("--hdf5", action="store_true") + parser.add_argument("--hdf5", action="store_true", help="Save output with hdf5 instead of pickle") parser.add_argument("--hdf5-gzip-level", type=int, default=4, help="GZIP compression level for HDF5 in range 0-9, default 4 (bigger is more compressed). " - "Set to negative value to disable compression") - parser.add_argument("--float16", action="store_true") + "Set to negative value to disable compression." + "Only used when --hdf5 is also set.") + parser.add_argument("--float16", action="store_true", help="Save embeddings as float16") + parser.add_argument("--gpus", help="Set this value to use multiprocessing." + "Comma-separated list of devices (e.g., cuda:0,cuda:1) for multi-GPU processing." + "Reading, writing and each GPU is assigned its own process." + "Can also be used with only one device to use the multiprocessing for reading/ writing of outputs but this is not necessarily faster with one GPU.") args = parser.parse_args() - encode(args) + start_time = time.time() + if not args.gpus: + encode(args) + else: + encode_multiprocess(args) + end_time = time.time() + logger.info(f"Finished encoding in {end_time-start_time}s") From bd6fdba2523431b4b2c6fe48fac2aa3049e21896 Mon Sep 17 00:00:00 2001 From: Geigle Date: Sat, 7 Aug 2021 16:15:11 +0200 Subject: [PATCH 21/23] Replaced loguru with standard Python logging (problems with using the logging.conf); Fixed missing updates for Auth Server unit tests; added Locust for load testing --- .github/workflows/ci.yml | 2 +- square-model-inference-api/Makefile | 1 + square-model-inference-api/README.md | 10 ++- .../auth_server/auth_api/security.py | 6 +- .../auth_server/main.py | 3 +- .../example_docker-compose.yml | 1 + .../inference_server/main.py | 8 +-- .../inference_server/requirements1.txt | 1 - .../api/routes/prediction.py | 5 +- .../core/event_handlers.py | 5 +- .../inference/adaptertransformer.py | 5 +- .../inference/sentencetransformer.py | 8 +-- .../inference/transformer.py | 5 +- .../pre_test_setup_for_docker_caching.py | 6 +- square-model-inference-api/locust/README.md | 52 +++++++++++++++ square-model-inference-api/locust/config.json | 41 ++++++++++++ .../locust/locustfile.py | 63 +++++++++++++++++++ .../locust/requirements.txt | 1 + square-model-inference-api/nginx/nginx.conf | 2 +- 19 files changed, 200 insertions(+), 25 deletions(-) create mode 100644 square-model-inference-api/locust/README.md create mode 100644 square-model-inference-api/locust/config.json create mode 100644 square-model-inference-api/locust/locustfile.py create mode 100644 square-model-inference-api/locust/requirements.txt diff --git a/.github/workflows/ci.yml b/.github/workflows/ci.yml index 7bb1ad9c8..e4bd36c1d 100644 --- a/.github/workflows/ci.yml +++ b/.github/workflows/ci.yml @@ -320,7 +320,7 @@ jobs: with: builder: ${{ steps.buildx.outputs.name }} context: ./square-model-inference-api/auth_server - #target: build # Not multistage + target: build push: ${{github.ref == 'refs/heads/master'}} tags: ${{ steps.prep.outputs.image }}:${{ steps.prep.outputs.tag }}, ${{ steps.prep.outputs.image }}:latest cache-from: type=local,src=/tmp/.buildx-cache diff --git a/square-model-inference-api/Makefile b/square-model-inference-api/Makefile index 0e8bc2e54..39899afde 100644 --- a/square-model-inference-api/Makefile +++ b/square-model-inference-api/Makefile @@ -8,6 +8,7 @@ all: clean test install run deploy down test: python -m pip install --upgrade pip && pip install pytest pytest-cov pytest-asyncio + PYTHONPATH=auth_server/ pytest --cov PYTHONPATH=inference_server/ pytest --cov install: diff --git a/square-model-inference-api/README.md b/square-model-inference-api/README.md index 26dcb90a1..8b245517f 100644 --- a/square-model-inference-api/README.md +++ b/square-model-inference-api/README.md @@ -12,9 +12,9 @@ to handle authorization of requests with the auth server. ├───auth_server # FastAPI Authorization Server │ ├───main.py # Entry point in server │ ├───Dockerfile # Dockerfile for server +│ ├───tests # Unit Tests +│ │ ├───test_api │ └───auth_api -├───nginx # nginx config for API Gateway & Authorizatio -│ └───nginx.conf ├───inference_server # FastAPI Model API Server │ ├───tests # Unit Tests │ │ ├───test_api @@ -27,6 +27,9 @@ to handle authorization of requests with the auth server. │ ├───core # Server config, Startup logic, etc. │ ├───models # Input/ output modelling for API │ └───inference # Deep Model implementation and inference code for NLP tasks +├───nginx # nginx config for API Gateway & Authorizatio +│ └───nginx.conf +├───locust # Load testing configuration with Locust └───example_docker-compose.yml # Example docker-compose setup for the Model API ``` @@ -73,10 +76,11 @@ make deploy ``` #### Running Tests - +For unit tests: ```sh make test ``` +For load testing with Locust, see [this README](locust/README.md). ## Setup 1. Create `auth_server/.env` with secret API key. See [here](auth_server/.env.example) for an example. diff --git a/square-model-inference-api/auth_server/auth_api/security.py b/square-model-inference-api/auth_server/auth_api/security.py index 058086f88..9fd328025 100644 --- a/square-model-inference-api/auth_server/auth_api/security.py +++ b/square-model-inference-api/auth_server/auth_api/security.py @@ -3,10 +3,14 @@ from fastapi import HTTPException, Security from fastapi.security.api_key import APIKeyHeader from starlette.status import HTTP_400_BAD_REQUEST, HTTP_401_UNAUTHORIZED -from loguru import logger from .config import API_KEY, API_KEY_HEADER_NAME from .messages import AUTH_REQ, NO_API_KEY + +import logging + +logger = logging.getLogger(__name__) + api_key_header = APIKeyHeader(name=API_KEY_HEADER_NAME, auto_error=False) diff --git a/square-model-inference-api/auth_server/main.py b/square-model-inference-api/auth_server/main.py index fb323930d..6daf91883 100644 --- a/square-model-inference-api/auth_server/main.py +++ b/square-model-inference-api/auth_server/main.py @@ -1,8 +1,9 @@ from fastapi import FastAPI, Depends import auth_api.security as security from logging.config import fileConfig -from loguru import logger +import logging +logger = logging.getLogger(__name__) try: fileConfig("logging.conf") except: diff --git a/square-model-inference-api/example_docker-compose.yml b/square-model-inference-api/example_docker-compose.yml index e03421974..dbfa7e022 100644 --- a/square-model-inference-api/example_docker-compose.yml +++ b/square-model-inference-api/example_docker-compose.yml @@ -4,6 +4,7 @@ services: auth: #build: ./auth_server/. image: ukpsquare/square-model-api-auth:latest + container_name: square_model_auth ports: - 8081:8081 env_file: diff --git a/square-model-inference-api/inference_server/main.py b/square-model-inference-api/inference_server/main.py index a3335add8..3f06b41de 100644 --- a/square-model-inference-api/inference_server/main.py +++ b/square-model-inference-api/inference_server/main.py @@ -1,14 +1,14 @@ -import sys - from fastapi import FastAPI from square_model_inference.api.routes.router import api_router from square_model_inference.core.config import API_PREFIX, APP_NAME, APP_VERSION from square_model_inference.core.event_handlers import start_app_handler, stop_app_handler from logging.config import fileConfig -from loguru import logger +import logging + +logger = logging.getLogger(__name__) def get_app() -> FastAPI: - # Set logging config. Loguru uses this then, too + # Set logging config. try: fileConfig("logging.conf") except: diff --git a/square-model-inference-api/inference_server/requirements1.txt b/square-model-inference-api/inference_server/requirements1.txt index 794d45f82..6db7ba44c 100644 --- a/square-model-inference-api/inference_server/requirements1.txt +++ b/square-model-inference-api/inference_server/requirements1.txt @@ -1,7 +1,6 @@ uvicorn==0.13.4 # ASGI server fastapi==0.65.1 # REST API Framework pydantic==1.8.2 # Input/ output modelling -loguru==0.5.3 # Easier logging python-dotenv==0.17.1 # Required for .env configs sentencepiece==0.1.95 torch==1.8.1 diff --git a/square-model-inference-api/inference_server/square_model_inference/api/routes/prediction.py b/square-model-inference-api/inference_server/square_model_inference/api/routes/prediction.py index eae7bda22..1de004a0d 100644 --- a/square-model-inference-api/inference_server/square_model_inference/api/routes/prediction.py +++ b/square-model-inference-api/inference_server/square_model_inference/api/routes/prediction.py @@ -1,11 +1,14 @@ from fastapi import APIRouter from starlette.requests import Request -from loguru import logger from square_model_inference.models.request import PredictionRequest, Task from square_model_inference.models.prediction import PredictionOutputForSequenceClassification, PredictionOutputForTokenClassification, \ PredictionOutputForQuestionAnswering, PredictionOutputForGeneration, PredictionOutputForEmbedding +import logging + +logger = logging.getLogger(__name__) + router = APIRouter() diff --git a/square-model-inference-api/inference_server/square_model_inference/core/event_handlers.py b/square-model-inference-api/inference_server/square_model_inference/core/event_handlers.py index 6011b30ec..2a664f050 100644 --- a/square-model-inference-api/inference_server/square_model_inference/core/event_handlers.py +++ b/square-model-inference-api/inference_server/square_model_inference/core/event_handlers.py @@ -1,7 +1,6 @@ from typing import Callable from fastapi import FastAPI -from loguru import logger from square_model_inference.inference.adaptertransformer import AdapterTransformer from square_model_inference.core.config import MODEL_TYPE, MODEL_NAME, MODEL_CLASS, DISABLE_GPU, BATCH_SIZE, \ @@ -9,6 +8,10 @@ from square_model_inference.inference.sentencetransformer import SentenceTransformer from square_model_inference.inference.transformer import Transformer +import logging + +logger = logging.getLogger(__name__) + MODEL_MAPPING = { "adapter": AdapterTransformer, "transformer": Transformer, diff --git a/square-model-inference-api/inference_server/square_model_inference/inference/adaptertransformer.py b/square-model-inference-api/inference_server/square_model_inference/inference/adaptertransformer.py index 99911bef2..2ad247089 100644 --- a/square-model-inference-api/inference_server/square_model_inference/inference/adaptertransformer.py +++ b/square-model-inference-api/inference_server/square_model_inference/inference/adaptertransformer.py @@ -1,5 +1,3 @@ -from loguru import logger - from transformers import AutoModelWithHeads, list_adapters from square_model_inference.inference.transformer import Transformer @@ -7,6 +5,9 @@ from square_model_inference.models.prediction import PredictionOutput +import logging + +logger = logging.getLogger(__name__) class AdapterTransformer(Transformer): """ diff --git a/square-model-inference-api/inference_server/square_model_inference/inference/sentencetransformer.py b/square-model-inference-api/inference_server/square_model_inference/inference/sentencetransformer.py index 7523bab29..b07de3bd0 100644 --- a/square-model-inference-api/inference_server/square_model_inference/inference/sentencetransformer.py +++ b/square-model-inference-api/inference_server/square_model_inference/inference/sentencetransformer.py @@ -1,9 +1,4 @@ -import json -from typing import Union, Tuple - import torch -from loguru import logger -import numpy as np from sentence_transformers import SentenceTransformer as SentenceTransformerModel from square_model_inference.inference.model import Model @@ -11,6 +6,9 @@ from square_model_inference.models.prediction import PredictionOutput, PredictionOutputForEmbedding +import logging + +logger = logging.getLogger(__name__) class SentenceTransformer(Model): """ diff --git a/square-model-inference-api/inference_server/square_model_inference/inference/transformer.py b/square-model-inference-api/inference_server/square_model_inference/inference/transformer.py index c7657de47..65dcceb0a 100644 --- a/square-model-inference-api/inference_server/square_model_inference/inference/transformer.py +++ b/square-model-inference-api/inference_server/square_model_inference/inference/transformer.py @@ -3,7 +3,6 @@ from typing import Union, Tuple import torch -from loguru import logger import numpy as np from transformers import AutoTokenizer, AutoModel, AutoModelForSequenceClassification, \ AutoModelForTokenClassification, AutoModelForQuestionAnswering, AutoModelForCausalLM @@ -14,6 +13,10 @@ from square_model_inference.models.prediction import PredictionOutput, PredictionOutputForSequenceClassification, PredictionOutputForTokenClassification, \ PredictionOutputForQuestionAnswering, PredictionOutputForGeneration, PredictionOutputForEmbedding +import logging + +logger = logging.getLogger(__name__) + CLASS_MAPPING = { "base": AutoModel, "sequence_classification": AutoModelForSequenceClassification, diff --git a/square-model-inference-api/inference_server/tests/pre_test_setup_for_docker_caching.py b/square-model-inference-api/inference_server/tests/pre_test_setup_for_docker_caching.py index 12c6de162..8460db236 100644 --- a/square-model-inference-api/inference_server/tests/pre_test_setup_for_docker_caching.py +++ b/square-model-inference-api/inference_server/tests/pre_test_setup_for_docker_caching.py @@ -1,12 +1,12 @@ # This file is used for the Docker image to cache long-running setup for the tests, # i.e., downloading finetuned Transformer models and so on. # This way, adding new tests or even changing the server code does NOT trigger a new download during building -import json - from sentence_transformers import SentenceTransformer from starlette.config import environ from transformers import AutoTokenizer, AutoModelWithHeads, list_adapters -from loguru import logger +import logging + +logger = logging.getLogger(__name__) TRANSFORMERS_TESTING_CACHE = "./.model_testing_cache" environ["TRANSFORMERS_CACHE"] = TRANSFORMERS_TESTING_CACHE diff --git a/square-model-inference-api/locust/README.md b/square-model-inference-api/locust/README.md new file mode 100644 index 000000000..261f52f94 --- /dev/null +++ b/square-model-inference-api/locust/README.md @@ -0,0 +1,52 @@ +# Locust +We use Locust for load testing. +See their [documentation](https://docs.locust.io/en/stable/) for more infos. + +## Setup +1. Install Python (>3.6) and Locust ([requirements.txt](requirements.txt)) on your system + (does not have to be the same system as where the Model API runs). +2. Write your ``config.json`` (see below for more) +3. Start the Model API servers (with Docker or locally or however you want). This does *not* have to be +on the same computer/ server as Locust. +4. Start the Locust server in this directory (``locust -f locustfile.py``), visit the web UI and start +the load test. For alternative methods see the above documentation. + +## config.json +We describe the keys with expected values for the config.json: +```json +{ + "config": { + # Time each user waits (min, max) after a request. Default [1, 2] + "wait_time": [1, 2], + # API key for authorization + "api_key": "example_key" + # Header for API key. Default: Authorization + "api_key_header": "Authorization" + }, + # List of Locust tasks. A user randomly selects from this list each time and starts the request + "tasks": [ + { + # Request to /api/$model/$endpoint + "endpoint": "embedding", + "model": "bert-base-uncased", + # Set to integer greater 1 to increase chance that this task is chosen. + # This task appears $weight-times in the list of tasks + # (from which the next task is uniformly chosen) + "weight": 1, + # The JSON of the query that is send to the Model API. + # See the documentation of the API for more details on this. + "query_json": { + "input": + [ + "test input" + ], + "is_preprocessed": false, + "preprocessing_kwargs": { }, + "model_kwargs": { }, + "task_kwargs": { }, + "adapter_name": "" + } + } + ] +} +``` diff --git a/square-model-inference-api/locust/config.json b/square-model-inference-api/locust/config.json new file mode 100644 index 000000000..4e86ebaf6 --- /dev/null +++ b/square-model-inference-api/locust/config.json @@ -0,0 +1,41 @@ +{ + "config": { + + "wait_time": [0, 0.5], + "api_key": "example_key" + }, + "tasks": [ + { + "endpoint": "embedding", + "model": "bert-base-uncased", + "weight": 1, + "query_json": { + "input": + [ + "test input" + ], + "is_preprocessed": false, + "preprocessing_kwargs": { }, + "model_kwargs": { }, + "task_kwargs": { }, + "adapter_name": "nli/rte@ukp" + } + }, + { + "endpoint": "sequence-classification", + "model": "bert-base-uncased", + "weight": 1, + "query_json": { + "input": + [ + "test input" + ], + "is_preprocessed": false, + "preprocessing_kwargs": { }, + "model_kwargs": { }, + "task_kwargs": { }, + "adapter_name": "nli/rte@ukp" + } + } + ] +} \ No newline at end of file diff --git a/square-model-inference-api/locust/locustfile.py b/square-model-inference-api/locust/locustfile.py new file mode 100644 index 000000000..a8e4514b4 --- /dev/null +++ b/square-model-inference-api/locust/locustfile.py @@ -0,0 +1,63 @@ +import json + +from locust import between +from locust.contrib.fasthttp import FastHttpUser + + +def curry_config_in_task(callable, config, endpoint): + """ + Identical task function might be called with different configs (different model etc.) but Locust calls tasks only + with the user as argument. We curry the task function with the given config into a Locust-callable task function. + :param callable: the task function to use + :param config: the specific config to be used for the task function + :param endpoint: the model API task endpoint that is called + :return: a Locust task function with signature f(user) that calls callable(user, config, model_api_task) + """ + def task(user): + return callable(user, config, endpoint) + return task + + +def task_query(config, endpoint): + """ + Template to make Locust tasks for queries that are generated dynamically based on the given config and endpoint. + Locust calls its task functions only with the user as argument so we create a closure and return it. + :param config: the config for the task with the model name, the API-Key, the JSON input, etc. + :param endpoint: the endpoint for the query + :return: the closure for the Locust task function that makes the specified query + """ + def query(user): + path = f"/api/{config['model']}/{endpoint}" + query_json = config["query_json"] + headers = {config.get("api_key_header", "Authorization"): config["api_key"]} + user.client.post(path, json=query_json, headers=headers) + return query + + +class ModelAPIUser(FastHttpUser): + wait_time = between(1, 2) + tasks = [] + + def __init__(self, *args, **kwargs): + # Load config + config = json.load(open("config.json")) + general_config = config["config"] + + # Setup User + wait_time = general_config.get("wait_time", [1, 2]) + # self.wait_time = between(...) does not work for some reason because it expects one argument that is not used but not supplied in calls + self.wait_time = lambda: between(wait_time[0], wait_time[1])(None) + + # Setup the Locust tasks + tasks = [] + for task in config["tasks"]: + task.update(general_config) + # Endpoint in URL uses - instead of _, so we replace it in case config was wrong + task_function = task_query(task, task["endpoint"].replace("_", "-")) + for _ in range(task.get("weight", 1)): + tasks.append(task_function) + self.tasks = tasks + + super().__init__(*args, **kwargs) + + diff --git a/square-model-inference-api/locust/requirements.txt b/square-model-inference-api/locust/requirements.txt new file mode 100644 index 000000000..61ddec44d --- /dev/null +++ b/square-model-inference-api/locust/requirements.txt @@ -0,0 +1 @@ +locust==2.0.0 \ No newline at end of file diff --git a/square-model-inference-api/nginx/nginx.conf b/square-model-inference-api/nginx/nginx.conf index 1858be5bf..54fbc7187 100644 --- a/square-model-inference-api/nginx/nginx.conf +++ b/square-model-inference-api/nginx/nginx.conf @@ -39,7 +39,7 @@ http { location /auth { internal; - proxy_pass http://auth:8081/auth; + proxy_pass http://square_model_auth:8081/auth; } } } \ No newline at end of file From 4a577c361cdc10d17c3bddf791ce77d342d58ab3 Mon Sep 17 00:00:00 2001 From: Geigle Date: Sat, 7 Aug 2021 16:20:53 +0200 Subject: [PATCH 22/23] Removed unneeded function --- square-model-inference-api/locust/config.json | 21 ++----------------- .../locust/locustfile.py | 14 ------------- 2 files changed, 2 insertions(+), 33 deletions(-) diff --git a/square-model-inference-api/locust/config.json b/square-model-inference-api/locust/config.json index 4e86ebaf6..1b7dd94ba 100644 --- a/square-model-inference-api/locust/config.json +++ b/square-model-inference-api/locust/config.json @@ -1,7 +1,6 @@ { "config": { - - "wait_time": [0, 0.5], + "wait_time": [1, 2], "api_key": "example_key" }, "tasks": [ @@ -18,23 +17,7 @@ "preprocessing_kwargs": { }, "model_kwargs": { }, "task_kwargs": { }, - "adapter_name": "nli/rte@ukp" - } - }, - { - "endpoint": "sequence-classification", - "model": "bert-base-uncased", - "weight": 1, - "query_json": { - "input": - [ - "test input" - ], - "is_preprocessed": false, - "preprocessing_kwargs": { }, - "model_kwargs": { }, - "task_kwargs": { }, - "adapter_name": "nli/rte@ukp" + "adapter_name": "" } } ] diff --git a/square-model-inference-api/locust/locustfile.py b/square-model-inference-api/locust/locustfile.py index a8e4514b4..7738f3278 100644 --- a/square-model-inference-api/locust/locustfile.py +++ b/square-model-inference-api/locust/locustfile.py @@ -4,20 +4,6 @@ from locust.contrib.fasthttp import FastHttpUser -def curry_config_in_task(callable, config, endpoint): - """ - Identical task function might be called with different configs (different model etc.) but Locust calls tasks only - with the user as argument. We curry the task function with the given config into a Locust-callable task function. - :param callable: the task function to use - :param config: the specific config to be used for the task function - :param endpoint: the model API task endpoint that is called - :return: a Locust task function with signature f(user) that calls callable(user, config, model_api_task) - """ - def task(user): - return callable(user, config, endpoint) - return task - - def task_query(config, endpoint): """ Template to make Locust tasks for queries that are generated dynamically based on the given config and endpoint. From d3c047ecb48a93fcc8db93ea209ba702a677ed89 Mon Sep 17 00:00:00 2001 From: Geigle Date: Tue, 10 Aug 2021 15:33:03 +0200 Subject: [PATCH 23/23] Finalized configs for CICD; fixed logging; updated documentation; embedding:added pooler output as option --- docker-compose.yaml | 38 +++++++++++++++++++ square-model-inference-api/README.md | 32 +++++++++++----- .../auth_server/main.py | 2 +- .../inference_server/.env.bert_adapter | 23 +++++++++++ .../inference_server/.env.dpr | 26 +++++++++++++ .../inference_server/main.py | 2 +- .../inference/transformer.py | 4 +- .../models/prediction.py | 2 +- .../square_model_inference/models/request.py | 3 +- .../tests/test_inference/test_transformers.py | 4 +- square-model-inference-api/locust/config.json | 18 ++++++++- square-model-inference-api/nginx/nginx.conf | 21 ++++++---- 12 files changed, 151 insertions(+), 24 deletions(-) create mode 100644 square-model-inference-api/inference_server/.env.bert_adapter create mode 100644 square-model-inference-api/inference_server/.env.dpr diff --git a/docker-compose.yaml b/docker-compose.yaml index c093325f6..8d4d17ac9 100644 --- a/docker-compose.yaml +++ b/docker-compose.yaml @@ -29,6 +29,44 @@ services: ports: - 80:80 +### MODEL API + model_auth: + image: ukpsquare/square-model-api-auth:latest + container_name: square_model_auth + ports: + - 8081:8081 + env_file: + - ./square-model-inference-api/auth_server/.env + + model_nginx: + image: nginx + ports: + - 8080:8080 + volumes: + - ./square-model-inference-api/nginx/nginx.conf:/etc/nginx/nginx.conf:ro + + inference_bert_adapter: + image: ukpsquare/square-model-api:latest + ports: + - 8000:8000 + env_file: + - ./square-model-inference-api/inference_server/.env.bert_adapter + container_name: square_model_inference_bert_adapter + volumes: + - ./.cache/:/etc/huggingface/.cache/ + + inference_dpr: + image: ukpsquare/square-model-api:latest + ports: + - 8001:8000 + env_file: + - ./square-model-inference-api/inference_server/.env.dpr + container_name: square_model_inference_dpr + volumes: + - ./.cache/:/etc/huggingface/.cache/ + +### / MODEL API Finished + #adminer: # image: adminer # restart: always diff --git a/square-model-inference-api/README.md b/square-model-inference-api/README.md index 8b245517f..31eabd254 100644 --- a/square-model-inference-api/README.md +++ b/square-model-inference-api/README.md @@ -2,6 +2,15 @@ Inference API that supports SOTA (QA) models & adapters. Receives input and returns prediction and other artifacts (e.g. attention scores) +## On the API Path +The 'true' path of the API for the model server is of the form `/api/$endpoint` where the endpoint +is embeddings, question-answering, etc. This is the path you use if you just run a model server locally. + +However, to run and distinguish multiple models, we use an API gateway with nginx so we extend +the path to `/api/$modelname/$endpoint` which is then resolved by nginx to the correct model server and forwarded +to this server's `/api/$endpoint` endpoint. This is the path you use with Docker. +This requires you to setup the docker-compose and nginx config as described below. + ## Project structure The Model API uses 3 components: @@ -58,6 +67,20 @@ Both `transformers` and `adapter-transformers` use the same namespace so they co Thus, we first install `sentence-transformers` along with `transformers`, uninstall `transformers`, and finally install `adapter-transformers`. + +## Setup +### Docker +1. Create `auth_server/.env` with secret API key. See [here](auth_server/.env.example) for an example. +2. For each model server that should run, create a `.env.$model` to configure it. + See [here](inference_server/.env.example) for an example. +3. Configure `nginx/nginx.conf` to correctly forward requests to each server. The server DNS name has to + match `container_name` of each server in the `docker-compose.yaml`. +4. Configure `docker-compose.yaml` by adding services for the auth server, nginx (with the config), and the + model servers (each with their .env file). See [example_docker-compose.yml](example_docker-compose.yml) for an example. +### Local +Create `inference_server/.env` and configure it as needed for your local model server. +You do not need nginx and the authorization server for local testing. + ## Running #### Running Localhost @@ -81,12 +104,3 @@ For unit tests: make test ``` For load testing with Locust, see [this README](locust/README.md). - -## Setup -1. Create `auth_server/.env` with secret API key. See [here](auth_server/.env.example) for an example. -2. For each model server that should run, create a `.env.$model` to configure it. - See [here](inference_server/.env.example) for an example. -3. Configure `nginx/nginx.conf` to correctly forward requests to each server. The server DNS name has to - match `container_name` of each server in the `docker-compose.yaml`. -4. Configure `docker-compose.yaml` by adding services for the auth server, nginx (with the config), and the -model servers (each with their .env file). See [example_docker-compose.yml](example_docker-compose.yml) for an example. \ No newline at end of file diff --git a/square-model-inference-api/auth_server/main.py b/square-model-inference-api/auth_server/main.py index 6daf91883..b091250e9 100644 --- a/square-model-inference-api/auth_server/main.py +++ b/square-model-inference-api/auth_server/main.py @@ -5,7 +5,7 @@ logger = logging.getLogger(__name__) try: - fileConfig("logging.conf") + fileConfig("logging.conf", disable_existing_loggers=False) except: logger.info("Failed to load 'logging.conf'. Continuing without configuring the server logger") app = FastAPI() diff --git a/square-model-inference-api/inference_server/.env.bert_adapter b/square-model-inference-api/inference_server/.env.bert_adapter new file mode 100644 index 000000000..6453fb42e --- /dev/null +++ b/square-model-inference-api/inference_server/.env.bert_adapter @@ -0,0 +1,23 @@ +# Corresponds to the Huggingface name for finetuned Transformers or the name of a finetuned SentenceTransformers +MODEL_NAME=bert-base-uncased +# Type of the model, e.g. Transformers, Adapter, ... +# See square_model_inference.core.event_handlers.MODEL_MAPPING for all available names with corresponding model +MODEL_TYPE=adapter + +# Disable CUDA even if available +DISABLE_GPU=True +# Batch size used for many inputs +BATCH_SIZE=32 +# Inputs larger than this size are rejected +MAX_INPUT_SIZE=1024 + +# Cache directory where model weights are stored +# This is the name for the env variable used by transformers and sentence-transformers package +TRANSFORMERS_CACHE=/etc/huggingface/.cache/ + + +# Flag that decides if returned numpy arrays are returned +# as lists or encoded to base64 (smaller but not easily human readable). +# See the comment in square_model_inference.models.prediction._encode_numpy on information on how to decode +# the base64 string back to the numpy array +RETURN_PLAINTEXT_ARRAYS=False \ No newline at end of file diff --git a/square-model-inference-api/inference_server/.env.dpr b/square-model-inference-api/inference_server/.env.dpr new file mode 100644 index 000000000..a1e3784a4 --- /dev/null +++ b/square-model-inference-api/inference_server/.env.dpr @@ -0,0 +1,26 @@ +# Corresponds to the Huggingface name for finetuned Transformers or the name of a finetuned SentenceTransformers +MODEL_NAME=facebook/dpr-question_encoder-single-nq-base +# Type of the model, e.g. Transformers, Adapter, ... +# See square_model_inference.core.event_handlers.MODEL_MAPPING for all available names with corresponding model +MODEL_TYPE=transformer + +# Disable CUDA even if available +DISABLE_GPU=False +# Batch size used for many inputs +BATCH_SIZE=32 +# Inputs larger than this size are rejected +MAX_INPUT_SIZE=1024 + +# Cache directory where model weights are stored +# This is the name for the env variable used by transformers and sentence-transformers package +TRANSFORMERS_CACHE=/etc/huggingface/.cache/ + +# For MODEL_TYPE=transformers: decides the AutoModel* class used +# See square_model_inference.inference.transformer.CLASS_MAPPING for valid names and corresponding class +MODEL_CLASS=base + +# Flag that decides if returned numpy arrays are returned +# as lists or encoded to base64 (smaller but not easily human readable). +# See the comment in square_model_inference.models.prediction._encode_numpy on information on how to decode +# the base64 string back to the numpy array +RETURN_PLAINTEXT_ARRAYS=False \ No newline at end of file diff --git a/square-model-inference-api/inference_server/main.py b/square-model-inference-api/inference_server/main.py index 3f06b41de..849fa23a2 100644 --- a/square-model-inference-api/inference_server/main.py +++ b/square-model-inference-api/inference_server/main.py @@ -10,7 +10,7 @@ def get_app() -> FastAPI: # Set logging config. try: - fileConfig("logging.conf") + fileConfig("logging.conf", disable_existing_loggers=False) except: logger.info("Failed to load 'logging.conf'. Continuing without configuring the server logger") fast_app = FastAPI(title=APP_NAME, version=APP_VERSION) diff --git a/square-model-inference-api/inference_server/square_model_inference/inference/transformer.py b/square-model-inference-api/inference_server/square_model_inference/inference/transformer.py index 65dcceb0a..5b013d8a1 100644 --- a/square-model-inference-api/inference_server/square_model_inference/inference/transformer.py +++ b/square-model-inference-api/inference_server/square_model_inference/inference/transformer.py @@ -29,7 +29,7 @@ class Transformer(Model): """ The class for all Huggingface transformer-based models """ - SUPPORTED_EMBEDDING_MODES = ["mean", "max", "cls", "token"] + SUPPORTED_EMBEDDING_MODES = ["mean", "max", "cls", "token", "pooler"] def __init__(self, model_name, model_class, batch_size, disable_gpu, max_input_size, **kwargs): """ @@ -127,6 +127,8 @@ def _embedding(self, request: PredictionRequest) -> PredictionOutput: if embedding_mode == "cls": emb = hidden_state[:, 0, :] + elif embedding_mode == "pooler": + emb = predictions["pooler_output"] # copied from sentence-transformers pooling elif embedding_mode == "max": input_mask_expanded = attention_mask.unsqueeze(-1).expand(hidden_state.size()).float() diff --git a/square-model-inference-api/inference_server/square_model_inference/models/prediction.py b/square-model-inference-api/inference_server/square_model_inference/models/prediction.py index a3f1d7fbb..f6a10712f 100644 --- a/square-model-inference-api/inference_server/square_model_inference/models/prediction.py +++ b/square-model-inference-api/inference_server/square_model_inference/models/prediction.py @@ -124,7 +124,7 @@ def __init__(self, **data): class PredictionOutputForEmbedding(PredictionOutput): - embedding_mode: str = Field("", description="Only used by Transformers/ Adapters.
One of 'mean', 'max', 'cls', 'token'. The pooling mode used (or not used for 'token')") + embedding_mode: str = Field("", description="Only used by Transformers/ Adapters.
One of 'mean', 'max', 'cls', 'pooler', 'token'. The pooling mode used (or not used for 'token')") word_ids: List[List[Optional[int]]] = Field([], description="Only used by Transformers/ Adapters.
" "Only set with embedding_mode='token'." " Mapping from each token to the corresponding word in the input. " diff --git a/square-model-inference-api/inference_server/square_model_inference/models/request.py b/square-model-inference-api/inference_server/square_model_inference/models/request.py index 547de9191..89678d9a6 100644 --- a/square-model-inference-api/inference_server/square_model_inference/models/request.py +++ b/square-model-inference-api/inference_server/square_model_inference/models/request.py @@ -62,7 +62,8 @@ class PredictionRequest(BaseModel): "'token_classification':
" "- 'is_regression': Flag to treat output of models with num_labels>1 as regression, too, i.e., no softmax and no labels are returned
" "'embedding':
" - "- 'embedding_mode: One of 'mean', 'max', 'cls', 'token'. The pooling mode used (or not used for 'token'). Default 'mean'.
" + "- 'embedding_mode: One of 'mean', 'max', 'cls', 'pooler', 'token'. The pooling mode used (or not used for 'token'). " + "'pooler' uses the pooler_output of a Transformer, i.e. the processed CLS token. Default value 'mean'.
" "'question_answering':
" "- 'topk': Return the top-k most likely spans. Default 1.
" "- 'max_answer_len': Maximal token length of answers. Default 128.
" diff --git a/square-model-inference-api/inference_server/tests/test_inference/test_transformers.py b/square-model-inference-api/inference_server/tests/test_inference/test_transformers.py index f389e9d00..c715fd2f1 100644 --- a/square-model-inference-api/inference_server/tests/test_inference/test_transformers.py +++ b/square-model-inference-api/inference_server/tests/test_inference/test_transformers.py @@ -82,7 +82,9 @@ class TestTransformerEmbedding: (["this is a test"], "max"), (["this is a test", "this is a test with a longer sentence"], "max"), (["this is a test"], "cls"), - (["this is a test", "this is a test with a longer sentence"], "cls")], + (["this is a test", "this is a test with a longer sentence"], "cls"), + (["this is a test"], "pooler"), + (["this is a test", "this is a test with a longer sentence"], "pooler")], ) async def test_embedding(self, prediction_request, test_transformer_embedding, input, mode): prediction_request.input = input diff --git a/square-model-inference-api/locust/config.json b/square-model-inference-api/locust/config.json index 1b7dd94ba..94fc2f27b 100644 --- a/square-model-inference-api/locust/config.json +++ b/square-model-inference-api/locust/config.json @@ -5,9 +5,25 @@ }, "tasks": [ { - "endpoint": "embedding", + "endpoint": "sequence-classification", "model": "bert-base-uncased", "weight": 1, + "query_json": { + "input": + [ + "test input" + ], + "is_preprocessed": false, + "preprocessing_kwargs": { }, + "model_kwargs": { }, + "task_kwargs": { }, + "adapter_name": "nli/rte@ukp" + } + }, + { + "endpoint": "embedding", + "model": "facebook/dpr-question_encoder-single-nq-base", + "weight": 1, "query_json": { "input": [ diff --git a/square-model-inference-api/nginx/nginx.conf b/square-model-inference-api/nginx/nginx.conf index 54fbc7187..56a4d7e53 100644 --- a/square-model-inference-api/nginx/nginx.conf +++ b/square-model-inference-api/nginx/nginx.conf @@ -22,21 +22,26 @@ http { access_log /var/log/nginx/access.log json_combined; listen 8080; - - location / { - proxy_pass http://square_model_inference_bert:8000/; + # Model API Documentation + location /docs { + proxy_pass http://square_model_inference_bert_adapter:8000/docs; + } + location /redoc { + proxy_pass http://square_model_inference_bert_adapter:8000/redoc; } + # Model Server API Gateway location /api/bert-base-uncased { auth_request /auth; - proxy_pass http://square_model_inference_bert:8000/api; + proxy_pass http://square_model_inference_bert_adapter:8000/api; } -# location /api/roberta-base { -# auth_request /auth; -# proxy_pass http://square_model_inference_roberta:8000/api; -# } + location /api/facebook/dpr-question_encoder-single-nq-base { + auth_request /auth; + proxy_pass http://square_model_inference_dpr:8000/api; + } + # Auth Server location /auth { internal; proxy_pass http://square_model_auth:8081/auth;