From e35a9cb31ae4242c22a3ffac0e95c37b40be1267 Mon Sep 17 00:00:00 2001
From: Geigle <gregortheodor.geigle@stud.tu-darmstadt.de>
Date: Thu, 3 Jun 2021 17:34:37 +0200
Subject: [PATCH 01/23] Initial commit: scaffolding, already implemented a lot
 for transformers/ adapters

---
 square-model-inference-api/.gitignore         | 133 +++++++++++++++
 square-model-inference-api/Makefile           |  42 +++++
 square-model-inference-api/README.md          |  80 +++++++++
 .../auth_server/.env.example                  |   2 +
 .../auth_server/Dockerfile                    |  14 ++
 .../auth_server/auth_api/__init__.py          |   0
 .../auth_server/auth_api/config.py            |   7 +
 .../auth_server/auth_api/messages.py          |   4 +
 .../auth_server/auth_api/security.py          |  25 +++
 .../auth_server/main.py                       |   8 +
 .../auth_server/requirements.txt              |   3 +
 square-model-inference-api/docker-compose.yml |  40 +++++
 .../inference_server/.dockerignore            | 133 +++++++++++++++
 .../inference_server/Dockerfile               |  14 ++
 .../inference_server/main.py                  |  21 +++
 .../inference_server/requirements.txt         |   8 +
 .../square_model_inference/__init__.py        |   0
 .../square_model_inference/api/__init__.py    |   0
 .../api/routes/__init__.py                    |   0
 .../api/routes/heartbeat.py                   |  11 ++
 .../api/routes/prediction.py                  |  20 +++
 .../api/routes/router.py                      |   7 +
 .../square_model_inference/core/__init__.py   |   0
 .../square_model_inference/core/config.py     |  18 ++
 .../core/event_handlers.py                    |  40 +++++
 .../square_model_inference/core/messages.py   |   1 +
 .../inference/__init__.py                     |   0
 .../inference/adaptertransformer.py           |  86 ++++++++++
 .../square_model_inference/inference/model.py |  18 ++
 .../inference/transformer.py                  | 155 ++++++++++++++++++
 .../square_model_inference/models/__init__.py |   0
 .../models/heartbeat.py                       |   5 +
 .../models/prediction.py                      |  48 ++++++
 .../square_model_inference/models/request.py  |  54 ++++++
 .../inference_server/tox.ini                  |  66 ++++++++
 square-model-inference-api/nginx/nginx.conf   |  34 ++++
 square-model-inference-api/tests/__init__.py  |   0
 square-model-inference-api/tests/conftest.py  |  16 ++
 .../tests/test_api/__init__.py                |   0
 .../tests/test_api/test_api_auth.py           |  17 ++
 .../tests/test_api/test_heartbeat.py          |  14 ++
 .../tests/test_api/test_prediction.py         |  23 +++
 .../tests/test_service/__init__.py            |   0
 .../tests/test_service/test_models.py         |  27 +++
 44 files changed, 1194 insertions(+)
 create mode 100644 square-model-inference-api/.gitignore
 create mode 100644 square-model-inference-api/Makefile
 create mode 100644 square-model-inference-api/README.md
 create mode 100644 square-model-inference-api/auth_server/.env.example
 create mode 100644 square-model-inference-api/auth_server/Dockerfile
 create mode 100644 square-model-inference-api/auth_server/auth_api/__init__.py
 create mode 100644 square-model-inference-api/auth_server/auth_api/config.py
 create mode 100644 square-model-inference-api/auth_server/auth_api/messages.py
 create mode 100644 square-model-inference-api/auth_server/auth_api/security.py
 create mode 100644 square-model-inference-api/auth_server/main.py
 create mode 100644 square-model-inference-api/auth_server/requirements.txt
 create mode 100644 square-model-inference-api/docker-compose.yml
 create mode 100644 square-model-inference-api/inference_server/.dockerignore
 create mode 100644 square-model-inference-api/inference_server/Dockerfile
 create mode 100644 square-model-inference-api/inference_server/main.py
 create mode 100644 square-model-inference-api/inference_server/requirements.txt
 create mode 100644 square-model-inference-api/inference_server/square_model_inference/__init__.py
 create mode 100644 square-model-inference-api/inference_server/square_model_inference/api/__init__.py
 create mode 100644 square-model-inference-api/inference_server/square_model_inference/api/routes/__init__.py
 create mode 100644 square-model-inference-api/inference_server/square_model_inference/api/routes/heartbeat.py
 create mode 100644 square-model-inference-api/inference_server/square_model_inference/api/routes/prediction.py
 create mode 100644 square-model-inference-api/inference_server/square_model_inference/api/routes/router.py
 create mode 100644 square-model-inference-api/inference_server/square_model_inference/core/__init__.py
 create mode 100644 square-model-inference-api/inference_server/square_model_inference/core/config.py
 create mode 100644 square-model-inference-api/inference_server/square_model_inference/core/event_handlers.py
 create mode 100644 square-model-inference-api/inference_server/square_model_inference/core/messages.py
 create mode 100644 square-model-inference-api/inference_server/square_model_inference/inference/__init__.py
 create mode 100644 square-model-inference-api/inference_server/square_model_inference/inference/adaptertransformer.py
 create mode 100644 square-model-inference-api/inference_server/square_model_inference/inference/model.py
 create mode 100644 square-model-inference-api/inference_server/square_model_inference/inference/transformer.py
 create mode 100644 square-model-inference-api/inference_server/square_model_inference/models/__init__.py
 create mode 100644 square-model-inference-api/inference_server/square_model_inference/models/heartbeat.py
 create mode 100644 square-model-inference-api/inference_server/square_model_inference/models/prediction.py
 create mode 100644 square-model-inference-api/inference_server/square_model_inference/models/request.py
 create mode 100644 square-model-inference-api/inference_server/tox.ini
 create mode 100644 square-model-inference-api/nginx/nginx.conf
 create mode 100644 square-model-inference-api/tests/__init__.py
 create mode 100644 square-model-inference-api/tests/conftest.py
 create mode 100644 square-model-inference-api/tests/test_api/__init__.py
 create mode 100644 square-model-inference-api/tests/test_api/test_api_auth.py
 create mode 100644 square-model-inference-api/tests/test_api/test_heartbeat.py
 create mode 100644 square-model-inference-api/tests/test_api/test_prediction.py
 create mode 100644 square-model-inference-api/tests/test_service/__init__.py
 create mode 100644 square-model-inference-api/tests/test_service/test_models.py

diff --git a/square-model-inference-api/.gitignore b/square-model-inference-api/.gitignore
new file mode 100644
index 000000000..7389010d8
--- /dev/null
+++ b/square-model-inference-api/.gitignore
@@ -0,0 +1,133 @@
+.idea/
+
+/ml_model/models
+
+# Hidden files
+.DS_store
+
+# Byte-compiled / optimized / DLL files
+__pycache__/
+*.py[cod]
+*$py.class
+
+# C extensions
+*.so
+
+# data is not logged
+data/
+
+# Distribution / packaging
+.Python
+env/
+build/
+develop-eggs/
+dist/
+downloads/
+eggs/
+.eggs/
+lib/
+lib64/
+parts/
+sdist/
+var/
+wheels/
+*.egg-info/
+.installed.cfg
+*.egg
+.vscode/
+
+# PyInstaller
+#  Usually these files are written by a python script from a template
+#  before PyInstaller builds the exe, so as to inject date/other infos into it.
+*.manifest
+*.spec
+
+# Installer logs
+pip-log.txt
+pip-delete-this-directory.txt
+
+# Unit test / coverage reports
+htmlcov/
+htmlcov-py36/
+.tox/
+.coverage
+.coverage.*
+.cache
+nosetests.xml
+coverage.xml
+*.cover
+.hypothesis/
+
+# Translations
+*.mo
+*.pot
+
+# Django stuff:
+*.log
+local_settings.py
+
+# Flask stuff:
+instance/
+.webassets-cache
+
+# Scrapy stuff:
+.scrapy
+
+# Sphinx documentation
+docs/_build/
+
+# PyBuilder
+target/
+
+# Jupyter Notebook
+.ipynb_checkpoints
+
+# pyenv
+.python-version
+
+.vscode/
+.pytest-cache/
+.pytest_cache/
+.empty/
+
+# celery beat schedule file
+celerybeat-schedule
+
+# SageMath parsed files
+*.sage.py
+
+# dotenv
+.env
+
+# virtualenv
+.venv
+venv/
+ENV/
+
+# Spyder project settings
+.spyderproject
+.spyproject
+
+# Rope project settings
+.ropeproject
+
+# mkdocs documentation
+/site
+
+# mypy
+.mypy_cache/
+.idea/
+
+
+
+Child_Watch_API/
+htmlcov-*/
+
+# remove large model
+ml_model/*.joblib
+
+
+#venv
+venv/
+
+
diff --git a/square-model-inference-api/Makefile b/square-model-inference-api/Makefile
new file mode 100644
index 000000000..e49710103
--- /dev/null
+++ b/square-model-inference-api/Makefile
@@ -0,0 +1,42 @@
+SHELL := /bin/bash
+
+# Target section and Global definitions
+# -----------------------------------------------------------------------------
+.PHONY: all clean test install run deploy down
+
+all: clean test install run deploy down
+
+test:
+	python -m pip install --upgrade pip && pip install setuptools tox
+	tox
+
+install:
+	pip install --upgrade pip && pip install -r inference_server/requirements.txt
+
+run:
+	PYTHONPATH=inference_server/ uvicorn inference_server.main:app --reload --host 0.0.0.0 --port 8002 --env-file inference_server/.env.debug
+
+build:
+	docker-compose build
+
+deploy:
+	docker-compose build
+	docker-compose up -d
+
+down:
+	docker-compose down
+
+clean:
+	-find . -name '*.pyc' -exec rm -rf {} \;
+	-find . -name '__pycache__' -exec rm -rf {} \;
+	-find . -name 'Thumbs.db' -exec rm -rf {} \;
+	-find . -name '*~' -exec rm -rf {} \;
+	-rm -rf .cache
+	-rm -rf build
+	-rm -rf dist
+	-rm -rf *.egg-info
+	-rm -rf htmlcov*
+	-rm -rf .tox/
+	-rm -rf docs/_build
+	-rm -r .coverage
+	-rm -rf .pytest_cache
\ No newline at end of file
diff --git a/square-model-inference-api/README.md b/square-model-inference-api/README.md
new file mode 100644
index 000000000..5b93ceaad
--- /dev/null
+++ b/square-model-inference-api/README.md
@@ -0,0 +1,80 @@
+# SQuARE Model API
+Inference API that supports SOTA (QA) models & adapters. 
+Receives input and returns prediction and other artifacts (e.g. attention scores)
+
+## Project structure
+
+TODO
+```
+├───tests
+│   ├───test_api
+│   └───test_service
+├───auth_server
+│   └───auth_api
+├───nginx
+├───inference_server
+│   ├───square_model_inference
+│   │   ├───api                  - Model API
+│   │   │   ├───routes
+│   │   ├───core                 
+│   │   ├───models               - Pydantic model definitions for in-/ output
+│   │   ├───inference
+```
+
+## Requirements
+
+Python 3.7+, Docker (optional), Make (optional)
+
+## Installation
+Install the required packages in your local environment (ideally virtualenv, conda, etc.).
+<!-- ```bash
+pip install -r requirements
+```  -->
+
+```sh
+python -m venv venv
+source venv/bin/activate
+make install
+```
+
+#### Running Localhost
+
+```sh
+make run
+```
+
+#### Running Via Docker
+
+```sh
+make deploy
+```
+
+#### Running Tests
+
+```sh
+make test
+```
+
+## Setup
+TODO
+
+## Run without `make` for development
+
+1. Start your  app with: 
+```bash
+PYTHONPATH=./inference_server uvicorn main:app --reload
+```
+
+2. Go to [http://localhost:8000/docs](http://localhost:8000/docs) or  [http://localhost:8000/redoc](http://localhost:8000/redoc) for alternative swagger
+
+
+## Run Tests with using `make`
+
+Install testing libraries and run `tox`:
+```bash
+pip install tox pytest flake8 coverage bandit
+tox
+```
+This runs tests and coverage for Python 3.8 and Flake8, Bandit.
+
+
diff --git a/square-model-inference-api/auth_server/.env.example b/square-model-inference-api/auth_server/.env.example
new file mode 100644
index 000000000..6125fa20f
--- /dev/null
+++ b/square-model-inference-api/auth_server/.env.example
@@ -0,0 +1,2 @@
+API_KEY=example_key
+API_KEY_NAME=api_key
\ No newline at end of file
diff --git a/square-model-inference-api/auth_server/Dockerfile b/square-model-inference-api/auth_server/Dockerfile
new file mode 100644
index 000000000..0ced2f573
--- /dev/null
+++ b/square-model-inference-api/auth_server/Dockerfile
@@ -0,0 +1,14 @@
+FROM python:3.8.2
+
+ENV PYTHONUNBUFFERED 1
+
+EXPOSE 8081
+WORKDIR /app
+
+COPY requirements.txt ./
+RUN pip install --upgrade pip && \
+    pip install -r requirements.txt
+
+COPY . ./
+
+CMD ["uvicorn", "main:app", "--host", "0.0.0.0", "--port", "8081"]
\ No newline at end of file
diff --git a/square-model-inference-api/auth_server/auth_api/__init__.py b/square-model-inference-api/auth_server/auth_api/__init__.py
new file mode 100644
index 000000000..e69de29bb
diff --git a/square-model-inference-api/auth_server/auth_api/config.py b/square-model-inference-api/auth_server/auth_api/config.py
new file mode 100644
index 000000000..ebd10abc4
--- /dev/null
+++ b/square-model-inference-api/auth_server/auth_api/config.py
@@ -0,0 +1,7 @@
+from starlette.config import Config
+from starlette.datastructures import Secret
+
+config = Config(".env")
+
+API_KEY: Secret = config("API_KEY", cast=Secret)
+API_KEY_NAME: str = config("API_KEY_NAME", cast=str, default="api_key")
diff --git a/square-model-inference-api/auth_server/auth_api/messages.py b/square-model-inference-api/auth_server/auth_api/messages.py
new file mode 100644
index 000000000..85c28dc04
--- /dev/null
+++ b/square-model-inference-api/auth_server/auth_api/messages.py
@@ -0,0 +1,4 @@
+NO_API_KEY = "No API key provided."
+AUTH_REQ = "Authentication required."
+HTTP_500_DETAIL = "Internal server error."
+
diff --git a/square-model-inference-api/auth_server/auth_api/security.py b/square-model-inference-api/auth_server/auth_api/security.py
new file mode 100644
index 000000000..8f6082a8c
--- /dev/null
+++ b/square-model-inference-api/auth_server/auth_api/security.py
@@ -0,0 +1,25 @@
+import secrets
+
+from fastapi import HTTPException, Security
+from fastapi.security.api_key import APIKeyHeader, APIKeyQuery
+from starlette.status import HTTP_400_BAD_REQUEST, HTTP_401_UNAUTHORIZED
+
+from .config import API_KEY, API_KEY_NAME
+from .messages import AUTH_REQ, NO_API_KEY
+
+api_key_query = APIKeyQuery(name=API_KEY_NAME, auto_error=False)
+api_key_header = APIKeyHeader(name=API_KEY_NAME, auto_error=False)
+
+
+def validate_request(query: str = Security(api_key_query),
+                     header: str = Security(api_key_header),) -> bool:
+    if query is None and header is None:
+        raise HTTPException(
+            status_code=HTTP_400_BAD_REQUEST, detail=NO_API_KEY, headers={}
+        )
+    elif not secrets.compare_digest(query, str(API_KEY)) \
+            or not secrets.compare_digest(header, str(API_KEY)):
+        raise HTTPException(
+            status_code=HTTP_401_UNAUTHORIZED, detail=AUTH_REQ, headers={}
+        )
+    return True
diff --git a/square-model-inference-api/auth_server/main.py b/square-model-inference-api/auth_server/main.py
new file mode 100644
index 000000000..90332c6d2
--- /dev/null
+++ b/square-model-inference-api/auth_server/main.py
@@ -0,0 +1,8 @@
+from fastapi import FastAPI, Depends
+import auth_api.security as security
+
+app = FastAPI()
+
+@app.get("/auth")
+async def auth(authenticated: bool = Depends(security.validate_request),):
+    return {"authenticated": True}
diff --git a/square-model-inference-api/auth_server/requirements.txt b/square-model-inference-api/auth_server/requirements.txt
new file mode 100644
index 000000000..e059f7f5f
--- /dev/null
+++ b/square-model-inference-api/auth_server/requirements.txt
@@ -0,0 +1,3 @@
+uvicorn
+fastapi
+pydantic
diff --git a/square-model-inference-api/docker-compose.yml b/square-model-inference-api/docker-compose.yml
new file mode 100644
index 000000000..ada141ee6
--- /dev/null
+++ b/square-model-inference-api/docker-compose.yml
@@ -0,0 +1,40 @@
+version: "3"
+
+services:
+  inference_bert:
+    build: ./inference_server/.
+    image: square_model_inference:latest
+    ports:
+      - 8000:8000
+    env_file:
+      - ./inference_server/.env.bert
+    container_name: square_model_inference_bert
+    volumes:
+      - ./.cache/:/etc/huggingface/.cache/
+      - ./inference_server/:/app/ #for developement
+
+  #inference_roberta:
+  #  build: ./inference_server/.
+  #  image: square_model_inference:latest
+  #  ports:
+  #    - 8001:8000
+  #  env_file:
+  #    - ./inference_server/.env.roberta
+  #  container_name: square_model_inference_roberta
+  #  volumes:
+  #    - ./.cache/:/etc/huggingface/.cache/
+  #    - ./inference_server/:/app/ #for developement
+
+  auth:
+    build: ./auth_server/.
+    ports:
+      - 8081:8081
+    env_file:
+      - ./auth_server/.env.example
+
+  nginx:
+    image: nginx
+    ports:
+      - 8080:8080
+    volumes:
+      - ./nginx/nginx.conf:/etc/nginx/nginx.conf:ro
\ No newline at end of file
diff --git a/square-model-inference-api/inference_server/.dockerignore b/square-model-inference-api/inference_server/.dockerignore
new file mode 100644
index 000000000..0af8215d8
--- /dev/null
+++ b/square-model-inference-api/inference_server/.dockerignore
@@ -0,0 +1,133 @@
+/images
+/tests
+.ascii-art
+
+*.yml
+*.ini
+
+# Hidden files
+.DS_store
+
+# Byte-compiled / optimized / DLL files
+__pycache__/
+*.py[cod]
+*$py.class
+
+# C extensions
+*.so
+
+# data is not logged
+data/
+
+# Distribution / packaging
+.Python
+env/
+build/
+develop-eggs/
+dist/
+downloads/
+eggs/
+.eggs/
+lib/
+lib64/
+parts/
+sdist/
+var/
+wheels/
+*.egg-info/
+.installed.cfg
+*.egg
+.vscode/
+
+# PyInstaller
+#  Usually these files are written by a python script from a template
+#  before PyInstaller builds the exe, so as to inject date/other infos into it.
+*.manifest
+*.spec
+
+# Installer logs
+pip-log.txt
+pip-delete-this-directory.txt
+
+# Unit test / coverage reports
+htmlcov/
+htmlcov-py36/
+.tox/
+.coverage
+.coverage.*
+.cache
+nosetests.xml
+coverage.xml
+*.cover
+.hypothesis/
+
+# Translations
+*.mo
+*.pot
+
+# Django stuff:
+*.log
+local_settings.py
+
+# Flask stuff:
+instance/
+.webassets-cache
+
+# Scrapy stuff:
+.scrapy
+
+# Sphinx documentation
+docs/_build/
+
+# PyBuilder
+target/
+
+# Jupyter Notebook
+.ipynb_checkpoints
+
+# pyenv
+.python-version
+
+.vscode/
+.pytest-cache/
+.pytest_cache/
+.empty/
+
+# celery beat schedule file
+celerybeat-schedule
+
+# SageMath parsed files
+*.sage.py
+
+# virtualenv
+.venv
+venv/
+ENV/
+
+# Spyder project settings
+.spyderproject
+.spyproject
+
+# Rope project settings
+.ropeproject
+
+# mkdocs documentation
+/site
+
+# mypy
+.mypy_cache/
+.idea/
+
+
+
+Child_Watch_API/
+htmlcov-*/
+
+# remove large model
+ml_model/*.joblib
+
+
+#venv
+venv/
+
+.idea/
\ No newline at end of file
diff --git a/square-model-inference-api/inference_server/Dockerfile b/square-model-inference-api/inference_server/Dockerfile
new file mode 100644
index 000000000..4418f182a
--- /dev/null
+++ b/square-model-inference-api/inference_server/Dockerfile
@@ -0,0 +1,14 @@
+FROM python:3.8.2
+
+ENV PYTHONUNBUFFERED 1
+
+EXPOSE 8000
+WORKDIR /app
+
+COPY requirements.txt ./
+RUN pip install --upgrade pip && \
+    pip install -r requirements.txt
+
+COPY . ./
+
+CMD ["uvicorn", "main:app", "--host", "0.0.0.0", "--port", "8000"]
\ No newline at end of file
diff --git a/square-model-inference-api/inference_server/main.py b/square-model-inference-api/inference_server/main.py
new file mode 100644
index 000000000..ba3ca507f
--- /dev/null
+++ b/square-model-inference-api/inference_server/main.py
@@ -0,0 +1,21 @@
+from fastapi import FastAPI
+from square_model_inference.api.routes.router import api_router
+from square_model_inference.core.config import API_PREFIX, APP_NAME, APP_VERSION, IS_DEBUG
+from square_model_inference.core.event_handlers import start_app_handler, stop_app_handler
+
+
+def get_app() -> FastAPI:
+    fast_app = FastAPI(title=APP_NAME, version=APP_VERSION, debug=IS_DEBUG)
+    fast_app.include_router(api_router, prefix=API_PREFIX)
+
+    fast_app.add_event_handler("startup", start_app_handler(fast_app))
+    fast_app.add_event_handler("shutdown", stop_app_handler(fast_app))
+
+    return fast_app
+
+
+app = get_app()
+
+if __name__ == "__main__":
+    import uvicorn
+    uvicorn.run(app, host="0.0.0.0", port=8000)
diff --git a/square-model-inference-api/inference_server/requirements.txt b/square-model-inference-api/inference_server/requirements.txt
new file mode 100644
index 000000000..0b2dd68dc
--- /dev/null
+++ b/square-model-inference-api/inference_server/requirements.txt
@@ -0,0 +1,8 @@
+uvicorn
+fastapi
+pydantic
+loguru
+python-dotenv
+adapter-transformers
+sentencepiece
+torch
\ No newline at end of file
diff --git a/square-model-inference-api/inference_server/square_model_inference/__init__.py b/square-model-inference-api/inference_server/square_model_inference/__init__.py
new file mode 100644
index 000000000..e69de29bb
diff --git a/square-model-inference-api/inference_server/square_model_inference/api/__init__.py b/square-model-inference-api/inference_server/square_model_inference/api/__init__.py
new file mode 100644
index 000000000..e69de29bb
diff --git a/square-model-inference-api/inference_server/square_model_inference/api/routes/__init__.py b/square-model-inference-api/inference_server/square_model_inference/api/routes/__init__.py
new file mode 100644
index 000000000..e69de29bb
diff --git a/square-model-inference-api/inference_server/square_model_inference/api/routes/heartbeat.py b/square-model-inference-api/inference_server/square_model_inference/api/routes/heartbeat.py
new file mode 100644
index 000000000..702a921bc
--- /dev/null
+++ b/square-model-inference-api/inference_server/square_model_inference/api/routes/heartbeat.py
@@ -0,0 +1,11 @@
+from fastapi import APIRouter
+
+from square_model_inference.models.heartbeat import HearbeatResult
+
+router = APIRouter()
+
+
+@router.get("/heartbeat", response_model=HearbeatResult, name="heartbeat")
+def get_hearbeat() -> HearbeatResult:
+    heartbeat = HearbeatResult(is_alive=True)
+    return heartbeat
diff --git a/square-model-inference-api/inference_server/square_model_inference/api/routes/prediction.py b/square-model-inference-api/inference_server/square_model_inference/api/routes/prediction.py
new file mode 100644
index 000000000..5c2f6ec25
--- /dev/null
+++ b/square-model-inference-api/inference_server/square_model_inference/api/routes/prediction.py
@@ -0,0 +1,20 @@
+from fastapi import APIRouter
+from starlette.requests import Request
+
+from square_model_inference.models.request import PredictionRequest
+from square_model_inference.models.prediction import PredictionOutput
+from square_model_inference.inference.model import Model
+
+router = APIRouter()
+
+
+@router.post("/predict", response_model=PredictionOutput, name="predict")
+async def predict(
+    request: Request,
+    prediction_request: PredictionRequest = None,
+) -> PredictionOutput:
+
+    model: Model = request.app.state.model
+    prediction: PredictionOutput = await model.predict(prediction_request)
+
+    return prediction
diff --git a/square-model-inference-api/inference_server/square_model_inference/api/routes/router.py b/square-model-inference-api/inference_server/square_model_inference/api/routes/router.py
new file mode 100644
index 000000000..b1fac0bfd
--- /dev/null
+++ b/square-model-inference-api/inference_server/square_model_inference/api/routes/router.py
@@ -0,0 +1,7 @@
+from fastapi import APIRouter
+
+from square_model_inference.api.routes import heartbeat, prediction
+
+api_router = APIRouter()
+api_router.include_router(heartbeat.router, tags=["health"], prefix="/health")
+api_router.include_router(prediction.router, tags=["prediction"], prefix="/v1")
diff --git a/square-model-inference-api/inference_server/square_model_inference/core/__init__.py b/square-model-inference-api/inference_server/square_model_inference/core/__init__.py
new file mode 100644
index 000000000..e69de29bb
diff --git a/square-model-inference-api/inference_server/square_model_inference/core/config.py b/square-model-inference-api/inference_server/square_model_inference/core/config.py
new file mode 100644
index 000000000..d56dd59e2
--- /dev/null
+++ b/square-model-inference-api/inference_server/square_model_inference/core/config.py
@@ -0,0 +1,18 @@
+from starlette.config import Config
+
+APP_VERSION = "0.1.0"
+APP_NAME = "SQuARE Model Inference API"
+API_PREFIX = "/api"
+
+config = Config(".env")
+
+IS_DEBUG: bool = config("IS_DEBUG", cast=bool, default=False)
+
+DISABLE_GPU: bool = config("DISABLE_GPU", cast=bool, default=False)
+MODEL_NAME: str = config("MODEL_NAME")
+MODEL_TYPE: str = config("MODEL_TYPE")
+TRANSFORMERS_CACHE: str = config("TRANSFORMERS_CACHE")
+
+MAX_BATCH_SIZE: int = config("MAX_BATCH_SIZE", cast=int, default=32)
+
+RETURN_PLAINTEXT_ARRAYS = config("RETURN_PLAINTEXT_ARRAYS", cast=bool, default=False)
\ No newline at end of file
diff --git a/square-model-inference-api/inference_server/square_model_inference/core/event_handlers.py b/square-model-inference-api/inference_server/square_model_inference/core/event_handlers.py
new file mode 100644
index 000000000..74493aa50
--- /dev/null
+++ b/square-model-inference-api/inference_server/square_model_inference/core/event_handlers.py
@@ -0,0 +1,40 @@
+from typing import Callable
+
+from fastapi import FastAPI
+from loguru import logger
+
+from square_model_inference.inference.adaptertransformer import AdapterTransformer
+from square_model_inference.core.config import MODEL_TYPE
+from square_model_inference.inference.transformer import Transformer
+
+MODEL_MAPPING = {
+    "adapter": AdapterTransformer,
+    "transformer": Transformer
+}
+
+
+def _startup_model(app: FastAPI) -> None:
+    if MODEL_TYPE not in MODEL_MAPPING:
+        raise RuntimeError(f"Unknown MODEL_MAPPING. Must be one of {MODEL_MAPPING.keys()}")
+    model_instance = MODEL_MAPPING[MODEL_TYPE]()
+    app.state.model = model_instance
+
+
+def _shutdown_model(app: FastAPI) -> None:
+    app.state.model = None
+
+
+def start_app_handler(app: FastAPI) -> Callable:
+    def startup() -> None:
+        logger.info("Running app start handler.")
+        _startup_model(app)
+
+    return startup
+
+
+def stop_app_handler(app: FastAPI) -> Callable:
+    def shutdown() -> None:
+        logger.info("Running app shutdown handler.")
+        _shutdown_model(app)
+
+    return shutdown
diff --git a/square-model-inference-api/inference_server/square_model_inference/core/messages.py b/square-model-inference-api/inference_server/square_model_inference/core/messages.py
new file mode 100644
index 000000000..b1251e7fa
--- /dev/null
+++ b/square-model-inference-api/inference_server/square_model_inference/core/messages.py
@@ -0,0 +1 @@
+HTTP_500_DETAIL = "Internal server error."
diff --git a/square-model-inference-api/inference_server/square_model_inference/inference/__init__.py b/square-model-inference-api/inference_server/square_model_inference/inference/__init__.py
new file mode 100644
index 000000000..e69de29bb
diff --git a/square-model-inference-api/inference_server/square_model_inference/inference/adaptertransformer.py b/square-model-inference-api/inference_server/square_model_inference/inference/adaptertransformer.py
new file mode 100644
index 000000000..8bbe71c96
--- /dev/null
+++ b/square-model-inference-api/inference_server/square_model_inference/inference/adaptertransformer.py
@@ -0,0 +1,86 @@
+import json
+
+import torch
+from loguru import logger
+
+from transformers import AutoModelWithHeads
+from transformers.adapters.utils import download_cached, ADAPTER_HUB_INDEX_FILE
+
+from square_model_inference.inference.transformer import Transformer
+from square_model_inference.models.request import PredictionRequest, Task
+
+from square_model_inference.models.prediction import PredictionOutput
+from square_model_inference.core.config import MODEL_NAME, TRANSFORMERS_CACHE
+
+
+class AdapterTransformer(Transformer):
+    def __init__(self):
+        self._load_model(AutoModelWithHeads)
+        self._load_adapter()
+
+    def _load_adapter(self):
+        """
+        Pre-load all available adapters for MODEL_NAME from adapterhub.ml.
+        We parse the hub index to extract all names and then load each model.
+        """
+        logger.info("Loading all available adapters from adapterhub.ml")
+        index_file = download_cached(ADAPTER_HUB_INDEX_FILE.format(MODEL_NAME))
+        adapter_index = json.load(open(index_file))
+        adapters = set()
+        for task, datasets in adapter_index.items():
+            for dataset in datasets.keys():
+                for key in datasets[dataset].keys():
+                    if key != "default":
+                        for org in datasets[dataset][key]["versions"].keys():
+                            adapters.add(f"{task}/{dataset}@{org}")
+        for adapter in adapters:
+            logger.debug(f"Loading adapter {adapter}")
+            self.model.load_adapter(adapter, load_as=adapter, with_head=True, cache_dir=TRANSFORMERS_CACHE)
+
+    # def _load_single_adapter(self, adapter_name: str):
+    #     if adapter_name not in self.model.config.adapters.adapters:
+    #         logger.info(f"Loading new adapter {adapter_name}")
+    #         self.model.load_adapter(adapter_name, with_head=True, load_as=adapter_name)
+    #     else:
+    #         logger.debug(f"Adapter {adapter_name} is already loaded. Not loading again")
+
+    def _token_classification(self, request: PredictionRequest) -> PredictionOutput:
+        # We only have to change the label2id mapping from config.label2id (what super() uses) to the mapping
+        # of the chosen head
+        prediction = super()._token_classification(request)
+
+        label2id = self.model.config.prediction_heads[request.adapter_name]["label2id"]
+        id2label = {v:k for k,v in label2id.items()}
+        prediction.task_outputs["id2label"] = id2label
+
+        return prediction
+
+    def _sequence_classification(self, request: PredictionRequest) -> PredictionOutput:
+        # We only have to change the label2id mapping from config.label2id (what super() uses) to the mapping
+        # of the chosen head
+        prediction = super()._sequence_classification(request)
+
+        label2id = self.model.config.prediction_heads[request.adapter_name]["label2id"]
+        id2label = {v:k for k,v in label2id.items()}
+        prediction.task_outputs["id2label"] = id2label
+
+        return prediction
+
+    async def predict(self, request: PredictionRequest) -> PredictionOutput:
+        if request.is_preprocessed:
+            ValueError("is_preprocessed=True is not supported for this model. Please use text as input.")
+
+        if not request.adapter_name or request.adapter_name not in self.model.config.adapters.adapters:
+            ValueError(f"Unknown or missing adapter {request.adapter_name}. "
+                       f"Please provider a fully specified adapter name from adapterhub.ml")
+        self.model.set_active_adapters(request.adapter_name)
+
+        if request.task == Task.sequence_classification:
+            return self._sequence_classification(request)
+        elif request.task == Task.token_classification:
+            return self._token_classification(request)
+        elif request.task == Task.embedding:
+            return self._embedding(request)
+        elif request.task == Task.generation:
+            return self._generation(request)
+
diff --git a/square-model-inference-api/inference_server/square_model_inference/inference/model.py b/square-model-inference-api/inference_server/square_model_inference/inference/model.py
new file mode 100644
index 000000000..4aa92d8d2
--- /dev/null
+++ b/square-model-inference-api/inference_server/square_model_inference/inference/model.py
@@ -0,0 +1,18 @@
+from square_model_inference.models.request import PredictionRequest
+from square_model_inference.models.prediction import PredictionOutput
+
+
+class Model:
+    """
+    Base class for all models.
+    __init__ is supposed to load all weights and other necessary files (e.g. tokenizer)
+    so that the model can directly perform inference on request
+    """
+    async def predict(self, payload: PredictionRequest) -> PredictionOutput:
+        """
+        Take an input, pre-process it accordingly, perform inference according to the task,
+        post-process the result and return it
+        :param payload: the prediction request containing the input and any other parameters required
+        :return: the result of the prediction
+        """
+        raise NotImplementedError
diff --git a/square-model-inference-api/inference_server/square_model_inference/inference/transformer.py b/square-model-inference-api/inference_server/square_model_inference/inference/transformer.py
new file mode 100644
index 000000000..adb0b490a
--- /dev/null
+++ b/square-model-inference-api/inference_server/square_model_inference/inference/transformer.py
@@ -0,0 +1,155 @@
+import json
+from typing import Union, Tuple
+
+import torch
+from loguru import logger
+
+from transformers import AutoTokenizer, AutoModel
+
+from square_model_inference.inference.model import Model
+from square_model_inference.models.request import PredictionRequest, Task
+
+from square_model_inference.models.prediction import PredictionOutput
+from square_model_inference.core.config import MODEL_NAME, DISABLE_GPU, MAX_BATCH_SIZE
+
+
+class Transformer(Model):
+    SUPPORTED_EMBEDDING_MODES = ["mean", "max", "cls", "token"]
+
+    def __init__(self):
+        self._load_model(AutoModel)
+
+    def _load_model(self, model_cls):
+        """
+        Load the base model MODEL_NAME and its tokenizer with Huggingface.
+        Model will be moved to GPU unless CUDA is unavailable or environment variable DISABLE_GPU is true.
+        """
+        logger.debug(f"Loading model {MODEL_NAME}")
+        tokenizer = AutoTokenizer.from_pretrained(MODEL_NAME)
+        # Check if GPU is available
+        device = "cuda" if torch.cuda.is_available() and not DISABLE_GPU else "cpu"
+        model = model_cls.from_pretrained(MODEL_NAME).to(device)
+        logger.info(f"Model {MODEL_NAME} loaded on {device}")
+
+        self.model = model
+        self.tokenizer = tokenizer
+
+    def _ensure_tensor_on_device(self, **inputs):
+        """
+        Ensure PyTorch tensors are on the specified device.
+
+        Args:
+            inputs (keyword arguments that should be :obj:`torch.Tensor`): The tensors to place on :obj:`self.device`.
+
+        Return:
+            :obj:`Dict[str, torch.Tensor]`: The same as :obj:`inputs` but on the proper device.
+        """
+        return {name: tensor.to(self.model.device) for name, tensor in inputs.items()}
+
+    def _predict(self, request: PredictionRequest, output_word_ids=False, output_attention_mask=False) \
+            -> Union[dict, Tuple[dict, dict]]:
+        all_predictions = []
+        features = self.tokenizer(request.input,
+                                  return_tensors="pt",
+                                  padding=True,
+                                  truncation=True,
+                                  **request.preprocessing_kwargs)
+        for start_idx in range(0, len(request.input), MAX_BATCH_SIZE):
+            with torch.no_grad():
+                input_features = {k: features[k][start_idx:start_idx+MAX_BATCH_SIZE] for k in features.keys()}
+                input_features = self._ensure_tensor_on_device(**input_features)
+                predictions = self.model(**input_features, **request.model_kwargs)
+                all_predictions.append(predictions)
+        keys = all_predictions[0].keys()
+        final_prediction = {}
+        for key in keys:
+            # HuggingFace outputs for 'attentions' and more is returned as tuple of tensors
+            # Tuple of tuples only exists for 'past_key_values' which is only relevant for generation.
+            # Generation should NOT use this function
+            if isinstance(all_predictions[0][key], tuple):
+                tuple_of_lists = list(zip(*[[p.cpu() for p in tpl[key]] for tpl in all_predictions]))
+                final_prediction[key] = tuple(torch.cat(l) for l in tuple_of_lists)
+            else:
+                final_prediction[key] = torch.cat([p[key].cpu() for p in all_predictions])
+        if any((output_word_ids, output_attention_mask)):
+            word_ids = [features.word_ids(i) for i in range(len(request.input))]
+            attention_mask = features["attention_mask"]
+            return final_prediction, {"word_ids": word_ids, "attention_mask": attention_mask}
+        return final_prediction
+
+    def _embedding(self, request: PredictionRequest) -> PredictionOutput:
+        request.model_kwargs["output_hidden_states"] = True
+        predictions, other = self._predict(request, output_word_ids=True, output_attention_mask=True)
+        hidden_state = predictions.pop("hidden_states")[-1]
+        attention_mask = other["attention_mask"]
+
+        embedding_mode = request.task_kwargs.get("embedding_mode", "mean")
+        if embedding_mode not in self.SUPPORTED_EMBEDDING_MODES:
+            ValueError(f"Embedding mode {embedding_mode} not in list of supported modes {self.SUPPORTED_EMBEDDING_MODES}")
+
+        if embedding_mode == "cls":
+            emb = hidden_state[:, 0, :]
+        # copied from sentence-transformers pooling
+        elif embedding_mode == "max":
+            input_mask_expanded = attention_mask.unsqueeze(-1).expand(hidden_state.size()).float()
+            hidden_state[input_mask_expanded == 0] = -1e9  # Set padding tokens to large negative value
+            emb = torch.max(hidden_state, 1)[0]
+        # copied from sentence-transformers pooling
+        elif embedding_mode == "mean":
+            input_mask_expanded = attention_mask.unsqueeze(-1).expand(hidden_state.size()).float()
+            sum_embeddings = torch.sum(hidden_state * input_mask_expanded, 1)
+            sum_mask = input_mask_expanded.sum(1)
+            emb = sum_embeddings / sum_mask
+        elif embedding_mode == "token":
+            emb = hidden_state
+        predictions["embedding"] = emb
+        task_outputs = {
+            "embedding_mode": embedding_mode,
+            "word_ids": other["word_ids"]
+        }
+        return PredictionOutput(model_outputs=predictions, task_outputs=task_outputs)
+
+    def _token_classification(self, request: PredictionRequest) -> PredictionOutput:
+        predictions, other = self._predict(request, output_word_ids=True)
+        # If logits dim > 1 or if the 'regression' flag is not set, we assume classification:
+        # We replace the logits by the softmax and add labels chosen with argmax
+        if predictions["logits"].size()[-1] != 1 and not request.task_kwargs.get("regression", False):
+            probabilities = torch.softmax(predictions["logits"], dim=-1)
+            predictions["logits"] = probabilities
+            predictions["labels"] = torch.argmax(predictions["logits"], dim=-1)
+        label2id = self.model.config.label2id
+        id2label = {v:k for k,v in label2id.items()}
+        task_outputs = {
+            "id2label": id2label,
+            "word_ids": other["word_ids"]
+        }
+        return PredictionOutput(model_outputs=predictions, task_outputs=task_outputs)
+
+    def _sequence_classification(self, request: PredictionRequest) -> PredictionOutput:
+        predictions = self._predict(request)
+        # If logits dim > 1 or if the 'regression' flag is not set, we assume classification:
+        # We replace the logits by the softmax and add labels chosen with argmax
+        if predictions["logits"].size()[-1] != 1 and not request.task_kwargs.get("regression", False):
+            probabilities = torch.softmax(predictions["logits"], dim=-1)
+            predictions["logits"] = probabilities
+            predictions["labels"] = torch.argmax(predictions["logits"], dim=-1)
+        label2id = self.model.config.label2id
+        id2label = {v:k for k,v in label2id.items()}
+        task_outputs = {
+            "id2label": id2label
+        }
+        return PredictionOutput(model_outputs=predictions, task_outputs=task_outputs)
+
+    async def predict(self, request: PredictionRequest) -> PredictionOutput:
+        if request.is_preprocessed:
+            ValueError("is_preprocessed=True is not supported for this model. Please use text as input.")
+
+        if request.task == Task.sequence_classification:
+            return self._sequence_classification(request)
+        elif request.task == Task.token_classification:
+            return self._token_classification(request)
+        elif request.task == Task.embedding:
+            return self._embedding(request)
+        elif request.task == Task.generation:
+            return self._generation(request)
+
diff --git a/square-model-inference-api/inference_server/square_model_inference/models/__init__.py b/square-model-inference-api/inference_server/square_model_inference/models/__init__.py
new file mode 100644
index 000000000..e69de29bb
diff --git a/square-model-inference-api/inference_server/square_model_inference/models/heartbeat.py b/square-model-inference-api/inference_server/square_model_inference/models/heartbeat.py
new file mode 100644
index 000000000..34a47e56e
--- /dev/null
+++ b/square-model-inference-api/inference_server/square_model_inference/models/heartbeat.py
@@ -0,0 +1,5 @@
+from pydantic import BaseModel
+
+
+class HearbeatResult(BaseModel):
+    is_alive: bool
diff --git a/square-model-inference-api/inference_server/square_model_inference/models/prediction.py b/square-model-inference-api/inference_server/square_model_inference/models/prediction.py
new file mode 100644
index 000000000..03acbad00
--- /dev/null
+++ b/square-model-inference-api/inference_server/square_model_inference/models/prediction.py
@@ -0,0 +1,48 @@
+import torch
+from io import BytesIO
+import base64
+import numpy as np
+from pydantic import Field, BaseModel
+from square_model_inference.core.config import RETURN_PLAINTEXT_ARRAYS
+
+def _encode_numpy(obj):
+    def encode(arr):
+        if RETURN_PLAINTEXT_ARRAYS:
+            return arr.tolist()
+        else:
+            with BytesIO() as b:
+                np.save(b, arr)
+                arr_binary = b.getvalue()
+            arr_binary_b64 = base64.b64encode(arr_binary)
+            arr_string_b64 = arr_binary_b64.decode("latin1")
+            return arr_string_b64
+            # LOADING WITH
+            # arr_binary_b64 = arr_string_b64.encode()
+            # arr_binary = base64.decodebytes(arr_binary_b64)
+            # arr = np.load(BytesIO(arr_binary))
+    for k, v in obj.items():
+        if isinstance(v, torch.Tensor):
+            v = v.numpy()
+            obj[k] = encode(v)
+        elif isinstance(v, tuple) and isinstance(v[0], torch.Tensor):
+            v = [encode(vv.numpy()) for vv in v]
+            obj[k] = v
+    return obj
+
+
+class PredictionOutput(BaseModel):
+    """
+    The results of the prediction of the model on the given input for the requested task.
+    """
+    model_outputs: dict = Field(
+        description="Dictionary containing the model outputs as numpy arrays cast to lists."
+    )
+    task_outputs: dict = Field(
+        description="Dictionary containing the task outputs. This covers anything from generated text, "
+                    "id2label mappings, token2word mappings, etc."
+    )
+
+    def __init__(self, **data):
+        super().__init__(**data)
+        self.model_outputs = _encode_numpy(self.model_outputs)
+
diff --git a/square-model-inference-api/inference_server/square_model_inference/models/request.py b/square-model-inference-api/inference_server/square_model_inference/models/request.py
new file mode 100644
index 000000000..3df379aa8
--- /dev/null
+++ b/square-model-inference-api/inference_server/square_model_inference/models/request.py
@@ -0,0 +1,54 @@
+from enum import Enum
+from typing import Union, List, Optional
+
+from pydantic import BaseModel, Field
+
+
+class Task(str, Enum):
+    """
+    The available tasks that can be handled. Note that most model actually supports only one task.
+    Requesting a task that a model cannot handle might fail or produce incorrect results.
+    """
+    sequence_classification = "sequence_classification"
+    token_classification = "token_classification"
+    embedding = "embedding"
+    generation = "generation"
+
+
+class PredictionRequest(BaseModel):
+    """
+    Prediction request containing the input, pre-processing parameters, parameters for the model forward pass,
+     the task with task-specific parameters, and parameters for any post-processing
+    """
+    input: Union[List[str], List[List[str]], dict] = Field(
+        ...,
+        description="Input for the model. Supports Huggingface Transformer inputs (i.e., list of sentences, or "
+                    "list of pairs of sentences), a dictionary with Transformer inputs, or a dictionary containing "
+                    "numpy arrays (as lists). For the numpy arrays, also set is_preprocessed=True."
+    )
+    is_preprocessed: bool = Field(
+        default=False,
+        description="Flag indicating that the input contains already pre-processed numpy arrays "
+                    "as list and that it needs no further pre-processing."
+    )
+    preprocessing_kwargs: dict = Field(
+        default={},
+        description="Dictionary containing additional parameters for the pre-processing step."
+    )
+    model_kwargs: dict = Field(
+        default={},
+        description="Dictionary containing parameters that are passed to the model for the forward pass. "
+                    "Set ‘output_attention=True’ to receive the attention weights for Huggingface Transformers in the "
+                    "output."
+    )
+    task: Task = Field(...)
+    task_kwargs: dict = Field(
+        default={},
+        description="Dictionary containing additional parameters for handling of the task and "
+                    "task-related post-processing."
+    )
+    adapter_name: Optional[str] = Field(
+        default="",
+        description="Only necessary for adapter-based models. "
+                    "The fully specified name of the to-be-used adapter from adapterhub.ml"
+    )
\ No newline at end of file
diff --git a/square-model-inference-api/inference_server/tox.ini b/square-model-inference-api/inference_server/tox.ini
new file mode 100644
index 000000000..e48ef368e
--- /dev/null
+++ b/square-model-inference-api/inference_server/tox.ini
@@ -0,0 +1,66 @@
+[tox]
+envlist = flake8,py38,bandit
+
+[testenv]
+deps = 
+    coverage
+    pytest
+    pytest-cov
+    pycodestyle>=2.0
+commands =
+    coverage erase
+    pip install -r {toxinidir}/requirements.txt
+    pytest --cov=tests --cov=square_model_inference --cov-report=term-missing --cov-config=setup.cfg 
+    coverage report
+    coverage html -d htmlcov-{envname}   
+
+[testenv:autopep8]
+basepython = python3
+skip_install = true
+deps =
+    autopep8>=1.5
+commands =
+    autopep8 --in-place --aggressive -r square_model_inference/
+
+
+[testenv:flake8]
+basepython = python3
+skip_install = true
+deps =
+    flake8
+    flake8-bugbear
+    flake8-colors
+    flake8-typing-imports>=1.1
+    pep8-naming
+commands =
+    flake8 square_model_inference/ tests/ setup.py
+
+[testenv:bandit]
+basepython = python3
+deps =
+    bandit
+commands =
+    bandit -r square_model_inference/
+
+[flake8]
+exclude =
+    .tox,
+    .git,
+    __pycache__,
+    docs/source/conf.py,
+    build,
+    dist,
+    tests/fixtures/*,
+    *.pyc,
+    *.egg-info,
+    .cache,
+    .eggs
+max-complexity = 10
+import-order-style = google
+application-import-names = flake8
+max-line-length = 120
+ignore = 
+    B008,
+    N803,
+    N806,
+    E126
\ No newline at end of file
diff --git a/square-model-inference-api/nginx/nginx.conf b/square-model-inference-api/nginx/nginx.conf
new file mode 100644
index 000000000..20c5c84d4
--- /dev/null
+++ b/square-model-inference-api/nginx/nginx.conf
@@ -0,0 +1,34 @@
+events {
+    worker_connections 1024;
+}
+
+http {
+    server {
+        listen 8080;
+
+        location / {
+            proxy_pass http://square_model_inference_bert:8000/;
+        }
+
+        location /api/bert-base-uncased {
+            #auth_request /auth;
+            proxy_pass http://square_model_inference_bert:8000/api;
+        }
+
+#         location /api/roberta-base {
+#             auth_request /auth;
+#             proxy_pass http://square_model_inference_roberta:8000/api;
+#         }
+
+        location /auth {
+            internal;
+            set $query '';
+               if ($request_uri ~* "[^\?]+\?(.*)$") {
+                   set $query $1;
+               }
+            proxy_pass http://auth:8081/auth;
+            proxy_pass_request_body   off;
+            proxy_set_header          Content-Length "";
+        }
+    }
+}
\ No newline at end of file
diff --git a/square-model-inference-api/tests/__init__.py b/square-model-inference-api/tests/__init__.py
new file mode 100644
index 000000000..e69de29bb
diff --git a/square-model-inference-api/tests/conftest.py b/square-model-inference-api/tests/conftest.py
new file mode 100644
index 000000000..b21642754
--- /dev/null
+++ b/square-model-inference-api/tests/conftest.py
@@ -0,0 +1,16 @@
+import pytest
+from starlette.config import environ
+from starlette.testclient import TestClient
+
+
+environ["API_KEY"] = "example_key"
+
+
+from square_model_inference.main import get_app  # noqa: E402
+
+
+@pytest.fixture()
+def test_client():
+    app = get_app()
+    with TestClient(app) as test_client:
+        yield test_client
diff --git a/square-model-inference-api/tests/test_api/__init__.py b/square-model-inference-api/tests/test_api/__init__.py
new file mode 100644
index 000000000..e69de29bb
diff --git a/square-model-inference-api/tests/test_api/test_api_auth.py b/square-model-inference-api/tests/test_api/test_api_auth.py
new file mode 100644
index 000000000..cc5467b20
--- /dev/null
+++ b/square-model-inference-api/tests/test_api/test_api_auth.py
@@ -0,0 +1,17 @@
+from square_model_inference.core import messages
+
+
+def test_auth_using_prediction_api_no_apikey_header(test_client) -> None:
+    response = test_client.post("/api/v1/question")
+    assert response.status_code == 400
+    assert response.json() == {"detail": messages.NO_API_KEY}
+
+
+def test_auth_using_prediction_api_wrong_apikey_header(test_client) -> None:
+    response = test_client.post(
+        "/api/v1/question",
+        json={"context": "test", "question": "test"},
+        headers={"token": "WRONG_TOKEN"},
+    )
+    assert response.status_code == 401
+    assert response.json() == {"detail": messages.AUTH_REQ}
diff --git a/square-model-inference-api/tests/test_api/test_heartbeat.py b/square-model-inference-api/tests/test_api/test_heartbeat.py
new file mode 100644
index 000000000..343f594d7
--- /dev/null
+++ b/square-model-inference-api/tests/test_api/test_heartbeat.py
@@ -0,0 +1,14 @@
+from square_model_inference.main import get_app
+
+app = get_app()
+
+
+def test_heartbeat(test_client) -> None:
+    response = test_client.get("/api/health/heartbeat")
+    assert response.status_code == 200
+    assert response.json() == {"is_alive": True}
+
+
+def test_default_route(test_client) -> None:
+    response = test_client.get("/")
+    assert response.status_code == 404
diff --git a/square-model-inference-api/tests/test_api/test_prediction.py b/square-model-inference-api/tests/test_api/test_prediction.py
new file mode 100644
index 000000000..a00caf028
--- /dev/null
+++ b/square-model-inference-api/tests/test_api/test_prediction.py
@@ -0,0 +1,23 @@
+def test_prediction(test_client) -> None:
+    response = test_client.post(
+        "/api/v1/question",
+        json={"context": "two plus two equal four", "question": "What is four?"},
+        headers={"token": "example_key"},
+    )
+    assert response.status_code == 200
+    assert "score" in response.json()
+
+
+def test_prediction_nopayload(test_client) -> None:
+    response = test_client.post(
+        "/api/v1/question", json={}, headers={"token": "example_key"}
+    )
+    """
+    ## if nopayload, default Hitchhiker's Guide to the Galaxy example is sent
+    # context:"42 is the answer to life, the universe and everything."
+    # question:"What is the answer to life?"
+    # if no default, assert response.status_code == 422
+    """
+
+    data = response.json()
+    assert data["answer"] == "42"
diff --git a/square-model-inference-api/tests/test_service/__init__.py b/square-model-inference-api/tests/test_service/__init__.py
new file mode 100644
index 000000000..e69de29bb
diff --git a/square-model-inference-api/tests/test_service/test_models.py b/square-model-inference-api/tests/test_service/test_models.py
new file mode 100644
index 000000000..61d666db7
--- /dev/null
+++ b/square-model-inference-api/tests/test_service/test_models.py
@@ -0,0 +1,27 @@
+import pytest
+
+from square_model_inference.core import config
+from square_model_inference.models.payload import QAPredictionPayload
+from square_model_inference.models.prediction import QAPredictionResult
+from square_model_inference.transformers.nlp import QAModel
+
+
+def test_prediction(test_client) -> None:
+    model_path = config.DEFAULT_MODEL_PATH
+    qa = QAPredictionPayload.parse_obj(
+        {"context": "two plus two equal four", "question": "What is four?"}
+    )
+
+    tm = QAModel(model_path)
+    result = tm.predict(qa)
+    assert isinstance(result, QAPredictionResult)
+    assert result.answer == "two plus two"
+
+
+def test_prediction_no_payload(test_client) -> None:
+    model_path = config.DEFAULT_MODEL_PATH
+
+    tm = QAModel(model_path)
+    with pytest.raises(ValueError):
+        result = tm.predict(None)
+        assert isinstance(result, QAPredictionResult)

From dee676e5036389e1868be60a5ebeeaf0372247ed Mon Sep 17 00:00:00 2001
From: Geigle <gregortheodor.geigle@stud.tu-darmstadt.de>
Date: Thu, 10 Jun 2021 15:42:42 +0200
Subject: [PATCH 02/23] Changed model init to decouple from config, added
 question answering; added sentence-transformers

---
 .../{inference_server => }/.dockerignore      |   0
 square-model-inference-api/Makefile           |  10 +-
 .../test_api => auth_server}/__init__.py      |   0
 .../__init__.py                               |   0
 .../inference_server/main.py                  |   5 +-
 .../square_model_inference/core/config.py     |  10 +-
 .../core/event_handlers.py                    |  15 +-
 .../inference/adaptertransformer.py           |  16 +-
 .../inference/sentencetransformer.py          |  45 +++++
 .../inference/transformer.py                  | 186 ++++++++++++++----
 .../models/prediction.py                      |  26 ++-
 .../square_model_inference/models/request.py  |   1 +
 .../inference_server/tox.ini                  |  66 -------
 13 files changed, 247 insertions(+), 133 deletions(-)
 rename square-model-inference-api/{inference_server => }/.dockerignore (100%)
 rename square-model-inference-api/{tests/test_api => auth_server}/__init__.py (100%)
 rename square-model-inference-api/{tests/test_service => inference_server}/__init__.py (100%)
 create mode 100644 square-model-inference-api/inference_server/square_model_inference/inference/sentencetransformer.py
 delete mode 100644 square-model-inference-api/inference_server/tox.ini

diff --git a/square-model-inference-api/inference_server/.dockerignore b/square-model-inference-api/.dockerignore
similarity index 100%
rename from square-model-inference-api/inference_server/.dockerignore
rename to square-model-inference-api/.dockerignore
diff --git a/square-model-inference-api/Makefile b/square-model-inference-api/Makefile
index e49710103..9ca741306 100644
--- a/square-model-inference-api/Makefile
+++ b/square-model-inference-api/Makefile
@@ -7,14 +7,14 @@ SHELL := /bin/bash
 all: clean test install run deploy down
 
 test:
-	python -m pip install --upgrade pip && pip install setuptools tox
-	tox
+	python -m pip install --upgrade pip && pip install pytest
+	pytest
 
 install:
 	pip install --upgrade pip && pip install -r inference_server/requirements.txt
 
 run:
-	PYTHONPATH=inference_server/ uvicorn inference_server.main:app --reload --host 0.0.0.0 --port 8002 --env-file inference_server/.env.debug
+	PYTHONPATH=inference_server/ uvicorn inference_server.main:app --reload --host 0.0.0.0 --port 8002 --env-file inference_server/.env
 
 build:
 	docker-compose build
@@ -31,12 +31,8 @@ clean:
 	-find . -name '__pycache__' -exec rm -rf {} \;
 	-find . -name 'Thumbs.db' -exec rm -rf {} \;
 	-find . -name '*~' -exec rm -rf {} \;
-	-rm -rf .cache
 	-rm -rf build
 	-rm -rf dist
 	-rm -rf *.egg-info
-	-rm -rf htmlcov*
-	-rm -rf .tox/
 	-rm -rf docs/_build
-	-rm -r .coverage
 	-rm -rf .pytest_cache
\ No newline at end of file
diff --git a/square-model-inference-api/tests/test_api/__init__.py b/square-model-inference-api/auth_server/__init__.py
similarity index 100%
rename from square-model-inference-api/tests/test_api/__init__.py
rename to square-model-inference-api/auth_server/__init__.py
diff --git a/square-model-inference-api/tests/test_service/__init__.py b/square-model-inference-api/inference_server/__init__.py
similarity index 100%
rename from square-model-inference-api/tests/test_service/__init__.py
rename to square-model-inference-api/inference_server/__init__.py
diff --git a/square-model-inference-api/inference_server/main.py b/square-model-inference-api/inference_server/main.py
index ba3ca507f..4d4f0825c 100644
--- a/square-model-inference-api/inference_server/main.py
+++ b/square-model-inference-api/inference_server/main.py
@@ -1,11 +1,11 @@
 from fastapi import FastAPI
 from square_model_inference.api.routes.router import api_router
-from square_model_inference.core.config import API_PREFIX, APP_NAME, APP_VERSION, IS_DEBUG
+from square_model_inference.core.config import API_PREFIX, APP_NAME, APP_VERSION
 from square_model_inference.core.event_handlers import start_app_handler, stop_app_handler
 
 
 def get_app() -> FastAPI:
-    fast_app = FastAPI(title=APP_NAME, version=APP_VERSION, debug=IS_DEBUG)
+    fast_app = FastAPI(title=APP_NAME, version=APP_VERSION)
     fast_app.include_router(api_router, prefix=API_PREFIX)
 
     fast_app.add_event_handler("startup", start_app_handler(fast_app))
@@ -13,7 +13,6 @@ def get_app() -> FastAPI:
 
     return fast_app
 
-
 app = get_app()
 
 if __name__ == "__main__":
diff --git a/square-model-inference-api/inference_server/square_model_inference/core/config.py b/square-model-inference-api/inference_server/square_model_inference/core/config.py
index d56dd59e2..a3973bdee 100644
--- a/square-model-inference-api/inference_server/square_model_inference/core/config.py
+++ b/square-model-inference-api/inference_server/square_model_inference/core/config.py
@@ -6,13 +6,19 @@
 
 config = Config(".env")
 
-IS_DEBUG: bool = config("IS_DEBUG", cast=bool, default=False)
-
+# Disable CUDA even if available
 DISABLE_GPU: bool = config("DISABLE_GPU", cast=bool, default=False)
+# Corresponds to the Huggingface name for Transformers
 MODEL_NAME: str = config("MODEL_NAME")
+# Type of the model, e.g. Transformers, Adapter, ...
 MODEL_TYPE: str = config("MODEL_TYPE")
+# Cache directory where model weights are stored
 TRANSFORMERS_CACHE: str = config("TRANSFORMERS_CACHE")
 
 MAX_BATCH_SIZE: int = config("MAX_BATCH_SIZE", cast=int, default=32)
 
+# For MODEL_TYPE=transformers: decides the AutoModelFor* class used
+MODEL_CLASS: str = config("MODEL_CLASS", default="base")
+
+# Flag that decides if returned numpy arrays are returned as lists or encoded to base64 (smaller but not easily human readable)
 RETURN_PLAINTEXT_ARRAYS = config("RETURN_PLAINTEXT_ARRAYS", cast=bool, default=False)
\ No newline at end of file
diff --git a/square-model-inference-api/inference_server/square_model_inference/core/event_handlers.py b/square-model-inference-api/inference_server/square_model_inference/core/event_handlers.py
index 74493aa50..bec998460 100644
--- a/square-model-inference-api/inference_server/square_model_inference/core/event_handlers.py
+++ b/square-model-inference-api/inference_server/square_model_inference/core/event_handlers.py
@@ -4,19 +4,28 @@
 from loguru import logger
 
 from square_model_inference.inference.adaptertransformer import AdapterTransformer
-from square_model_inference.core.config import MODEL_TYPE
+from square_model_inference.core.config import MODEL_TYPE, MODEL_NAME, MODEL_CLASS, DISABLE_GPU, MAX_BATCH_SIZE, TRANSFORMERS_CACHE
+from square_model_inference.inference.sentencetransformer import SentenceTransformer
 from square_model_inference.inference.transformer import Transformer
 
 MODEL_MAPPING = {
     "adapter": AdapterTransformer,
-    "transformer": Transformer
+    "transformer": Transformer,
+    "sentence-transformer": SentenceTransformer
 }
 
+MODEL_KWARGS = {
+    "model_name": MODEL_NAME,
+    "model_class": MODEL_CLASS,
+    "disable_gpu": DISABLE_GPU,
+    "max_batch_size": MAX_BATCH_SIZE,
+    "transformers_cache": TRANSFORMERS_CACHE
+}
 
 def _startup_model(app: FastAPI) -> None:
     if MODEL_TYPE not in MODEL_MAPPING:
         raise RuntimeError(f"Unknown MODEL_MAPPING. Must be one of {MODEL_MAPPING.keys()}")
-    model_instance = MODEL_MAPPING[MODEL_TYPE]()
+    model_instance = MODEL_MAPPING[MODEL_TYPE](**MODEL_KWARGS)
     app.state.model = model_instance
 
 
diff --git a/square-model-inference-api/inference_server/square_model_inference/inference/adaptertransformer.py b/square-model-inference-api/inference_server/square_model_inference/inference/adaptertransformer.py
index 8bbe71c96..ed7e0b4c1 100644
--- a/square-model-inference-api/inference_server/square_model_inference/inference/adaptertransformer.py
+++ b/square-model-inference-api/inference_server/square_model_inference/inference/adaptertransformer.py
@@ -10,21 +10,21 @@
 from square_model_inference.models.request import PredictionRequest, Task
 
 from square_model_inference.models.prediction import PredictionOutput
-from square_model_inference.core.config import MODEL_NAME, TRANSFORMERS_CACHE
 
 
 class AdapterTransformer(Transformer):
-    def __init__(self):
-        self._load_model(AutoModelWithHeads)
-        self._load_adapter()
+    def __init__(self, model_name, max_batch_size, disable_gpu, transformers_cache, **kwargs):
+        self._load_model(AutoModelWithHeads, model_name, disable_gpu)
+        self._load_adapter(model_name, transformers_cache)
+        self.max_batch_size = max_batch_size
 
-    def _load_adapter(self):
+    def _load_adapter(self, model_name, transformers_cache):
         """
         Pre-load all available adapters for MODEL_NAME from adapterhub.ml.
         We parse the hub index to extract all names and then load each model.
         """
         logger.info("Loading all available adapters from adapterhub.ml")
-        index_file = download_cached(ADAPTER_HUB_INDEX_FILE.format(MODEL_NAME))
+        index_file = download_cached(ADAPTER_HUB_INDEX_FILE.format(model_name))
         adapter_index = json.load(open(index_file))
         adapters = set()
         for task, datasets in adapter_index.items():
@@ -35,7 +35,7 @@ def _load_adapter(self):
                             adapters.add(f"{task}/{dataset}@{org}")
         for adapter in adapters:
             logger.debug(f"Loading adapter {adapter}")
-            self.model.load_adapter(adapter, load_as=adapter, with_head=True, cache_dir=TRANSFORMERS_CACHE)
+            self.model.load_adapter(adapter, load_as=adapter, with_head=True, cache_dir=transformers_cache)
 
     # def _load_single_adapter(self, adapter_name: str):
     #     if adapter_name not in self.model.config.adapters.adapters:
@@ -79,6 +79,8 @@ async def predict(self, request: PredictionRequest) -> PredictionOutput:
             return self._sequence_classification(request)
         elif request.task == Task.token_classification:
             return self._token_classification(request)
+        elif request.task == Task.question_answering:
+            return self._question_answering(request)
         elif request.task == Task.embedding:
             return self._embedding(request)
         elif request.task == Task.generation:
diff --git a/square-model-inference-api/inference_server/square_model_inference/inference/sentencetransformer.py b/square-model-inference-api/inference_server/square_model_inference/inference/sentencetransformer.py
new file mode 100644
index 000000000..fdfd32ad6
--- /dev/null
+++ b/square-model-inference-api/inference_server/square_model_inference/inference/sentencetransformer.py
@@ -0,0 +1,45 @@
+import json
+from typing import Union, Tuple
+
+import torch
+from loguru import logger
+import numpy as np
+from sentence_transformers import SentenceTransformer
+
+from square_model_inference.inference.model import Model
+from square_model_inference.models.request import PredictionRequest, Task
+
+from square_model_inference.models.prediction import PredictionOutput
+
+
+class SentenceTransformer(Model):
+    SUPPORTED_EMBEDDING_MODES = ["mean", "max", "cls", "token"]
+
+    def __init__(self, model_name, max_batch_size, disable_gpu, **kwargs):
+        self._load_model(model_name, disable_gpu)
+        self.max_batch_size = max_batch_size
+
+    def _load_model(self, model_name, disable_gpu):
+        """
+        Load the Transformer model model_name and its tokenizer with Huggingface.
+        Model will be moved to GPU unless CUDA is unavailable or disable_gpu is true.
+        """
+        logger.debug(f"Loading model {model_name}")
+        device = "cuda" if torch.cuda.is_available() and not disable_gpu else "cpu"
+        model = SentenceTransformer(model_name_or_path=model_name, device=device)
+        logger.info(f"Model {model_name} loaded on {device}")
+        self.model = model
+
+    def _embedding(self, request: PredictionRequest) -> PredictionOutput:
+        embeddings = self.model.encode(request.input, batch_size=self.max_batch_size, show_progress_bar=False)
+        return PredictionOutput(model_outputs={"embeddings": embeddings}, task_outputs={})
+
+
+    async def predict(self, request: PredictionRequest) -> PredictionOutput:
+        if request.is_preprocessed:
+            ValueError("is_preprocessed=True is not supported for this model. Please use text as input.")
+
+        if request.task != Task.embedding:
+            NotImplementedError("Only embedding task supported by this model")
+        return self._embedding(request)
+
diff --git a/square-model-inference-api/inference_server/square_model_inference/inference/transformer.py b/square-model-inference-api/inference_server/square_model_inference/inference/transformer.py
index adb0b490a..c4c0a4f92 100644
--- a/square-model-inference-api/inference_server/square_model_inference/inference/transformer.py
+++ b/square-model-inference-api/inference_server/square_model_inference/inference/transformer.py
@@ -3,33 +3,42 @@
 
 import torch
 from loguru import logger
-
-from transformers import AutoTokenizer, AutoModel
+import numpy as np
+from transformers import AutoTokenizer, AutoModel, AutoModelForSequenceClassification, \
+                        AutoModelForTokenClassification, AutoModelForQuestionAnswering
 
 from square_model_inference.inference.model import Model
 from square_model_inference.models.request import PredictionRequest, Task
 
 from square_model_inference.models.prediction import PredictionOutput
-from square_model_inference.core.config import MODEL_NAME, DISABLE_GPU, MAX_BATCH_SIZE
 
+CLASS_MAPPING = {
+    "base": AutoModel,
+    "sequence_classification": AutoModelForSequenceClassification,
+    "token_classifcation": AutoModelForTokenClassification,
+    "question_answering": AutoModelForQuestionAnswering
+}
 
 class Transformer(Model):
     SUPPORTED_EMBEDDING_MODES = ["mean", "max", "cls", "token"]
 
-    def __init__(self):
-        self._load_model(AutoModel)
+    def __init__(self, model_name, model_class, max_batch_size, disable_gpu, **kwargs):
+        if model_class not in CLASS_MAPPING:
+            raise RuntimeError(f"Unknown MODEL_CLASS. Must be one of {CLASS_MAPPING.keys()}")
+        self._load_model(CLASS_MAPPING[model_class], model_name, disable_gpu)
+        self.max_batch_size = max_batch_size
 
-    def _load_model(self, model_cls):
+    def _load_model(self, model_cls, model_name, disable_gpu):
         """
-        Load the base model MODEL_NAME and its tokenizer with Huggingface.
-        Model will be moved to GPU unless CUDA is unavailable or environment variable DISABLE_GPU is true.
+        Load the Transformer model model_name and its tokenizer with Huggingface.
+        Model will be moved to GPU unless CUDA is unavailable or disable_gpu is true.
         """
-        logger.debug(f"Loading model {MODEL_NAME}")
-        tokenizer = AutoTokenizer.from_pretrained(MODEL_NAME)
+        logger.debug(f"Loading model {model_name}")
+        tokenizer = AutoTokenizer.from_pretrained(model_name)
         # Check if GPU is available
-        device = "cuda" if torch.cuda.is_available() and not DISABLE_GPU else "cpu"
-        model = model_cls.from_pretrained(MODEL_NAME).to(device)
-        logger.info(f"Model {MODEL_NAME} loaded on {device}")
+        device = "cuda" if torch.cuda.is_available() and not disable_gpu else "cpu"
+        model = model_cls.from_pretrained(model_name).to(device)
+        logger.info(f"Model {model_name} loaded on {device}")
 
         self.model = model
         self.tokenizer = tokenizer
@@ -46,17 +55,17 @@ def _ensure_tensor_on_device(self, **inputs):
         """
         return {name: tensor.to(self.model.device) for name, tensor in inputs.items()}
 
-    def _predict(self, request: PredictionRequest, output_word_ids=False, output_attention_mask=False) \
+    def _predict(self, request: PredictionRequest, output_features=False) \
             -> Union[dict, Tuple[dict, dict]]:
         all_predictions = []
+        request.preprocessing_kwargs["padding"] = request.preprocessing_kwargs.get("padding", True)
+        request.preprocessing_kwargs["truncation"] = request.preprocessing_kwargs.get("truncation", True)
         features = self.tokenizer(request.input,
                                   return_tensors="pt",
-                                  padding=True,
-                                  truncation=True,
                                   **request.preprocessing_kwargs)
-        for start_idx in range(0, len(request.input), MAX_BATCH_SIZE):
+        for start_idx in range(0, len(request.input), self.max_batch_size):
             with torch.no_grad():
-                input_features = {k: features[k][start_idx:start_idx+MAX_BATCH_SIZE] for k in features.keys()}
+                input_features = {k: features[k][start_idx:start_idx+self.max_batch_size] for k in features.keys()}
                 input_features = self._ensure_tensor_on_device(**input_features)
                 predictions = self.model(**input_features, **request.model_kwargs)
                 all_predictions.append(predictions)
@@ -71,19 +80,22 @@ def _predict(self, request: PredictionRequest, output_word_ids=False, output_att
                 final_prediction[key] = tuple(torch.cat(l) for l in tuple_of_lists)
             else:
                 final_prediction[key] = torch.cat([p[key].cpu() for p in all_predictions])
-        if any((output_word_ids, output_attention_mask)):
-            word_ids = [features.word_ids(i) for i in range(len(request.input))]
-            attention_mask = features["attention_mask"]
-            return final_prediction, {"word_ids": word_ids, "attention_mask": attention_mask}
+        if output_features:
+            return final_prediction, features
         return final_prediction
 
     def _embedding(self, request: PredictionRequest) -> PredictionOutput:
         request.model_kwargs["output_hidden_states"] = True
-        predictions, other = self._predict(request, output_word_ids=True, output_attention_mask=True)
+        predictions, features = self._predict(request, output_features=True)
+        # We remove hidden_states from predictions!
         hidden_state = predictions.pop("hidden_states")[-1]
-        attention_mask = other["attention_mask"]
+        attention_mask = features["attention_mask"]
 
         embedding_mode = request.task_kwargs.get("embedding_mode", "mean")
+        task_outputs = {
+            "embedding_mode": embedding_mode
+        }
+
         if embedding_mode not in self.SUPPORTED_EMBEDDING_MODES:
             ValueError(f"Embedding mode {embedding_mode} not in list of supported modes {self.SUPPORTED_EMBEDDING_MODES}")
 
@@ -102,42 +114,132 @@ def _embedding(self, request: PredictionRequest) -> PredictionOutput:
             emb = sum_embeddings / sum_mask
         elif embedding_mode == "token":
             emb = hidden_state
-        predictions["embedding"] = emb
-        task_outputs = {
-            "embedding_mode": embedding_mode,
-            "word_ids": other["word_ids"]
-        }
+            task_outputs["word_ids"] = [features.word_ids(i) for i in range(len(request.input))]
+        predictions["embeddings"] = emb
+
         return PredictionOutput(model_outputs=predictions, task_outputs=task_outputs)
 
     def _token_classification(self, request: PredictionRequest) -> PredictionOutput:
-        predictions, other = self._predict(request, output_word_ids=True)
+        predictions, features = self._predict(request, output_features=True)
         # If logits dim > 1 or if the 'regression' flag is not set, we assume classification:
         # We replace the logits by the softmax and add labels chosen with argmax
-        if predictions["logits"].size()[-1] != 1 and not request.task_kwargs.get("regression", False):
-            probabilities = torch.softmax(predictions["logits"], dim=-1)
-            predictions["logits"] = probabilities
-            predictions["labels"] = torch.argmax(predictions["logits"], dim=-1)
         label2id = self.model.config.label2id
         id2label = {v:k for k,v in label2id.items()}
         task_outputs = {
             "id2label": id2label,
-            "word_ids": other["word_ids"]
+            "word_ids": [features.word_ids(i) for i in range(len(request.input))]
         }
+        if predictions["logits"].size()[-1] != 1 and not request.task_kwargs.get("regression", False):
+            probabilities = torch.softmax(predictions["logits"], dim=-1)
+            predictions["logits"] = probabilities
+            task_outputs["labels"] = torch.argmax(predictions["logits"], dim=-1).tolist()
+
         return PredictionOutput(model_outputs=predictions, task_outputs=task_outputs)
 
     def _sequence_classification(self, request: PredictionRequest) -> PredictionOutput:
         predictions = self._predict(request)
-        # If logits dim > 1 or if the 'regression' flag is not set, we assume classification:
-        # We replace the logits by the softmax and add labels chosen with argmax
-        if predictions["logits"].size()[-1] != 1 and not request.task_kwargs.get("regression", False):
-            probabilities = torch.softmax(predictions["logits"], dim=-1)
-            predictions["logits"] = probabilities
-            predictions["labels"] = torch.argmax(predictions["logits"], dim=-1)
         label2id = self.model.config.label2id
         id2label = {v:k for k,v in label2id.items()}
         task_outputs = {
             "id2label": id2label
         }
+        # If logits dim > 1 or if the 'regression' flag is not set, we assume classification:
+        # We replace the logits by the softmax and add labels chosen with argmax
+        if predictions["logits"].size()[-1] != 1 and not request.task_kwargs.get("regression", False):
+            probabilities = torch.softmax(predictions["logits"], dim=-1)
+            predictions["logits"] = probabilities
+            task_outputs["labels"] = torch.argmax(predictions["logits"], dim=-1).tolist()
+
+        return PredictionOutput(model_outputs=predictions, task_outputs=task_outputs)
+
+    def _generation(self, request: PredictionRequest) -> PredictionOutput:
+        NotImplementedError("Generation is currently not implemented")
+
+    def _question_answering(self, request: PredictionRequest) -> PredictionOutput:
+        # Making heavy use of https://huggingface.co/transformers/_modules/transformers/pipelines/question_answering.html#QuestionAnsweringPipeline
+        def decode(start_: np.ndarray, end_: np.ndarray, topk: int, max_answer_len: int, undesired_tokens_: np.ndarray) -> Tuple:
+                """
+                Take the output of any :obj:`ModelForQuestionAnswering` and will generate probabilities for each span to be the
+                actual answer.
+
+                In addition, it filters out some unwanted/impossible cases like answer len being greater than max_answer_len or
+                answer end position being before the starting position. The method supports output the k-best answer through
+                the topk argument.
+
+                Args:
+                    start_ (:obj:`np.ndarray`): Individual start probabilities for each token.
+                    end (:obj:`np.ndarray`): Individual end_ probabilities for each token.
+                    topk (:obj:`int`): Indicates how many possible answer span(s) to extract from the model output.
+                    max_answer_len (:obj:`int`): Maximum size of the answer to extract from the model's output.
+                    undesired_tokens_ (:obj:`np.ndarray`): Mask determining tokens that can be part of the answer
+                """
+                # Ensure we have batch axis
+                if start_.ndim == 1:
+                    start_ = start_[None]
+
+                if end_.ndim == 1:
+                    end_ = end_[None]
+
+                # Compute the score of each tuple(start_, end_) to be the real answer
+                outer = np.matmul(np.expand_dims(start_, -1), np.expand_dims(end_, 1))
+
+                # Remove candidate with end_ < start_ and end_ - start_ > max_answer_len
+                candidates = np.tril(np.triu(outer), max_answer_len - 1)
+
+                #  Inspired by Chen & al. (https://github.com/facebookresearch/DrQA)
+                scores_flat = candidates.flatten()
+                if topk == 1:
+                    idx_sort = [np.argmax(scores_flat)]
+                elif len(scores_flat) < topk:
+                    idx_sort = np.argsort(-scores_flat)
+                else:
+                    idx = np.argpartition(-scores_flat, topk)[0:topk]
+                    idx_sort = idx[np.argsort(-scores_flat[idx])]
+
+                starts_, ends_ = np.unravel_index(idx_sort, candidates.shape)[1:]
+                desired_spans = np.isin(starts_, undesired_tokens_.nonzero()) & np.isin(ends_, undesired_tokens_.nonzero())
+                starts_ = starts_[desired_spans]
+                ends_ = ends_[desired_spans]
+                scores_ = candidates[0, starts_, ends_]
+
+                return starts_, ends_, scores_
+
+        request.preprocessing_kwargs["truncation"] = "only_second"
+        predictions, features = self._predict(request, output_features=True)
+
+        task_outputs = {"answers": []}
+        for idx, (start, end, (_, context)) in enumerate(zip(predictions["start_logits"], predictions["end_logits"], request.input)):
+            start = start.numpy()
+            end = end.numpy()
+            # Ensure padded tokens & question tokens cannot belong to the set of candidate answers.
+            undesired_tokens = np.abs(np.array([s != 1 for s in features.sequence_ids(idx)]) - 1) & features["attention_mask"][idx].numpy()
+
+            # Generate mask
+            undesired_tokens_mask = undesired_tokens == 0.0
+
+            # Make sure non-context indexes in the tensor cannot contribute to the softmax
+            start = np.where(undesired_tokens_mask, -10000.0, start)
+            end = np.where(undesired_tokens_mask, -10000.0, end)
+
+            start = np.exp(start - np.log(np.sum(np.exp(start), axis=-1, keepdims=True)))
+            end = np.exp(end - np.log(np.sum(np.exp(end), axis=-1, keepdims=True)))
+
+            starts, ends, scores = decode(
+                start, end, request.task_kwargs.get("topk", 1), request.task_kwargs.get("max_answer_len", 512), undesired_tokens
+            )
+            enc = features[idx]
+            answers = [
+                {
+                    "score": score.item(),
+                    "start": enc.word_to_chars(
+                        enc.token_to_word(s), sequence_index=1)[0],
+                    "end": enc.word_to_chars(enc.token_to_word(e), sequence_index=1)[1],
+                    "answer": context[
+                              enc.word_to_chars(enc.token_to_word(s), sequence_index=1)[0] :
+                              enc.word_to_chars(enc.token_to_word(e), sequence_index=1)[1]],
+                }
+                for s, e, score in zip(starts, ends, scores)]
+            task_outputs["answers"].append(answers)
         return PredictionOutput(model_outputs=predictions, task_outputs=task_outputs)
 
     async def predict(self, request: PredictionRequest) -> PredictionOutput:
@@ -150,6 +252,8 @@ async def predict(self, request: PredictionRequest) -> PredictionOutput:
             return self._token_classification(request)
         elif request.task == Task.embedding:
             return self._embedding(request)
+        elif request.task == Task.question_answering:
+            return self._question_answering(request)
         elif request.task == Task.generation:
             return self._generation(request)
 
diff --git a/square-model-inference-api/inference_server/square_model_inference/models/prediction.py b/square-model-inference-api/inference_server/square_model_inference/models/prediction.py
index 03acbad00..a62c4151d 100644
--- a/square-model-inference-api/inference_server/square_model_inference/models/prediction.py
+++ b/square-model-inference-api/inference_server/square_model_inference/models/prediction.py
@@ -1,3 +1,5 @@
+from typing import Dict, Union, Tuple
+
 import torch
 from io import BytesIO
 import base64
@@ -5,18 +7,26 @@
 from pydantic import Field, BaseModel
 from square_model_inference.core.config import RETURN_PLAINTEXT_ARRAYS
 
-def _encode_numpy(obj):
+
+def _encode_numpy(obj: Dict[str, Union[torch.Tensor, Tuple[torch.Tensor]]]) -> Dict[str, Union[list, str]]:
+    """
+    Encodes the Torch Tensors first to Numpy arrays and then makes either plain lists or base64-encode them depending
+    on the flag RETURN_PLAINTEXT_ARRAYS
+    :param obj: the objects whose tensors will be encoded
+    :return: the same dictionary with all tensors replaced by lists or base64-encoded array strings.
+    """
     def encode(arr):
         if RETURN_PLAINTEXT_ARRAYS:
             return arr.tolist()
         else:
+            # np.save expects a file which we emulate with BytesIO
             with BytesIO() as b:
                 np.save(b, arr)
                 arr_binary = b.getvalue()
             arr_binary_b64 = base64.b64encode(arr_binary)
             arr_string_b64 = arr_binary_b64.decode("latin1")
             return arr_string_b64
-            # LOADING WITH
+            # DECODE THE VALUE WITH
             # arr_binary_b64 = arr_string_b64.encode()
             # arr_binary = base64.decodebytes(arr_binary_b64)
             # arr = np.load(BytesIO(arr_binary))
@@ -35,14 +45,22 @@ class PredictionOutput(BaseModel):
     The results of the prediction of the model on the given input for the requested task.
     """
     model_outputs: dict = Field(
-        description="Dictionary containing the model outputs as numpy arrays cast to lists."
+        description="Dictionary containing the model tensor outputs either as plain list or as base64-encoded numpy array."
     )
     task_outputs: dict = Field(
         description="Dictionary containing the task outputs. This covers anything from generated text, "
-                    "id2label mappings, token2word mappings, etc."
+                    "labels, id2label mappings, token2word mappings, etc."
     )
 
     def __init__(self, **data):
+        """
+        Data model for the model and task outputs.
+        The model outputs (,i.e., tensors) will be encoded as base64 strings or as plain lists depending on the flag
+        RETURN_PLAINTEXT_ARRAYS.
+        :param data:
+        'model_outputs': dict[str: Union[torch.Tensor, Tuple[torch.Tensor]]]. All tensor results of the model
+        'task_outputs': dict[str: Any]. All non-tensor results of the processed task like the predicted labels, extracted spans, etc.
+        """
         super().__init__(**data)
         self.model_outputs = _encode_numpy(self.model_outputs)
 
diff --git a/square-model-inference-api/inference_server/square_model_inference/models/request.py b/square-model-inference-api/inference_server/square_model_inference/models/request.py
index 3df379aa8..a41190689 100644
--- a/square-model-inference-api/inference_server/square_model_inference/models/request.py
+++ b/square-model-inference-api/inference_server/square_model_inference/models/request.py
@@ -11,6 +11,7 @@ class Task(str, Enum):
     """
     sequence_classification = "sequence_classification"
     token_classification = "token_classification"
+    question_answering = "question_answering"
     embedding = "embedding"
     generation = "generation"
 
diff --git a/square-model-inference-api/inference_server/tox.ini b/square-model-inference-api/inference_server/tox.ini
deleted file mode 100644
index e48ef368e..000000000
--- a/square-model-inference-api/inference_server/tox.ini
+++ /dev/null
@@ -1,66 +0,0 @@
-[tox]
-envlist = flake8,py38,bandit
-
-[testenv]
-deps = 
-    coverage
-    pytest
-    pytest-cov
-    pycodestyle>=2.0
-commands =
-    coverage erase
-    pip install -r {toxinidir}/requirements.txt
-    pytest --cov=tests --cov=square_model_inference --cov-report=term-missing --cov-config=setup.cfg 
-    coverage report
-    coverage html -d htmlcov-{envname}   
-
-[testenv:autopep8]
-basepython = python3
-skip_install = true
-deps =
-    autopep8>=1.5
-commands =
-    autopep8 --in-place --aggressive -r square_model_inference/
-
-
-[testenv:flake8]
-basepython = python3
-skip_install = true
-deps =
-    flake8
-    flake8-bugbear
-    flake8-colors
-    flake8-typing-imports>=1.1
-    pep8-naming
-commands =
-    flake8 square_model_inference/ tests/ setup.py
-
-[testenv:bandit]
-basepython = python3
-deps =
-    bandit
-commands =
-    bandit -r square_model_inference/
-
-[flake8]
-exclude =
-    .tox,
-    .git,
-    __pycache__,
-    docs/source/conf.py,
-    build,
-    dist,
-    tests/fixtures/*,
-    *.pyc,
-    *.egg-info,
-    .cache,
-    .eggs
-max-complexity = 10
-import-order-style = google
-application-import-names = flake8
-max-line-length = 120
-ignore = 
-    B008,
-    N803,
-    N806,
-    E126
\ No newline at end of file

From d9046a0835dc148528b0260a57e6e8c74cb33a5e Mon Sep 17 00:00:00 2001
From: Geigle <gregortheodor.geigle@stud.tu-darmstadt.de>
Date: Fri, 11 Jun 2021 16:33:39 +0200
Subject: [PATCH 03/23] Startet adding tests

---
 square-model-inference-api/Makefile           |   4 +-
 .../inference_server/Dockerfile               |  13 +-
 .../api/routes/router.py                      |   2 +-
 .../inference/adaptertransformer.py           |   4 +-
 .../inference/sentencetransformer.py          |   4 +-
 .../inference/transformer.py                  |  13 +-
 .../models/prediction.py                      |   4 +-
 .../square_model_inference/models/request.py  |   2 +-
 .../inference_server/tests/conftest.py        |  85 ++++++
 .../pre_test_setup_for_docker_caching.py      |  24 ++
 .../tests/test_api/test_heartbeat.py          |  10 +-
 .../tests/test_api/test_prediction.py         |  39 +++
 .../test_prediction_output_numpy.py           |  36 +++
 .../tests/test_inference/test_transformers.py | 273 ++++++++++++++++++
 square-model-inference-api/tests/__init__.py  |   0
 square-model-inference-api/tests/conftest.py  |  16 -
 .../tests/test_api/test_api_auth.py           |  17 --
 .../tests/test_api/test_prediction.py         |  23 --
 .../tests/test_service/test_models.py         |  27 --
 19 files changed, 491 insertions(+), 105 deletions(-)
 create mode 100644 square-model-inference-api/inference_server/tests/conftest.py
 create mode 100644 square-model-inference-api/inference_server/tests/pre_test_setup_for_docker_caching.py
 rename square-model-inference-api/{ => inference_server}/tests/test_api/test_heartbeat.py (52%)
 create mode 100644 square-model-inference-api/inference_server/tests/test_api/test_prediction.py
 create mode 100644 square-model-inference-api/inference_server/tests/test_inference/test_prediction_output_numpy.py
 create mode 100644 square-model-inference-api/inference_server/tests/test_inference/test_transformers.py
 delete mode 100644 square-model-inference-api/tests/__init__.py
 delete mode 100644 square-model-inference-api/tests/conftest.py
 delete mode 100644 square-model-inference-api/tests/test_api/test_api_auth.py
 delete mode 100644 square-model-inference-api/tests/test_api/test_prediction.py
 delete mode 100644 square-model-inference-api/tests/test_service/test_models.py

diff --git a/square-model-inference-api/Makefile b/square-model-inference-api/Makefile
index 9ca741306..7acebffb0 100644
--- a/square-model-inference-api/Makefile
+++ b/square-model-inference-api/Makefile
@@ -7,8 +7,8 @@ SHELL := /bin/bash
 all: clean test install run deploy down
 
 test:
-	python -m pip install --upgrade pip && pip install pytest
-	pytest
+	python -m pip install --upgrade pip && pip install pytest pytest-cov pytest-asyncio
+	pytest --cov
 
 install:
 	pip install --upgrade pip && pip install -r inference_server/requirements.txt
diff --git a/square-model-inference-api/inference_server/Dockerfile b/square-model-inference-api/inference_server/Dockerfile
index 4418f182a..dee4290fb 100644
--- a/square-model-inference-api/inference_server/Dockerfile
+++ b/square-model-inference-api/inference_server/Dockerfile
@@ -1,4 +1,4 @@
-FROM python:3.8.2
+FROM python:3.8.2 as build
 
 ENV PYTHONUNBUFFERED 1
 
@@ -9,6 +9,17 @@ COPY requirements.txt ./
 RUN pip install --upgrade pip && \
     pip install -r requirements.txt
 
+COPY ./tests/pre_test_setup_for_docker_caching.py ./tests/pre_test_setup_for_docker_caching.py
+RUN python ./tests/pre_test_setup_for_docker_caching.py
+
 COPY . ./
 
+FROM base as test
+WORKDIR /app
+RUN pip install pytest pytest-cov pytest-asyncio
+COPY ../tests tests
+RUN pytest --cov
+
+FROM base as build
+
 CMD ["uvicorn", "main:app", "--host", "0.0.0.0", "--port", "8000"]
\ No newline at end of file
diff --git a/square-model-inference-api/inference_server/square_model_inference/api/routes/router.py b/square-model-inference-api/inference_server/square_model_inference/api/routes/router.py
index b1fac0bfd..67d60b65e 100644
--- a/square-model-inference-api/inference_server/square_model_inference/api/routes/router.py
+++ b/square-model-inference-api/inference_server/square_model_inference/api/routes/router.py
@@ -4,4 +4,4 @@
 
 api_router = APIRouter()
 api_router.include_router(heartbeat.router, tags=["health"], prefix="/health")
-api_router.include_router(prediction.router, tags=["prediction"], prefix="/v1")
+api_router.include_router(prediction.router, tags=["prediction"])
diff --git a/square-model-inference-api/inference_server/square_model_inference/inference/adaptertransformer.py b/square-model-inference-api/inference_server/square_model_inference/inference/adaptertransformer.py
index ed7e0b4c1..e76259aa3 100644
--- a/square-model-inference-api/inference_server/square_model_inference/inference/adaptertransformer.py
+++ b/square-model-inference-api/inference_server/square_model_inference/inference/adaptertransformer.py
@@ -68,10 +68,10 @@ def _sequence_classification(self, request: PredictionRequest) -> PredictionOutp
 
     async def predict(self, request: PredictionRequest) -> PredictionOutput:
         if request.is_preprocessed:
-            ValueError("is_preprocessed=True is not supported for this model. Please use text as input.")
+            raise ValueError("is_preprocessed=True is not supported for this model. Please use text as input.")
 
         if not request.adapter_name or request.adapter_name not in self.model.config.adapters.adapters:
-            ValueError(f"Unknown or missing adapter {request.adapter_name}. "
+            raise ValueError(f"Unknown or missing adapter {request.adapter_name}. "
                        f"Please provider a fully specified adapter name from adapterhub.ml")
         self.model.set_active_adapters(request.adapter_name)
 
diff --git a/square-model-inference-api/inference_server/square_model_inference/inference/sentencetransformer.py b/square-model-inference-api/inference_server/square_model_inference/inference/sentencetransformer.py
index fdfd32ad6..f6acf7672 100644
--- a/square-model-inference-api/inference_server/square_model_inference/inference/sentencetransformer.py
+++ b/square-model-inference-api/inference_server/square_model_inference/inference/sentencetransformer.py
@@ -37,9 +37,9 @@ def _embedding(self, request: PredictionRequest) -> PredictionOutput:
 
     async def predict(self, request: PredictionRequest) -> PredictionOutput:
         if request.is_preprocessed:
-            ValueError("is_preprocessed=True is not supported for this model. Please use text as input.")
+            raise ValueError("is_preprocessed=True is not supported for this model. Please use text as input.")
 
         if request.task != Task.embedding:
-            NotImplementedError("Only embedding task supported by this model")
+            raise NotImplementedError("Only embedding task supported by this model")
         return self._embedding(request)
 
diff --git a/square-model-inference-api/inference_server/square_model_inference/inference/transformer.py b/square-model-inference-api/inference_server/square_model_inference/inference/transformer.py
index c4c0a4f92..32c34cc59 100644
--- a/square-model-inference-api/inference_server/square_model_inference/inference/transformer.py
+++ b/square-model-inference-api/inference_server/square_model_inference/inference/transformer.py
@@ -5,7 +5,7 @@
 from loguru import logger
 import numpy as np
 from transformers import AutoTokenizer, AutoModel, AutoModelForSequenceClassification, \
-                        AutoModelForTokenClassification, AutoModelForQuestionAnswering
+    AutoModelForTokenClassification, AutoModelForQuestionAnswering, AutoModelForCausalLM
 
 from square_model_inference.inference.model import Model
 from square_model_inference.models.request import PredictionRequest, Task
@@ -15,8 +15,9 @@
 CLASS_MAPPING = {
     "base": AutoModel,
     "sequence_classification": AutoModelForSequenceClassification,
-    "token_classifcation": AutoModelForTokenClassification,
-    "question_answering": AutoModelForQuestionAnswering
+    "token_classification": AutoModelForTokenClassification,
+    "question_answering": AutoModelForQuestionAnswering,
+    "generation": AutoModelForCausalLM
 }
 
 class Transformer(Model):
@@ -97,7 +98,7 @@ def _embedding(self, request: PredictionRequest) -> PredictionOutput:
         }
 
         if embedding_mode not in self.SUPPORTED_EMBEDDING_MODES:
-            ValueError(f"Embedding mode {embedding_mode} not in list of supported modes {self.SUPPORTED_EMBEDDING_MODES}")
+            raise ValueError(f"Embedding mode {embedding_mode} not in list of supported modes {self.SUPPORTED_EMBEDDING_MODES}")
 
         if embedding_mode == "cls":
             emb = hidden_state[:, 0, :]
@@ -153,7 +154,7 @@ def _sequence_classification(self, request: PredictionRequest) -> PredictionOutp
         return PredictionOutput(model_outputs=predictions, task_outputs=task_outputs)
 
     def _generation(self, request: PredictionRequest) -> PredictionOutput:
-        NotImplementedError("Generation is currently not implemented")
+        raise NotImplementedError("Generation is currently not implemented")
 
     def _question_answering(self, request: PredictionRequest) -> PredictionOutput:
         # Making heavy use of https://huggingface.co/transformers/_modules/transformers/pipelines/question_answering.html#QuestionAnsweringPipeline
@@ -244,7 +245,7 @@ def decode(start_: np.ndarray, end_: np.ndarray, topk: int, max_answer_len: int,
 
     async def predict(self, request: PredictionRequest) -> PredictionOutput:
         if request.is_preprocessed:
-            ValueError("is_preprocessed=True is not supported for this model. Please use text as input.")
+            raise ValueError("is_preprocessed=True is not supported for this model. Please use text as input.")
 
         if request.task == Task.sequence_classification:
             return self._sequence_classification(request)
diff --git a/square-model-inference-api/inference_server/square_model_inference/models/prediction.py b/square-model-inference-api/inference_server/square_model_inference/models/prediction.py
index a62c4151d..333b150e7 100644
--- a/square-model-inference-api/inference_server/square_model_inference/models/prediction.py
+++ b/square-model-inference-api/inference_server/square_model_inference/models/prediction.py
@@ -8,7 +8,7 @@
 from square_model_inference.core.config import RETURN_PLAINTEXT_ARRAYS
 
 
-def _encode_numpy(obj: Dict[str, Union[torch.Tensor, Tuple[torch.Tensor]]]) -> Dict[str, Union[list, str]]:
+def _encode_numpy(obj: Dict[str, Union[torch.Tensor, Tuple[torch.Tensor]]], return_plaintext: bool=RETURN_PLAINTEXT_ARRAYS) -> Dict[str, Union[list, str]]:
     """
     Encodes the Torch Tensors first to Numpy arrays and then makes either plain lists or base64-encode them depending
     on the flag RETURN_PLAINTEXT_ARRAYS
@@ -16,7 +16,7 @@ def _encode_numpy(obj: Dict[str, Union[torch.Tensor, Tuple[torch.Tensor]]]) -> D
     :return: the same dictionary with all tensors replaced by lists or base64-encoded array strings.
     """
     def encode(arr):
-        if RETURN_PLAINTEXT_ARRAYS:
+        if return_plaintext:
             return arr.tolist()
         else:
             # np.save expects a file which we emulate with BytesIO
diff --git a/square-model-inference-api/inference_server/square_model_inference/models/request.py b/square-model-inference-api/inference_server/square_model_inference/models/request.py
index a41190689..ab3c7e1ea 100644
--- a/square-model-inference-api/inference_server/square_model_inference/models/request.py
+++ b/square-model-inference-api/inference_server/square_model_inference/models/request.py
@@ -39,7 +39,7 @@ class PredictionRequest(BaseModel):
     model_kwargs: dict = Field(
         default={},
         description="Dictionary containing parameters that are passed to the model for the forward pass. "
-                    "Set ‘output_attention=True’ to receive the attention weights for Huggingface Transformers in the "
+                    "Set ‘output_attentions=True’ to receive the attention weights for Huggingface Transformers in the "
                     "output."
     )
     task: Task = Field(...)
diff --git a/square-model-inference-api/inference_server/tests/conftest.py b/square-model-inference-api/inference_server/tests/conftest.py
new file mode 100644
index 000000000..8a2f333f8
--- /dev/null
+++ b/square-model-inference-api/inference_server/tests/conftest.py
@@ -0,0 +1,85 @@
+import pytest
+
+from fastapi.testclient import TestClient
+from pre_test_setup_for_docker_caching import TRANSFORMERS_TESTING_CACHE, TRANSFORMER_MODEL, SENTENCE_MODEL
+import torch
+
+## Due to import and config reasons, the environ is set in pre_test_setup_for_docker_caching !
+## (because we import Transformer, which imports Model, imports PredictionOutput, which imports RETURN_PLAINTEXT_ARRAYS and this creates the starlette config.
+## The config  is read by this point and starlette forbids overwriting it then)
+# from starlette.config import environ
+# environ["TRANSFORMERS_CACHE"] = TRANSFORMERS_TESTING_CACHE
+# environ["MODEL_NAME"] = "test"
+# environ["MODEL_TYPE"] = "test"
+# environ["DISABLE_GPU"] = "True"
+# environ["MAX_BATCH_SIZE"] = "1"
+# environ["RETURN_PLAINTEXT_ARRAYS"] = "False"
+
+from main import get_app
+from square_model_inference.inference.model import Model
+from square_model_inference.models.prediction import PredictionOutput
+from square_model_inference.models.request import PredictionRequest
+from square_model_inference.inference.transformer import Transformer
+from square_model_inference.inference.adaptertransformer import AdapterTransformer
+from square_model_inference.inference.sentencetransformer import SentenceTransformer
+
+@pytest.fixture(scope="session")
+def test_app():
+    app = get_app()
+    app.state.model = TestModel()
+    return app
+
+
+@pytest.fixture()
+def test_client(test_app):
+    with TestClient(test_app) as test_client:
+        yield test_client
+
+
+class TestModel(Model):
+    def __init__(self):
+        self.prediction = PredictionOutput(model_outputs={}, task_outputs={})
+
+    async def predict(self, payload: PredictionRequest) -> PredictionOutput:
+        return self.prediction
+
+
+# We only load bert-base-uncased, so we fix the random seed to always get the same randomly generated heads on top
+@pytest.fixture(scope="class")
+def test_transformer_sequence_classification():
+    torch.manual_seed(987654321)
+    return Transformer(TRANSFORMER_MODEL, "sequence_classification", 1, True)
+
+
+@pytest.fixture(scope="class")
+def test_transformer_embedding():
+    torch.manual_seed(987654321)
+    return Transformer(TRANSFORMER_MODEL, "base", 1, True)
+
+
+@pytest.fixture(scope="class")
+def test_transformer_token_classification():
+    torch.manual_seed(987654321)
+    return Transformer(TRANSFORMER_MODEL, "token_classification", 1, True)
+
+
+@pytest.fixture(scope="class")
+def test_transformer_question_answering():
+    torch.manual_seed(987654321)
+    return Transformer(TRANSFORMER_MODEL, "question_answering", 1, True)
+
+
+@pytest.fixture(scope="class")
+def test_transformer_generation():
+    torch.manual_seed(987654321)
+    return Transformer(TRANSFORMER_MODEL, "generation", 1, True)
+
+
+@pytest.fixture(scope="class")
+def test_adapter():
+    return AdapterTransformer(TRANSFORMER_MODEL, 1, True, TRANSFORMERS_TESTING_CACHE)
+
+
+@pytest.fixture(scope="class")
+def test_sentence_transformer():
+    return SentenceTransformer(SENTENCE_MODEL, 1, True)
diff --git a/square-model-inference-api/inference_server/tests/pre_test_setup_for_docker_caching.py b/square-model-inference-api/inference_server/tests/pre_test_setup_for_docker_caching.py
new file mode 100644
index 000000000..fe6794e6b
--- /dev/null
+++ b/square-model-inference-api/inference_server/tests/pre_test_setup_for_docker_caching.py
@@ -0,0 +1,24 @@
+# This file is used for the Docker image to cache long-running setup for the tests,
+# i.e., downloading finetuned Transformer models and so on.
+# This way, adding new tests or even changing the server code does NOT trigger a new download during building
+from starlette.config import environ
+TRANSFORMERS_TESTING_CACHE = "./model_testing_cache"
+environ["TRANSFORMERS_CACHE"] = TRANSFORMERS_TESTING_CACHE
+environ["MODEL_NAME"] = "test"
+environ["MODEL_TYPE"] = "test"
+environ["DISABLE_GPU"] = "True"
+environ["MAX_BATCH_SIZE"] = "1"
+environ["RETURN_PLAINTEXT_ARRAYS"] = "True"
+
+from square_model_inference.inference.transformer import Transformer
+from square_model_inference.inference.adaptertransformer import AdapterTransformer
+from square_model_inference.inference.sentencetransformer import SentenceTransformer
+
+#Downloaded models:
+TRANSFORMER_MODEL = "bert-base-uncased"
+SENTENCE_MODEL = "paraphrase-MiniLM-L6-v2 "
+
+if __name__ == "__main__":
+    bert_transformer = Transformer(TRANSFORMER_MODEL, "base", 1, True)
+    adapter_transformer = AdapterTransformer(TRANSFORMER_MODEL, 1, True, TRANSFORMERS_TESTING_CACHE)
+    sentence_transformer = SentenceTransformer(SENTENCE_MODEL)
\ No newline at end of file
diff --git a/square-model-inference-api/tests/test_api/test_heartbeat.py b/square-model-inference-api/inference_server/tests/test_api/test_heartbeat.py
similarity index 52%
rename from square-model-inference-api/tests/test_api/test_heartbeat.py
rename to square-model-inference-api/inference_server/tests/test_api/test_heartbeat.py
index 343f594d7..eade6ef16 100644
--- a/square-model-inference-api/tests/test_api/test_heartbeat.py
+++ b/square-model-inference-api/inference_server/tests/test_api/test_heartbeat.py
@@ -1,14 +1,14 @@
-from square_model_inference.main import get_app
+from starlette.testclient import TestClient
 
-app = get_app()
 
-
-def test_heartbeat(test_client) -> None:
+def test_heartbeat(test_app) -> None:
+    test_client = TestClient(test_app)
     response = test_client.get("/api/health/heartbeat")
     assert response.status_code == 200
     assert response.json() == {"is_alive": True}
 
 
-def test_default_route(test_client) -> None:
+def test_default_route(test_app) -> None:
+    test_client = TestClient(test_app)
     response = test_client.get("/")
     assert response.status_code == 404
diff --git a/square-model-inference-api/inference_server/tests/test_api/test_prediction.py b/square-model-inference-api/inference_server/tests/test_api/test_prediction.py
new file mode 100644
index 000000000..9829ff680
--- /dev/null
+++ b/square-model-inference-api/inference_server/tests/test_api/test_prediction.py
@@ -0,0 +1,39 @@
+from starlette.testclient import TestClient
+
+
+def test_prediction(test_app) -> None:
+    test_client = TestClient(test_app)
+    response = test_client.post(
+        "/api/predict",
+        json={
+            "input": [
+                "this is a test"
+            ],
+            "is_preprocessed": False,
+            "preprocessing_kwargs": {},
+            "model_kwargs": {},
+            "task": "embedding",
+            "task_kwargs": {},
+            "adapter_name": ""
+        }
+    )
+    assert response.status_code == 200
+
+
+def test_prediction_wrong_request(test_app) -> None:
+    test_client = TestClient(test_app, raise_server_exceptions=False)
+    response = test_client.post(
+        "/api/predict", json={
+            "input": [
+                "this is a test"
+            ],
+            "is_preprocessed": False,
+            "preprocessing_kwargs": {},
+            # required
+            #"model_kwargs": {},
+            #"task": "embedding",
+            "task_kwargs": {},
+            "adapter_name": ""
+        }
+    )
+    assert response.status_code == 422
diff --git a/square-model-inference-api/inference_server/tests/test_inference/test_prediction_output_numpy.py b/square-model-inference-api/inference_server/tests/test_inference/test_prediction_output_numpy.py
new file mode 100644
index 000000000..35c4309c6
--- /dev/null
+++ b/square-model-inference-api/inference_server/tests/test_inference/test_prediction_output_numpy.py
@@ -0,0 +1,36 @@
+from starlette.config import Environ
+from square_model_inference.models.prediction import PredictionOutput, _encode_numpy
+from io import BytesIO
+import base64
+import torch
+import numpy as np
+
+def test_prediction_output_numpy_encoded() -> None:
+
+    arr = np.ones(shape=(10,10), dtype="float32")
+
+    output = _encode_numpy({"test": torch.from_numpy(arr)}, return_plaintext=False)
+
+    encoded_arr = output["test"]
+
+    # reversing code
+    arr_binary_b64 = encoded_arr.encode()
+    arr_binary = base64.decodebytes(arr_binary_b64)
+    arr_back = np.load(BytesIO(arr_binary))
+
+    np.testing.assert_equal(arr, arr_back)
+
+
+def test_prediction_output_numpy_plaintext() -> None:
+
+    arr = np.ones(shape=(10,10), dtype="float32")
+
+    output = _encode_numpy({"test": torch.from_numpy(arr)}, return_plaintext=True)
+
+    plaintext_list_arr = output["test"]
+
+    # reversing code
+    arr_back = np.array(plaintext_list_arr)
+
+    np.testing.assert_equal(arr, arr_back)
+
diff --git a/square-model-inference-api/inference_server/tests/test_inference/test_transformers.py b/square-model-inference-api/inference_server/tests/test_inference/test_transformers.py
new file mode 100644
index 000000000..a8631c0ef
--- /dev/null
+++ b/square-model-inference-api/inference_server/tests/test_inference/test_transformers.py
@@ -0,0 +1,273 @@
+import pytest
+
+from square_model_inference.models.request import PredictionRequest
+import numpy as np
+
+@pytest.mark.usefixtures("test_transformer_sequence_classification")
+class TestTransformerSequenceClassification:
+
+    @pytest.mark.asyncio
+    @pytest.mark.parametrize("input,output", [(["this is a test"], [1]),
+                                              (["this is a test", "this is a test with a longer sentence"], [1, 1])],
+                             ids=["single", "batch"])
+    async def test_sequence_classification(self, test_transformer_sequence_classification, input, output):
+        request = PredictionRequest.parse_obj({
+            "input": input,
+            "is_preprocessed": False,
+            "preprocessing_kwargs": {},
+            "model_kwargs": {},
+            "task": "sequence_classification",
+            "task_kwargs": {},
+            "adapter_name": ""
+        })
+
+        prediction = await test_transformer_sequence_classification.predict(request)
+        np.testing.assert_allclose(np.sum(prediction.model_outputs["logits"], axis=-1), [1.0]*len(input), err_msg="logits are not converted to softmax")
+        assert prediction.task_outputs["labels"] == output
+        assert "logits" in prediction.model_outputs
+        assert "id2label" in prediction.task_outputs
+
+    @pytest.mark.asyncio
+    async def test_sequence_classification_output_attention(self, test_transformer_sequence_classification):
+        request = PredictionRequest.parse_obj({
+            "input": ["testing test"],
+            "is_preprocessed": False,
+            "preprocessing_kwargs": {},
+            "model_kwargs": {"output_attentions": True},
+            "task": "sequence_classification",
+            "task_kwargs": {},
+            "adapter_name": ""
+        })
+
+        prediction = await test_transformer_sequence_classification.predict(request)
+        assert "attentions" in prediction.model_outputs
+
+    @pytest.mark.asyncio
+    @pytest.mark.parametrize("input,output", [(["this is a test"], [[0.17748619616031647, 0.4802907109260559]]),
+                                              (["this is a test", "this is a test with a longer sentence"],
+                                               [[0.17748622596263885, 0.48029083013534546], [-0.048022523522377014, 0.5764690637588501]])],
+                             ids=["single", "batch"])
+    async def test_sequence_classification_regression(self, test_transformer_sequence_classification, input, output):
+        request = PredictionRequest.parse_obj({
+            "input": input,
+            "is_preprocessed": False,
+            "preprocessing_kwargs": {},
+            "model_kwargs": {},
+            "task": "sequence_classification",
+            "task_kwargs": {"regression": True},
+            "adapter_name": ""
+        })
+
+        prediction = await test_transformer_sequence_classification.predict(request)
+        assert prediction.model_outputs["logits"] == output
+        assert "labels" not in prediction.task_outputs
+        assert "logits" in prediction.model_outputs
+
+
+@pytest.mark.usefixtures("test_transformer_token_classification")
+class TestTransformerTokenClassification:
+
+    @pytest.mark.asyncio
+    @pytest.mark.parametrize("input,output,word_ids", [(["this is a test"], [[0, 1, 1, 1, 1, 1]], [[None, 0, 1, 2, 3, None]]),
+                                              (["this is a test", "this is a test with a longer sentence"],
+                                               [[0, 1, 1, 1, 1, 1, 0, 0, 0, 0], [1, 1, 1, 1, 1, 1, 1, 1, 1, 1]],
+                                               [[None, 0, 1, 2, 3, None, None, None, None, None], [None, 0, 1, 2, 3, 4, 5, 6, 7, None]])],
+                             ids=["single", "batch"])
+    async def test_token_classification(self, test_transformer_token_classification, input, output, word_ids):
+        request = PredictionRequest.parse_obj({
+            "input": input,
+            "is_preprocessed": False,
+            "preprocessing_kwargs": {},
+            "model_kwargs": {},
+            "task": "token_classification",
+            "task_kwargs": {},
+            "adapter_name": ""
+        })
+
+        prediction = await test_transformer_token_classification.predict(request)
+        np.testing.assert_allclose(np.sum(prediction.model_outputs["logits"], axis=-1), np.ones_like(output), err_msg="logits are not converted to softmax")
+        assert prediction.task_outputs["labels"] == output
+        assert "logits" in prediction.model_outputs
+        assert "id2label" in prediction.task_outputs
+        assert prediction.task_outputs["word_ids"] == word_ids
+
+
+    @pytest.mark.asyncio
+    @pytest.mark.parametrize("input,output,word_ids", [(["this is a test"], [[[0.11221601068973541, 0.05969493091106415],
+                                                                              [-0.15642595291137695, -0.07739989459514618],
+                                                                              [-0.2801300287246704, 0.06872765719890594],
+                                                                              [-0.28268659114837646, -0.01801353693008423],
+                                                                              [0.032697953283786774, 0.051608189940452576],
+                                                                              [0.01579795777797699, 0.23798659443855286]]],
+                                                        [[None, 0, 1, 2, 3, None]]),
+                                                       (["this is a test", "this is a test with a longer sentence"],
+                                                        [[[0.11221593618392944, 0.059694863855838776],
+                                                          [-0.15642589330673218, -0.07740016281604767],
+                                                          [-0.2801305651664734, 0.06872743368148804],
+                                                          [-0.28268682956695557, -0.018013179302215576],
+                                                          [0.03269825875759125, 0.051608070731163025],
+                                                          [0.01579788327217102, 0.2379867136478424],
+                                                          [0.03173816204071045, -0.15285855531692505],
+                                                          [0.11003853380680084, -0.12873490154743195],
+                                                          [0.13318589329719543, -0.10646772384643555],
+                                                          [0.13220593333244324, -0.12443935126066208]],
+                                                         [[-0.05789753794670105, 0.04508095979690552],
+                                                          [-0.27872341871261597, -0.1229611188173294],
+                                                          [-0.5753417015075684, 0.07893967628479004],
+                                                          [-0.47106701135635376, -0.04298338294029236],
+                                                          [-0.5324697494506836, 0.10730768740177155],
+                                                          [-0.31086403131484985, 0.3265591263771057],
+                                                          [-0.2457294911146164, 0.24867495894432068],
+                                                          [-0.2389427125453949, 0.4464331567287445],
+                                                          [-0.08393016457557678, -0.03680300712585449],
+                                                          [-0.01722254604101181, 0.41447973251342773]]],
+                                                        [[None, 0, 1, 2, 3, None, None, None, None, None], [None, 0, 1, 2, 3, 4, 5, 6, 7, None]])],
+                             ids=["single", "batch"])
+    async def test_token_classification_regression(self, test_transformer_token_classification, input, output, word_ids):
+        request = PredictionRequest.parse_obj({
+            "input": input,
+            "is_preprocessed": False,
+            "preprocessing_kwargs": {},
+            "model_kwargs": {},
+            "task": "token_classification",
+            "task_kwargs": {"regression": True},
+            "adapter_name": ""
+        })
+
+        prediction = await test_transformer_token_classification.predict(request)
+        assert prediction.model_outputs["logits"] == output
+        assert "labels" not in prediction.task_outputs
+        assert "logits" in prediction.model_outputs
+        assert prediction.task_outputs["word_ids"] == word_ids
+
+
+@pytest.mark.usefixtures("test_transformer_embedding")
+class TestTransformerSequenceClassification:
+
+    @pytest.mark.asyncio
+    @pytest.mark.parametrize("input,mode", [(["this is a test"], "mean"),
+                                              (["this is a test", "this is a test with a longer sentence"], "mean"),
+                                            (["this is a test"], "max"),
+                                            (["this is a test", "this is a test with a longer sentence"], "max"),
+                                            (["this is a test"], "cls"),
+                                            (["this is a test", "this is a test with a longer sentence"], "cls")],
+                             )
+    async def test_embedding(self, test_transformer_embedding, input, mode):
+        request = PredictionRequest.parse_obj({
+            "input": input,
+            "is_preprocessed": False,
+            "preprocessing_kwargs": {},
+            "model_kwargs": {},
+            "task": "embedding",
+            "task_kwargs": {"embedding_mode": mode},
+            "adapter_name": ""
+        })
+
+        prediction = await test_transformer_embedding.predict(request)
+        assert np.array(prediction.model_outputs["embeddings"]).shape[1] == 768
+        assert np.array(prediction.model_outputs["embeddings"]).shape[0] == len(input)
+        assert "hidden_states" not in prediction.model_outputs
+        assert prediction.task_outputs["embedding_mode"] == mode
+
+    @pytest.mark.asyncio
+    @pytest.mark.parametrize("input,word_ids", [(["this is a test"], [[None, 0, 1, 2, 3, None]]),
+                                            (["this is a test", "this is a test with a longer sentence"],
+                                             [[None, 0, 1, 2, 3, None, None, None, None, None], [None, 0, 1, 2, 3, 4, 5, 6, 7, None]])],
+                             ids=["single", "batch"])
+    async def test_embedding_token(self, test_transformer_embedding, input, word_ids):
+        request = PredictionRequest.parse_obj({
+            "input": input,
+            "is_preprocessed": False,
+            "preprocessing_kwargs": {},
+            "model_kwargs": {},
+            "task": "embedding",
+            "task_kwargs": {"embedding_mode": "token"},
+            "adapter_name": ""
+        })
+
+        prediction = await test_transformer_embedding.predict(request)
+        assert np.array(prediction.model_outputs["embeddings"]).shape[2] == 768
+        assert np.array(prediction.model_outputs["embeddings"]).shape[1] == len(word_ids[0])
+        assert np.array(prediction.model_outputs["embeddings"]).shape[0] == len(input)
+        assert "hidden_states" not in prediction.model_outputs
+        assert prediction.task_outputs["embedding_mode"] == "token"
+        assert prediction.task_outputs["word_ids"] == word_ids
+
+    @pytest.mark.asyncio
+    async def test_embedding_unknown_mode(self, test_transformer_embedding):
+        request = PredictionRequest.parse_obj({
+            "input": ["test"],
+            "is_preprocessed": False,
+            "preprocessing_kwargs": {},
+            "model_kwargs": {},
+            "task": "embedding",
+            "task_kwargs": {"embedding_mode": "this mode does not exist"},
+            "adapter_name": ""
+        })
+        with pytest.raises(ValueError):
+            prediction = await test_transformer_embedding.predict(request)
+
+
+@pytest.mark.usefixtures("test_transformer_question_answering")
+class TestTransformerSequenceClassification:
+
+    @pytest.mark.asyncio
+    @pytest.mark.parametrize("input", [([["What is a test?", "A test is a thing where you test."]]),
+                                       ([["What is a test?", "A test is a thing where you test."],
+                                         ["What is a longer test?", "A test is a thing where you test. If it is longer you call it longer"]])],
+                             )
+    async def test_question_answering(self, test_transformer_question_answering, input):
+        request = PredictionRequest.parse_obj({
+            "input": input,
+            "is_preprocessed": False,
+            "preprocessing_kwargs": {},
+            "model_kwargs": {},
+            "task": "question_answering",
+            "task_kwargs": {"topk": 1},
+            "adapter_name": ""
+        })
+
+        prediction = await test_transformer_question_answering.predict(request)
+        answers = [input[i][1][prediction.task_outputs["answers"][i][0]["start"]:prediction.task_outputs["answers"][i][0]["end"]] for i in range(len(input))]
+        assert "start_logits" in prediction.model_outputs and "end_logits" in prediction.model_outputs
+        assert len(prediction.task_outputs["answers"]) == len(input)
+        assert all(prediction.task_outputs["answers"][i][0]["answer"] == answers[i] for i in range(len(input)))
+
+    @pytest.mark.asyncio
+    async def test_question_answering_topk(self, test_transformer_question_answering):
+        input = [["What is a test?", "A test is a thing where you test."]]
+        request = PredictionRequest.parse_obj({
+            "input": input,
+            "is_preprocessed": False,
+            "preprocessing_kwargs": {},
+            "model_kwargs": {},
+            "task": "question_answering",
+            "task_kwargs": {"topk": 2},
+            "adapter_name": ""
+        })
+
+        prediction = await test_transformer_question_answering.predict(request)
+        answers = [input[0][1][prediction.task_outputs["answers"][0][i]["start"]:prediction.task_outputs["answers"][0][i]["end"]] for i in range(2)]
+        assert "start_logits" in prediction.model_outputs and "end_logits" in prediction.model_outputs
+        assert len(prediction.task_outputs["answers"]) == len(input)
+        assert all(prediction.task_outputs["answers"][0][i]["answer"] == answers[i] for i in range(2))
+
+
+@pytest.mark.usefixtures("test_transformer_generation")
+class TestTransformerGeneration:
+    @pytest.mark.asyncio
+    async def test_generation(self, test_transformer_generation):
+        request = PredictionRequest.parse_obj({
+            "input": [
+                "this is a test"
+            ],
+            "is_preprocessed": False,
+            "preprocessing_kwargs": {},
+            "model_kwargs": {},
+            "task": "generation",
+            "task_kwargs": {},
+            "adapter_name": ""
+        })
+
+        prediction = await test_transformer_generation.predict(request)
+        assert prediction
\ No newline at end of file
diff --git a/square-model-inference-api/tests/__init__.py b/square-model-inference-api/tests/__init__.py
deleted file mode 100644
index e69de29bb..000000000
diff --git a/square-model-inference-api/tests/conftest.py b/square-model-inference-api/tests/conftest.py
deleted file mode 100644
index b21642754..000000000
--- a/square-model-inference-api/tests/conftest.py
+++ /dev/null
@@ -1,16 +0,0 @@
-import pytest
-from starlette.config import environ
-from starlette.testclient import TestClient
-
-
-environ["API_KEY"] = "example_key"
-
-
-from square_model_inference.main import get_app  # noqa: E402
-
-
-@pytest.fixture()
-def test_client():
-    app = get_app()
-    with TestClient(app) as test_client:
-        yield test_client
diff --git a/square-model-inference-api/tests/test_api/test_api_auth.py b/square-model-inference-api/tests/test_api/test_api_auth.py
deleted file mode 100644
index cc5467b20..000000000
--- a/square-model-inference-api/tests/test_api/test_api_auth.py
+++ /dev/null
@@ -1,17 +0,0 @@
-from square_model_inference.core import messages
-
-
-def test_auth_using_prediction_api_no_apikey_header(test_client) -> None:
-    response = test_client.post("/api/v1/question")
-    assert response.status_code == 400
-    assert response.json() == {"detail": messages.NO_API_KEY}
-
-
-def test_auth_using_prediction_api_wrong_apikey_header(test_client) -> None:
-    response = test_client.post(
-        "/api/v1/question",
-        json={"context": "test", "question": "test"},
-        headers={"token": "WRONG_TOKEN"},
-    )
-    assert response.status_code == 401
-    assert response.json() == {"detail": messages.AUTH_REQ}
diff --git a/square-model-inference-api/tests/test_api/test_prediction.py b/square-model-inference-api/tests/test_api/test_prediction.py
deleted file mode 100644
index a00caf028..000000000
--- a/square-model-inference-api/tests/test_api/test_prediction.py
+++ /dev/null
@@ -1,23 +0,0 @@
-def test_prediction(test_client) -> None:
-    response = test_client.post(
-        "/api/v1/question",
-        json={"context": "two plus two equal four", "question": "What is four?"},
-        headers={"token": "example_key"},
-    )
-    assert response.status_code == 200
-    assert "score" in response.json()
-
-
-def test_prediction_nopayload(test_client) -> None:
-    response = test_client.post(
-        "/api/v1/question", json={}, headers={"token": "example_key"}
-    )
-    """
-    ## if nopayload, default Hitchhiker's Guide to the Galaxy example is sent
-    # context:"42 is the answer to life, the universe and everything."
-    # question:"What is the answer to life?"
-    # if no default, assert response.status_code == 422
-    """
-
-    data = response.json()
-    assert data["answer"] == "42"
diff --git a/square-model-inference-api/tests/test_service/test_models.py b/square-model-inference-api/tests/test_service/test_models.py
deleted file mode 100644
index 61d666db7..000000000
--- a/square-model-inference-api/tests/test_service/test_models.py
+++ /dev/null
@@ -1,27 +0,0 @@
-import pytest
-
-from square_model_inference.core import config
-from square_model_inference.models.payload import QAPredictionPayload
-from square_model_inference.models.prediction import QAPredictionResult
-from square_model_inference.transformers.nlp import QAModel
-
-
-def test_prediction(test_client) -> None:
-    model_path = config.DEFAULT_MODEL_PATH
-    qa = QAPredictionPayload.parse_obj(
-        {"context": "two plus two equal four", "question": "What is four?"}
-    )
-
-    tm = QAModel(model_path)
-    result = tm.predict(qa)
-    assert isinstance(result, QAPredictionResult)
-    assert result.answer == "two plus two"
-
-
-def test_prediction_no_payload(test_client) -> None:
-    model_path = config.DEFAULT_MODEL_PATH
-
-    tm = QAModel(model_path)
-    with pytest.raises(ValueError):
-        result = tm.predict(None)
-        assert isinstance(result, QAPredictionResult)

From 074fc3bc1b8f6510f1ff6cd441a3d7b58d8e6dd2 Mon Sep 17 00:00:00 2001
From: Geigle <gregortheodor.geigle@stud.tu-darmstadt.de>
Date: Thu, 17 Jun 2021 14:29:51 +0200
Subject: [PATCH 04/23] Added generation for transformers; continued tests
 (missing for now just adapters)

---
 .../inference_server/Dockerfile               |   6 +-
 .../inference/sentencetransformer.py          |   6 +-
 .../inference/transformer.py                  |  34 ++++-
 .../models/prediction.py                      |  20 ++-
 .../inference_server/tests/conftest.py        |   7 +-
 .../pre_test_setup_for_docker_caching.py      |   2 +-
 .../tests/test_inference/test_adapter.py      |  10 ++
 .../test_sentence_transformer.py              |  54 +++++++
 .../tests/test_inference/test_transformers.py | 135 +++++++++++-------
 9 files changed, 198 insertions(+), 76 deletions(-)
 create mode 100644 square-model-inference-api/inference_server/tests/test_inference/test_adapter.py
 create mode 100644 square-model-inference-api/inference_server/tests/test_inference/test_sentence_transformer.py

diff --git a/square-model-inference-api/inference_server/Dockerfile b/square-model-inference-api/inference_server/Dockerfile
index dee4290fb..fe2b5344c 100644
--- a/square-model-inference-api/inference_server/Dockerfile
+++ b/square-model-inference-api/inference_server/Dockerfile
@@ -9,8 +9,8 @@ COPY requirements.txt ./
 RUN pip install --upgrade pip && \
     pip install -r requirements.txt
 
-COPY ./tests/pre_test_setup_for_docker_caching.py ./tests/pre_test_setup_for_docker_caching.py
-RUN python ./tests/pre_test_setup_for_docker_caching.py
+COPY ./inference_server/tests/pre_test_setup_for_docker_caching.py ./inference_server/tests/pre_test_setup_for_docker_caching.py
+RUN PYTHONPATH=./inference_server; python ./inference_server/tests/pre_test_setup_for_docker_caching.py
 
 COPY . ./
 
@@ -18,7 +18,7 @@ FROM base as test
 WORKDIR /app
 RUN pip install pytest pytest-cov pytest-asyncio
 COPY ../tests tests
-RUN pytest --cov
+RUN PYTHONPATH=./inference_server; pytest --cov
 
 FROM base as build
 
diff --git a/square-model-inference-api/inference_server/square_model_inference/inference/sentencetransformer.py b/square-model-inference-api/inference_server/square_model_inference/inference/sentencetransformer.py
index f6acf7672..7ed77b813 100644
--- a/square-model-inference-api/inference_server/square_model_inference/inference/sentencetransformer.py
+++ b/square-model-inference-api/inference_server/square_model_inference/inference/sentencetransformer.py
@@ -4,7 +4,7 @@
 import torch
 from loguru import logger
 import numpy as np
-from sentence_transformers import SentenceTransformer
+from sentence_transformers import SentenceTransformer as SentenceTransformerModel
 
 from square_model_inference.inference.model import Model
 from square_model_inference.models.request import PredictionRequest, Task
@@ -26,7 +26,7 @@ def _load_model(self, model_name, disable_gpu):
         """
         logger.debug(f"Loading model {model_name}")
         device = "cuda" if torch.cuda.is_available() and not disable_gpu else "cpu"
-        model = SentenceTransformer(model_name_or_path=model_name, device=device)
+        model = SentenceTransformerModel(model_name_or_path=model_name, device=device)
         logger.info(f"Model {model_name} loaded on {device}")
         self.model = model
 
@@ -40,6 +40,6 @@ async def predict(self, request: PredictionRequest) -> PredictionOutput:
             raise ValueError("is_preprocessed=True is not supported for this model. Please use text as input.")
 
         if request.task != Task.embedding:
-            raise NotImplementedError("Only embedding task supported by this model")
+            raise ValueError("Only embedding task supported by this model")
         return self._embedding(request)
 
diff --git a/square-model-inference-api/inference_server/square_model_inference/inference/transformer.py b/square-model-inference-api/inference_server/square_model_inference/inference/transformer.py
index 32c34cc59..a6994ec4d 100644
--- a/square-model-inference-api/inference_server/square_model_inference/inference/transformer.py
+++ b/square-model-inference-api/inference_server/square_model_inference/inference/transformer.py
@@ -1,4 +1,5 @@
 import json
+from collections import defaultdict
 from typing import Union, Tuple
 
 import torch
@@ -130,7 +131,7 @@ def _token_classification(self, request: PredictionRequest) -> PredictionOutput:
             "id2label": id2label,
             "word_ids": [features.word_ids(i) for i in range(len(request.input))]
         }
-        if predictions["logits"].size()[-1] != 1 and not request.task_kwargs.get("regression", False):
+        if predictions["logits"].size()[-1] != 1 and not request.task_kwargs.get("is_regression", False):
             probabilities = torch.softmax(predictions["logits"], dim=-1)
             predictions["logits"] = probabilities
             task_outputs["labels"] = torch.argmax(predictions["logits"], dim=-1).tolist()
@@ -146,7 +147,7 @@ def _sequence_classification(self, request: PredictionRequest) -> PredictionOutp
         }
         # If logits dim > 1 or if the 'regression' flag is not set, we assume classification:
         # We replace the logits by the softmax and add labels chosen with argmax
-        if predictions["logits"].size()[-1] != 1 and not request.task_kwargs.get("regression", False):
+        if predictions["logits"].size()[-1] != 1 and not request.task_kwargs.get("is_regression", False):
             probabilities = torch.softmax(predictions["logits"], dim=-1)
             predictions["logits"] = probabilities
             task_outputs["labels"] = torch.argmax(predictions["logits"], dim=-1).tolist()
@@ -154,7 +155,34 @@ def _sequence_classification(self, request: PredictionRequest) -> PredictionOutp
         return PredictionOutput(model_outputs=predictions, task_outputs=task_outputs)
 
     def _generation(self, request: PredictionRequest) -> PredictionOutput:
-        raise NotImplementedError("Generation is currently not implemented")
+        request.preprocessing_kwargs["padding"] = request.preprocessing_kwargs.get("padding", False)
+        request.preprocessing_kwargs["add_special_tokens"] = request.preprocessing_kwargs.get("add_special_tokens", False)
+        task_outputs = {"generated_texts": []}
+        model_outputs = defaultdict(list)
+        for prompt in request.input:
+            features = self.tokenizer(prompt, return_tensors="pt", **request.preprocessing_kwargs)
+            input_ids = features["input_ids"]
+            input_ids = self._ensure_tensor_on_device(input_ids=input_ids)["input_ids"]
+            request.model_kwargs.update(request.task_kwargs)
+            request.model_kwargs["return_dict_in_generate"] = True
+            res = self.model.generate(input_ids, **request.model_kwargs)
+
+            # put everything on CPU and add it to model_outputs
+            for key in res.keys():
+                if isinstance(res[key], tuple):
+                    if isinstance(res[key][0], tuple):
+                        res[key] = tuple((tuple(tensor.cpu() for tensor in tpl)) for tpl in res[key])
+                    else:
+                        res[key] = tuple(tensor.cpu() for tensor in res[key])
+                else:
+                    res[key] = res[key].cpu()
+                model_outputs[key].append(res[key])
+
+            generated_texts = [self.tokenizer.decode(seq, skip_special_tokens=True,
+                                         clean_up_tokenization_spaces=request.task_kwargs.get("clean_up_tokenization_spaces", False))
+                               for seq in res["sequences"]]
+            task_outputs["generated_texts"].append(generated_texts)
+        return PredictionOutput(model_outputs=model_outputs, task_outputs=task_outputs)
 
     def _question_answering(self, request: PredictionRequest) -> PredictionOutput:
         # Making heavy use of https://huggingface.co/transformers/_modules/transformers/pipelines/question_answering.html#QuestionAnsweringPipeline
diff --git a/square-model-inference-api/inference_server/square_model_inference/models/prediction.py b/square-model-inference-api/inference_server/square_model_inference/models/prediction.py
index 333b150e7..f3f6df4cd 100644
--- a/square-model-inference-api/inference_server/square_model_inference/models/prediction.py
+++ b/square-model-inference-api/inference_server/square_model_inference/models/prediction.py
@@ -1,3 +1,4 @@
+from collections import Iterable
 from typing import Dict, Union, Tuple
 
 import torch
@@ -15,7 +16,10 @@ def _encode_numpy(obj: Dict[str, Union[torch.Tensor, Tuple[torch.Tensor]]], retu
     :param obj: the objects whose tensors will be encoded
     :return: the same dictionary with all tensors replaced by lists or base64-encoded array strings.
     """
+    # Encode numpy array either as lists or base64 string
     def encode(arr):
+        if isinstance(arr, torch.Tensor):
+            arr = arr.numpy()
         if return_plaintext:
             return arr.tolist()
         else:
@@ -30,13 +34,17 @@ def encode(arr):
             # arr_binary_b64 = arr_string_b64.encode()
             # arr_binary = base64.decodebytes(arr_binary_b64)
             # arr = np.load(BytesIO(arr_binary))
+
+    # Recursively go through a value and encode leafs (=tensors) it or iterate over values and encode them
+    def enc_or_iterate(val):
+        if isinstance(val, Iterable) and not isinstance(val, torch.Tensor) and not isinstance(val, np.ndarray):
+            return [enc_or_iterate(v) for v in val]
+        else:
+            return encode(val)
+
     for k, v in obj.items():
-        if isinstance(v, torch.Tensor):
-            v = v.numpy()
-            obj[k] = encode(v)
-        elif isinstance(v, tuple) and isinstance(v[0], torch.Tensor):
-            v = [encode(vv.numpy()) for vv in v]
-            obj[k] = v
+        v = enc_or_iterate(v)
+        obj[k] = v
     return obj
 
 
diff --git a/square-model-inference-api/inference_server/tests/conftest.py b/square-model-inference-api/inference_server/tests/conftest.py
index 8a2f333f8..55a6e05e2 100644
--- a/square-model-inference-api/inference_server/tests/conftest.py
+++ b/square-model-inference-api/inference_server/tests/conftest.py
@@ -23,6 +23,7 @@
 from square_model_inference.inference.adaptertransformer import AdapterTransformer
 from square_model_inference.inference.sentencetransformer import SentenceTransformer
 
+
 @pytest.fixture(scope="session")
 def test_app():
     app = get_app()
@@ -30,12 +31,6 @@ def test_app():
     return app
 
 
-@pytest.fixture()
-def test_client(test_app):
-    with TestClient(test_app) as test_client:
-        yield test_client
-
-
 class TestModel(Model):
     def __init__(self):
         self.prediction = PredictionOutput(model_outputs={}, task_outputs={})
diff --git a/square-model-inference-api/inference_server/tests/pre_test_setup_for_docker_caching.py b/square-model-inference-api/inference_server/tests/pre_test_setup_for_docker_caching.py
index fe6794e6b..cf12f1e0f 100644
--- a/square-model-inference-api/inference_server/tests/pre_test_setup_for_docker_caching.py
+++ b/square-model-inference-api/inference_server/tests/pre_test_setup_for_docker_caching.py
@@ -16,7 +16,7 @@
 
 #Downloaded models:
 TRANSFORMER_MODEL = "bert-base-uncased"
-SENTENCE_MODEL = "paraphrase-MiniLM-L6-v2 "
+SENTENCE_MODEL = "paraphrase-albert-small-v2"
 
 if __name__ == "__main__":
     bert_transformer = Transformer(TRANSFORMER_MODEL, "base", 1, True)
diff --git a/square-model-inference-api/inference_server/tests/test_inference/test_adapter.py b/square-model-inference-api/inference_server/tests/test_inference/test_adapter.py
new file mode 100644
index 000000000..585801453
--- /dev/null
+++ b/square-model-inference-api/inference_server/tests/test_inference/test_adapter.py
@@ -0,0 +1,10 @@
+import pytest
+
+from square_model_inference.models.request import PredictionRequest
+import numpy as np
+
+
+def test_adapter():
+    pytest.fail("Adapter tests waiting for next version of adapter-transformers. "
+                "Not enough heads available. "
+                "Waiting for https://github.com/Adapter-Hub/adapter-transformers/commit/94ff58dbf09ac1d14b7107c9ce48770d2d352161")
\ No newline at end of file
diff --git a/square-model-inference-api/inference_server/tests/test_inference/test_sentence_transformer.py b/square-model-inference-api/inference_server/tests/test_inference/test_sentence_transformer.py
new file mode 100644
index 000000000..a290a78c6
--- /dev/null
+++ b/square-model-inference-api/inference_server/tests/test_inference/test_sentence_transformer.py
@@ -0,0 +1,54 @@
+import pytest
+
+import numpy as np
+from square_model_inference.models.request import PredictionRequest
+
+
+@pytest.mark.usefixtures("test_sentence_transformer")
+class TestSentenceTransformerEmbedding:
+
+    @pytest.mark.asyncio
+    @pytest.mark.parametrize("input", [(["this is a test"]),
+                                            (["this is a test", "this is a test with a longer sentence"])])
+    async def test_embedding(self, test_sentence_transformer, input):
+        request = PredictionRequest.parse_obj({
+            "input": input,
+            "is_preprocessed": False,
+            "preprocessing_kwargs": {},
+            "model_kwargs": {},
+            "task": "embedding",
+            "task_kwargs": {},
+            "adapter_name": ""
+        })
+
+        prediction = await test_sentence_transformer.predict(request)
+        assert np.array(prediction.model_outputs["embeddings"]).shape[1] == 768
+        assert np.array(prediction.model_outputs["embeddings"]).shape[0] == len(input)
+
+    @pytest.mark.asyncio
+    async def test_not_embedding(self, test_sentence_transformer):
+        request = PredictionRequest.parse_obj({
+            "input": ["input"],
+            "is_preprocessed": False,
+            "preprocessing_kwargs": {},
+            "model_kwargs": {},
+            "task": "sequence_classification",
+            "task_kwargs": {},
+            "adapter_name": ""
+        })
+        with pytest.raises(ValueError):
+            prediction = await test_sentence_transformer.predict(request)
+
+    @pytest.mark.asyncio
+    async def test_forbid_is_preprocessed(self, test_sentence_transformer):
+        request = PredictionRequest.parse_obj({
+            "input": ["test"],
+            "is_preprocessed": True,
+            "preprocessing_kwargs": {},
+            "model_kwargs": {},
+            "task": "embedding",
+            "task_kwargs": {},
+            "adapter_name": ""
+        })
+        with pytest.raises(ValueError):
+            prediction = await test_sentence_transformer.predict(request)
\ No newline at end of file
diff --git a/square-model-inference-api/inference_server/tests/test_inference/test_transformers.py b/square-model-inference-api/inference_server/tests/test_inference/test_transformers.py
index a8631c0ef..9c1a9b00d 100644
--- a/square-model-inference-api/inference_server/tests/test_inference/test_transformers.py
+++ b/square-model-inference-api/inference_server/tests/test_inference/test_transformers.py
@@ -5,12 +5,11 @@
 
 @pytest.mark.usefixtures("test_transformer_sequence_classification")
 class TestTransformerSequenceClassification:
-
     @pytest.mark.asyncio
-    @pytest.mark.parametrize("input,output", [(["this is a test"], [1]),
-                                              (["this is a test", "this is a test with a longer sentence"], [1, 1])],
+    @pytest.mark.parametrize("input", [(["this is a test"]),
+                                              (["this is a test", "this is a test with a longer sentence"])],
                              ids=["single", "batch"])
-    async def test_sequence_classification(self, test_transformer_sequence_classification, input, output):
+    async def test_sequence_classification(self, test_transformer_sequence_classification, input):
         request = PredictionRequest.parse_obj({
             "input": input,
             "is_preprocessed": False,
@@ -22,8 +21,9 @@ async def test_sequence_classification(self, test_transformer_sequence_classific
         })
 
         prediction = await test_transformer_sequence_classification.predict(request)
-        np.testing.assert_allclose(np.sum(prediction.model_outputs["logits"], axis=-1), [1.0]*len(input), err_msg="logits are not converted to softmax")
-        assert prediction.task_outputs["labels"] == output
+        np.testing.assert_allclose(np.sum(prediction.model_outputs["logits"], axis=-1), [1.0]*len(input), err_msg="logits are softmax")
+        assert len(prediction.task_outputs["labels"]) == len(input)
+        assert all(isinstance(prediction.task_outputs["labels"][i], int) for i in range(len(input)))
         assert "logits" in prediction.model_outputs
         assert "id2label" in prediction.task_outputs
 
@@ -43,23 +43,22 @@ async def test_sequence_classification_output_attention(self, test_transformer_s
         assert "attentions" in prediction.model_outputs
 
     @pytest.mark.asyncio
-    @pytest.mark.parametrize("input,output", [(["this is a test"], [[0.17748619616031647, 0.4802907109260559]]),
-                                              (["this is a test", "this is a test with a longer sentence"],
-                                               [[0.17748622596263885, 0.48029083013534546], [-0.048022523522377014, 0.5764690637588501]])],
+    @pytest.mark.parametrize("input", [(["this is a test"]), 
+                                       (["this is a test", "this is a test with a longer sentence"])],
                              ids=["single", "batch"])
-    async def test_sequence_classification_regression(self, test_transformer_sequence_classification, input, output):
+    async def test_sequence_classification_regression(self, test_transformer_sequence_classification, input):
         request = PredictionRequest.parse_obj({
             "input": input,
             "is_preprocessed": False,
             "preprocessing_kwargs": {},
             "model_kwargs": {},
             "task": "sequence_classification",
-            "task_kwargs": {"regression": True},
+            "task_kwargs": {"is_regression": True},
             "adapter_name": ""
         })
 
         prediction = await test_transformer_sequence_classification.predict(request)
-        assert prediction.model_outputs["logits"] == output
+        assert not np.array_equal(np.sum(prediction.model_outputs["logits"], axis=-1)-1, [0.0]*len(input)), "logits are not softmax"
         assert "labels" not in prediction.task_outputs
         assert "logits" in prediction.model_outputs
 
@@ -68,12 +67,11 @@ async def test_sequence_classification_regression(self, test_transformer_sequenc
 class TestTransformerTokenClassification:
 
     @pytest.mark.asyncio
-    @pytest.mark.parametrize("input,output,word_ids", [(["this is a test"], [[0, 1, 1, 1, 1, 1]], [[None, 0, 1, 2, 3, None]]),
+    @pytest.mark.parametrize("input,word_ids", [(["this is a test"], [[None, 0, 1, 2, 3, None]]),
                                               (["this is a test", "this is a test with a longer sentence"],
-                                               [[0, 1, 1, 1, 1, 1, 0, 0, 0, 0], [1, 1, 1, 1, 1, 1, 1, 1, 1, 1]],
                                                [[None, 0, 1, 2, 3, None, None, None, None, None], [None, 0, 1, 2, 3, 4, 5, 6, 7, None]])],
                              ids=["single", "batch"])
-    async def test_token_classification(self, test_transformer_token_classification, input, output, word_ids):
+    async def test_token_classification(self, test_transformer_token_classification, input, word_ids):
         request = PredictionRequest.parse_obj({
             "input": input,
             "is_preprocessed": False,
@@ -85,65 +83,39 @@ async def test_token_classification(self, test_transformer_token_classification,
         })
 
         prediction = await test_transformer_token_classification.predict(request)
-        np.testing.assert_allclose(np.sum(prediction.model_outputs["logits"], axis=-1), np.ones_like(output), err_msg="logits are not converted to softmax")
-        assert prediction.task_outputs["labels"] == output
+        np.testing.assert_allclose(np.sum(prediction.model_outputs["logits"], axis=-1), np.ones(shape=(len(input), len(word_ids[0]))), err_msg="logits are softmax")
+        assert all(len(prediction.task_outputs["labels"][i]) == len(word_ids[i]) for i in range(len(input)))
         assert "logits" in prediction.model_outputs
         assert "id2label" in prediction.task_outputs
         assert prediction.task_outputs["word_ids"] == word_ids
 
 
     @pytest.mark.asyncio
-    @pytest.mark.parametrize("input,output,word_ids", [(["this is a test"], [[[0.11221601068973541, 0.05969493091106415],
-                                                                              [-0.15642595291137695, -0.07739989459514618],
-                                                                              [-0.2801300287246704, 0.06872765719890594],
-                                                                              [-0.28268659114837646, -0.01801353693008423],
-                                                                              [0.032697953283786774, 0.051608189940452576],
-                                                                              [0.01579795777797699, 0.23798659443855286]]],
+    @pytest.mark.parametrize("input,word_ids", [(["this is a test"],
                                                         [[None, 0, 1, 2, 3, None]]),
                                                        (["this is a test", "this is a test with a longer sentence"],
-                                                        [[[0.11221593618392944, 0.059694863855838776],
-                                                          [-0.15642589330673218, -0.07740016281604767],
-                                                          [-0.2801305651664734, 0.06872743368148804],
-                                                          [-0.28268682956695557, -0.018013179302215576],
-                                                          [0.03269825875759125, 0.051608070731163025],
-                                                          [0.01579788327217102, 0.2379867136478424],
-                                                          [0.03173816204071045, -0.15285855531692505],
-                                                          [0.11003853380680084, -0.12873490154743195],
-                                                          [0.13318589329719543, -0.10646772384643555],
-                                                          [0.13220593333244324, -0.12443935126066208]],
-                                                         [[-0.05789753794670105, 0.04508095979690552],
-                                                          [-0.27872341871261597, -0.1229611188173294],
-                                                          [-0.5753417015075684, 0.07893967628479004],
-                                                          [-0.47106701135635376, -0.04298338294029236],
-                                                          [-0.5324697494506836, 0.10730768740177155],
-                                                          [-0.31086403131484985, 0.3265591263771057],
-                                                          [-0.2457294911146164, 0.24867495894432068],
-                                                          [-0.2389427125453949, 0.4464331567287445],
-                                                          [-0.08393016457557678, -0.03680300712585449],
-                                                          [-0.01722254604101181, 0.41447973251342773]]],
                                                         [[None, 0, 1, 2, 3, None, None, None, None, None], [None, 0, 1, 2, 3, 4, 5, 6, 7, None]])],
                              ids=["single", "batch"])
-    async def test_token_classification_regression(self, test_transformer_token_classification, input, output, word_ids):
+    async def test_token_classification_regression(self, test_transformer_token_classification, input, word_ids):
         request = PredictionRequest.parse_obj({
             "input": input,
             "is_preprocessed": False,
             "preprocessing_kwargs": {},
             "model_kwargs": {},
             "task": "token_classification",
-            "task_kwargs": {"regression": True},
+            "task_kwargs": {"is_regression": True},
             "adapter_name": ""
         })
 
         prediction = await test_transformer_token_classification.predict(request)
-        assert prediction.model_outputs["logits"] == output
+        assert not np.array_equal((np.sum(prediction.model_outputs["logits"], axis=-1), np.ones_like(word_ids)), "logits are not softmax")
         assert "labels" not in prediction.task_outputs
         assert "logits" in prediction.model_outputs
         assert prediction.task_outputs["word_ids"] == word_ids
 
 
 @pytest.mark.usefixtures("test_transformer_embedding")
-class TestTransformerSequenceClassification:
-
+class TestTransformerEmbedding:
     @pytest.mark.asyncio
     @pytest.mark.parametrize("input,mode", [(["this is a test"], "mean"),
                                               (["this is a test", "this is a test with a longer sentence"], "mean"),
@@ -207,9 +179,22 @@ async def test_embedding_unknown_mode(self, test_transformer_embedding):
         with pytest.raises(ValueError):
             prediction = await test_transformer_embedding.predict(request)
 
+    @pytest.mark.asyncio
+    async def test_forbid_is_preprocessed(self, test_transformer_embedding):
+        request = PredictionRequest.parse_obj({
+            "input": ["test"],
+            "is_preprocessed": True,
+            "preprocessing_kwargs": {},
+            "model_kwargs": {},
+            "task": "embedding",
+            "task_kwargs": {},
+            "adapter_name": ""
+        })
+        with pytest.raises(ValueError):
+            prediction = await test_transformer_embedding.predict(request)
 
 @pytest.mark.usefixtures("test_transformer_question_answering")
-class TestTransformerSequenceClassification:
+class TestTransformerQuestionAnswering:
 
     @pytest.mark.asyncio
     @pytest.mark.parametrize("input", [([["What is a test?", "A test is a thing where you test."]]),
@@ -256,11 +241,12 @@ async def test_question_answering_topk(self, test_transformer_question_answering
 @pytest.mark.usefixtures("test_transformer_generation")
 class TestTransformerGeneration:
     @pytest.mark.asyncio
-    async def test_generation(self, test_transformer_generation):
+    @pytest.mark.parametrize("input", [(["Generate text"]),
+                                       (["Generate text", "And more text"])],
+                             )
+    async def test_generation(self, test_transformer_generation, input):
         request = PredictionRequest.parse_obj({
-            "input": [
-                "this is a test"
-            ],
+            "input": input,
             "is_preprocessed": False,
             "preprocessing_kwargs": {},
             "model_kwargs": {},
@@ -270,4 +256,45 @@ async def test_generation(self, test_transformer_generation):
         })
 
         prediction = await test_transformer_generation.predict(request)
-        assert prediction
\ No newline at end of file
+        assert all(isinstance(prediction.task_outputs["generated_texts"][i][0], str) for i in range(len(input)))
+
+    @pytest.mark.asyncio
+    async def test_generation_output_attention_and_scores(self, test_transformer_generation):
+        request = PredictionRequest.parse_obj({
+            "input": ["Generate text"],
+            "is_preprocessed": False,
+            "preprocessing_kwargs": {},
+            "model_kwargs": {
+                "output_attentions": True,
+                "output_scores": True
+            },
+            "task": "generation",
+            "task_kwargs": {},
+            "adapter_name": ""
+        })
+
+        prediction = await test_transformer_generation.predict(request)
+        assert "scores" in prediction.model_outputs
+        assert "attentions" in prediction.model_outputs
+
+    @pytest.mark.asyncio
+    async def test_generation_beam_sample_multiple_seqs(self, test_transformer_generation):
+        request = PredictionRequest.parse_obj({
+            "input": ["Generate text"],
+            "is_preprocessed": False,
+            "preprocessing_kwargs": {},
+            "model_kwargs": {},
+            "task": "generation",
+            "task_kwargs": {
+                "num_beams": 2,
+                "do_sample": True,
+                "top_k": 10,
+                "top_p": 0.5,
+                "no_repeat_ngram_size": 2,
+                "num_return_sequences": 2
+            },
+            "adapter_name": ""
+        })
+
+        prediction = await test_transformer_generation.predict(request)
+        assert len(prediction.task_outputs["generated_texts"][0]) == 2
\ No newline at end of file

From eee585178c60b07e5b09bc921ac451d806590d74 Mon Sep 17 00:00:00 2001
From: Geigle <gregortheodor.geigle@stud.tu-darmstadt.de>
Date: Thu, 17 Jun 2021 17:12:24 +0200
Subject: [PATCH 05/23] Extended documentation

---
 .../api/routes/prediction.py                  |  2 +
 .../square_model_inference/core/config.py     | 17 +++++---
 .../core/event_handlers.py                    |  4 +-
 .../inference/adaptertransformer.py           | 19 +++++++--
 .../inference/sentencetransformer.py          | 17 ++++++--
 .../inference/transformer.py                  | 40 ++++++++++++++----
 .../models/prediction.py                      | 17 ++++++--
 .../square_model_inference/models/request.py  | 42 +++++++++++++++----
 .../inference_server/tests/conftest.py        |  2 +-
 .../pre_test_setup_for_docker_caching.py      |  2 +-
 10 files changed, 125 insertions(+), 37 deletions(-)

diff --git a/square-model-inference-api/inference_server/square_model_inference/api/routes/prediction.py b/square-model-inference-api/inference_server/square_model_inference/api/routes/prediction.py
index 5c2f6ec25..077a58b99 100644
--- a/square-model-inference-api/inference_server/square_model_inference/api/routes/prediction.py
+++ b/square-model-inference-api/inference_server/square_model_inference/api/routes/prediction.py
@@ -1,5 +1,6 @@
 from fastapi import APIRouter
 from starlette.requests import Request
+from loguru import logger
 
 from square_model_inference.models.request import PredictionRequest
 from square_model_inference.models.prediction import PredictionOutput
@@ -14,6 +15,7 @@ async def predict(
     prediction_request: PredictionRequest = None,
 ) -> PredictionOutput:
 
+    logger.info(f"Request: {request.dict()}")
     model: Model = request.app.state.model
     prediction: PredictionOutput = await model.predict(prediction_request)
 
diff --git a/square-model-inference-api/inference_server/square_model_inference/core/config.py b/square-model-inference-api/inference_server/square_model_inference/core/config.py
index a3973bdee..c1cd0c6a3 100644
--- a/square-model-inference-api/inference_server/square_model_inference/core/config.py
+++ b/square-model-inference-api/inference_server/square_model_inference/core/config.py
@@ -8,17 +8,24 @@
 
 # Disable CUDA even if available
 DISABLE_GPU: bool = config("DISABLE_GPU", cast=bool, default=False)
-# Corresponds to the Huggingface name for Transformers
+# Corresponds to the Huggingface name for finetuned Transformers or the name of a finetuned SentenceTransformers
 MODEL_NAME: str = config("MODEL_NAME")
 # Type of the model, e.g. Transformers, Adapter, ...
+# See square_model_inference.core.event_handlers.MODEL_MAPPING for all available names with corresponding model
 MODEL_TYPE: str = config("MODEL_TYPE")
 # Cache directory where model weights are stored
+# This is the name for the env variable used by transformers and sentence-transformers package
 TRANSFORMERS_CACHE: str = config("TRANSFORMERS_CACHE")
 
-MAX_BATCH_SIZE: int = config("MAX_BATCH_SIZE", cast=int, default=32)
+# Batch size used for many inputs
+BATCH_SIZE: int = config("BATCH_SIZE", cast=int, default=32)
 
-# For MODEL_TYPE=transformers: decides the AutoModelFor* class used
+# For MODEL_TYPE=transformers: decides the AutoModel* class used
+# See square_model_inference.inference.transformer.CLASS_MAPPING for valid names and corresponding class
 MODEL_CLASS: str = config("MODEL_CLASS", default="base")
 
-# Flag that decides if returned numpy arrays are returned as lists or encoded to base64 (smaller but not easily human readable)
-RETURN_PLAINTEXT_ARRAYS = config("RETURN_PLAINTEXT_ARRAYS", cast=bool, default=False)
\ No newline at end of file
+# Flag that decides if returned numpy arrays are returned
+# as lists or encoded to base64 (smaller but not easily human readable).
+# See the comment in square_model_inference.models.prediction._encode_numpy on information on how to decode
+# the base64 string back to the numpy array
+RETURN_PLAINTEXT_ARRAYS = config("RETURN_PLAINTEXT_ARRAYS", cast=bool, default=False)
diff --git a/square-model-inference-api/inference_server/square_model_inference/core/event_handlers.py b/square-model-inference-api/inference_server/square_model_inference/core/event_handlers.py
index bec998460..5a629fa2e 100644
--- a/square-model-inference-api/inference_server/square_model_inference/core/event_handlers.py
+++ b/square-model-inference-api/inference_server/square_model_inference/core/event_handlers.py
@@ -4,7 +4,7 @@
 from loguru import logger
 
 from square_model_inference.inference.adaptertransformer import AdapterTransformer
-from square_model_inference.core.config import MODEL_TYPE, MODEL_NAME, MODEL_CLASS, DISABLE_GPU, MAX_BATCH_SIZE, TRANSFORMERS_CACHE
+from square_model_inference.core.config import MODEL_TYPE, MODEL_NAME, MODEL_CLASS, DISABLE_GPU, BATCH_SIZE, TRANSFORMERS_CACHE
 from square_model_inference.inference.sentencetransformer import SentenceTransformer
 from square_model_inference.inference.transformer import Transformer
 
@@ -18,7 +18,7 @@
     "model_name": MODEL_NAME,
     "model_class": MODEL_CLASS,
     "disable_gpu": DISABLE_GPU,
-    "max_batch_size": MAX_BATCH_SIZE,
+    "batch_size": BATCH_SIZE,
     "transformers_cache": TRANSFORMERS_CACHE
 }
 
diff --git a/square-model-inference-api/inference_server/square_model_inference/inference/adaptertransformer.py b/square-model-inference-api/inference_server/square_model_inference/inference/adaptertransformer.py
index e76259aa3..6e8ef5aa3 100644
--- a/square-model-inference-api/inference_server/square_model_inference/inference/adaptertransformer.py
+++ b/square-model-inference-api/inference_server/square_model_inference/inference/adaptertransformer.py
@@ -1,6 +1,5 @@
 import json
 
-import torch
 from loguru import logger
 
 from transformers import AutoModelWithHeads
@@ -13,10 +12,22 @@
 
 
 class AdapterTransformer(Transformer):
-    def __init__(self, model_name, max_batch_size, disable_gpu, transformers_cache, **kwargs):
+    """
+    The class for all adapter-based models using the adapter-transformers package
+    """
+    def __init__(self, model_name, batch_size, disable_gpu, transformers_cache, **kwargs):
+        """
+        Initialize the Adapter with its underlying Transformer and pre-load all available adapters from adapterhub.ml
+        :param model_name: the Huggingface model name
+        :param batch_size: batch size used for inference
+        :param disable_gpu: do not move model to GPU even if CUDA is available
+        :param transformers_cache: Should be same as TRANSFORMERS_CACHE env variable.
+        This folder will be used to store the adapters
+        :param kwargs: Not used
+        """
         self._load_model(AutoModelWithHeads, model_name, disable_gpu)
         self._load_adapter(model_name, transformers_cache)
-        self.max_batch_size = max_batch_size
+        self.batch_size = batch_size
 
     def _load_adapter(self, model_name, transformers_cache):
         """
@@ -24,6 +35,7 @@ def _load_adapter(self, model_name, transformers_cache):
         We parse the hub index to extract all names and then load each model.
         """
         logger.info("Loading all available adapters from adapterhub.ml")
+        logger.warning("This will not load adapters published only on https://huggingface.co/models")
         index_file = download_cached(ADAPTER_HUB_INDEX_FILE.format(model_name))
         adapter_index = json.load(open(index_file))
         adapters = set()
@@ -69,7 +81,6 @@ def _sequence_classification(self, request: PredictionRequest) -> PredictionOutp
     async def predict(self, request: PredictionRequest) -> PredictionOutput:
         if request.is_preprocessed:
             raise ValueError("is_preprocessed=True is not supported for this model. Please use text as input.")
-
         if not request.adapter_name or request.adapter_name not in self.model.config.adapters.adapters:
             raise ValueError(f"Unknown or missing adapter {request.adapter_name}. "
                        f"Please provider a fully specified adapter name from adapterhub.ml")
diff --git a/square-model-inference-api/inference_server/square_model_inference/inference/sentencetransformer.py b/square-model-inference-api/inference_server/square_model_inference/inference/sentencetransformer.py
index 7ed77b813..6244a0222 100644
--- a/square-model-inference-api/inference_server/square_model_inference/inference/sentencetransformer.py
+++ b/square-model-inference-api/inference_server/square_model_inference/inference/sentencetransformer.py
@@ -13,11 +13,20 @@
 
 
 class SentenceTransformer(Model):
-    SUPPORTED_EMBEDDING_MODES = ["mean", "max", "cls", "token"]
+    """
+    The class for all sentence-transformers models
+    """
 
-    def __init__(self, model_name, max_batch_size, disable_gpu, **kwargs):
+    def __init__(self, model_name, batch_size, disable_gpu, **kwargs):
+        """
+        Initialize the SentenceTransformer
+        :param model_name: the sentence-transformer model name (https://sbert.net/docs/pretrained_models.html)
+        :param batch_size: batch size used for inference
+        :param disable_gpu: do not move model to GPU even if CUDA is available
+        :param kwargs: Not used
+        """
         self._load_model(model_name, disable_gpu)
-        self.max_batch_size = max_batch_size
+        self.batch_size = batch_size
 
     def _load_model(self, model_name, disable_gpu):
         """
@@ -31,7 +40,7 @@ def _load_model(self, model_name, disable_gpu):
         self.model = model
 
     def _embedding(self, request: PredictionRequest) -> PredictionOutput:
-        embeddings = self.model.encode(request.input, batch_size=self.max_batch_size, show_progress_bar=False)
+        embeddings = self.model.encode(request.input, batch_size=self.batch_size, show_progress_bar=False)
         return PredictionOutput(model_outputs={"embeddings": embeddings}, task_outputs={})
 
 
diff --git a/square-model-inference-api/inference_server/square_model_inference/inference/transformer.py b/square-model-inference-api/inference_server/square_model_inference/inference/transformer.py
index a6994ec4d..5e4dc9a15 100644
--- a/square-model-inference-api/inference_server/square_model_inference/inference/transformer.py
+++ b/square-model-inference-api/inference_server/square_model_inference/inference/transformer.py
@@ -22,13 +22,24 @@
 }
 
 class Transformer(Model):
+    """
+    The class for all Huggingface transformer-based models
+    """
     SUPPORTED_EMBEDDING_MODES = ["mean", "max", "cls", "token"]
 
-    def __init__(self, model_name, model_class, max_batch_size, disable_gpu, **kwargs):
+    def __init__(self, model_name, model_class, batch_size, disable_gpu, **kwargs):
+        """
+        Initialize the Transformer
+        :param model_name: the Huggingface model name
+        :param model_class: the class name (according to CLASS_MAPPING) to use
+        :param batch_size: batch size used for inference
+        :param disable_gpu: do not move model to GPU even if CUDA is available
+        :param kwargs: Not used
+        """
         if model_class not in CLASS_MAPPING:
             raise RuntimeError(f"Unknown MODEL_CLASS. Must be one of {CLASS_MAPPING.keys()}")
         self._load_model(CLASS_MAPPING[model_class], model_name, disable_gpu)
-        self.max_batch_size = max_batch_size
+        self.batch_size = batch_size
 
     def _load_model(self, model_cls, model_name, disable_gpu):
         """
@@ -59,15 +70,22 @@ def _ensure_tensor_on_device(self, **inputs):
 
     def _predict(self, request: PredictionRequest, output_features=False) \
             -> Union[dict, Tuple[dict, dict]]:
+        """
+        Inference on the input.
+        :param request: the request with the input and optional kwargs
+        :param output_features: return the features of the input.
+        Necessary if, e.g., attention mask is needed for post-processing.
+        :return: The model outputs and optionally the input features
+        """
         all_predictions = []
         request.preprocessing_kwargs["padding"] = request.preprocessing_kwargs.get("padding", True)
         request.preprocessing_kwargs["truncation"] = request.preprocessing_kwargs.get("truncation", True)
         features = self.tokenizer(request.input,
                                   return_tensors="pt",
                                   **request.preprocessing_kwargs)
-        for start_idx in range(0, len(request.input), self.max_batch_size):
+        for start_idx in range(0, len(request.input), self.batch_size):
             with torch.no_grad():
-                input_features = {k: features[k][start_idx:start_idx+self.max_batch_size] for k in features.keys()}
+                input_features = {k: features[k][start_idx:start_idx+self.batch_size] for k in features.keys()}
                 input_features = self._ensure_tensor_on_device(**input_features)
                 predictions = self.model(**input_features, **request.model_kwargs)
                 all_predictions.append(predictions)
@@ -123,7 +141,7 @@ def _embedding(self, request: PredictionRequest) -> PredictionOutput:
 
     def _token_classification(self, request: PredictionRequest) -> PredictionOutput:
         predictions, features = self._predict(request, output_features=True)
-        # If logits dim > 1 or if the 'regression' flag is not set, we assume classification:
+        # If logits dim > 1 or if the 'is_regression' flag is not set, we assume classification:
         # We replace the logits by the softmax and add labels chosen with argmax
         label2id = self.model.config.label2id
         id2label = {v:k for k,v in label2id.items()}
@@ -145,7 +163,7 @@ def _sequence_classification(self, request: PredictionRequest) -> PredictionOutp
         task_outputs = {
             "id2label": id2label
         }
-        # If logits dim > 1 or if the 'regression' flag is not set, we assume classification:
+        # If logits dim > 1 or if the 'is_regression' flag is not set, we assume classification:
         # We replace the logits by the softmax and add labels chosen with argmax
         if predictions["logits"].size()[-1] != 1 and not request.task_kwargs.get("is_regression", False):
             probabilities = torch.softmax(predictions["logits"], dim=-1)
@@ -159,6 +177,7 @@ def _generation(self, request: PredictionRequest) -> PredictionOutput:
         request.preprocessing_kwargs["add_special_tokens"] = request.preprocessing_kwargs.get("add_special_tokens", False)
         task_outputs = {"generated_texts": []}
         model_outputs = defaultdict(list)
+        # We cannot batch generate so we have to to it separately for each input prompt.
         for prompt in request.input:
             features = self.tokenizer(prompt, return_tensors="pt", **request.preprocessing_kwargs)
             input_ids = features["input_ids"]
@@ -185,6 +204,13 @@ def _generation(self, request: PredictionRequest) -> PredictionOutput:
         return PredictionOutput(model_outputs=model_outputs, task_outputs=task_outputs)
 
     def _question_answering(self, request: PredictionRequest) -> PredictionOutput:
+        """
+        Span-based question answering for a given question and context.
+
+        We expect the input to use the (question, context) format for the text pairs.
+        :param request:
+        :return:
+        """
         # Making heavy use of https://huggingface.co/transformers/_modules/transformers/pipelines/question_answering.html#QuestionAnsweringPipeline
         def decode(start_: np.ndarray, end_: np.ndarray, topk: int, max_answer_len: int, undesired_tokens_: np.ndarray) -> Tuple:
                 """
@@ -254,7 +280,7 @@ def decode(start_: np.ndarray, end_: np.ndarray, topk: int, max_answer_len: int,
             end = np.exp(end - np.log(np.sum(np.exp(end), axis=-1, keepdims=True)))
 
             starts, ends, scores = decode(
-                start, end, request.task_kwargs.get("topk", 1), request.task_kwargs.get("max_answer_len", 512), undesired_tokens
+                start, end, request.task_kwargs.get("topk", 1), request.task_kwargs.get("max_answer_len", 128), undesired_tokens
             )
             enc = features[idx]
             answers = [
diff --git a/square-model-inference-api/inference_server/square_model_inference/models/prediction.py b/square-model-inference-api/inference_server/square_model_inference/models/prediction.py
index f3f6df4cd..98fea524a 100644
--- a/square-model-inference-api/inference_server/square_model_inference/models/prediction.py
+++ b/square-model-inference-api/inference_server/square_model_inference/models/prediction.py
@@ -11,8 +11,8 @@
 
 def _encode_numpy(obj: Dict[str, Union[torch.Tensor, Tuple[torch.Tensor]]], return_plaintext: bool=RETURN_PLAINTEXT_ARRAYS) -> Dict[str, Union[list, str]]:
     """
-    Encodes the Torch Tensors first to Numpy arrays and then makes either plain lists or base64-encode them depending
-    on the flag RETURN_PLAINTEXT_ARRAYS
+    Encodes the Torch Tensors first to Numpy arrays and then encodes them either as plain lists or base64 string
+    depending on the flag RETURN_PLAINTEXT_ARRAYS
     :param obj: the objects whose tensors will be encoded
     :return: the same dictionary with all tensors replaced by lists or base64-encoded array strings.
     """
@@ -30,6 +30,7 @@ def encode(arr):
             arr_binary_b64 = base64.b64encode(arr_binary)
             arr_string_b64 = arr_binary_b64.decode("latin1")
             return arr_string_b64
+
             # DECODE THE VALUE WITH
             # arr_binary_b64 = arr_string_b64.encode()
             # arr_binary = base64.decodebytes(arr_binary_b64)
@@ -53,11 +54,19 @@ class PredictionOutput(BaseModel):
     The results of the prediction of the model on the given input for the requested task.
     """
     model_outputs: dict = Field(
-        description="Dictionary containing the model tensor outputs either as plain list or as base64-encoded numpy array."
+        description="Dictionary containing the model tensor outputs either as plain list or as base64-encoded numpy array.<br><br>"
+                    "Decode the base64 string 'arr_string_b64' back to an array in Python like this:<br>"
+                    "arr_binary_b64 = arr_string_b64.encode()<br>"
+                    "arr_binary = base64.decodebytes(arr_binary_b64)<br>"
+                    "arr = np.load(BytesIO(arr_binary))<br><br>"
+                    "Task 'generation' does not concatenate the tensors of the inputs together but instead creates a list"
+                    "with each entry corresponding to the respective input."
+
     )
     task_outputs: dict = Field(
         description="Dictionary containing the task outputs. This covers anything from generated text, "
-                    "labels, id2label mappings, token2word mappings, etc."
+                    "labels, id2label mappings, token2word mappings, etc.<br><br>"
+                    "SentenceTransformer: This is not used.<br>"
     )
 
     def __init__(self, **data):
diff --git a/square-model-inference-api/inference_server/square_model_inference/models/request.py b/square-model-inference-api/inference_server/square_model_inference/models/request.py
index ab3c7e1ea..ad4af8ae6 100644
--- a/square-model-inference-api/inference_server/square_model_inference/models/request.py
+++ b/square-model-inference-api/inference_server/square_model_inference/models/request.py
@@ -25,31 +25,55 @@ class PredictionRequest(BaseModel):
         ...,
         description="Input for the model. Supports Huggingface Transformer inputs (i.e., list of sentences, or "
                     "list of pairs of sentences), a dictionary with Transformer inputs, or a dictionary containing "
-                    "numpy arrays (as lists). For the numpy arrays, also set is_preprocessed=True."
+                    "numpy arrays (as lists). For the numpy arrays, also set is_preprocessed=True. "
+                    "<br><br>"
+                    "Transformer/ Adapter:<br>"
+                    "Task 'question_answering' expects the input to be in the (question, context) format."
     )
     is_preprocessed: bool = Field(
         default=False,
         description="Flag indicating that the input contains already pre-processed numpy arrays "
-                    "as list and that it needs no further pre-processing."
+                    "as list and that it needs no further pre-processing.<br><br>"
+                    "Transformer/ Adapter/ SentenceTransformer: 'is_preprocessed' is not supported."
     )
     preprocessing_kwargs: dict = Field(
         default={},
-        description="Dictionary containing additional parameters for the pre-processing step."
+        description="Optional dictionary containing additional parameters for the pre-processing step.<br><br>"
+                    "SentenceTransformer: This is ignored.<br>"
+                    "Transformer/ Adapter: See the Huggingface tokenizer for possible parameters."
     )
     model_kwargs: dict = Field(
         default={},
-        description="Dictionary containing parameters that are passed to the model for the forward pass. "
-                    "Set ‘output_attentions=True’ to receive the attention weights for Huggingface Transformers in the "
-                    "output."
+        description="Optional dictionary containing parameters that are passed to the model for the forward pass "
+                    "to control what additional tensors are returned.<br><br>"
+                    "SentenceTransformer: This is ignored.<br>"
+                    "Transformer/ Adapter: See the forward method of the Huggingface models for possible parameters"
+                    "For example, set ‘output_attentions=True’ to receive the attention results in the output."
     )
     task: Task = Field(...)
     task_kwargs: dict = Field(
         default={},
-        description="Dictionary containing additional parameters for handling of the task and "
-                    "task-related post-processing."
+        description="Optional dictionary containing additional parameters for handling of the task and "
+                    "task-related post-processing.<br><br>"
+                    "SentenceTransformer: This is ignored.<br>"
+                    "Transformer/ Adapter:<br>"
+                    "'sentence_classification':<br>"
+                    "- 'is_regression': Flag to treat output of models with num_labels>1 as regression, too, i.e., no softmax and no labels are returned<br>"
+                    "'token_classification':<br>"
+                    "- 'is_regression': Flag to treat output of models with num_labels>1 as regression, too, i.e., no softmax and no labels are returned<br>"
+                    "'embedding':<br>"
+                    "- 'embedding_mode: One of 'mean', 'max', 'cls', 'token'. The pooling mode used (or not used for 'token'). Default 'mean'.<br>"
+                    "'question_answering':<br>"
+                    "- 'topk': Return the top-k most likely spans. Default 1.<br>"
+                    "- 'max_answer_len': Maximal token length of answers. Default 128.<br>"
+                    "'generation':<br>"
+                    "- 'clean_up_tokenization_spaces': See parameter in Huggingface tokenizer.decode(). Default False<br>"
+                    "- See Huggingface model.generate() for all possible parameters that can be used. "
+                    "Note, 'model_kwargs' and 'task_kwargs' are merged for generation."
+
     )
     adapter_name: Optional[str] = Field(
         default="",
-        description="Only necessary for adapter-based models. "
+        description="Only necessary for Adapter. "
                     "The fully specified name of the to-be-used adapter from adapterhub.ml"
     )
\ No newline at end of file
diff --git a/square-model-inference-api/inference_server/tests/conftest.py b/square-model-inference-api/inference_server/tests/conftest.py
index 55a6e05e2..df5906e2f 100644
--- a/square-model-inference-api/inference_server/tests/conftest.py
+++ b/square-model-inference-api/inference_server/tests/conftest.py
@@ -12,7 +12,7 @@
 # environ["MODEL_NAME"] = "test"
 # environ["MODEL_TYPE"] = "test"
 # environ["DISABLE_GPU"] = "True"
-# environ["MAX_BATCH_SIZE"] = "1"
+# environ["BATCH_SIZE"] = "1"
 # environ["RETURN_PLAINTEXT_ARRAYS"] = "False"
 
 from main import get_app
diff --git a/square-model-inference-api/inference_server/tests/pre_test_setup_for_docker_caching.py b/square-model-inference-api/inference_server/tests/pre_test_setup_for_docker_caching.py
index cf12f1e0f..0b8a3fd41 100644
--- a/square-model-inference-api/inference_server/tests/pre_test_setup_for_docker_caching.py
+++ b/square-model-inference-api/inference_server/tests/pre_test_setup_for_docker_caching.py
@@ -7,7 +7,7 @@
 environ["MODEL_NAME"] = "test"
 environ["MODEL_TYPE"] = "test"
 environ["DISABLE_GPU"] = "True"
-environ["MAX_BATCH_SIZE"] = "1"
+environ["BATCH_SIZE"] = "1"
 environ["RETURN_PLAINTEXT_ARRAYS"] = "True"
 
 from square_model_inference.inference.transformer import Transformer

From 0029da1eb5370af5506197bdb68f91f7638c1817 Mon Sep 17 00:00:00 2001
From: Geigle <gregortheodor.geigle@stud.tu-darmstadt.de>
Date: Thu, 24 Jun 2021 15:18:11 +0200
Subject: [PATCH 06/23] Updated prediction documentation; question-answering:
 added 'no answer' support

---
 .../api/routes/prediction.py                  |  2 +-
 .../inference/adaptertransformer.py           |  1 +
 .../inference/transformer.py                  | 11 ++++-
 .../models/prediction.py                      | 46 ++++++++++++++++++-
 4 files changed, 57 insertions(+), 3 deletions(-)

diff --git a/square-model-inference-api/inference_server/square_model_inference/api/routes/prediction.py b/square-model-inference-api/inference_server/square_model_inference/api/routes/prediction.py
index 077a58b99..6632c3bf4 100644
--- a/square-model-inference-api/inference_server/square_model_inference/api/routes/prediction.py
+++ b/square-model-inference-api/inference_server/square_model_inference/api/routes/prediction.py
@@ -15,7 +15,7 @@ async def predict(
     prediction_request: PredictionRequest = None,
 ) -> PredictionOutput:
 
-    logger.info(f"Request: {request.dict()}")
+    logger.info(f"Request: {prediction_request.dict()}")
     model: Model = request.app.state.model
     prediction: PredictionOutput = await model.predict(prediction_request)
 
diff --git a/square-model-inference-api/inference_server/square_model_inference/inference/adaptertransformer.py b/square-model-inference-api/inference_server/square_model_inference/inference/adaptertransformer.py
index 6e8ef5aa3..3a0507219 100644
--- a/square-model-inference-api/inference_server/square_model_inference/inference/adaptertransformer.py
+++ b/square-model-inference-api/inference_server/square_model_inference/inference/adaptertransformer.py
@@ -36,6 +36,7 @@ def _load_adapter(self, model_name, transformers_cache):
         """
         logger.info("Loading all available adapters from adapterhub.ml")
         logger.warning("This will not load adapters published only on https://huggingface.co/models")
+        logger.warning("UPDATE with https://github.com/Adapter-Hub/adapter-transformers/pull/193")
         index_file = download_cached(ADAPTER_HUB_INDEX_FILE.format(model_name))
         adapter_index = json.load(open(index_file))
         adapters = set()
diff --git a/square-model-inference-api/inference_server/square_model_inference/inference/transformer.py b/square-model-inference-api/inference_server/square_model_inference/inference/transformer.py
index 5e4dc9a15..46c68c77a 100644
--- a/square-model-inference-api/inference_server/square_model_inference/inference/transformer.py
+++ b/square-model-inference-api/inference_server/square_model_inference/inference/transformer.py
@@ -267,7 +267,10 @@ def decode(start_: np.ndarray, end_: np.ndarray, topk: int, max_answer_len: int,
             start = start.numpy()
             end = end.numpy()
             # Ensure padded tokens & question tokens cannot belong to the set of candidate answers.
-            undesired_tokens = np.abs(np.array([s != 1 for s in features.sequence_ids(idx)]) - 1) & features["attention_mask"][idx].numpy()
+            question_tokens = np.abs(np.array([s != 1 for s in features.sequence_ids(idx)]) - 1)
+            # Unmask CLS token for 'no answer'
+            question_tokens[0] = 1
+            undesired_tokens = question_tokens & features["attention_mask"][idx].numpy()
 
             # Generate mask
             undesired_tokens_mask = undesired_tokens == 0.0
@@ -279,6 +282,10 @@ def decode(start_: np.ndarray, end_: np.ndarray, topk: int, max_answer_len: int,
             start = np.exp(start - np.log(np.sum(np.exp(start), axis=-1, keepdims=True)))
             end = np.exp(end - np.log(np.sum(np.exp(end), axis=-1, keepdims=True)))
 
+            # Get score for 'no answer' then mask for decoding step (CLS token
+            no_answer_score = (start[0] * end[0]).item()
+            start[0] = end[0] = 0.0
+
             starts, ends, scores = decode(
                 start, end, request.task_kwargs.get("topk", 1), request.task_kwargs.get("max_answer_len", 128), undesired_tokens
             )
@@ -294,6 +301,8 @@ def decode(start_: np.ndarray, end_: np.ndarray, topk: int, max_answer_len: int,
                               enc.word_to_chars(enc.token_to_word(e), sequence_index=1)[1]],
                 }
                 for s, e, score in zip(starts, ends, scores)]
+            answers.append({"score": no_answer_score, "start": 0, "end": 0, "answer": ""})
+            answers = sorted(answers, key=lambda x: x["score"], reverse=True)[: request.task_kwargs.get("topk", 1)]
             task_outputs["answers"].append(answers)
         return PredictionOutput(model_outputs=predictions, task_outputs=task_outputs)
 
diff --git a/square-model-inference-api/inference_server/square_model_inference/models/prediction.py b/square-model-inference-api/inference_server/square_model_inference/models/prediction.py
index 98fea524a..e1160a1da 100644
--- a/square-model-inference-api/inference_server/square_model_inference/models/prediction.py
+++ b/square-model-inference-api/inference_server/square_model_inference/models/prediction.py
@@ -59,15 +59,59 @@ class PredictionOutput(BaseModel):
                     "arr_binary_b64 = arr_string_b64.encode()<br>"
                     "arr_binary = base64.decodebytes(arr_binary_b64)<br>"
                     "arr = np.load(BytesIO(arr_binary))<br><br>"
+                    "SentenceTransformer:<br>"
+                    "'embedding':<br>"
+                    "- 'embeddings: Embedding tensors.<br>"
+                    "Transformer/ Adapter:<br>"
+                    "Optional tensor depend on request's 'model_kwargs' parameters, e.g. 'output_attentions'. "
+                    "See the Huggingface documentation for information like shape etc. <br>"
+                    
+                    "'sentence_classification':<br>"
+                    "- 'logits': (Softmax) logits of the classifier.<br>"
+                    "'token_classification':<br>"
+                    "- 'logits': (Softmax) logits of the classifier.<br>"
+                    "'embedding':<br>"
+                    "- 'embeddings: Embedding tensors.<br>"
+                    "'question_answering':<br>"
+                    "- 'start_logits': Logits for the beginning of the span<br>"
+                    "- 'end_logits': Logits for the end of the span<br>"
+                    "'generation':<br>"
+                    "- 'sequences': The generated vocab ids for the sequence<br>"
                     "Task 'generation' does not concatenate the tensors of the inputs together but instead creates a list"
-                    "with each entry corresponding to the respective input."
+                    "of the tensors for each input."
 
     )
     task_outputs: dict = Field(
         description="Dictionary containing the task outputs. This covers anything from generated text, "
                     "labels, id2label mappings, token2word mappings, etc.<br><br>"
                     "SentenceTransformer: This is not used.<br>"
+                    "Transformer/ Adapter:<br>"
+                    "'sentence_classification':<br>"
+                    "- 'labels': List of the predicted label ids for the input <br>"
+                    "- 'id2label': Mapping from label id to the label name <br>"
+                    "'token_classification':<br>"
+                    "- 'labels': List of the predicted label ids for the input <br>"
+                    "- 'id2label': Mapping from label id to the label name <br>"
+                    "- 'word_ids': Mapping from tokens (as produced by the model tokenizer) to the words of the input. <br>"
+                    "'None' represents special tokens that have been added by the tokenizer <br>"
+                    "'embedding':<br>"
+                    "- 'embedding_mode: One of 'mean', 'max', 'cls', 'token'. The pooling mode used (or not used for 'token')<br>"
+                    "- 'word_ids': Mapping from tokens (as produced by the model tokenizer) to the words of the input. "
+                    "Only returned if 'embedding_mode=token' <br>"
+                    "'question_answering':<br>"
+                    "- 'answers': List of lists of answers. Length of outer list is the number of inputs, "
+                    "length of inner list is parameter 'topk' from the request's 'task_kwargs' (default 1). "
+                    "Each answer is a dictionary with 'score', 'start' (span start index in context), 'end' (span end index in context), "
+                    "and 'answer' (the extracted span). The inner list is sorted by score. If no answer span was extracted, "
+                    "the empty span is returned (start and end both 0).<br>"
+                    "'generation':<br>"
+                    "- 'generated_texts': List of list of the generated texts. Length of outer list is the number of inputs, "
+                    "length of inner list is parameter 'num_return_sequences' in request's 'task_kwargs'"
     )
+    model_output_is_encoded: bool = Field(
+        not RETURN_PLAINTEXT_ARRAYS,
+        description="Flag indicating that 'model_output' is a base64-encoded numpy array and not a human-readable list."
+                    "See the field description for 'model_output' on information on how to decode the array.")
 
     def __init__(self, **data):
         """

From 32c973a7cfb2557b60d136fa08e6bc71041652f9 Mon Sep 17 00:00:00 2001
From: Geigle <gregortheodor.geigle@stud.tu-darmstadt.de>
Date: Tue, 29 Jun 2021 09:23:20 +0200
Subject: [PATCH 07/23] Removed project dependencies from pre-test setup so it
 can run without the project code (Docker caching reasons)

---
 .../pre_test_setup_for_docker_caching.py      | 44 +++++++++++++++----
 1 file changed, 35 insertions(+), 9 deletions(-)

diff --git a/square-model-inference-api/inference_server/tests/pre_test_setup_for_docker_caching.py b/square-model-inference-api/inference_server/tests/pre_test_setup_for_docker_caching.py
index 0b8a3fd41..a6b537d5f 100644
--- a/square-model-inference-api/inference_server/tests/pre_test_setup_for_docker_caching.py
+++ b/square-model-inference-api/inference_server/tests/pre_test_setup_for_docker_caching.py
@@ -1,8 +1,15 @@
 # This file is used for the Docker image to cache long-running setup for the tests,
 # i.e., downloading finetuned Transformer models and so on.
 # This way, adding new tests or even changing the server code does NOT trigger a new download during building
+import json
+
+from sentence_transformers import SentenceTransformer
 from starlette.config import environ
-TRANSFORMERS_TESTING_CACHE = "./model_testing_cache"
+from transformers import AutoTokenizer, AutoModelWithHeads
+from loguru import logger
+from transformers.adapter_utils import download_cached, ADAPTER_HUB_INDEX_FILE
+
+TRANSFORMERS_TESTING_CACHE = "./.model_testing_cache"
 environ["TRANSFORMERS_CACHE"] = TRANSFORMERS_TESTING_CACHE
 environ["MODEL_NAME"] = "test"
 environ["MODEL_TYPE"] = "test"
@@ -10,15 +17,34 @@
 environ["BATCH_SIZE"] = "1"
 environ["RETURN_PLAINTEXT_ARRAYS"] = "True"
 
-from square_model_inference.inference.transformer import Transformer
-from square_model_inference.inference.adaptertransformer import AdapterTransformer
-from square_model_inference.inference.sentencetransformer import SentenceTransformer
-
-#Downloaded models:
+# Downloaded models:
 TRANSFORMER_MODEL = "bert-base-uncased"
 SENTENCE_MODEL = "paraphrase-albert-small-v2"
 
 if __name__ == "__main__":
-    bert_transformer = Transformer(TRANSFORMER_MODEL, "base", 1, True)
-    adapter_transformer = AdapterTransformer(TRANSFORMER_MODEL, 1, True, TRANSFORMERS_TESTING_CACHE)
-    sentence_transformer = SentenceTransformer(SENTENCE_MODEL)
\ No newline at end of file
+    # We pre-download all models needed for the tests.
+    # We have to be careful to NOT import anything from square_model_inference because this script runs in the Dockerfile
+    # BEFORE any other of our code is copied (so that we do not have to re-download after every code change).
+    device = "cpu"
+
+    # Pre-download Huggingface model for tests
+    _ = AutoTokenizer.from_pretrained(TRANSFORMER_MODEL)
+    model = AutoModelWithHeads.from_pretrained(TRANSFORMER_MODEL).to(device)
+
+    # Pre-download adapters
+    logger.warning("UPDATE with https://github.com/Adapter-Hub/adapter-transformers/pull/193")
+    index_file = download_cached(ADAPTER_HUB_INDEX_FILE.format(TRANSFORMER_MODEL))
+    adapter_index = json.load(open(index_file))
+    adapters = set()
+    for task, datasets in adapter_index.items():
+        for dataset in datasets.keys():
+            for key in datasets[dataset].keys():
+                if key != "default":
+                    for org in datasets[dataset][key]["versions"].keys():
+                        adapters.add(f"{task}/{dataset}@{org}")
+    for adapter in adapters:
+        logger.debug(f"Loading adapter {adapter}")
+        model.load_adapter(adapter, load_as=adapter, with_head=True, cache_dir=TRANSFORMERS_TESTING_CACHE)
+
+    # Pre-download sentence-transformer models for tests
+    _ = SentenceTransformer(model_name_or_path=SENTENCE_MODEL, device=device)

From 9c5da6ae4818021d17dc49c4721115e4b5fe317a Mon Sep 17 00:00:00 2001
From: Geigle <gregortheodor.geigle@stud.tu-darmstadt.de>
Date: Thu, 1 Jul 2021 16:03:35 +0200
Subject: [PATCH 08/23] Made PredictionRequest a fixture and just set relevant
 fields

---
 .../inference_server/tests/conftest.py        |  13 ++
 .../tests/test_inference/test_transformers.py | 211 +++++-------------
 2 files changed, 73 insertions(+), 151 deletions(-)

diff --git a/square-model-inference-api/inference_server/tests/conftest.py b/square-model-inference-api/inference_server/tests/conftest.py
index df5906e2f..c3a4984ae 100644
--- a/square-model-inference-api/inference_server/tests/conftest.py
+++ b/square-model-inference-api/inference_server/tests/conftest.py
@@ -78,3 +78,16 @@ def test_adapter():
 @pytest.fixture(scope="class")
 def test_sentence_transformer():
     return SentenceTransformer(SENTENCE_MODEL, 1, True)
+
+@pytest.fixture()
+def request():
+    request = PredictionRequest.parse_obj({
+        "input": ["test"],
+        "is_preprocessed": False,
+        "preprocessing_kwargs": {},
+        "model_kwargs": {},
+        "task": "sequence_classification",
+        "task_kwargs": {},
+        "adapter_name": ""
+    })
+    return request
\ No newline at end of file
diff --git a/square-model-inference-api/inference_server/tests/test_inference/test_transformers.py b/square-model-inference-api/inference_server/tests/test_inference/test_transformers.py
index 9c1a9b00d..bf6c9a3d2 100644
--- a/square-model-inference-api/inference_server/tests/test_inference/test_transformers.py
+++ b/square-model-inference-api/inference_server/tests/test_inference/test_transformers.py
@@ -9,16 +9,9 @@ class TestTransformerSequenceClassification:
     @pytest.mark.parametrize("input", [(["this is a test"]),
                                               (["this is a test", "this is a test with a longer sentence"])],
                              ids=["single", "batch"])
-    async def test_sequence_classification(self, test_transformer_sequence_classification, input):
-        request = PredictionRequest.parse_obj({
-            "input": input,
-            "is_preprocessed": False,
-            "preprocessing_kwargs": {},
-            "model_kwargs": {},
-            "task": "sequence_classification",
-            "task_kwargs": {},
-            "adapter_name": ""
-        })
+    async def test_sequence_classification(self, request, test_transformer_sequence_classification, input):
+        request.input = input
+        request.task = "sequence_classification"
 
         prediction = await test_transformer_sequence_classification.predict(request)
         np.testing.assert_allclose(np.sum(prediction.model_outputs["logits"], axis=-1), [1.0]*len(input), err_msg="logits are softmax")
@@ -28,16 +21,9 @@ async def test_sequence_classification(self, test_transformer_sequence_classific
         assert "id2label" in prediction.task_outputs
 
     @pytest.mark.asyncio
-    async def test_sequence_classification_output_attention(self, test_transformer_sequence_classification):
-        request = PredictionRequest.parse_obj({
-            "input": ["testing test"],
-            "is_preprocessed": False,
-            "preprocessing_kwargs": {},
-            "model_kwargs": {"output_attentions": True},
-            "task": "sequence_classification",
-            "task_kwargs": {},
-            "adapter_name": ""
-        })
+    async def test_sequence_classification_output_attention(self, request, test_transformer_sequence_classification):
+        request.task = "sequence_classification"
+        request.model_kwargs = {"output_attentions": True}
 
         prediction = await test_transformer_sequence_classification.predict(request)
         assert "attentions" in prediction.model_outputs
@@ -46,16 +32,10 @@ async def test_sequence_classification_output_attention(self, test_transformer_s
     @pytest.mark.parametrize("input", [(["this is a test"]), 
                                        (["this is a test", "this is a test with a longer sentence"])],
                              ids=["single", "batch"])
-    async def test_sequence_classification_regression(self, test_transformer_sequence_classification, input):
-        request = PredictionRequest.parse_obj({
-            "input": input,
-            "is_preprocessed": False,
-            "preprocessing_kwargs": {},
-            "model_kwargs": {},
-            "task": "sequence_classification",
-            "task_kwargs": {"is_regression": True},
-            "adapter_name": ""
-        })
+    async def test_sequence_classification_regression(self, request, test_transformer_sequence_classification, input):
+        request.input = input
+        request.task = "sequence_classification"
+        request.task_kwargs = {"is_regression": True}
 
         prediction = await test_transformer_sequence_classification.predict(request)
         assert not np.array_equal(np.sum(prediction.model_outputs["logits"], axis=-1)-1, [0.0]*len(input)), "logits are not softmax"
@@ -71,16 +51,9 @@ class TestTransformerTokenClassification:
                                               (["this is a test", "this is a test with a longer sentence"],
                                                [[None, 0, 1, 2, 3, None, None, None, None, None], [None, 0, 1, 2, 3, 4, 5, 6, 7, None]])],
                              ids=["single", "batch"])
-    async def test_token_classification(self, test_transformer_token_classification, input, word_ids):
-        request = PredictionRequest.parse_obj({
-            "input": input,
-            "is_preprocessed": False,
-            "preprocessing_kwargs": {},
-            "model_kwargs": {},
-            "task": "token_classification",
-            "task_kwargs": {},
-            "adapter_name": ""
-        })
+    async def test_token_classification(self, request, test_transformer_token_classification, input, word_ids):
+        request.input = input
+        request.task = "token_classification"
 
         prediction = await test_transformer_token_classification.predict(request)
         np.testing.assert_allclose(np.sum(prediction.model_outputs["logits"], axis=-1), np.ones(shape=(len(input), len(word_ids[0]))), err_msg="logits are softmax")
@@ -96,16 +69,10 @@ async def test_token_classification(self, test_transformer_token_classification,
                                                        (["this is a test", "this is a test with a longer sentence"],
                                                         [[None, 0, 1, 2, 3, None, None, None, None, None], [None, 0, 1, 2, 3, 4, 5, 6, 7, None]])],
                              ids=["single", "batch"])
-    async def test_token_classification_regression(self, test_transformer_token_classification, input, word_ids):
-        request = PredictionRequest.parse_obj({
-            "input": input,
-            "is_preprocessed": False,
-            "preprocessing_kwargs": {},
-            "model_kwargs": {},
-            "task": "token_classification",
-            "task_kwargs": {"is_regression": True},
-            "adapter_name": ""
-        })
+    async def test_token_classification_regression(self, request, test_transformer_token_classification, input, word_ids):
+        request.input = input
+        request.task = "token_classification"
+        request.task_kwargs = {"is_regression": True}
 
         prediction = await test_transformer_token_classification.predict(request)
         assert not np.array_equal((np.sum(prediction.model_outputs["logits"], axis=-1), np.ones_like(word_ids)), "logits are not softmax")
@@ -124,16 +91,10 @@ class TestTransformerEmbedding:
                                             (["this is a test"], "cls"),
                                             (["this is a test", "this is a test with a longer sentence"], "cls")],
                              )
-    async def test_embedding(self, test_transformer_embedding, input, mode):
-        request = PredictionRequest.parse_obj({
-            "input": input,
-            "is_preprocessed": False,
-            "preprocessing_kwargs": {},
-            "model_kwargs": {},
-            "task": "embedding",
-            "task_kwargs": {"embedding_mode": mode},
-            "adapter_name": ""
-        })
+    async def test_embedding(self, request, test_transformer_embedding, input, mode):
+        request.input = input
+        request.task = "embedding"
+        request.task_kwargs = {"embedding_mode": mode}
 
         prediction = await test_transformer_embedding.predict(request)
         assert np.array(prediction.model_outputs["embeddings"]).shape[1] == 768
@@ -146,16 +107,10 @@ async def test_embedding(self, test_transformer_embedding, input, mode):
                                             (["this is a test", "this is a test with a longer sentence"],
                                              [[None, 0, 1, 2, 3, None, None, None, None, None], [None, 0, 1, 2, 3, 4, 5, 6, 7, None]])],
                              ids=["single", "batch"])
-    async def test_embedding_token(self, test_transformer_embedding, input, word_ids):
-        request = PredictionRequest.parse_obj({
-            "input": input,
-            "is_preprocessed": False,
-            "preprocessing_kwargs": {},
-            "model_kwargs": {},
-            "task": "embedding",
-            "task_kwargs": {"embedding_mode": "token"},
-            "adapter_name": ""
-        })
+    async def test_embedding_token(self, request, test_transformer_embedding, input, word_ids):
+        request.input = input
+        request.task = "embedding"
+        request.task_kwargs = {"embedding_mode": "token"}
 
         prediction = await test_transformer_embedding.predict(request)
         assert np.array(prediction.model_outputs["embeddings"]).shape[2] == 768
@@ -166,30 +121,18 @@ async def test_embedding_token(self, test_transformer_embedding, input, word_ids
         assert prediction.task_outputs["word_ids"] == word_ids
 
     @pytest.mark.asyncio
-    async def test_embedding_unknown_mode(self, test_transformer_embedding):
-        request = PredictionRequest.parse_obj({
-            "input": ["test"],
-            "is_preprocessed": False,
-            "preprocessing_kwargs": {},
-            "model_kwargs": {},
-            "task": "embedding",
-            "task_kwargs": {"embedding_mode": "this mode does not exist"},
-            "adapter_name": ""
-        })
+    async def test_embedding_unknown_mode(self, request, test_transformer_embedding):
+        request.task = "embedding"
+        request.task_kwargs = {"embedding_mode": "this mode does not exist"}
+
         with pytest.raises(ValueError):
             prediction = await test_transformer_embedding.predict(request)
 
     @pytest.mark.asyncio
-    async def test_forbid_is_preprocessed(self, test_transformer_embedding):
-        request = PredictionRequest.parse_obj({
-            "input": ["test"],
-            "is_preprocessed": True,
-            "preprocessing_kwargs": {},
-            "model_kwargs": {},
-            "task": "embedding",
-            "task_kwargs": {},
-            "adapter_name": ""
-        })
+    async def test_forbid_is_preprocessed(self, request, test_transformer_embedding):
+        request.task = "embedding"
+        request.is_preprocessed = True
+
         with pytest.raises(ValueError):
             prediction = await test_transformer_embedding.predict(request)
 
@@ -201,16 +144,10 @@ class TestTransformerQuestionAnswering:
                                        ([["What is a test?", "A test is a thing where you test."],
                                          ["What is a longer test?", "A test is a thing where you test. If it is longer you call it longer"]])],
                              )
-    async def test_question_answering(self, test_transformer_question_answering, input):
-        request = PredictionRequest.parse_obj({
-            "input": input,
-            "is_preprocessed": False,
-            "preprocessing_kwargs": {},
-            "model_kwargs": {},
-            "task": "question_answering",
-            "task_kwargs": {"topk": 1},
-            "adapter_name": ""
-        })
+    async def test_question_answering(self, request, test_transformer_question_answering, input):
+        request.input = input
+        request.task = "question_answering"
+        request.task_kwargs = {"topk": 1}
 
         prediction = await test_transformer_question_answering.predict(request)
         answers = [input[i][1][prediction.task_outputs["answers"][i][0]["start"]:prediction.task_outputs["answers"][i][0]["end"]] for i in range(len(input))]
@@ -219,17 +156,10 @@ async def test_question_answering(self, test_transformer_question_answering, inp
         assert all(prediction.task_outputs["answers"][i][0]["answer"] == answers[i] for i in range(len(input)))
 
     @pytest.mark.asyncio
-    async def test_question_answering_topk(self, test_transformer_question_answering):
-        input = [["What is a test?", "A test is a thing where you test."]]
-        request = PredictionRequest.parse_obj({
-            "input": input,
-            "is_preprocessed": False,
-            "preprocessing_kwargs": {},
-            "model_kwargs": {},
-            "task": "question_answering",
-            "task_kwargs": {"topk": 2},
-            "adapter_name": ""
-        })
+    async def test_question_answering_topk(self, request, test_transformer_question_answering):
+        request.input = [["What is a test?", "A test is a thing where you test."]]
+        request.task = "question_answering"
+        request.task_kwargs = {"topk": 2}
 
         prediction = await test_transformer_question_answering.predict(request)
         answers = [input[0][1][prediction.task_outputs["answers"][0][i]["start"]:prediction.task_outputs["answers"][0][i]["end"]] for i in range(2)]
@@ -244,57 +174,36 @@ class TestTransformerGeneration:
     @pytest.mark.parametrize("input", [(["Generate text"]),
                                        (["Generate text", "And more text"])],
                              )
-    async def test_generation(self, test_transformer_generation, input):
-        request = PredictionRequest.parse_obj({
-            "input": input,
-            "is_preprocessed": False,
-            "preprocessing_kwargs": {},
-            "model_kwargs": {},
-            "task": "generation",
-            "task_kwargs": {},
-            "adapter_name": ""
-        })
+    async def test_generation(self, request, test_transformer_generation, input):
+        request.input = input
+        request.task = "generation"
 
         prediction = await test_transformer_generation.predict(request)
         assert all(isinstance(prediction.task_outputs["generated_texts"][i][0], str) for i in range(len(input)))
 
     @pytest.mark.asyncio
-    async def test_generation_output_attention_and_scores(self, test_transformer_generation):
-        request = PredictionRequest.parse_obj({
-            "input": ["Generate text"],
-            "is_preprocessed": False,
-            "preprocessing_kwargs": {},
-            "model_kwargs": {
-                "output_attentions": True,
-                "output_scores": True
-            },
-            "task": "generation",
-            "task_kwargs": {},
-            "adapter_name": ""
-        })
+    async def test_generation_output_attention_and_scores(self, request, test_transformer_generation):
+        request.task = "generation"
+        request.model_kwargs = {
+            "output_attentions": True,
+            "output_scores": True
+        }
 
         prediction = await test_transformer_generation.predict(request)
         assert "scores" in prediction.model_outputs
         assert "attentions" in prediction.model_outputs
 
     @pytest.mark.asyncio
-    async def test_generation_beam_sample_multiple_seqs(self, test_transformer_generation):
-        request = PredictionRequest.parse_obj({
-            "input": ["Generate text"],
-            "is_preprocessed": False,
-            "preprocessing_kwargs": {},
-            "model_kwargs": {},
-            "task": "generation",
-            "task_kwargs": {
-                "num_beams": 2,
-                "do_sample": True,
-                "top_k": 10,
-                "top_p": 0.5,
-                "no_repeat_ngram_size": 2,
-                "num_return_sequences": 2
-            },
-            "adapter_name": ""
-        })
+    async def test_generation_beam_sample_multiple_seqs(self, request, test_transformer_generation):
+        request.task = "generation"
+        request.task_kwargs =  {
+            "num_beams": 2,
+            "do_sample": True,
+            "top_k": 10,
+            "top_p": 0.5,
+            "no_repeat_ngram_size": 2,
+            "num_return_sequences": 2
+        }
 
         prediction = await test_transformer_generation.predict(request)
         assert len(prediction.task_outputs["generated_texts"][0]) == 2
\ No newline at end of file

From fd7fae3c0652cc6a32a9496f3d2b24129d2fbef6 Mon Sep 17 00:00:00 2001
From: Geigle <gregortheodor.geigle@stud.tu-darmstadt.de>
Date: Thu, 1 Jul 2021 16:20:22 +0200
Subject: [PATCH 09/23] Fixed disallowed fixture name (request ->
 prediction_request)

---
 .../inference_server/tests/conftest.py        |   2 +-
 .../test_sentence_transformer.py              |  48 ++-----
 .../tests/test_inference/test_transformers.py | 126 +++++++++---------
 3 files changed, 77 insertions(+), 99 deletions(-)

diff --git a/square-model-inference-api/inference_server/tests/conftest.py b/square-model-inference-api/inference_server/tests/conftest.py
index c3a4984ae..880250d56 100644
--- a/square-model-inference-api/inference_server/tests/conftest.py
+++ b/square-model-inference-api/inference_server/tests/conftest.py
@@ -80,7 +80,7 @@ def test_sentence_transformer():
     return SentenceTransformer(SENTENCE_MODEL, 1, True)
 
 @pytest.fixture()
-def request():
+def prediction_request():
     request = PredictionRequest.parse_obj({
         "input": ["test"],
         "is_preprocessed": False,
diff --git a/square-model-inference-api/inference_server/tests/test_inference/test_sentence_transformer.py b/square-model-inference-api/inference_server/tests/test_inference/test_sentence_transformer.py
index a290a78c6..92d70faf6 100644
--- a/square-model-inference-api/inference_server/tests/test_inference/test_sentence_transformer.py
+++ b/square-model-inference-api/inference_server/tests/test_inference/test_sentence_transformer.py
@@ -1,7 +1,6 @@
 import pytest
 
 import numpy as np
-from square_model_inference.models.request import PredictionRequest
 
 
 @pytest.mark.usefixtures("test_sentence_transformer")
@@ -10,45 +9,24 @@ class TestSentenceTransformerEmbedding:
     @pytest.mark.asyncio
     @pytest.mark.parametrize("input", [(["this is a test"]),
                                             (["this is a test", "this is a test with a longer sentence"])])
-    async def test_embedding(self, test_sentence_transformer, input):
-        request = PredictionRequest.parse_obj({
-            "input": input,
-            "is_preprocessed": False,
-            "preprocessing_kwargs": {},
-            "model_kwargs": {},
-            "task": "embedding",
-            "task_kwargs": {},
-            "adapter_name": ""
-        })
-
-        prediction = await test_sentence_transformer.predict(request)
+    async def test_embedding(self, prediction_request, test_sentence_transformer, input):
+        prediction_request.task = "embedding"
+        prediction_request.input = input
+
+        prediction = await test_sentence_transformer.predict(prediction_request)
         assert np.array(prediction.model_outputs["embeddings"]).shape[1] == 768
         assert np.array(prediction.model_outputs["embeddings"]).shape[0] == len(input)
 
     @pytest.mark.asyncio
-    async def test_not_embedding(self, test_sentence_transformer):
-        request = PredictionRequest.parse_obj({
-            "input": ["input"],
-            "is_preprocessed": False,
-            "preprocessing_kwargs": {},
-            "model_kwargs": {},
-            "task": "sequence_classification",
-            "task_kwargs": {},
-            "adapter_name": ""
-        })
+    async def test_not_embedding(self, prediction_request, test_sentence_transformer):
+        prediction_request.task = "sequence_classification"
         with pytest.raises(ValueError):
-            prediction = await test_sentence_transformer.predict(request)
+            prediction = await test_sentence_transformer.predict(prediction_request)
 
     @pytest.mark.asyncio
-    async def test_forbid_is_preprocessed(self, test_sentence_transformer):
-        request = PredictionRequest.parse_obj({
-            "input": ["test"],
-            "is_preprocessed": True,
-            "preprocessing_kwargs": {},
-            "model_kwargs": {},
-            "task": "embedding",
-            "task_kwargs": {},
-            "adapter_name": ""
-        })
+    async def test_forbid_is_preprocessed(self, prediction_request, test_sentence_transformer):
+        prediction_request.task = "embedding"
+        prediction_request.is_preprocessed = True
+
         with pytest.raises(ValueError):
-            prediction = await test_sentence_transformer.predict(request)
\ No newline at end of file
+            prediction = await test_sentence_transformer.predict(prediction_request)
\ No newline at end of file
diff --git a/square-model-inference-api/inference_server/tests/test_inference/test_transformers.py b/square-model-inference-api/inference_server/tests/test_inference/test_transformers.py
index bf6c9a3d2..5f99ed649 100644
--- a/square-model-inference-api/inference_server/tests/test_inference/test_transformers.py
+++ b/square-model-inference-api/inference_server/tests/test_inference/test_transformers.py
@@ -1,19 +1,19 @@
 import pytest
 
-from square_model_inference.models.request import PredictionRequest
 import numpy as np
 
+
 @pytest.mark.usefixtures("test_transformer_sequence_classification")
 class TestTransformerSequenceClassification:
     @pytest.mark.asyncio
     @pytest.mark.parametrize("input", [(["this is a test"]),
                                               (["this is a test", "this is a test with a longer sentence"])],
                              ids=["single", "batch"])
-    async def test_sequence_classification(self, request, test_transformer_sequence_classification, input):
-        request.input = input
-        request.task = "sequence_classification"
+    async def test_sequence_classification(self, prediction_request, test_transformer_sequence_classification, input):
+        prediction_request.input = input
+        prediction_request.task = "sequence_classification"
 
-        prediction = await test_transformer_sequence_classification.predict(request)
+        prediction = await test_transformer_sequence_classification.predict(prediction_request)
         np.testing.assert_allclose(np.sum(prediction.model_outputs["logits"], axis=-1), [1.0]*len(input), err_msg="logits are softmax")
         assert len(prediction.task_outputs["labels"]) == len(input)
         assert all(isinstance(prediction.task_outputs["labels"][i], int) for i in range(len(input)))
@@ -21,23 +21,23 @@ async def test_sequence_classification(self, request, test_transformer_sequence_
         assert "id2label" in prediction.task_outputs
 
     @pytest.mark.asyncio
-    async def test_sequence_classification_output_attention(self, request, test_transformer_sequence_classification):
-        request.task = "sequence_classification"
-        request.model_kwargs = {"output_attentions": True}
+    async def test_sequence_classification_output_attention(self, prediction_request, test_transformer_sequence_classification):
+        prediction_request.task = "sequence_classification"
+        prediction_request.model_kwargs = {"output_attentions": True}
 
-        prediction = await test_transformer_sequence_classification.predict(request)
+        prediction = await test_transformer_sequence_classification.predict(prediction_request)
         assert "attentions" in prediction.model_outputs
 
     @pytest.mark.asyncio
     @pytest.mark.parametrize("input", [(["this is a test"]), 
                                        (["this is a test", "this is a test with a longer sentence"])],
                              ids=["single", "batch"])
-    async def test_sequence_classification_regression(self, request, test_transformer_sequence_classification, input):
-        request.input = input
-        request.task = "sequence_classification"
-        request.task_kwargs = {"is_regression": True}
+    async def test_sequence_classification_regression(self, prediction_request, test_transformer_sequence_classification, input):
+        prediction_request.input = input
+        prediction_request.task = "sequence_classification"
+        prediction_request.task_kwargs = {"is_regression": True}
 
-        prediction = await test_transformer_sequence_classification.predict(request)
+        prediction = await test_transformer_sequence_classification.predict(prediction_request)
         assert not np.array_equal(np.sum(prediction.model_outputs["logits"], axis=-1)-1, [0.0]*len(input)), "logits are not softmax"
         assert "labels" not in prediction.task_outputs
         assert "logits" in prediction.model_outputs
@@ -51,11 +51,11 @@ class TestTransformerTokenClassification:
                                               (["this is a test", "this is a test with a longer sentence"],
                                                [[None, 0, 1, 2, 3, None, None, None, None, None], [None, 0, 1, 2, 3, 4, 5, 6, 7, None]])],
                              ids=["single", "batch"])
-    async def test_token_classification(self, request, test_transformer_token_classification, input, word_ids):
-        request.input = input
-        request.task = "token_classification"
+    async def test_token_classification(self, prediction_request, test_transformer_token_classification, input, word_ids):
+        prediction_request.input = input
+        prediction_request.task = "token_classification"
 
-        prediction = await test_transformer_token_classification.predict(request)
+        prediction = await test_transformer_token_classification.predict(prediction_request)
         np.testing.assert_allclose(np.sum(prediction.model_outputs["logits"], axis=-1), np.ones(shape=(len(input), len(word_ids[0]))), err_msg="logits are softmax")
         assert all(len(prediction.task_outputs["labels"][i]) == len(word_ids[i]) for i in range(len(input)))
         assert "logits" in prediction.model_outputs
@@ -69,12 +69,12 @@ async def test_token_classification(self, request, test_transformer_token_classi
                                                        (["this is a test", "this is a test with a longer sentence"],
                                                         [[None, 0, 1, 2, 3, None, None, None, None, None], [None, 0, 1, 2, 3, 4, 5, 6, 7, None]])],
                              ids=["single", "batch"])
-    async def test_token_classification_regression(self, request, test_transformer_token_classification, input, word_ids):
-        request.input = input
-        request.task = "token_classification"
-        request.task_kwargs = {"is_regression": True}
+    async def test_token_classification_regression(self, prediction_request, test_transformer_token_classification, input, word_ids):
+        prediction_request.input = input
+        prediction_request.task = "token_classification"
+        prediction_request.task_kwargs = {"is_regression": True}
 
-        prediction = await test_transformer_token_classification.predict(request)
+        prediction = await test_transformer_token_classification.predict(prediction_request)
         assert not np.array_equal((np.sum(prediction.model_outputs["logits"], axis=-1), np.ones_like(word_ids)), "logits are not softmax")
         assert "labels" not in prediction.task_outputs
         assert "logits" in prediction.model_outputs
@@ -91,12 +91,12 @@ class TestTransformerEmbedding:
                                             (["this is a test"], "cls"),
                                             (["this is a test", "this is a test with a longer sentence"], "cls")],
                              )
-    async def test_embedding(self, request, test_transformer_embedding, input, mode):
-        request.input = input
-        request.task = "embedding"
-        request.task_kwargs = {"embedding_mode": mode}
+    async def test_embedding(self, prediction_request, test_transformer_embedding, input, mode):
+        prediction_request.input = input
+        prediction_request.task = "embedding"
+        prediction_request.task_kwargs = {"embedding_mode": mode}
 
-        prediction = await test_transformer_embedding.predict(request)
+        prediction = await test_transformer_embedding.predict(prediction_request)
         assert np.array(prediction.model_outputs["embeddings"]).shape[1] == 768
         assert np.array(prediction.model_outputs["embeddings"]).shape[0] == len(input)
         assert "hidden_states" not in prediction.model_outputs
@@ -107,12 +107,12 @@ async def test_embedding(self, request, test_transformer_embedding, input, mode)
                                             (["this is a test", "this is a test with a longer sentence"],
                                              [[None, 0, 1, 2, 3, None, None, None, None, None], [None, 0, 1, 2, 3, 4, 5, 6, 7, None]])],
                              ids=["single", "batch"])
-    async def test_embedding_token(self, request, test_transformer_embedding, input, word_ids):
-        request.input = input
-        request.task = "embedding"
-        request.task_kwargs = {"embedding_mode": "token"}
+    async def test_embedding_token(self, prediction_request, test_transformer_embedding, input, word_ids):
+        prediction_request.input = input
+        prediction_request.task = "embedding"
+        prediction_request.task_kwargs = {"embedding_mode": "token"}
 
-        prediction = await test_transformer_embedding.predict(request)
+        prediction = await test_transformer_embedding.predict(prediction_request)
         assert np.array(prediction.model_outputs["embeddings"]).shape[2] == 768
         assert np.array(prediction.model_outputs["embeddings"]).shape[1] == len(word_ids[0])
         assert np.array(prediction.model_outputs["embeddings"]).shape[0] == len(input)
@@ -121,20 +121,20 @@ async def test_embedding_token(self, request, test_transformer_embedding, input,
         assert prediction.task_outputs["word_ids"] == word_ids
 
     @pytest.mark.asyncio
-    async def test_embedding_unknown_mode(self, request, test_transformer_embedding):
-        request.task = "embedding"
-        request.task_kwargs = {"embedding_mode": "this mode does not exist"}
+    async def test_embedding_unknown_mode(self, prediction_request, test_transformer_embedding):
+        prediction_request.task = "embedding"
+        prediction_request.task_kwargs = {"embedding_mode": "this mode does not exist"}
 
         with pytest.raises(ValueError):
-            prediction = await test_transformer_embedding.predict(request)
+            prediction = await test_transformer_embedding.predict(prediction_request)
 
     @pytest.mark.asyncio
-    async def test_forbid_is_preprocessed(self, request, test_transformer_embedding):
-        request.task = "embedding"
-        request.is_preprocessed = True
+    async def test_forbid_is_preprocessed(self, prediction_request, test_transformer_embedding):
+        prediction_request.task = "embedding"
+        prediction_request.is_preprocessed = True
 
         with pytest.raises(ValueError):
-            prediction = await test_transformer_embedding.predict(request)
+            prediction = await test_transformer_embedding.predict(prediction_request)
 
 @pytest.mark.usefixtures("test_transformer_question_answering")
 class TestTransformerQuestionAnswering:
@@ -144,24 +144,24 @@ class TestTransformerQuestionAnswering:
                                        ([["What is a test?", "A test is a thing where you test."],
                                          ["What is a longer test?", "A test is a thing where you test. If it is longer you call it longer"]])],
                              )
-    async def test_question_answering(self, request, test_transformer_question_answering, input):
-        request.input = input
-        request.task = "question_answering"
-        request.task_kwargs = {"topk": 1}
+    async def test_question_answering(self, prediction_request, test_transformer_question_answering, input):
+        prediction_request.input = input
+        prediction_request.task = "question_answering"
+        prediction_request.task_kwargs = {"topk": 1}
 
-        prediction = await test_transformer_question_answering.predict(request)
+        prediction = await test_transformer_question_answering.predict(prediction_request)
         answers = [input[i][1][prediction.task_outputs["answers"][i][0]["start"]:prediction.task_outputs["answers"][i][0]["end"]] for i in range(len(input))]
         assert "start_logits" in prediction.model_outputs and "end_logits" in prediction.model_outputs
         assert len(prediction.task_outputs["answers"]) == len(input)
         assert all(prediction.task_outputs["answers"][i][0]["answer"] == answers[i] for i in range(len(input)))
 
     @pytest.mark.asyncio
-    async def test_question_answering_topk(self, request, test_transformer_question_answering):
-        request.input = [["What is a test?", "A test is a thing where you test."]]
-        request.task = "question_answering"
-        request.task_kwargs = {"topk": 2}
+    async def test_question_answering_topk(self, prediction_request, test_transformer_question_answering):
+        prediction_request.input = [["What is a test?", "A test is a thing where you test."]]
+        prediction_request.task = "question_answering"
+        prediction_request.task_kwargs = {"topk": 2}
 
-        prediction = await test_transformer_question_answering.predict(request)
+        prediction = await test_transformer_question_answering.predict(prediction_request)
         answers = [input[0][1][prediction.task_outputs["answers"][0][i]["start"]:prediction.task_outputs["answers"][0][i]["end"]] for i in range(2)]
         assert "start_logits" in prediction.model_outputs and "end_logits" in prediction.model_outputs
         assert len(prediction.task_outputs["answers"]) == len(input)
@@ -174,29 +174,29 @@ class TestTransformerGeneration:
     @pytest.mark.parametrize("input", [(["Generate text"]),
                                        (["Generate text", "And more text"])],
                              )
-    async def test_generation(self, request, test_transformer_generation, input):
-        request.input = input
-        request.task = "generation"
+    async def test_generation(self, prediction_request, test_transformer_generation, input):
+        prediction_request.input = input
+        prediction_request.task = "generation"
 
-        prediction = await test_transformer_generation.predict(request)
+        prediction = await test_transformer_generation.predict(prediction_request)
         assert all(isinstance(prediction.task_outputs["generated_texts"][i][0], str) for i in range(len(input)))
 
     @pytest.mark.asyncio
-    async def test_generation_output_attention_and_scores(self, request, test_transformer_generation):
-        request.task = "generation"
-        request.model_kwargs = {
+    async def test_generation_output_attention_and_scores(self, prediction_request, test_transformer_generation):
+        prediction_request.task = "generation"
+        prediction_request.model_kwargs = {
             "output_attentions": True,
             "output_scores": True
         }
 
-        prediction = await test_transformer_generation.predict(request)
+        prediction = await test_transformer_generation.predict(prediction_request)
         assert "scores" in prediction.model_outputs
         assert "attentions" in prediction.model_outputs
 
     @pytest.mark.asyncio
-    async def test_generation_beam_sample_multiple_seqs(self, request, test_transformer_generation):
-        request.task = "generation"
-        request.task_kwargs =  {
+    async def test_generation_beam_sample_multiple_seqs(self, prediction_request, test_transformer_generation):
+        prediction_request.task = "generation"
+        prediction_request.task_kwargs = {
             "num_beams": 2,
             "do_sample": True,
             "top_k": 10,
@@ -205,5 +205,5 @@ async def test_generation_beam_sample_multiple_seqs(self, request, test_transfor
             "num_return_sequences": 2
         }
 
-        prediction = await test_transformer_generation.predict(request)
+        prediction = await test_transformer_generation.predict(prediction_request)
         assert len(prediction.task_outputs["generated_texts"][0]) == 2
\ No newline at end of file

From 9f03fff0118d742b1af7eeab0fa79963ee9fa43a Mon Sep 17 00:00:00 2001
From: Geigle <gregortheodor.geigle@stud.tu-darmstadt.de>
Date: Fri, 9 Jul 2021 16:30:54 +0200
Subject: [PATCH 10/23] Updated requirements and installation process; started
 splitting PredictionOutput in multiple classes for better readable
 documentation

---
 square-model-inference-api/Makefile           |  5 +-
 square-model-inference-api/README.md          | 18 ++--
 .../inference_server/Dockerfile               | 26 +++---
 .../inference_server/requirements.txt         |  8 --
 .../inference_server/requirements1.txt        |  8 ++
 .../inference_server/requirements2.txt        |  1 +
 .../api/routes/prediction.py                  | 69 +++++++++++---
 .../square_model_inference/core/config.py     | 12 ++-
 .../core/event_handlers.py                    |  6 +-
 .../inference/adaptertransformer.py           | 55 +++++------
 .../square_model_inference/inference/model.py |  5 +-
 .../inference/sentencetransformer.py          | 15 +--
 .../inference/transformer.py                  | 31 ++++---
 .../models/prediction.py                      | 85 +++++++++++------
 .../square_model_inference/models/request.py  |  2 +-
 .../inference_server/tests/conftest.py        | 15 ++-
 .../pre_test_setup_for_docker_caching.py      | 27 +++---
 .../tests/test_inference/test_adapter.py      |  3 +-
 .../test_sentence_transformer.py              | 18 ++--
 .../tests/test_inference/test_transformers.py | 93 +++++++++----------
 .../uninstall_requirements.txt                |  1 +
 square-model-inference-api/nginx/nginx.conf   |  8 +-
 22 files changed, 304 insertions(+), 207 deletions(-)
 delete mode 100644 square-model-inference-api/inference_server/requirements.txt
 create mode 100644 square-model-inference-api/inference_server/requirements1.txt
 create mode 100644 square-model-inference-api/inference_server/requirements2.txt
 create mode 100644 square-model-inference-api/inference_server/uninstall_requirements.txt

diff --git a/square-model-inference-api/Makefile b/square-model-inference-api/Makefile
index 7acebffb0..13e50caf8 100644
--- a/square-model-inference-api/Makefile
+++ b/square-model-inference-api/Makefile
@@ -11,7 +11,10 @@ test:
 	pytest --cov
 
 install:
-	pip install --upgrade pip && pip install -r inference_server/requirements.txt
+	pip install --upgrade pip
+	pip install -r inference_server/requirements1.txt
+    pip uninstall -y -r inference_server/uninstall_requirements.txt
+    pip install -r inference_server/requirements2.txt
 
 run:
 	PYTHONPATH=inference_server/ uvicorn inference_server.main:app --reload --host 0.0.0.0 --port 8002 --env-file inference_server/.env
diff --git a/square-model-inference-api/README.md b/square-model-inference-api/README.md
index 5b93ceaad..324b89cdc 100644
--- a/square-model-inference-api/README.md
+++ b/square-model-inference-api/README.md
@@ -27,15 +27,21 @@ Python 3.7+, Docker (optional), Make (optional)
 
 ## Installation
 Install the required packages in your local environment (ideally virtualenv, conda, etc.).
-<!-- ```bash
-pip install -r requirements
-```  -->
-
+```bash
+pip install -r inference_server/requirements1.txt
+pip uninstall -y -r inference_server/uninstall_requirements.txt
+pip install -r inference_server/requirements2.txt
+```
+or
 ```sh
-python -m venv venv
-source venv/bin/activate
 make install
 ```
+**Why two requirement.txt and why the uninstall?**  
+`sentence-transformers` depends on `transformers` and it will be installed along with it.
+However, we use `adapter-transformers` (a fork of `transformers`) in this project.
+Both `transformers` and `adapter-transformers` use the same namespace so they conflict.
+Thus, we first install `sentence-transformers` along with `transformers`, 
+uninstall `transformers`, and finally install `adapter-transformers`.
 
 #### Running Localhost
 
diff --git a/square-model-inference-api/inference_server/Dockerfile b/square-model-inference-api/inference_server/Dockerfile
index fe2b5344c..fefc84ae8 100644
--- a/square-model-inference-api/inference_server/Dockerfile
+++ b/square-model-inference-api/inference_server/Dockerfile
@@ -1,25 +1,29 @@
-FROM python:3.8.2 as build
+FROM python:3.7.6-slim-buster as base
 
 ENV PYTHONUNBUFFERED 1
 
 EXPOSE 8000
 WORKDIR /app
 
-COPY requirements.txt ./
-RUN pip install --upgrade pip && \
-    pip install -r requirements.txt
+RUN pip install --upgrade pip
+COPY requirements1.txt requirements2.txt uninstall_requirements.txt ./
+RUN pip install -r requirements1.txt
+RUN pip uninstall -y -r uninstall_requirements.txt
+RUN pip install -r requirements2.txt
 
-COPY ./inference_server/tests/pre_test_setup_for_docker_caching.py ./inference_server/tests/pre_test_setup_for_docker_caching.py
-RUN PYTHONPATH=./inference_server; python ./inference_server/tests/pre_test_setup_for_docker_caching.py
+# Testing stage. We first pre-download any models separately for caching (pre_test_setup_for_docker_caching.py) and then
+# run the tests
+FROM base as test
 
+COPY ./tests/pre_test_setup_for_docker_caching.py ./tests/pre_test_setup_for_docker_caching.py
+RUN python ./tests/pre_test_setup_for_docker_caching.py
 COPY . ./
-
-FROM base as test
-WORKDIR /app
 RUN pip install pytest pytest-cov pytest-asyncio
-COPY ../tests tests
-RUN PYTHONPATH=./inference_server; pytest --cov
+RUN PYTHONPATH=./ pytest --cov
 
+# Deployment stage
 FROM base as build
 
+COPY . ./
+
 CMD ["uvicorn", "main:app", "--host", "0.0.0.0", "--port", "8000"]
\ No newline at end of file
diff --git a/square-model-inference-api/inference_server/requirements.txt b/square-model-inference-api/inference_server/requirements.txt
deleted file mode 100644
index 0b2dd68dc..000000000
--- a/square-model-inference-api/inference_server/requirements.txt
+++ /dev/null
@@ -1,8 +0,0 @@
-uvicorn
-fastapi
-pydantic
-loguru
-python-dotenv
-adapter-transformers
-sentencepiece
-torch
\ No newline at end of file
diff --git a/square-model-inference-api/inference_server/requirements1.txt b/square-model-inference-api/inference_server/requirements1.txt
new file mode 100644
index 000000000..d8a296d3e
--- /dev/null
+++ b/square-model-inference-api/inference_server/requirements1.txt
@@ -0,0 +1,8 @@
+uvicorn==0.13.4
+fastapi==0.65.1
+pydantic==1.8.2
+loguru==0.5.3
+python-dotenv==0.17.1
+sentencepiece==0.1.95
+torch==1.8.1
+sentence-transformers==1.2.0
\ No newline at end of file
diff --git a/square-model-inference-api/inference_server/requirements2.txt b/square-model-inference-api/inference_server/requirements2.txt
new file mode 100644
index 000000000..1503d3a94
--- /dev/null
+++ b/square-model-inference-api/inference_server/requirements2.txt
@@ -0,0 +1 @@
+adapter-transformers==2.0.1
\ No newline at end of file
diff --git a/square-model-inference-api/inference_server/square_model_inference/api/routes/prediction.py b/square-model-inference-api/inference_server/square_model_inference/api/routes/prediction.py
index 6632c3bf4..c06c0633f 100644
--- a/square-model-inference-api/inference_server/square_model_inference/api/routes/prediction.py
+++ b/square-model-inference-api/inference_server/square_model_inference/api/routes/prediction.py
@@ -2,21 +2,68 @@
 from starlette.requests import Request
 from loguru import logger
 
-from square_model_inference.models.request import PredictionRequest
-from square_model_inference.models.prediction import PredictionOutput
-from square_model_inference.inference.model import Model
+from square_model_inference.models.request import PredictionRequest, Task
+from square_model_inference.models.prediction import PredictionOutputForSequenceClassification, PredictionOutputForTokenClassification, \
+    PredictionOutputForQuestionAnswering, PredictionOutputForGeneration, PredictionOutputForEmbedding
 
 router = APIRouter()
 
+@router.post("/sequence-classification", response_model=PredictionOutputForSequenceClassification, name="sequence classification")
+async def sequence_classification(
+        request: Request,
+        prediction_request: PredictionRequest,
+) -> PredictionOutputForSequenceClassification:
 
-@router.post("/predict", response_model=PredictionOutput, name="predict")
-async def predict(
-    request: Request,
-    prediction_request: PredictionRequest = None,
-) -> PredictionOutput:
+    logger.info(f"Sequence Classification Request: {prediction_request.dict()}")
+    model = request.app.state.model
+    prediction = await model.predict(prediction_request, Task.sequence_classification)
 
-    logger.info(f"Request: {prediction_request.dict()}")
-    model: Model = request.app.state.model
-    prediction: PredictionOutput = await model.predict(prediction_request)
+    return prediction
+
+@router.post("/token-classification", response_model=PredictionOutputForTokenClassification, name="token classification")
+async def token_classification(
+        request: Request,
+        prediction_request: PredictionRequest,
+) -> PredictionOutputForTokenClassification:
+
+    logger.info(f"Token Classification Request: {prediction_request.dict()}")
+    model = request.app.state.model
+    prediction = await model.predict(prediction_request, Task.token_classification)
+
+    return prediction
+
+@router.post("/embedding", response_model=PredictionOutputForEmbedding, name="embedding")
+async def embedding(
+        request: Request,
+        prediction_request: PredictionRequest,
+) -> PredictionOutputForEmbedding:
+
+    logger.info(f"Embedding Request: {prediction_request.dict()}")
+    model = request.app.state.model
+    prediction = await model.predict(prediction_request, Task.embedding)
 
     return prediction
+
+@router.post("/question-answering", response_model=PredictionOutputForQuestionAnswering, name="question answering")
+async def question_answering(
+        request: Request,
+        prediction_request: PredictionRequest,
+) -> PredictionOutputForQuestionAnswering:
+
+    logger.info(f"Question Answering Request: {prediction_request.dict()}")
+    model = request.app.state.model
+    prediction = await model.predict(prediction_request, Task.question_answering)
+
+    return prediction
+
+@router.post("/generation", response_model=PredictionOutputForGeneration, name="generation")
+async def generation(
+        request: Request,
+        prediction_request: PredictionRequest,
+) -> PredictionOutputForGeneration:
+
+    logger.info(f"Generation Request: {prediction_request.dict()}")
+    model = request.app.state.model
+    prediction = await model.predict(prediction_request, Task.generation)
+
+    return prediction
\ No newline at end of file
diff --git a/square-model-inference-api/inference_server/square_model_inference/core/config.py b/square-model-inference-api/inference_server/square_model_inference/core/config.py
index c1cd0c6a3..224417b3d 100644
--- a/square-model-inference-api/inference_server/square_model_inference/core/config.py
+++ b/square-model-inference-api/inference_server/square_model_inference/core/config.py
@@ -6,19 +6,21 @@
 
 config = Config(".env")
 
-# Disable CUDA even if available
-DISABLE_GPU: bool = config("DISABLE_GPU", cast=bool, default=False)
 # Corresponds to the Huggingface name for finetuned Transformers or the name of a finetuned SentenceTransformers
 MODEL_NAME: str = config("MODEL_NAME")
 # Type of the model, e.g. Transformers, Adapter, ...
 # See square_model_inference.core.event_handlers.MODEL_MAPPING for all available names with corresponding model
 MODEL_TYPE: str = config("MODEL_TYPE")
-# Cache directory where model weights are stored
-# This is the name for the env variable used by transformers and sentence-transformers package
-TRANSFORMERS_CACHE: str = config("TRANSFORMERS_CACHE")
 
+# Disable CUDA even if available
+DISABLE_GPU: bool = config("DISABLE_GPU", cast=bool, default=False)
 # Batch size used for many inputs
 BATCH_SIZE: int = config("BATCH_SIZE", cast=int, default=32)
+MAX_INPUT_SIZE: int = config("MAX_INPUT_SIZE", cast=int, default=1024)
+
+# Cache directory where model weights are stored
+# This is the name for the env variable used by transformers and sentence-transformers package
+TRANSFORMERS_CACHE: str = config("TRANSFORMERS_CACHE")
 
 # For MODEL_TYPE=transformers: decides the AutoModel* class used
 # See square_model_inference.inference.transformer.CLASS_MAPPING for valid names and corresponding class
diff --git a/square-model-inference-api/inference_server/square_model_inference/core/event_handlers.py b/square-model-inference-api/inference_server/square_model_inference/core/event_handlers.py
index 5a629fa2e..9498a0b38 100644
--- a/square-model-inference-api/inference_server/square_model_inference/core/event_handlers.py
+++ b/square-model-inference-api/inference_server/square_model_inference/core/event_handlers.py
@@ -4,7 +4,8 @@
 from loguru import logger
 
 from square_model_inference.inference.adaptertransformer import AdapterTransformer
-from square_model_inference.core.config import MODEL_TYPE, MODEL_NAME, MODEL_CLASS, DISABLE_GPU, BATCH_SIZE, TRANSFORMERS_CACHE
+from square_model_inference.core.config import MODEL_TYPE, MODEL_NAME, MODEL_CLASS, DISABLE_GPU, BATCH_SIZE, \
+    TRANSFORMERS_CACHE, MAX_INPUT_SIZE
 from square_model_inference.inference.sentencetransformer import SentenceTransformer
 from square_model_inference.inference.transformer import Transformer
 
@@ -19,7 +20,8 @@
     "model_class": MODEL_CLASS,
     "disable_gpu": DISABLE_GPU,
     "batch_size": BATCH_SIZE,
-    "transformers_cache": TRANSFORMERS_CACHE
+    "transformers_cache": TRANSFORMERS_CACHE,
+    "max_input_size": MAX_INPUT_SIZE
 }
 
 def _startup_model(app: FastAPI) -> None:
diff --git a/square-model-inference-api/inference_server/square_model_inference/inference/adaptertransformer.py b/square-model-inference-api/inference_server/square_model_inference/inference/adaptertransformer.py
index 3a0507219..e9c052c4c 100644
--- a/square-model-inference-api/inference_server/square_model_inference/inference/adaptertransformer.py
+++ b/square-model-inference-api/inference_server/square_model_inference/inference/adaptertransformer.py
@@ -2,8 +2,7 @@
 
 from loguru import logger
 
-from transformers import AutoModelWithHeads
-from transformers.adapters.utils import download_cached, ADAPTER_HUB_INDEX_FILE
+from transformers import AutoModelWithHeads, list_adapters
 
 from square_model_inference.inference.transformer import Transformer
 from square_model_inference.models.request import PredictionRequest, Task
@@ -15,7 +14,7 @@ class AdapterTransformer(Transformer):
     """
     The class for all adapter-based models using the adapter-transformers package
     """
-    def __init__(self, model_name, batch_size, disable_gpu, transformers_cache, **kwargs):
+    def __init__(self, model_name, batch_size, disable_gpu, transformers_cache, max_input_size, **kwargs):
         """
         Initialize the Adapter with its underlying Transformer and pre-load all available adapters from adapterhub.ml
         :param model_name: the Huggingface model name
@@ -23,34 +22,36 @@ def __init__(self, model_name, batch_size, disable_gpu, transformers_cache, **kw
         :param disable_gpu: do not move model to GPU even if CUDA is available
         :param transformers_cache: Should be same as TRANSFORMERS_CACHE env variable.
         This folder will be used to store the adapters
+        :param max_input_size: requests with a larger input are rejected
         :param kwargs: Not used
         """
         self._load_model(AutoModelWithHeads, model_name, disable_gpu)
         self._load_adapter(model_name, transformers_cache)
         self.batch_size = batch_size
+        self.max_input_size = max_input_size
 
     def _load_adapter(self, model_name, transformers_cache):
         """
         Pre-load all available adapters for MODEL_NAME from adapterhub.ml.
         We parse the hub index to extract all names and then load each model.
         """
-        logger.info("Loading all available adapters from adapterhub.ml")
-        logger.warning("This will not load adapters published only on https://huggingface.co/models")
-        logger.warning("UPDATE with https://github.com/Adapter-Hub/adapter-transformers/pull/193")
-        index_file = download_cached(ADAPTER_HUB_INDEX_FILE.format(model_name))
-        adapter_index = json.load(open(index_file))
-        adapters = set()
-        for task, datasets in adapter_index.items():
-            for dataset in datasets.keys():
-                for key in datasets[dataset].keys():
-                    if key != "default":
-                        for org in datasets[dataset][key]["versions"].keys():
-                            adapters.add(f"{task}/{dataset}@{org}")
+        logger.info("Loading all available adapters")
+        adapter_infos = [info for info in list_adapters(source="ah") if info.model_name==model_name]
+        adapters = set(f"{adapter_info.task}/{adapter_info.subtask}@{adapter_info.username}" for adapter_info in adapter_infos)
         for adapter in adapters:
             logger.debug(f"Loading adapter {adapter}")
-            self.model.load_adapter(adapter, load_as=adapter, with_head=True, cache_dir=transformers_cache)
-
-    # def _load_single_adapter(self, adapter_name: str):
+            try:
+                self.model.load_adapter(adapter, load_as=adapter, with_head=True, cache_dir=transformers_cache)
+            except KeyError as e:
+                logger.debug(f"Could not load {adapter} due to incomplete config: Missing key {e.args[0]}")
+            except RuntimeError as e:
+                if "Error(s) in loading state_dict" in e.args[0]:
+                    logger.debug(f"Could not load {adapter} due to missing label_ids in config resulting in exception:\n{e.args[0]}")
+                else:
+                    raise(e)
+
+
+# def _load_single_adapter(self, adapter_name: str):
     #     if adapter_name not in self.model.config.adapters.adapters:
     #         logger.info(f"Loading new adapter {adapter_name}")
     #         self.model.load_adapter(adapter_name, with_head=True, load_as=adapter_name)
@@ -64,7 +65,7 @@ def _token_classification(self, request: PredictionRequest) -> PredictionOutput:
 
         label2id = self.model.config.prediction_heads[request.adapter_name]["label2id"]
         id2label = {v:k for k,v in label2id.items()}
-        prediction.task_outputs["id2label"] = id2label
+        prediction.id2label = id2label
 
         return prediction
 
@@ -75,26 +76,28 @@ def _sequence_classification(self, request: PredictionRequest) -> PredictionOutp
 
         label2id = self.model.config.prediction_heads[request.adapter_name]["label2id"]
         id2label = {v:k for k,v in label2id.items()}
-        prediction.task_outputs["id2label"] = id2label
+        prediction.id2label = id2label
 
         return prediction
 
-    async def predict(self, request: PredictionRequest) -> PredictionOutput:
+    async def predict(self, request: PredictionRequest, task: Task) -> PredictionOutput:
         if request.is_preprocessed:
             raise ValueError("is_preprocessed=True is not supported for this model. Please use text as input.")
+        if len(request.input) > self.max_input_size:
+            raise ValueError(f"Input is too large. Max input size is {self.max_input_size}")
         if not request.adapter_name or request.adapter_name not in self.model.config.adapters.adapters:
             raise ValueError(f"Unknown or missing adapter {request.adapter_name}. "
                        f"Please provider a fully specified adapter name from adapterhub.ml")
         self.model.set_active_adapters(request.adapter_name)
 
-        if request.task == Task.sequence_classification:
+        if task == Task.sequence_classification:
             return self._sequence_classification(request)
-        elif request.task == Task.token_classification:
+        elif task == Task.token_classification:
             return self._token_classification(request)
-        elif request.task == Task.question_answering:
+        elif task == Task.question_answering:
             return self._question_answering(request)
-        elif request.task == Task.embedding:
+        elif task == Task.embedding:
             return self._embedding(request)
-        elif request.task == Task.generation:
+        elif task == Task.generation:
             return self._generation(request)
 
diff --git a/square-model-inference-api/inference_server/square_model_inference/inference/model.py b/square-model-inference-api/inference_server/square_model_inference/inference/model.py
index 4aa92d8d2..f8fbed524 100644
--- a/square-model-inference-api/inference_server/square_model_inference/inference/model.py
+++ b/square-model-inference-api/inference_server/square_model_inference/inference/model.py
@@ -1,4 +1,4 @@
-from square_model_inference.models.request import PredictionRequest
+from square_model_inference.models.request import PredictionRequest, Task
 from square_model_inference.models.prediction import PredictionOutput
 
 
@@ -8,11 +8,12 @@ class Model:
     __init__ is supposed to load all weights and other necessary files (e.g. tokenizer)
     so that the model can directly perform inference on request
     """
-    async def predict(self, payload: PredictionRequest) -> PredictionOutput:
+    async def predict(self, payload: PredictionRequest, task: Task) -> PredictionOutput:
         """
         Take an input, pre-process it accordingly, perform inference according to the task,
         post-process the result and return it
         :param payload: the prediction request containing the input and any other parameters required
+        :param task: The task that the model should perform with the payload
         :return: the result of the prediction
         """
         raise NotImplementedError
diff --git a/square-model-inference-api/inference_server/square_model_inference/inference/sentencetransformer.py b/square-model-inference-api/inference_server/square_model_inference/inference/sentencetransformer.py
index 6244a0222..7523bab29 100644
--- a/square-model-inference-api/inference_server/square_model_inference/inference/sentencetransformer.py
+++ b/square-model-inference-api/inference_server/square_model_inference/inference/sentencetransformer.py
@@ -9,7 +9,7 @@
 from square_model_inference.inference.model import Model
 from square_model_inference.models.request import PredictionRequest, Task
 
-from square_model_inference.models.prediction import PredictionOutput
+from square_model_inference.models.prediction import PredictionOutput, PredictionOutputForEmbedding
 
 
 class SentenceTransformer(Model):
@@ -17,16 +17,18 @@ class SentenceTransformer(Model):
     The class for all sentence-transformers models
     """
 
-    def __init__(self, model_name, batch_size, disable_gpu, **kwargs):
+    def __init__(self, model_name, batch_size, disable_gpu, max_input_size, **kwargs):
         """
         Initialize the SentenceTransformer
         :param model_name: the sentence-transformer model name (https://sbert.net/docs/pretrained_models.html)
         :param batch_size: batch size used for inference
         :param disable_gpu: do not move model to GPU even if CUDA is available
+        :param max_input_size: requests with a larger input are rejected
         :param kwargs: Not used
         """
         self._load_model(model_name, disable_gpu)
         self.batch_size = batch_size
+        self.max_input_size = max_input_size
 
     def _load_model(self, model_name, disable_gpu):
         """
@@ -41,14 +43,15 @@ def _load_model(self, model_name, disable_gpu):
 
     def _embedding(self, request: PredictionRequest) -> PredictionOutput:
         embeddings = self.model.encode(request.input, batch_size=self.batch_size, show_progress_bar=False)
-        return PredictionOutput(model_outputs={"embeddings": embeddings}, task_outputs={})
+        return PredictionOutputForEmbedding(model_outputs={"embeddings": embeddings})
 
 
-    async def predict(self, request: PredictionRequest) -> PredictionOutput:
+    async def predict(self, request: PredictionRequest, task: Task) -> PredictionOutput:
         if request.is_preprocessed:
             raise ValueError("is_preprocessed=True is not supported for this model. Please use text as input.")
-
-        if request.task != Task.embedding:
+        if len(request.input) > self.max_input_size:
+            raise ValueError(f"Input is too large. Max input size is {self.max_input_size}")
+        if task != Task.embedding:
             raise ValueError("Only embedding task supported by this model")
         return self._embedding(request)
 
diff --git a/square-model-inference-api/inference_server/square_model_inference/inference/transformer.py b/square-model-inference-api/inference_server/square_model_inference/inference/transformer.py
index 46c68c77a..c7657de47 100644
--- a/square-model-inference-api/inference_server/square_model_inference/inference/transformer.py
+++ b/square-model-inference-api/inference_server/square_model_inference/inference/transformer.py
@@ -11,7 +11,8 @@
 from square_model_inference.inference.model import Model
 from square_model_inference.models.request import PredictionRequest, Task
 
-from square_model_inference.models.prediction import PredictionOutput
+from square_model_inference.models.prediction import PredictionOutput, PredictionOutputForSequenceClassification, PredictionOutputForTokenClassification, \
+    PredictionOutputForQuestionAnswering, PredictionOutputForGeneration, PredictionOutputForEmbedding
 
 CLASS_MAPPING = {
     "base": AutoModel,
@@ -27,19 +28,21 @@ class Transformer(Model):
     """
     SUPPORTED_EMBEDDING_MODES = ["mean", "max", "cls", "token"]
 
-    def __init__(self, model_name, model_class, batch_size, disable_gpu, **kwargs):
+    def __init__(self, model_name, model_class, batch_size, disable_gpu, max_input_size, **kwargs):
         """
         Initialize the Transformer
         :param model_name: the Huggingface model name
         :param model_class: the class name (according to CLASS_MAPPING) to use
         :param batch_size: batch size used for inference
         :param disable_gpu: do not move model to GPU even if CUDA is available
+        :param max_input_size: requests with a larger input are rejected
         :param kwargs: Not used
         """
         if model_class not in CLASS_MAPPING:
             raise RuntimeError(f"Unknown MODEL_CLASS. Must be one of {CLASS_MAPPING.keys()}")
         self._load_model(CLASS_MAPPING[model_class], model_name, disable_gpu)
         self.batch_size = batch_size
+        self.max_input_size = max_input_size
 
     def _load_model(self, model_cls, model_name, disable_gpu):
         """
@@ -137,7 +140,7 @@ def _embedding(self, request: PredictionRequest) -> PredictionOutput:
             task_outputs["word_ids"] = [features.word_ids(i) for i in range(len(request.input))]
         predictions["embeddings"] = emb
 
-        return PredictionOutput(model_outputs=predictions, task_outputs=task_outputs)
+        return PredictionOutputForEmbedding(model_outputs=predictions, **task_outputs)
 
     def _token_classification(self, request: PredictionRequest) -> PredictionOutput:
         predictions, features = self._predict(request, output_features=True)
@@ -154,7 +157,7 @@ def _token_classification(self, request: PredictionRequest) -> PredictionOutput:
             predictions["logits"] = probabilities
             task_outputs["labels"] = torch.argmax(predictions["logits"], dim=-1).tolist()
 
-        return PredictionOutput(model_outputs=predictions, task_outputs=task_outputs)
+        return PredictionOutputForTokenClassification(model_outputs=predictions, **task_outputs)
 
     def _sequence_classification(self, request: PredictionRequest) -> PredictionOutput:
         predictions = self._predict(request)
@@ -170,7 +173,7 @@ def _sequence_classification(self, request: PredictionRequest) -> PredictionOutp
             predictions["logits"] = probabilities
             task_outputs["labels"] = torch.argmax(predictions["logits"], dim=-1).tolist()
 
-        return PredictionOutput(model_outputs=predictions, task_outputs=task_outputs)
+        return PredictionOutputForSequenceClassification(model_outputs=predictions, **task_outputs)
 
     def _generation(self, request: PredictionRequest) -> PredictionOutput:
         request.preprocessing_kwargs["padding"] = request.preprocessing_kwargs.get("padding", False)
@@ -201,7 +204,7 @@ def _generation(self, request: PredictionRequest) -> PredictionOutput:
                                          clean_up_tokenization_spaces=request.task_kwargs.get("clean_up_tokenization_spaces", False))
                                for seq in res["sequences"]]
             task_outputs["generated_texts"].append(generated_texts)
-        return PredictionOutput(model_outputs=model_outputs, task_outputs=task_outputs)
+        return PredictionOutputForGeneration(model_outputs=model_outputs, **task_outputs)
 
     def _question_answering(self, request: PredictionRequest) -> PredictionOutput:
         """
@@ -304,20 +307,22 @@ def decode(start_: np.ndarray, end_: np.ndarray, topk: int, max_answer_len: int,
             answers.append({"score": no_answer_score, "start": 0, "end": 0, "answer": ""})
             answers = sorted(answers, key=lambda x: x["score"], reverse=True)[: request.task_kwargs.get("topk", 1)]
             task_outputs["answers"].append(answers)
-        return PredictionOutput(model_outputs=predictions, task_outputs=task_outputs)
+        return PredictionOutputForQuestionAnswering(model_outputs=predictions, **task_outputs)
 
-    async def predict(self, request: PredictionRequest) -> PredictionOutput:
+    async def predict(self, request: PredictionRequest, task: Task) -> PredictionOutput:
         if request.is_preprocessed:
             raise ValueError("is_preprocessed=True is not supported for this model. Please use text as input.")
+        if len(request.input) > self.max_input_size:
+            raise ValueError(f"Input is too large. Max input size is {self.max_input_size}")
 
-        if request.task == Task.sequence_classification:
+        if task == Task.sequence_classification:
             return self._sequence_classification(request)
-        elif request.task == Task.token_classification:
+        elif task == Task.token_classification:
             return self._token_classification(request)
-        elif request.task == Task.embedding:
+        elif task == Task.embedding:
             return self._embedding(request)
-        elif request.task == Task.question_answering:
+        elif task == Task.question_answering:
             return self._question_answering(request)
-        elif request.task == Task.generation:
+        elif task == Task.generation:
             return self._generation(request)
 
diff --git a/square-model-inference-api/inference_server/square_model_inference/models/prediction.py b/square-model-inference-api/inference_server/square_model_inference/models/prediction.py
index e1160a1da..67be53aba 100644
--- a/square-model-inference-api/inference_server/square_model_inference/models/prediction.py
+++ b/square-model-inference-api/inference_server/square_model_inference/models/prediction.py
@@ -1,5 +1,5 @@
 from collections import Iterable
-from typing import Dict, Union, Tuple
+from typing import Dict, Union, Tuple, List, Optional
 
 import torch
 from io import BytesIO
@@ -53,7 +53,7 @@ class PredictionOutput(BaseModel):
     """
     The results of the prediction of the model on the given input for the requested task.
     """
-    model_outputs: dict = Field(
+    model_outputs: Dict = Field(
         description="Dictionary containing the model tensor outputs either as plain list or as base64-encoded numpy array.<br><br>"
                     "Decode the base64 string 'arr_string_b64' back to an array in Python like this:<br>"
                     "arr_binary_b64 = arr_string_b64.encode()<br>"
@@ -79,34 +79,6 @@ class PredictionOutput(BaseModel):
                     "- 'sequences': The generated vocab ids for the sequence<br>"
                     "Task 'generation' does not concatenate the tensors of the inputs together but instead creates a list"
                     "of the tensors for each input."
-
-    )
-    task_outputs: dict = Field(
-        description="Dictionary containing the task outputs. This covers anything from generated text, "
-                    "labels, id2label mappings, token2word mappings, etc.<br><br>"
-                    "SentenceTransformer: This is not used.<br>"
-                    "Transformer/ Adapter:<br>"
-                    "'sentence_classification':<br>"
-                    "- 'labels': List of the predicted label ids for the input <br>"
-                    "- 'id2label': Mapping from label id to the label name <br>"
-                    "'token_classification':<br>"
-                    "- 'labels': List of the predicted label ids for the input <br>"
-                    "- 'id2label': Mapping from label id to the label name <br>"
-                    "- 'word_ids': Mapping from tokens (as produced by the model tokenizer) to the words of the input. <br>"
-                    "'None' represents special tokens that have been added by the tokenizer <br>"
-                    "'embedding':<br>"
-                    "- 'embedding_mode: One of 'mean', 'max', 'cls', 'token'. The pooling mode used (or not used for 'token')<br>"
-                    "- 'word_ids': Mapping from tokens (as produced by the model tokenizer) to the words of the input. "
-                    "Only returned if 'embedding_mode=token' <br>"
-                    "'question_answering':<br>"
-                    "- 'answers': List of lists of answers. Length of outer list is the number of inputs, "
-                    "length of inner list is parameter 'topk' from the request's 'task_kwargs' (default 1). "
-                    "Each answer is a dictionary with 'score', 'start' (span start index in context), 'end' (span end index in context), "
-                    "and 'answer' (the extracted span). The inner list is sorted by score. If no answer span was extracted, "
-                    "the empty span is returned (start and end both 0).<br>"
-                    "'generation':<br>"
-                    "- 'generated_texts': List of list of the generated texts. Length of outer list is the number of inputs, "
-                    "length of inner list is parameter 'num_return_sequences' in request's 'task_kwargs'"
     )
     model_output_is_encoded: bool = Field(
         not RETURN_PLAINTEXT_ARRAYS,
@@ -125,3 +97,56 @@ def __init__(self, **data):
         super().__init__(**data)
         self.model_outputs = _encode_numpy(self.model_outputs)
 
+
+class PredictionOutputForSequenceClassification(PredictionOutput):
+    labels: List[int] = Field([], description="List of the predicted label ids for the input. Not set for regression.")
+    id2label: Dict[int, str] = Field({}, description="Mapping from label id to the label name. Not set for regression.")
+
+    def __init__(self, **data):
+        super().__init__(**data)
+
+
+class PredictionOutputForTokenClassification(PredictionOutput):
+    labels: List[List[int]] = Field([], description="List of the predicted label ids for the input. Not set for regression.")
+    id2label: Dict[int, str] = Field({}, description="Mapping from label id to the label name. Not set for regression.")
+    word_ids: List[List[Optional[int]]] = Field(..., description="Mapping from each token to the corresponding word in the input. "
+                                                           "'None' represents special tokens added by tokenizer")
+    def __init__(self, **data):
+        super().__init__(**data)
+
+
+class PredictionOutputForEmbedding(PredictionOutput):
+    embedding_mode: str = Field("", description="Only used by Transformers/ Adapters.<br> One of 'mean', 'max', 'cls', 'token'. The pooling mode used (or not used for 'token')")
+    word_ids: List[List[Optional[int]]] = Field([], description="Only used by Transformers/ Adapters.<br> "
+                                                          "Only set with embedding_mode='token'."
+                                                          " Mapping from each token to the corresponding word in the input. "
+                                                           "'None' represents special tokens added by tokenizer")
+    def __init__(self, **data):
+        super().__init__(**data)
+
+
+class PredictionOutputForGeneration(PredictionOutput):
+    generated_texts: List[List[str]] = Field(..., description="List of list of the generated texts. Length of outer list is the number of inputs, "
+                                                      "length of inner list is parameter 'num_return_sequences' in request's 'task_kwargs'")
+    def __init__(self, **data):
+        super().__init__(**data)
+
+
+class QAAnswer(BaseModel):
+    """
+    A span answer for question_answering with a score, the start and end character index and the extracted span answer.
+    """
+    score: float
+    start: int
+    end: int
+    answer: str
+
+
+class PredictionOutputForQuestionAnswering(PredictionOutput):
+    answers: List[List[QAAnswer]] = Field(..., description="List of lists of answers. Length of outer list is the number of inputs, "
+                                                   "length of inner list is parameter 'topk' from the request's 'task_kwargs' (default 1). "
+                                                   "Each answer is a dictionary with 'score', 'start' (span start index in context), 'end' (span end index in context), "
+                                                   "and 'answer' (the extracted span). The inner list is sorted by score. If no answer span was extracted, "
+                                                   "the empty span is returned (start and end both 0)")
+    def __init__(self, **data):
+        super().__init__(**data)
\ No newline at end of file
diff --git a/square-model-inference-api/inference_server/square_model_inference/models/request.py b/square-model-inference-api/inference_server/square_model_inference/models/request.py
index ad4af8ae6..547de9191 100644
--- a/square-model-inference-api/inference_server/square_model_inference/models/request.py
+++ b/square-model-inference-api/inference_server/square_model_inference/models/request.py
@@ -50,7 +50,7 @@ class PredictionRequest(BaseModel):
                     "Transformer/ Adapter: See the forward method of the Huggingface models for possible parameters"
                     "For example, set ‘output_attentions=True’ to receive the attention results in the output."
     )
-    task: Task = Field(...)
+    #task: Task = Field(...)
     task_kwargs: dict = Field(
         default={},
         description="Optional dictionary containing additional parameters for handling of the task and "
diff --git a/square-model-inference-api/inference_server/tests/conftest.py b/square-model-inference-api/inference_server/tests/conftest.py
index 880250d56..268f69990 100644
--- a/square-model-inference-api/inference_server/tests/conftest.py
+++ b/square-model-inference-api/inference_server/tests/conftest.py
@@ -43,41 +43,41 @@ async def predict(self, payload: PredictionRequest) -> PredictionOutput:
 @pytest.fixture(scope="class")
 def test_transformer_sequence_classification():
     torch.manual_seed(987654321)
-    return Transformer(TRANSFORMER_MODEL, "sequence_classification", 1, True)
+    return Transformer(TRANSFORMER_MODEL, "sequence_classification", 1, True, 50)
 
 
 @pytest.fixture(scope="class")
 def test_transformer_embedding():
     torch.manual_seed(987654321)
-    return Transformer(TRANSFORMER_MODEL, "base", 1, True)
+    return Transformer(TRANSFORMER_MODEL, "base", 1, True, 50)
 
 
 @pytest.fixture(scope="class")
 def test_transformer_token_classification():
     torch.manual_seed(987654321)
-    return Transformer(TRANSFORMER_MODEL, "token_classification", 1, True)
+    return Transformer(TRANSFORMER_MODEL, "token_classification", 1, True, 50)
 
 
 @pytest.fixture(scope="class")
 def test_transformer_question_answering():
     torch.manual_seed(987654321)
-    return Transformer(TRANSFORMER_MODEL, "question_answering", 1, True)
+    return Transformer(TRANSFORMER_MODEL, "question_answering", 1, True, 50)
 
 
 @pytest.fixture(scope="class")
 def test_transformer_generation():
     torch.manual_seed(987654321)
-    return Transformer(TRANSFORMER_MODEL, "generation", 1, True)
+    return Transformer(TRANSFORMER_MODEL, "generation", 1, True, 50)
 
 
 @pytest.fixture(scope="class")
 def test_adapter():
-    return AdapterTransformer(TRANSFORMER_MODEL, 1, True, TRANSFORMERS_TESTING_CACHE)
+    return AdapterTransformer(TRANSFORMER_MODEL, 1, True, TRANSFORMERS_TESTING_CACHE, 50)
 
 
 @pytest.fixture(scope="class")
 def test_sentence_transformer():
-    return SentenceTransformer(SENTENCE_MODEL, 1, True)
+    return SentenceTransformer(SENTENCE_MODEL, 1, True, 50)
 
 @pytest.fixture()
 def prediction_request():
@@ -86,7 +86,6 @@ def prediction_request():
         "is_preprocessed": False,
         "preprocessing_kwargs": {},
         "model_kwargs": {},
-        "task": "sequence_classification",
         "task_kwargs": {},
         "adapter_name": ""
     })
diff --git a/square-model-inference-api/inference_server/tests/pre_test_setup_for_docker_caching.py b/square-model-inference-api/inference_server/tests/pre_test_setup_for_docker_caching.py
index a6b537d5f..bb41c27ba 100644
--- a/square-model-inference-api/inference_server/tests/pre_test_setup_for_docker_caching.py
+++ b/square-model-inference-api/inference_server/tests/pre_test_setup_for_docker_caching.py
@@ -5,9 +5,8 @@
 
 from sentence_transformers import SentenceTransformer
 from starlette.config import environ
-from transformers import AutoTokenizer, AutoModelWithHeads
+from transformers import AutoTokenizer, AutoModelWithHeads, list_adapters
 from loguru import logger
-from transformers.adapter_utils import download_cached, ADAPTER_HUB_INDEX_FILE
 
 TRANSFORMERS_TESTING_CACHE = "./.model_testing_cache"
 environ["TRANSFORMERS_CACHE"] = TRANSFORMERS_TESTING_CACHE
@@ -16,6 +15,7 @@
 environ["DISABLE_GPU"] = "True"
 environ["BATCH_SIZE"] = "1"
 environ["RETURN_PLAINTEXT_ARRAYS"] = "True"
+environ["MAX_INPUT_SIZE"] = "100"
 
 # Downloaded models:
 TRANSFORMER_MODEL = "bert-base-uncased"
@@ -32,19 +32,20 @@
     model = AutoModelWithHeads.from_pretrained(TRANSFORMER_MODEL).to(device)
 
     # Pre-download adapters
-    logger.warning("UPDATE with https://github.com/Adapter-Hub/adapter-transformers/pull/193")
-    index_file = download_cached(ADAPTER_HUB_INDEX_FILE.format(TRANSFORMER_MODEL))
-    adapter_index = json.load(open(index_file))
-    adapters = set()
-    for task, datasets in adapter_index.items():
-        for dataset in datasets.keys():
-            for key in datasets[dataset].keys():
-                if key != "default":
-                    for org in datasets[dataset][key]["versions"].keys():
-                        adapters.add(f"{task}/{dataset}@{org}")
+    logger.info("Loading all available adapters")
+    adapter_infos = [info for info in list_adapters(source="ah") if info.model_name==TRANSFORMER_MODEL]
+    adapters = set(f"{adapter_info.task}/{adapter_info.subtask}@{adapter_info.username}" for adapter_info in adapter_infos)
     for adapter in adapters:
         logger.debug(f"Loading adapter {adapter}")
-        model.load_adapter(adapter, load_as=adapter, with_head=True, cache_dir=TRANSFORMERS_TESTING_CACHE)
+        try:
+            model.load_adapter(adapter, load_as=adapter, with_head=True, cache_dir=TRANSFORMERS_TESTING_CACHE)
+        except KeyError as e:
+            logger.debug(f"Could not load {adapter} due to incomplete config:\n{e.args[0]}")
+        except RuntimeError as e:
+            if "Error(s) in loading state_dict" in e.args[0]:
+                logger.debug(f"Could not load {adapter} due to missing label_ids in config resulting in exception:\n{e.args[0]}")
+            else:
+                raise(e)
 
     # Pre-download sentence-transformer models for tests
     _ = SentenceTransformer(model_name_or_path=SENTENCE_MODEL, device=device)
diff --git a/square-model-inference-api/inference_server/tests/test_inference/test_adapter.py b/square-model-inference-api/inference_server/tests/test_inference/test_adapter.py
index 585801453..aa02d1e35 100644
--- a/square-model-inference-api/inference_server/tests/test_inference/test_adapter.py
+++ b/square-model-inference-api/inference_server/tests/test_inference/test_adapter.py
@@ -3,7 +3,8 @@
 from square_model_inference.models.request import PredictionRequest
 import numpy as np
 
-
+# todo
+@pytest.mark.skip(reason="Waiting for next update of adapter-transformers")
 def test_adapter():
     pytest.fail("Adapter tests waiting for next version of adapter-transformers. "
                 "Not enough heads available. "
diff --git a/square-model-inference-api/inference_server/tests/test_inference/test_sentence_transformer.py b/square-model-inference-api/inference_server/tests/test_inference/test_sentence_transformer.py
index 92d70faf6..1265aafb2 100644
--- a/square-model-inference-api/inference_server/tests/test_inference/test_sentence_transformer.py
+++ b/square-model-inference-api/inference_server/tests/test_inference/test_sentence_transformer.py
@@ -2,6 +2,8 @@
 
 import numpy as np
 
+from square_model_inference.models.request import Task
+
 
 @pytest.mark.usefixtures("test_sentence_transformer")
 class TestSentenceTransformerEmbedding:
@@ -10,23 +12,27 @@ class TestSentenceTransformerEmbedding:
     @pytest.mark.parametrize("input", [(["this is a test"]),
                                             (["this is a test", "this is a test with a longer sentence"])])
     async def test_embedding(self, prediction_request, test_sentence_transformer, input):
-        prediction_request.task = "embedding"
         prediction_request.input = input
 
-        prediction = await test_sentence_transformer.predict(prediction_request)
+        prediction = await test_sentence_transformer.predict(prediction_request, Task.embedding)
         assert np.array(prediction.model_outputs["embeddings"]).shape[1] == 768
         assert np.array(prediction.model_outputs["embeddings"]).shape[0] == len(input)
 
     @pytest.mark.asyncio
     async def test_not_embedding(self, prediction_request, test_sentence_transformer):
-        prediction_request.task = "sequence_classification"
         with pytest.raises(ValueError):
-            prediction = await test_sentence_transformer.predict(prediction_request)
+            prediction = await test_sentence_transformer.predict(prediction_request, Task.sequence_classification)
 
     @pytest.mark.asyncio
     async def test_forbid_is_preprocessed(self, prediction_request, test_sentence_transformer):
-        prediction_request.task = "embedding"
         prediction_request.is_preprocessed = True
 
         with pytest.raises(ValueError):
-            prediction = await test_sentence_transformer.predict(prediction_request)
\ No newline at end of file
+            prediction = await test_sentence_transformer.predict(prediction_request, Task.embedding)
+
+    @pytest.mark.asyncio
+    async def test_input_too_big(self, prediction_request, test_sentence_transformer):
+        prediction_request.input = ["test"]*1000
+
+        with pytest.raises(ValueError):
+            prediction = await test_sentence_transformer.predict(prediction_request, Task.embedding)
\ No newline at end of file
diff --git a/square-model-inference-api/inference_server/tests/test_inference/test_transformers.py b/square-model-inference-api/inference_server/tests/test_inference/test_transformers.py
index 5f99ed649..f389e9d00 100644
--- a/square-model-inference-api/inference_server/tests/test_inference/test_transformers.py
+++ b/square-model-inference-api/inference_server/tests/test_inference/test_transformers.py
@@ -2,6 +2,8 @@
 
 import numpy as np
 
+from square_model_inference.models.request import Task
+
 
 @pytest.mark.usefixtures("test_transformer_sequence_classification")
 class TestTransformerSequenceClassification:
@@ -11,21 +13,18 @@ class TestTransformerSequenceClassification:
                              ids=["single", "batch"])
     async def test_sequence_classification(self, prediction_request, test_transformer_sequence_classification, input):
         prediction_request.input = input
-        prediction_request.task = "sequence_classification"
-
-        prediction = await test_transformer_sequence_classification.predict(prediction_request)
+        
+        prediction = await test_transformer_sequence_classification.predict(prediction_request, Task.sequence_classification)
         np.testing.assert_allclose(np.sum(prediction.model_outputs["logits"], axis=-1), [1.0]*len(input), err_msg="logits are softmax")
-        assert len(prediction.task_outputs["labels"]) == len(input)
-        assert all(isinstance(prediction.task_outputs["labels"][i], int) for i in range(len(input)))
+        assert len(prediction.labels) == len(input)
+        assert all(isinstance(prediction.labels[i], int) for i in range(len(input)))
         assert "logits" in prediction.model_outputs
-        assert "id2label" in prediction.task_outputs
 
     @pytest.mark.asyncio
     async def test_sequence_classification_output_attention(self, prediction_request, test_transformer_sequence_classification):
-        prediction_request.task = "sequence_classification"
         prediction_request.model_kwargs = {"output_attentions": True}
 
-        prediction = await test_transformer_sequence_classification.predict(prediction_request)
+        prediction = await test_transformer_sequence_classification.predict(prediction_request, Task.sequence_classification)
         assert "attentions" in prediction.model_outputs
 
     @pytest.mark.asyncio
@@ -34,12 +33,10 @@ async def test_sequence_classification_output_attention(self, prediction_request
                              ids=["single", "batch"])
     async def test_sequence_classification_regression(self, prediction_request, test_transformer_sequence_classification, input):
         prediction_request.input = input
-        prediction_request.task = "sequence_classification"
         prediction_request.task_kwargs = {"is_regression": True}
 
-        prediction = await test_transformer_sequence_classification.predict(prediction_request)
+        prediction = await test_transformer_sequence_classification.predict(prediction_request, Task.sequence_classification)
         assert not np.array_equal(np.sum(prediction.model_outputs["logits"], axis=-1)-1, [0.0]*len(input)), "logits are not softmax"
-        assert "labels" not in prediction.task_outputs
         assert "logits" in prediction.model_outputs
 
 
@@ -53,14 +50,12 @@ class TestTransformerTokenClassification:
                              ids=["single", "batch"])
     async def test_token_classification(self, prediction_request, test_transformer_token_classification, input, word_ids):
         prediction_request.input = input
-        prediction_request.task = "token_classification"
 
-        prediction = await test_transformer_token_classification.predict(prediction_request)
+        prediction = await test_transformer_token_classification.predict(prediction_request, Task.token_classification)
         np.testing.assert_allclose(np.sum(prediction.model_outputs["logits"], axis=-1), np.ones(shape=(len(input), len(word_ids[0]))), err_msg="logits are softmax")
-        assert all(len(prediction.task_outputs["labels"][i]) == len(word_ids[i]) for i in range(len(input)))
+        assert all(len(prediction.labels[i]) == len(word_ids[i]) for i in range(len(input)))
         assert "logits" in prediction.model_outputs
-        assert "id2label" in prediction.task_outputs
-        assert prediction.task_outputs["word_ids"] == word_ids
+        assert prediction.word_ids == word_ids
 
 
     @pytest.mark.asyncio
@@ -71,14 +66,12 @@ async def test_token_classification(self, prediction_request, test_transformer_t
                              ids=["single", "batch"])
     async def test_token_classification_regression(self, prediction_request, test_transformer_token_classification, input, word_ids):
         prediction_request.input = input
-        prediction_request.task = "token_classification"
         prediction_request.task_kwargs = {"is_regression": True}
 
-        prediction = await test_transformer_token_classification.predict(prediction_request)
+        prediction = await test_transformer_token_classification.predict(prediction_request, Task.token_classification)
         assert not np.array_equal((np.sum(prediction.model_outputs["logits"], axis=-1), np.ones_like(word_ids)), "logits are not softmax")
-        assert "labels" not in prediction.task_outputs
         assert "logits" in prediction.model_outputs
-        assert prediction.task_outputs["word_ids"] == word_ids
+        assert prediction.word_ids == word_ids
 
 
 @pytest.mark.usefixtures("test_transformer_embedding")
@@ -93,14 +86,13 @@ class TestTransformerEmbedding:
                              )
     async def test_embedding(self, prediction_request, test_transformer_embedding, input, mode):
         prediction_request.input = input
-        prediction_request.task = "embedding"
         prediction_request.task_kwargs = {"embedding_mode": mode}
 
-        prediction = await test_transformer_embedding.predict(prediction_request)
+        prediction = await test_transformer_embedding.predict(prediction_request, Task.embedding)
         assert np.array(prediction.model_outputs["embeddings"]).shape[1] == 768
         assert np.array(prediction.model_outputs["embeddings"]).shape[0] == len(input)
         assert "hidden_states" not in prediction.model_outputs
-        assert prediction.task_outputs["embedding_mode"] == mode
+        assert prediction.embedding_mode == mode
 
     @pytest.mark.asyncio
     @pytest.mark.parametrize("input,word_ids", [(["this is a test"], [[None, 0, 1, 2, 3, None]]),
@@ -109,32 +101,36 @@ async def test_embedding(self, prediction_request, test_transformer_embedding, i
                              ids=["single", "batch"])
     async def test_embedding_token(self, prediction_request, test_transformer_embedding, input, word_ids):
         prediction_request.input = input
-        prediction_request.task = "embedding"
         prediction_request.task_kwargs = {"embedding_mode": "token"}
 
-        prediction = await test_transformer_embedding.predict(prediction_request)
+        prediction = await test_transformer_embedding.predict(prediction_request, Task.embedding)
         assert np.array(prediction.model_outputs["embeddings"]).shape[2] == 768
         assert np.array(prediction.model_outputs["embeddings"]).shape[1] == len(word_ids[0])
         assert np.array(prediction.model_outputs["embeddings"]).shape[0] == len(input)
         assert "hidden_states" not in prediction.model_outputs
-        assert prediction.task_outputs["embedding_mode"] == "token"
-        assert prediction.task_outputs["word_ids"] == word_ids
+        assert prediction.embedding_mode == "token"
+        assert prediction.word_ids == word_ids
 
     @pytest.mark.asyncio
     async def test_embedding_unknown_mode(self, prediction_request, test_transformer_embedding):
-        prediction_request.task = "embedding"
         prediction_request.task_kwargs = {"embedding_mode": "this mode does not exist"}
 
         with pytest.raises(ValueError):
-            prediction = await test_transformer_embedding.predict(prediction_request)
+            prediction = await test_transformer_embedding.predict(prediction_request, Task.embedding)
 
     @pytest.mark.asyncio
     async def test_forbid_is_preprocessed(self, prediction_request, test_transformer_embedding):
-        prediction_request.task = "embedding"
         prediction_request.is_preprocessed = True
 
         with pytest.raises(ValueError):
-            prediction = await test_transformer_embedding.predict(prediction_request)
+            prediction = await test_transformer_embedding.predict(prediction_request, Task.embedding)
+
+    @pytest.mark.asyncio
+    async def test_input_too_big(self, prediction_request, test_transformer_embedding):
+        prediction_request.input = ["test"]*1000
+
+        with pytest.raises(ValueError):
+            prediction = await test_transformer_embedding.predict(prediction_request, Task.embedding)
 
 @pytest.mark.usefixtures("test_transformer_question_answering")
 class TestTransformerQuestionAnswering:
@@ -146,26 +142,26 @@ class TestTransformerQuestionAnswering:
                              )
     async def test_question_answering(self, prediction_request, test_transformer_question_answering, input):
         prediction_request.input = input
-        prediction_request.task = "question_answering"
         prediction_request.task_kwargs = {"topk": 1}
 
-        prediction = await test_transformer_question_answering.predict(prediction_request)
-        answers = [input[i][1][prediction.task_outputs["answers"][i][0]["start"]:prediction.task_outputs["answers"][i][0]["end"]] for i in range(len(input))]
+        prediction = await test_transformer_question_answering.predict(prediction_request, Task.question_answering)
+        answers = [input[i][1][prediction.answers[i][0].start:prediction.answers[i][0].end] for i in range(len(input))]
         assert "start_logits" in prediction.model_outputs and "end_logits" in prediction.model_outputs
-        assert len(prediction.task_outputs["answers"]) == len(input)
-        assert all(prediction.task_outputs["answers"][i][0]["answer"] == answers[i] for i in range(len(input)))
+        assert len(prediction.answers) == len(input)
+        assert all(prediction.answers[i][0].answer == answers[i] for i in range(len(input)))
 
     @pytest.mark.asyncio
     async def test_question_answering_topk(self, prediction_request, test_transformer_question_answering):
-        prediction_request.input = [["What is a test?", "A test is a thing where you test."]]
-        prediction_request.task = "question_answering"
+        input = [["What is a test?", "A test is a thing where you test."]]
+        prediction_request.input = input
         prediction_request.task_kwargs = {"topk": 2}
 
-        prediction = await test_transformer_question_answering.predict(prediction_request)
-        answers = [input[0][1][prediction.task_outputs["answers"][0][i]["start"]:prediction.task_outputs["answers"][0][i]["end"]] for i in range(2)]
+        prediction = await test_transformer_question_answering.predict(prediction_request, Task.question_answering)
+        answers = [input[0][1][prediction.answers[0][i].start:prediction.answers[0][i].end] for i in range(2)]
         assert "start_logits" in prediction.model_outputs and "end_logits" in prediction.model_outputs
-        assert len(prediction.task_outputs["answers"]) == len(input)
-        assert all(prediction.task_outputs["answers"][0][i]["answer"] == answers[i] for i in range(2))
+        assert len(prediction.answers) == len(input)
+        assert prediction.answers[0][0].score >= prediction.answers[0][1].score
+        assert all(prediction.answers[0][i].answer == answers[i] for i in range(2))
 
 
 @pytest.mark.usefixtures("test_transformer_generation")
@@ -176,26 +172,23 @@ class TestTransformerGeneration:
                              )
     async def test_generation(self, prediction_request, test_transformer_generation, input):
         prediction_request.input = input
-        prediction_request.task = "generation"
 
-        prediction = await test_transformer_generation.predict(prediction_request)
-        assert all(isinstance(prediction.task_outputs["generated_texts"][i][0], str) for i in range(len(input)))
+        prediction = await test_transformer_generation.predict(prediction_request, Task.generation)
+        assert all(isinstance(prediction.generated_texts[i][0], str) for i in range(len(input)))
 
     @pytest.mark.asyncio
     async def test_generation_output_attention_and_scores(self, prediction_request, test_transformer_generation):
-        prediction_request.task = "generation"
         prediction_request.model_kwargs = {
             "output_attentions": True,
             "output_scores": True
         }
 
-        prediction = await test_transformer_generation.predict(prediction_request)
+        prediction = await test_transformer_generation.predict(prediction_request, Task.generation)
         assert "scores" in prediction.model_outputs
         assert "attentions" in prediction.model_outputs
 
     @pytest.mark.asyncio
     async def test_generation_beam_sample_multiple_seqs(self, prediction_request, test_transformer_generation):
-        prediction_request.task = "generation"
         prediction_request.task_kwargs = {
             "num_beams": 2,
             "do_sample": True,
@@ -205,5 +198,5 @@ async def test_generation_beam_sample_multiple_seqs(self, prediction_request, te
             "num_return_sequences": 2
         }
 
-        prediction = await test_transformer_generation.predict(prediction_request)
-        assert len(prediction.task_outputs["generated_texts"][0]) == 2
\ No newline at end of file
+        prediction = await test_transformer_generation.predict(prediction_request, Task.generation)
+        assert len(prediction.generated_texts[0]) == 2
\ No newline at end of file
diff --git a/square-model-inference-api/inference_server/uninstall_requirements.txt b/square-model-inference-api/inference_server/uninstall_requirements.txt
new file mode 100644
index 000000000..747b7aa97
--- /dev/null
+++ b/square-model-inference-api/inference_server/uninstall_requirements.txt
@@ -0,0 +1 @@
+transformers
\ No newline at end of file
diff --git a/square-model-inference-api/nginx/nginx.conf b/square-model-inference-api/nginx/nginx.conf
index 20c5c84d4..c6051be8f 100644
--- a/square-model-inference-api/nginx/nginx.conf
+++ b/square-model-inference-api/nginx/nginx.conf
@@ -11,7 +11,7 @@ http {
         }
 
         location /api/bert-base-uncased {
-            #auth_request /auth;
+            auth_request /auth;
             proxy_pass http://square_model_inference_bert:8000/api;
         }
 
@@ -22,13 +22,7 @@ http {
 
         location /auth {
             internal;
-            set $query '';
-               if ($request_uri ~* "[^\?]+\?(.*)$") {
-                   set $query $1;
-               }
             proxy_pass http://auth:8081/auth;
-            proxy_pass_request_body   off;
-            proxy_set_header          Content-Length "";
         }
     }
 }
\ No newline at end of file

From 5b6d3fef3acd936b08b8f6ef5df78dd294b8c1bb Mon Sep 17 00:00:00 2001
From: Geigle <gregortheodor.geigle@stud.tu-darmstadt.de>
Date: Tue, 13 Jul 2021 13:48:55 +0200
Subject: [PATCH 11/23] Updated adapter-transformers and added tests for
 adapter model

---
 .../inference_server/requirements2.txt        |   2 +-
 .../inference/adaptertransformer.py           |   5 +-
 .../models/prediction.py                      |   1 +
 .../inference_server/tests/conftest.py        |  21 +-
 .../pre_test_setup_for_docker_caching.py      |   2 -
 .../tests/test_api/test_prediction.py         |  85 ++++++-
 .../tests/test_inference/test_adapter.py      | 208 +++++++++++++++++-
 7 files changed, 294 insertions(+), 30 deletions(-)

diff --git a/square-model-inference-api/inference_server/requirements2.txt b/square-model-inference-api/inference_server/requirements2.txt
index 1503d3a94..05ea5692f 100644
--- a/square-model-inference-api/inference_server/requirements2.txt
+++ b/square-model-inference-api/inference_server/requirements2.txt
@@ -1 +1 @@
-adapter-transformers==2.0.1
\ No newline at end of file
+adapter-transformers==2.1.1
\ No newline at end of file
diff --git a/square-model-inference-api/inference_server/square_model_inference/inference/adaptertransformer.py b/square-model-inference-api/inference_server/square_model_inference/inference/adaptertransformer.py
index e9c052c4c..06995863c 100644
--- a/square-model-inference-api/inference_server/square_model_inference/inference/adaptertransformer.py
+++ b/square-model-inference-api/inference_server/square_model_inference/inference/adaptertransformer.py
@@ -42,14 +42,11 @@ def _load_adapter(self, model_name, transformers_cache):
             logger.debug(f"Loading adapter {adapter}")
             try:
                 self.model.load_adapter(adapter, load_as=adapter, with_head=True, cache_dir=transformers_cache)
-            except KeyError as e:
-                logger.debug(f"Could not load {adapter} due to incomplete config: Missing key {e.args[0]}")
             except RuntimeError as e:
                 if "Error(s) in loading state_dict" in e.args[0]:
                     logger.debug(f"Could not load {adapter} due to missing label_ids in config resulting in exception:\n{e.args[0]}")
                 else:
-                    raise(e)
-
+                    raise e
 
 # def _load_single_adapter(self, adapter_name: str):
     #     if adapter_name not in self.model.config.adapters.adapters:
diff --git a/square-model-inference-api/inference_server/square_model_inference/models/prediction.py b/square-model-inference-api/inference_server/square_model_inference/models/prediction.py
index 67be53aba..18a4f2ed3 100644
--- a/square-model-inference-api/inference_server/square_model_inference/models/prediction.py
+++ b/square-model-inference-api/inference_server/square_model_inference/models/prediction.py
@@ -54,6 +54,7 @@ class PredictionOutput(BaseModel):
     The results of the prediction of the model on the given input for the requested task.
     """
     model_outputs: Dict = Field(
+        {},
         description="Dictionary containing the model tensor outputs either as plain list or as base64-encoded numpy array.<br><br>"
                     "Decode the base64 string 'arr_string_b64' back to an array in Python like this:<br>"
                     "arr_binary_b64 = arr_string_b64.encode()<br>"
diff --git a/square-model-inference-api/inference_server/tests/conftest.py b/square-model-inference-api/inference_server/tests/conftest.py
index 268f69990..fc8984e78 100644
--- a/square-model-inference-api/inference_server/tests/conftest.py
+++ b/square-model-inference-api/inference_server/tests/conftest.py
@@ -17,8 +17,9 @@
 
 from main import get_app
 from square_model_inference.inference.model import Model
-from square_model_inference.models.prediction import PredictionOutput
-from square_model_inference.models.request import PredictionRequest
+from square_model_inference.models.prediction import PredictionOutput, PredictionOutputForGeneration, \
+    PredictionOutputForEmbedding, PredictionOutputForTokenClassification, PredictionOutputForSequenceClassification, PredictionOutputForQuestionAnswering
+from square_model_inference.models.request import PredictionRequest, Task
 from square_model_inference.inference.transformer import Transformer
 from square_model_inference.inference.adaptertransformer import AdapterTransformer
 from square_model_inference.inference.sentencetransformer import SentenceTransformer
@@ -32,11 +33,17 @@ def test_app():
 
 
 class TestModel(Model):
-    def __init__(self):
-        self.prediction = PredictionOutput(model_outputs={}, task_outputs={})
-
-    async def predict(self, payload: PredictionRequest) -> PredictionOutput:
-        return self.prediction
+    async def predict(self, payload, task) -> PredictionOutput:
+        if task == Task.generation:
+            return PredictionOutputForGeneration(generated_texts=[[""]])
+        elif task == Task.question_answering:
+            return PredictionOutputForQuestionAnswering(answers=[[{"score": 0, "start": 0, "end": 0, "answer": ""}]])
+        elif task == Task.embedding:
+            return PredictionOutputForEmbedding(word_ids=[[0]])
+        elif task == Task.token_classification:
+            return PredictionOutputForTokenClassification(word_ids=[[0]])
+        elif task == Task.sequence_classification:
+            return PredictionOutputForSequenceClassification()
 
 
 # We only load bert-base-uncased, so we fix the random seed to always get the same randomly generated heads on top
diff --git a/square-model-inference-api/inference_server/tests/pre_test_setup_for_docker_caching.py b/square-model-inference-api/inference_server/tests/pre_test_setup_for_docker_caching.py
index bb41c27ba..12c6de162 100644
--- a/square-model-inference-api/inference_server/tests/pre_test_setup_for_docker_caching.py
+++ b/square-model-inference-api/inference_server/tests/pre_test_setup_for_docker_caching.py
@@ -39,8 +39,6 @@
         logger.debug(f"Loading adapter {adapter}")
         try:
             model.load_adapter(adapter, load_as=adapter, with_head=True, cache_dir=TRANSFORMERS_TESTING_CACHE)
-        except KeyError as e:
-            logger.debug(f"Could not load {adapter} due to incomplete config:\n{e.args[0]}")
         except RuntimeError as e:
             if "Error(s) in loading state_dict" in e.args[0]:
                 logger.debug(f"Could not load {adapter} due to missing label_ids in config resulting in exception:\n{e.args[0]}")
diff --git a/square-model-inference-api/inference_server/tests/test_api/test_prediction.py b/square-model-inference-api/inference_server/tests/test_api/test_prediction.py
index 9829ff680..77767924a 100644
--- a/square-model-inference-api/inference_server/tests/test_api/test_prediction.py
+++ b/square-model-inference-api/inference_server/tests/test_api/test_prediction.py
@@ -1,10 +1,10 @@
 from starlette.testclient import TestClient
 
 
-def test_prediction(test_app) -> None:
+def test_api_sequence_classification(test_app) -> None:
     test_client = TestClient(test_app)
     response = test_client.post(
-        "/api/predict",
+        "/api/sequence-classification",
         json={
             "input": [
                 "this is a test"
@@ -12,7 +12,6 @@ def test_prediction(test_app) -> None:
             "is_preprocessed": False,
             "preprocessing_kwargs": {},
             "model_kwargs": {},
-            "task": "embedding",
             "task_kwargs": {},
             "adapter_name": ""
         }
@@ -20,20 +19,88 @@ def test_prediction(test_app) -> None:
     assert response.status_code == 200
 
 
-def test_prediction_wrong_request(test_app) -> None:
+def test_api_sequence_classification_malformed_input(test_app) -> None:
     test_client = TestClient(test_app, raise_server_exceptions=False)
     response = test_client.post(
-        "/api/predict", json={
+        "/api/sequence-classification", json={
+            # "input": [
+            #     "this is a test"
+            # ],
+            "is_preprocessed": False,
+            "preprocessing_kwargs": {},
+            "task_kwargs": {},
+            "adapter_name": ""
+        }
+    )
+    assert response.status_code == 422
+
+
+def test_api_token_classification(test_app) -> None:
+    test_client = TestClient(test_app)
+    response = test_client.post(
+        "/api/token-classification",
+        json={
             "input": [
                 "this is a test"
             ],
             "is_preprocessed": False,
             "preprocessing_kwargs": {},
-            # required
-            #"model_kwargs": {},
-            #"task": "embedding",
+            "model_kwargs": {},
             "task_kwargs": {},
             "adapter_name": ""
         }
     )
-    assert response.status_code == 422
+    assert response.status_code == 200
+
+def test_api_embedding(test_app) -> None:
+    test_client = TestClient(test_app)
+    response = test_client.post(
+        "/api/embedding",
+        json={
+            "input": [
+                "this is a test"
+            ],
+            "is_preprocessed": False,
+            "preprocessing_kwargs": {},
+            "model_kwargs": {},
+            "task_kwargs": {},
+            "adapter_name": ""
+        }
+    )
+    assert response.status_code == 200
+
+
+def test_api_question_answering(test_app) -> None:
+    test_client = TestClient(test_app)
+    response = test_client.post(
+        "/api/question-answering",
+        json={
+            "input": [
+                "this is a test"
+            ],
+            "is_preprocessed": False,
+            "preprocessing_kwargs": {},
+            "model_kwargs": {},
+            "task_kwargs": {},
+            "adapter_name": ""
+        }
+    )
+    assert response.status_code == 200
+
+
+def test_api_generation(test_app) -> None:
+    test_client = TestClient(test_app)
+    response = test_client.post(
+        "/api/generation",
+        json={
+            "input": [
+                "this is a test"
+            ],
+            "is_preprocessed": False,
+            "preprocessing_kwargs": {},
+            "model_kwargs": {},
+            "task_kwargs": {},
+            "adapter_name": ""
+        }
+    )
+    assert response.status_code == 200
\ No newline at end of file
diff --git a/square-model-inference-api/inference_server/tests/test_inference/test_adapter.py b/square-model-inference-api/inference_server/tests/test_inference/test_adapter.py
index aa02d1e35..b379c3e2a 100644
--- a/square-model-inference-api/inference_server/tests/test_inference/test_adapter.py
+++ b/square-model-inference-api/inference_server/tests/test_inference/test_adapter.py
@@ -1,11 +1,205 @@
 import pytest
 
-from square_model_inference.models.request import PredictionRequest
 import numpy as np
 
-# todo
-@pytest.mark.skip(reason="Waiting for next update of adapter-transformers")
-def test_adapter():
-    pytest.fail("Adapter tests waiting for next version of adapter-transformers. "
-                "Not enough heads available. "
-                "Waiting for https://github.com/Adapter-Hub/adapter-transformers/commit/94ff58dbf09ac1d14b7107c9ce48770d2d352161")
\ No newline at end of file
+from square_model_inference.models.request import Task
+
+
+@pytest.mark.usefixtures("test_adapter")
+class TestTransformerAdapter:
+    @pytest.mark.asyncio
+    @pytest.mark.parametrize("input", [(["this is a test"]),
+                                       (["this is a test", "this is a test with a longer sentence"])],
+                             ids=["single", "batch"])
+    async def test_sequence_classification(self, prediction_request, test_adapter, input):
+        prediction_request.input = input
+        prediction_request.adapter_name = "nli/rte@ukp"
+
+        prediction = await test_adapter.predict(prediction_request, Task.sequence_classification)
+        np.testing.assert_allclose(np.sum(prediction.model_outputs["logits"], axis=-1), [1.0]*len(input), err_msg="logits are softmax")
+        assert len(prediction.labels) == len(input)
+        assert all(isinstance(prediction.labels[i], int) for i in range(len(input)))
+        assert "logits" in prediction.model_outputs
+
+    @pytest.mark.asyncio
+    async def test_sequence_classification_output_attention(self, prediction_request, test_adapter):
+        prediction_request.model_kwargs = {"output_attentions": True}
+        prediction_request.adapter_name = "nli/rte@ukp"
+
+        prediction = await test_adapter.predict(prediction_request, Task.sequence_classification)
+        assert "attentions" in prediction.model_outputs
+
+    @pytest.mark.asyncio
+    @pytest.mark.parametrize("input", [(["this is a test"]),
+                                       (["this is a test", "this is a test with a longer sentence"])],
+                             ids=["single", "batch"])
+    async def test_sequence_classification_regression(self, prediction_request, test_adapter, input):
+        prediction_request.input = input
+        prediction_request.task_kwargs = {"is_regression": True}
+        prediction_request.adapter_name = "nli/rte@ukp"
+
+        prediction = await test_adapter.predict(prediction_request, Task.sequence_classification)
+        assert not np.array_equal(np.sum(prediction.model_outputs["logits"], axis=-1)-1, [0.0]*len(input)), "logits are not softmax"
+        assert "logits" in prediction.model_outputs
+
+
+    @pytest.mark.asyncio
+    @pytest.mark.parametrize("input,word_ids", [(["this is a test"], [[None, 0, 1, 2, 3, None]]),
+                                                (["this is a test", "this is a test with a longer sentence"],
+                                                 [[None, 0, 1, 2, 3, None, None, None, None, None], [None, 0, 1, 2, 3, 4, 5, 6, 7, None]])],
+                             ids=["single", "batch"])
+    async def test_token_classification(self, prediction_request, test_adapter, input, word_ids):
+        prediction_request.input = input
+        prediction_request.adapter_name = "ner/conll2003@ukp"
+
+        prediction = await test_adapter.predict(prediction_request, Task.token_classification)
+        np.testing.assert_allclose(np.sum(prediction.model_outputs["logits"], axis=-1), np.ones(shape=(len(input), len(word_ids[0]))), atol=1e-6, err_msg="logits should be softmax")
+        assert all(len(prediction.labels[i]) == len(word_ids[i]) for i in range(len(input)))
+        assert "logits" in prediction.model_outputs
+        assert prediction.word_ids == word_ids
+
+
+    @pytest.mark.asyncio
+    @pytest.mark.parametrize("input,word_ids", [(["this is a test"],
+                                                 [[None, 0, 1, 2, 3, None]]),
+                                                (["this is a test", "this is a test with a longer sentence"],
+                                                 [[None, 0, 1, 2, 3, None, None, None, None, None], [None, 0, 1, 2, 3, 4, 5, 6, 7, None]])],
+                             ids=["single", "batch"])
+    async def test_token_classification_regression(self, prediction_request, test_adapter, input, word_ids):
+        prediction_request.input = input
+        prediction_request.task_kwargs = {"is_regression": True}
+        prediction_request.adapter_name = "ner/conll2003@ukp"
+
+        prediction = await test_adapter.predict(prediction_request, Task.token_classification)
+        assert not np.array_equal((np.sum(prediction.model_outputs["logits"], axis=-1), np.ones_like(word_ids)), "logits are not softmax")
+        assert "logits" in prediction.model_outputs
+        assert prediction.word_ids == word_ids
+
+    @pytest.mark.asyncio
+    @pytest.mark.parametrize("input,mode", [(["this is a test"], "mean"),
+                                            (["this is a test", "this is a test with a longer sentence"], "mean"),
+                                            (["this is a test"], "max"),
+                                            (["this is a test", "this is a test with a longer sentence"], "max"),
+                                            (["this is a test"], "cls"),
+                                            (["this is a test", "this is a test with a longer sentence"], "cls")],
+                             )
+    async def test_embedding(self, prediction_request, test_adapter, input, mode):
+        prediction_request.input = input
+        prediction_request.task_kwargs = {"embedding_mode": mode}
+        prediction_request.adapter_name = "sts/sts-b@ukp"
+
+        prediction = await test_adapter.predict(prediction_request, Task.embedding)
+        assert np.array(prediction.model_outputs["embeddings"]).shape[1] == 768
+        assert np.array(prediction.model_outputs["embeddings"]).shape[0] == len(input)
+        assert "hidden_states" not in prediction.model_outputs
+        assert prediction.embedding_mode == mode
+
+    @pytest.mark.asyncio
+    @pytest.mark.parametrize("input,word_ids", [(["this is a test"], [[None, 0, 1, 2, 3, None]]),
+                                                (["this is a test", "this is a test with a longer sentence"],
+                                                 [[None, 0, 1, 2, 3, None, None, None, None, None], [None, 0, 1, 2, 3, 4, 5, 6, 7, None]])],
+                             ids=["single", "batch"])
+    async def test_embedding_token(self, prediction_request, test_adapter, input, word_ids):
+        prediction_request.input = input
+        prediction_request.task_kwargs = {"embedding_mode": "token"}
+        prediction_request.adapter_name = "sts/sts-b@ukp"
+
+        prediction = await test_adapter.predict(prediction_request, Task.embedding)
+        assert np.array(prediction.model_outputs["embeddings"]).shape[2] == 768
+        assert np.array(prediction.model_outputs["embeddings"]).shape[1] == len(word_ids[0])
+        assert np.array(prediction.model_outputs["embeddings"]).shape[0] == len(input)
+        assert "hidden_states" not in prediction.model_outputs
+        assert prediction.embedding_mode == "token"
+        assert prediction.word_ids == word_ids
+
+    @pytest.mark.asyncio
+    async def test_embedding_unknown_mode(self, prediction_request, test_adapter):
+        prediction_request.task_kwargs = {"embedding_mode": "this mode does not exist"}
+        prediction_request.adapter_name = "sts/sts-b@ukp"
+
+        with pytest.raises(ValueError):
+            prediction = await test_adapter.predict(prediction_request, Task.embedding)
+
+    @pytest.mark.asyncio
+    async def test_forbid_is_preprocessed(self, prediction_request, test_adapter):
+        prediction_request.is_preprocessed = True
+        prediction_request.adapter_name = "sts/sts-b@ukp"
+
+        with pytest.raises(ValueError):
+            prediction = await test_adapter.predict(prediction_request, Task.embedding)
+
+    @pytest.mark.asyncio
+    async def test_input_too_big(self, prediction_request, test_adapter):
+        prediction_request.input = ["test"]*1000
+        prediction_request.adapter_name = "sts/sts-b@ukp"
+
+        with pytest.raises(ValueError):
+            prediction = await test_adapter.predict(prediction_request, Task.embedding)
+
+    @pytest.mark.asyncio
+    @pytest.mark.parametrize("input", [([["What is a test?", "A test is a thing where you test."]]),
+                                       ([["What is a test?", "A test is a thing where you test."],
+                                         ["What is a longer test?", "A test is a thing where you test. If it is longer you call it longer"]])],
+                             )
+    async def test_question_answering(self, prediction_request, test_adapter, input):
+        prediction_request.input = input
+        prediction_request.task_kwargs = {"topk": 1}
+        prediction_request.adapter_name = "qa/squad2@ukp"
+
+        prediction = await test_adapter.predict(prediction_request, Task.question_answering)
+        answers = [input[i][1][prediction.answers[i][0].start:prediction.answers[i][0].end] for i in range(len(input))]
+        assert "start_logits" in prediction.model_outputs and "end_logits" in prediction.model_outputs
+        assert len(prediction.answers) == len(input)
+        assert all(prediction.answers[i][0].answer == answers[i] for i in range(len(input)))
+
+    @pytest.mark.asyncio
+    async def test_question_answering_topk(self, prediction_request, test_adapter):
+        input = [["What is a test?", "A test is a thing where you test."]]
+        prediction_request.input = input
+        prediction_request.task_kwargs = {"topk": 2}
+        prediction_request.adapter_name = "qa/squad2@ukp"
+
+        prediction = await test_adapter.predict(prediction_request, Task.question_answering)
+        answers = [input[0][1][prediction.answers[0][i].start:prediction.answers[0][i].end] for i in range(2)]
+        assert "start_logits" in prediction.model_outputs and "end_logits" in prediction.model_outputs
+        assert len(prediction.answers) == len(input)
+        assert prediction.answers[0][0].score >= prediction.answers[0][1].score
+        assert all(prediction.answers[0][i].answer == answers[i] for i in range(2))
+
+    @pytest.mark.skip("No generation adapter for bert-base-uncased available currently.")
+    @pytest.mark.asyncio
+    @pytest.mark.parametrize("input", [(["Generate text"]),
+                                       (["Generate text", "And more text"])],
+                             )
+    async def test_generation(self, prediction_request, test_adapter, input):
+        prediction_request.input = input
+
+        prediction = await test_adapter.predict(prediction_request, Task.generation)
+        assert all(isinstance(prediction.generated_texts[i][0], str) for i in range(len(input)))
+
+    @pytest.mark.skip("No generation adapter for bert-base-uncased available currently.")
+    @pytest.mark.asyncio
+    async def test_generation_output_attention_and_scores(self, prediction_request, test_adapter):
+        prediction_request.model_kwargs = {
+            "output_attentions": True,
+            "output_scores": True
+        }
+
+        prediction = await test_adapter.predict(prediction_request, Task.generation)
+        assert "scores" in prediction.model_outputs
+        assert "attentions" in prediction.model_outputs
+
+    @pytest.mark.skip("No generation adapter for bert-base-uncased available currently.")
+    @pytest.mark.asyncio
+    async def test_generation_beam_sample_multiple_seqs(self, prediction_request, test_adapter):
+        prediction_request.task_kwargs = {
+            "num_beams": 2,
+            "do_sample": True,
+            "top_k": 10,
+            "top_p": 0.5,
+            "no_repeat_ngram_size": 2,
+            "num_return_sequences": 2
+        }
+
+        prediction = await test_adapter.predict(prediction_request, Task.generation)
+        assert len(prediction.generated_texts[0]) == 2
\ No newline at end of file

From e368434cf0a3fea6859f4ab3c3acb3e01c5f1017 Mon Sep 17 00:00:00 2001
From: Geigle <gregortheodor.geigle@stud.tu-darmstadt.de>
Date: Tue, 13 Jul 2021 15:27:16 +0200
Subject: [PATCH 12/23] Updated auth server; fixed output serialization problem
 discovered by Docker integration test

---
 .../auth_server/.env.example                  |  2 +-
 .../auth_server/Dockerfile                    |  6 +--
 .../auth_server/auth_api/messages.py          |  2 +-
 .../auth_server/auth_api/security.py          | 13 +++---
 .../auth_server/requirements.txt              |  6 +--
 square-model-inference-api/docker-compose.yml | 40 ------------------
 .../example_docker-compose.yml                | 42 +++++++++++++++++++
 .../models/prediction.py                      | 12 +++++-
 8 files changed, 65 insertions(+), 58 deletions(-)
 delete mode 100644 square-model-inference-api/docker-compose.yml
 create mode 100644 square-model-inference-api/example_docker-compose.yml

diff --git a/square-model-inference-api/auth_server/.env.example b/square-model-inference-api/auth_server/.env.example
index 6125fa20f..95fa8b3b7 100644
--- a/square-model-inference-api/auth_server/.env.example
+++ b/square-model-inference-api/auth_server/.env.example
@@ -1,2 +1,2 @@
 API_KEY=example_key
-API_KEY_NAME=api_key
\ No newline at end of file
+API_KEY_NAME=Authorization
\ No newline at end of file
diff --git a/square-model-inference-api/auth_server/Dockerfile b/square-model-inference-api/auth_server/Dockerfile
index 0ced2f573..aadc437d3 100644
--- a/square-model-inference-api/auth_server/Dockerfile
+++ b/square-model-inference-api/auth_server/Dockerfile
@@ -1,13 +1,13 @@
-FROM python:3.8.2
+FROM python:3.7.6-slim-buster
 
 ENV PYTHONUNBUFFERED 1
 
 EXPOSE 8081
 WORKDIR /app
 
+RUN pip install --upgrade pip
 COPY requirements.txt ./
-RUN pip install --upgrade pip && \
-    pip install -r requirements.txt
+RUN pip install -r requirements.txt
 
 COPY . ./
 
diff --git a/square-model-inference-api/auth_server/auth_api/messages.py b/square-model-inference-api/auth_server/auth_api/messages.py
index 85c28dc04..03eb2e101 100644
--- a/square-model-inference-api/auth_server/auth_api/messages.py
+++ b/square-model-inference-api/auth_server/auth_api/messages.py
@@ -1,4 +1,4 @@
-NO_API_KEY = "No API key provided."
+NO_API_KEY = "No API key provided in header {}."
 AUTH_REQ = "Authentication required."
 HTTP_500_DETAIL = "Internal server error."
 
diff --git a/square-model-inference-api/auth_server/auth_api/security.py b/square-model-inference-api/auth_server/auth_api/security.py
index 8f6082a8c..943eee929 100644
--- a/square-model-inference-api/auth_server/auth_api/security.py
+++ b/square-model-inference-api/auth_server/auth_api/security.py
@@ -1,24 +1,21 @@
 import secrets
 
 from fastapi import HTTPException, Security
-from fastapi.security.api_key import APIKeyHeader, APIKeyQuery
+from fastapi.security.api_key import APIKeyHeader
 from starlette.status import HTTP_400_BAD_REQUEST, HTTP_401_UNAUTHORIZED
 
 from .config import API_KEY, API_KEY_NAME
 from .messages import AUTH_REQ, NO_API_KEY
 
-api_key_query = APIKeyQuery(name=API_KEY_NAME, auto_error=False)
 api_key_header = APIKeyHeader(name=API_KEY_NAME, auto_error=False)
 
 
-def validate_request(query: str = Security(api_key_query),
-                     header: str = Security(api_key_header),) -> bool:
-    if query is None and header is None:
+def validate_request(header: str = Security(api_key_header),) -> bool:
+    if header is None:
         raise HTTPException(
-            status_code=HTTP_400_BAD_REQUEST, detail=NO_API_KEY, headers={}
+            status_code=HTTP_400_BAD_REQUEST, detail=NO_API_KEY.format(API_KEY_NAME), headers={}
         )
-    elif not secrets.compare_digest(query, str(API_KEY)) \
-            or not secrets.compare_digest(header, str(API_KEY)):
+    elif not secrets.compare_digest(header, str(API_KEY)):
         raise HTTPException(
             status_code=HTTP_401_UNAUTHORIZED, detail=AUTH_REQ, headers={}
         )
diff --git a/square-model-inference-api/auth_server/requirements.txt b/square-model-inference-api/auth_server/requirements.txt
index e059f7f5f..c3f33f964 100644
--- a/square-model-inference-api/auth_server/requirements.txt
+++ b/square-model-inference-api/auth_server/requirements.txt
@@ -1,3 +1,3 @@
-uvicorn
-fastapi
-pydantic
+uvicorn==0.13.4
+fastapi==0.65.1
+pydantic==1.8.2
diff --git a/square-model-inference-api/docker-compose.yml b/square-model-inference-api/docker-compose.yml
deleted file mode 100644
index ada141ee6..000000000
--- a/square-model-inference-api/docker-compose.yml
+++ /dev/null
@@ -1,40 +0,0 @@
-version: "3"
-
-services:
-  inference_bert:
-    build: ./inference_server/.
-    image: square_model_inference:latest
-    ports:
-      - 8000:8000
-    env_file:
-      - ./inference_server/.env.bert
-    container_name: square_model_inference_bert
-    volumes:
-      - ./.cache/:/etc/huggingface/.cache/
-      - ./inference_server/:/app/ #for developement
-
-  #inference_roberta:
-  #  build: ./inference_server/.
-  #  image: square_model_inference:latest
-  #  ports:
-  #    - 8001:8000
-  #  env_file:
-  #    - ./inference_server/.env.roberta
-  #  container_name: square_model_inference_roberta
-  #  volumes:
-  #    - ./.cache/:/etc/huggingface/.cache/
-  #    - ./inference_server/:/app/ #for developement
-
-  auth:
-    build: ./auth_server/.
-    ports:
-      - 8081:8081
-    env_file:
-      - ./auth_server/.env.example
-
-  nginx:
-    image: nginx
-    ports:
-      - 8080:8080
-    volumes:
-      - ./nginx/nginx.conf:/etc/nginx/nginx.conf:ro
\ No newline at end of file
diff --git a/square-model-inference-api/example_docker-compose.yml b/square-model-inference-api/example_docker-compose.yml
new file mode 100644
index 000000000..d15b00b0a
--- /dev/null
+++ b/square-model-inference-api/example_docker-compose.yml
@@ -0,0 +1,42 @@
+version: "3"
+
+services:
+  auth:
+    build: ./auth_server/.
+    ports:
+      - 8081:8081
+    env_file:
+      - ./auth_server/.env.example
+
+  nginx:
+    image: nginx
+    ports:
+      - 8080:8080
+    volumes:
+      - ./nginx/nginx.conf:/etc/nginx/nginx.conf:ro
+
+  # Example config for one model server
+  inference_bert:
+    build: ./inference_server/.
+    image: square_model_inference:latest
+    ports:
+      - 8000:8000
+    env_file:
+      - ./inference_server/.env.bert
+    container_name: square_model_inference_bert
+    volumes:
+      - ./.cache/:/etc/huggingface/.cache/
+      #- ./inference_server/:/app/ #for developement
+
+  # Example config for another model server
+  inference_roberta:
+    build: ./inference_server/.
+    image: square_model_inference:latest
+    ports:
+      - 8001:8000
+    env_file:
+      - ./inference_server/.env.roberta
+    container_name: square_model_inference_roberta
+    volumes:
+      - ./.cache/:/etc/huggingface/.cache/
+      #- ./inference_server/:/app/ #for developement
diff --git a/square-model-inference-api/inference_server/square_model_inference/models/prediction.py b/square-model-inference-api/inference_server/square_model_inference/models/prediction.py
index 18a4f2ed3..3f0963f69 100644
--- a/square-model-inference-api/inference_server/square_model_inference/models/prediction.py
+++ b/square-model-inference-api/inference_server/square_model_inference/models/prediction.py
@@ -36,15 +36,23 @@ def encode(arr):
             # arr_binary = base64.decodebytes(arr_binary_b64)
             # arr = np.load(BytesIO(arr_binary))
 
-    # Recursively go through a value and encode leafs (=tensors) it or iterate over values and encode them
+    # Recursively go through a value and encode leaves (=tensors) it or iterate over values and encode them
     def enc_or_iterate(val):
+        # Stop attempt to encode an already encoded array
+        # This can happen because PredictionOutput is initialized twice
+        # - once by Model and once when request response is serialized by fastAPI
+        if isinstance(val, int) or isinstance(val, float) or isinstance(val, str):
+            raise ValueError("Array is already encoded")
         if isinstance(val, Iterable) and not isinstance(val, torch.Tensor) and not isinstance(val, np.ndarray):
             return [enc_or_iterate(v) for v in val]
         else:
             return encode(val)
 
     for k, v in obj.items():
-        v = enc_or_iterate(v)
+        try:
+            v = enc_or_iterate(v)
+        except ValueError:
+            break
         obj[k] = v
     return obj
 

From 9575ef8e7a14e27581dedd72683cb21314ceb9f4 Mon Sep 17 00:00:00 2001
From: Geigle <gregortheodor.geigle@stud.tu-darmstadt.de>
Date: Wed, 14 Jul 2021 14:19:15 +0200
Subject: [PATCH 13/23] Extended documentation

---
 square-model-inference-api/.dockerignore      |  1 +
 square-model-inference-api/.gitignore         |  1 +
 square-model-inference-api/Makefile           |  8 +--
 square-model-inference-api/README.md          | 69 +++++++++----------
 .../auth_server/.env.example                  |  2 +-
 .../auth_server/auth_api/config.py            |  4 +-
 .../auth_server/auth_api/messages.py          |  1 -
 .../auth_server/auth_api/security.py          |  6 +-
 .../auth_server/main.py                       |  7 +-
 .../auth_server/requirements.txt              |  6 +-
 .../inference_server/.env.example             | 26 +++++++
 .../inference_server/main.py                  |  2 +-
 .../inference_server/requirements1.txt        | 10 +--
 .../api/routes/heartbeat.py                   |  8 +--
 .../api/routes/prediction.py                  |  7 +-
 .../square_model_inference/core/config.py     |  1 +
 .../core/event_handlers.py                    |  4 ++
 .../inference/adaptertransformer.py           |  2 -
 .../models/heartbeat.py                       |  2 +-
 .../models/prediction.py                      |  3 +-
 20 files changed, 104 insertions(+), 66 deletions(-)
 create mode 100644 square-model-inference-api/inference_server/.env.example

diff --git a/square-model-inference-api/.dockerignore b/square-model-inference-api/.dockerignore
index 0af8215d8..cc3361cfa 100644
--- a/square-model-inference-api/.dockerignore
+++ b/square-model-inference-api/.dockerignore
@@ -60,6 +60,7 @@ nosetests.xml
 coverage.xml
 *.cover
 .hypothesis/
+.model_testing_cache
 
 # Translations
 *.mo
diff --git a/square-model-inference-api/.gitignore b/square-model-inference-api/.gitignore
index 7389010d8..ed7883485 100644
--- a/square-model-inference-api/.gitignore
+++ b/square-model-inference-api/.gitignore
@@ -57,6 +57,7 @@ nosetests.xml
 coverage.xml
 *.cover
 .hypothesis/
+.model_testing_cache
 
 # Translations
 *.mo
diff --git a/square-model-inference-api/Makefile b/square-model-inference-api/Makefile
index 13e50caf8..0e8bc2e54 100644
--- a/square-model-inference-api/Makefile
+++ b/square-model-inference-api/Makefile
@@ -8,16 +8,16 @@ all: clean test install run deploy down
 
 test:
 	python -m pip install --upgrade pip && pip install pytest pytest-cov pytest-asyncio
-	pytest --cov
+	PYTHONPATH=inference_server/ pytest --cov
 
 install:
 	pip install --upgrade pip
 	pip install -r inference_server/requirements1.txt
-    pip uninstall -y -r inference_server/uninstall_requirements.txt
-    pip install -r inference_server/requirements2.txt
+	pip uninstall -y -r inference_server/uninstall_requirements.txt
+	pip install -r inference_server/requirements2.txt
 
 run:
-	PYTHONPATH=inference_server/ uvicorn inference_server.main:app --reload --host 0.0.0.0 --port 8002 --env-file inference_server/.env
+	PYTHONPATH=inference_server/ uvicorn inference_server.main:app --reload --host 0.0.0.0 --port 8000 --env-file inference_server/.env
 
 build:
 	docker-compose build
diff --git a/square-model-inference-api/README.md b/square-model-inference-api/README.md
index 324b89cdc..c51c79ab3 100644
--- a/square-model-inference-api/README.md
+++ b/square-model-inference-api/README.md
@@ -4,21 +4,30 @@ Receives input and returns prediction and other artifacts (e.g. attention scores
 
 ## Project structure
 
-TODO
+The Model API uses 3 components: 
+1 authorization server, n inference servers (each with their own model), 
+and a nginx server that serves as API gateway to forward requests to the correct inference server and
+to handle authorization of requests with the auth server.
 ```
-├───tests
-│   ├───test_api
-│   └───test_service
-├───auth_server
+├───auth_server                 # FastAPI Authorization Server
+│   ├───main.py                 # Entry point in server
+│   ├───Dockerfile              # Dockerfile for server
 │   └───auth_api
-├───nginx
-├───inference_server
-│   ├───square_model_inference
-│   │   ├───api                  - Model API
-│   │   │   ├───routes
-│   │   ├───core                 
-│   │   ├───models               - Pydantic model definitions for in-/ output
-│   │   ├───inference
+├───nginx                       # nginx config for API Gateway & Authorizatio
+│   └───nginx.conf
+├───inference_server            # FastAPI Model API Server
+│   ├───tests                   # Unit Tests
+│   │   ├───test_api
+│   │   ├───test_inference
+│   ├───main.py                 # Entry point in server
+│   ├───Dockerfile              # Dockerfile for server
+│   └───square_model_inference  # Server Logic
+│       ├───api                 # API Routes
+│       │   ├───routes
+│       ├───core                # Server config, Startup logic, etc.
+│       ├───models              # Input/ output modelling for API
+│       └───inference           # Deep Model implementation and inference code for NLP tasks
+└───example_docker-compose.yml  # Example docker-compose setup for the Model API
 ```
 
 ## Requirements
@@ -43,11 +52,16 @@ Both `transformers` and `adapter-transformers` use the same namespace so they co
 Thus, we first install `sentence-transformers` along with `transformers`, 
 uninstall `transformers`, and finally install `adapter-transformers`.
 
+## Running
+
 #### Running Localhost
 
 ```sh
 make run
 ```
+This *only* starts one inference server using `inference_server/.env`. No nginx, no auth server.  
+For debugging, `inference_server/main.py` can also be used as entry.
+
 
 #### Running Via Docker
 
@@ -62,25 +76,10 @@ make test
 ```
 
 ## Setup
-TODO
-
-## Run without `make` for development
-
-1. Start your  app with: 
-```bash
-PYTHONPATH=./inference_server uvicorn main:app --reload
-```
-
-2. Go to [http://localhost:8000/docs](http://localhost:8000/docs) or  [http://localhost:8000/redoc](http://localhost:8000/redoc) for alternative swagger
-
-
-## Run Tests with using `make`
-
-Install testing libraries and run `tox`:
-```bash
-pip install tox pytest flake8 coverage bandit
-tox
-```
-This runs tests and coverage for Python 3.8 and Flake8, Bandit.
-
-
+1. Create `auth_server/.env` with secret API key. See [here](auth_server/.env.example) for an example.
+2. For each model server that should run, create a `.env.$model` to configure it.  
+   See [here](inference_server/.env.example) for an example.
+3. Configure `nginx/nginx.conf` to correctly forward requests to each server. The server DNS name has to
+   match `container_name` of each server in the `docker-compose.yaml`.
+4. Configure `docker-compose.yaml` by adding services for the auth server, nginx (with the config), and the
+model servers (each with their .env file). See [example_docker-compose.yml](example_docker-compose.yml) for an example.
\ No newline at end of file
diff --git a/square-model-inference-api/auth_server/.env.example b/square-model-inference-api/auth_server/.env.example
index 95fa8b3b7..6f37be409 100644
--- a/square-model-inference-api/auth_server/.env.example
+++ b/square-model-inference-api/auth_server/.env.example
@@ -1,2 +1,2 @@
 API_KEY=example_key
-API_KEY_NAME=Authorization
\ No newline at end of file
+API_KEY_HEADER_NAME=Authorization
\ No newline at end of file
diff --git a/square-model-inference-api/auth_server/auth_api/config.py b/square-model-inference-api/auth_server/auth_api/config.py
index ebd10abc4..ec1987091 100644
--- a/square-model-inference-api/auth_server/auth_api/config.py
+++ b/square-model-inference-api/auth_server/auth_api/config.py
@@ -3,5 +3,7 @@
 
 config = Config(".env")
 
+# The API key required to get authorization
 API_KEY: Secret = config("API_KEY", cast=Secret)
-API_KEY_NAME: str = config("API_KEY_NAME", cast=str, default="api_key")
+# Name of the header in the request that contains the API key
+API_KEY_HEADER_NAME: str = config("API_KEY_HEADER_NAME", cast=str, default="Authorization")
diff --git a/square-model-inference-api/auth_server/auth_api/messages.py b/square-model-inference-api/auth_server/auth_api/messages.py
index 03eb2e101..16425b6c8 100644
--- a/square-model-inference-api/auth_server/auth_api/messages.py
+++ b/square-model-inference-api/auth_server/auth_api/messages.py
@@ -1,4 +1,3 @@
 NO_API_KEY = "No API key provided in header {}."
 AUTH_REQ = "Authentication required."
-HTTP_500_DETAIL = "Internal server error."
 
diff --git a/square-model-inference-api/auth_server/auth_api/security.py b/square-model-inference-api/auth_server/auth_api/security.py
index 943eee929..1a74e31d8 100644
--- a/square-model-inference-api/auth_server/auth_api/security.py
+++ b/square-model-inference-api/auth_server/auth_api/security.py
@@ -4,16 +4,16 @@
 from fastapi.security.api_key import APIKeyHeader
 from starlette.status import HTTP_400_BAD_REQUEST, HTTP_401_UNAUTHORIZED
 
-from .config import API_KEY, API_KEY_NAME
+from .config import API_KEY, API_KEY_HEADER_NAME
 from .messages import AUTH_REQ, NO_API_KEY
 
-api_key_header = APIKeyHeader(name=API_KEY_NAME, auto_error=False)
+api_key_header = APIKeyHeader(name=API_KEY_HEADER_NAME, auto_error=False)
 
 
 def validate_request(header: str = Security(api_key_header),) -> bool:
     if header is None:
         raise HTTPException(
-            status_code=HTTP_400_BAD_REQUEST, detail=NO_API_KEY.format(API_KEY_NAME), headers={}
+            status_code=HTTP_400_BAD_REQUEST, detail=NO_API_KEY.format(API_KEY_HEADER_NAME), headers={}
         )
     elif not secrets.compare_digest(header, str(API_KEY)):
         raise HTTPException(
diff --git a/square-model-inference-api/auth_server/main.py b/square-model-inference-api/auth_server/main.py
index 90332c6d2..93894341f 100644
--- a/square-model-inference-api/auth_server/main.py
+++ b/square-model-inference-api/auth_server/main.py
@@ -3,6 +3,9 @@
 
 app = FastAPI()
 
+
 @app.get("/auth")
-async def auth(authenticated: bool = Depends(security.validate_request),):
-    return {"authenticated": True}
+async def auth(authenticated: bool = Depends(security.validate_request)):
+    # authenticated is always True because security.validate_request raises an exception if validation fails
+    # and it does not return False
+    return {"authenticated": authenticated}
diff --git a/square-model-inference-api/auth_server/requirements.txt b/square-model-inference-api/auth_server/requirements.txt
index c3f33f964..1141b0ad8 100644
--- a/square-model-inference-api/auth_server/requirements.txt
+++ b/square-model-inference-api/auth_server/requirements.txt
@@ -1,3 +1,3 @@
-uvicorn==0.13.4
-fastapi==0.65.1
-pydantic==1.8.2
+uvicorn==0.13.4     # ASGI server
+fastapi==0.65.1     # REST API Framework
+pydantic==1.8.2     # Input/ output modelling
diff --git a/square-model-inference-api/inference_server/.env.example b/square-model-inference-api/inference_server/.env.example
new file mode 100644
index 000000000..cff919980
--- /dev/null
+++ b/square-model-inference-api/inference_server/.env.example
@@ -0,0 +1,26 @@
+# Corresponds to the Huggingface name for finetuned Transformers or the name of a finetuned SentenceTransformers
+MODEL_NAME=bert-base-uncased
+# Type of the model, e.g. Transformers, Adapter, ...
+# See square_model_inference.core.event_handlers.MODEL_MAPPING for all available names with corresponding model
+MODEL_TYPE=adapter
+
+# Disable CUDA even if available
+DISABLE_GPU=True
+# Batch size used for many inputs
+BATCH_SIZE=32
+# Inputs larger than this size are rejected
+MAX_INPUT_SIZE=1024
+
+# Cache directory where model weights are stored
+# This is the name for the env variable used by transformers and sentence-transformers package
+TRANSFORMERS_CACHE=../.cache
+
+# For MODEL_TYPE=transformers: decides the AutoModel* class used
+# See square_model_inference.inference.transformer.CLASS_MAPPING for valid names and corresponding class
+MODEL_CLASS=base
+
+# Flag that decides if returned numpy arrays are returned
+# as lists or encoded to base64 (smaller but not easily human readable).
+# See the comment in square_model_inference.models.prediction._encode_numpy on information on how to decode
+# the base64 string back to the numpy array
+RETURN_PLAINTEXT_ARRAYS=False
\ No newline at end of file
diff --git a/square-model-inference-api/inference_server/main.py b/square-model-inference-api/inference_server/main.py
index 4d4f0825c..e7004c49e 100644
--- a/square-model-inference-api/inference_server/main.py
+++ b/square-model-inference-api/inference_server/main.py
@@ -17,4 +17,4 @@ def get_app() -> FastAPI:
 
 if __name__ == "__main__":
     import uvicorn
-    uvicorn.run(app, host="0.0.0.0", port=8000)
+    uvicorn.run(app, host="0.0.0.0", port=8002)
diff --git a/square-model-inference-api/inference_server/requirements1.txt b/square-model-inference-api/inference_server/requirements1.txt
index d8a296d3e..00e9c0a57 100644
--- a/square-model-inference-api/inference_server/requirements1.txt
+++ b/square-model-inference-api/inference_server/requirements1.txt
@@ -1,8 +1,8 @@
-uvicorn==0.13.4
-fastapi==0.65.1
-pydantic==1.8.2
-loguru==0.5.3
-python-dotenv==0.17.1
+uvicorn==0.13.4                 # ASGI server
+fastapi==0.65.1                 # REST API Framework
+pydantic==1.8.2                 # Input/ output modelling
+loguru==0.5.3                   # Easier logging
+python-dotenv==0.17.1           # Required for .env configs
 sentencepiece==0.1.95
 torch==1.8.1
 sentence-transformers==1.2.0
\ No newline at end of file
diff --git a/square-model-inference-api/inference_server/square_model_inference/api/routes/heartbeat.py b/square-model-inference-api/inference_server/square_model_inference/api/routes/heartbeat.py
index 702a921bc..67c0570f2 100644
--- a/square-model-inference-api/inference_server/square_model_inference/api/routes/heartbeat.py
+++ b/square-model-inference-api/inference_server/square_model_inference/api/routes/heartbeat.py
@@ -1,11 +1,11 @@
 from fastapi import APIRouter
 
-from square_model_inference.models.heartbeat import HearbeatResult
+from square_model_inference.models.heartbeat import HeartbeatResult
 
 router = APIRouter()
 
 
-@router.get("/heartbeat", response_model=HearbeatResult, name="heartbeat")
-def get_hearbeat() -> HearbeatResult:
-    heartbeat = HearbeatResult(is_alive=True)
+@router.get("/heartbeat", response_model=HeartbeatResult, name="heartbeat")
+def get_hearbeat() -> HeartbeatResult:
+    heartbeat = HeartbeatResult(is_alive=True)
     return heartbeat
diff --git a/square-model-inference-api/inference_server/square_model_inference/api/routes/prediction.py b/square-model-inference-api/inference_server/square_model_inference/api/routes/prediction.py
index c06c0633f..eae7bda22 100644
--- a/square-model-inference-api/inference_server/square_model_inference/api/routes/prediction.py
+++ b/square-model-inference-api/inference_server/square_model_inference/api/routes/prediction.py
@@ -8,6 +8,7 @@
 
 router = APIRouter()
 
+
 @router.post("/sequence-classification", response_model=PredictionOutputForSequenceClassification, name="sequence classification")
 async def sequence_classification(
         request: Request,
@@ -15,11 +16,12 @@ async def sequence_classification(
 ) -> PredictionOutputForSequenceClassification:
 
     logger.info(f"Sequence Classification Request: {prediction_request.dict()}")
-    model = request.app.state.model
+    model = request.app.state.model  # Access model from global app state
     prediction = await model.predict(prediction_request, Task.sequence_classification)
 
     return prediction
 
+
 @router.post("/token-classification", response_model=PredictionOutputForTokenClassification, name="token classification")
 async def token_classification(
         request: Request,
@@ -32,6 +34,7 @@ async def token_classification(
 
     return prediction
 
+
 @router.post("/embedding", response_model=PredictionOutputForEmbedding, name="embedding")
 async def embedding(
         request: Request,
@@ -44,6 +47,7 @@ async def embedding(
 
     return prediction
 
+
 @router.post("/question-answering", response_model=PredictionOutputForQuestionAnswering, name="question answering")
 async def question_answering(
         request: Request,
@@ -56,6 +60,7 @@ async def question_answering(
 
     return prediction
 
+
 @router.post("/generation", response_model=PredictionOutputForGeneration, name="generation")
 async def generation(
         request: Request,
diff --git a/square-model-inference-api/inference_server/square_model_inference/core/config.py b/square-model-inference-api/inference_server/square_model_inference/core/config.py
index 224417b3d..aba698196 100644
--- a/square-model-inference-api/inference_server/square_model_inference/core/config.py
+++ b/square-model-inference-api/inference_server/square_model_inference/core/config.py
@@ -16,6 +16,7 @@
 DISABLE_GPU: bool = config("DISABLE_GPU", cast=bool, default=False)
 # Batch size used for many inputs
 BATCH_SIZE: int = config("BATCH_SIZE", cast=int, default=32)
+# Inputs larger than this size are rejected
 MAX_INPUT_SIZE: int = config("MAX_INPUT_SIZE", cast=int, default=1024)
 
 # Cache directory where model weights are stored
diff --git a/square-model-inference-api/inference_server/square_model_inference/core/event_handlers.py b/square-model-inference-api/inference_server/square_model_inference/core/event_handlers.py
index 9498a0b38..6011b30ec 100644
--- a/square-model-inference-api/inference_server/square_model_inference/core/event_handlers.py
+++ b/square-model-inference-api/inference_server/square_model_inference/core/event_handlers.py
@@ -24,7 +24,11 @@
     "max_input_size": MAX_INPUT_SIZE
 }
 
+
 def _startup_model(app: FastAPI) -> None:
+    """
+    Initialize the model used by the server and set it to the app state for global access
+    """
     if MODEL_TYPE not in MODEL_MAPPING:
         raise RuntimeError(f"Unknown MODEL_MAPPING. Must be one of {MODEL_MAPPING.keys()}")
     model_instance = MODEL_MAPPING[MODEL_TYPE](**MODEL_KWARGS)
diff --git a/square-model-inference-api/inference_server/square_model_inference/inference/adaptertransformer.py b/square-model-inference-api/inference_server/square_model_inference/inference/adaptertransformer.py
index 06995863c..5c97532e9 100644
--- a/square-model-inference-api/inference_server/square_model_inference/inference/adaptertransformer.py
+++ b/square-model-inference-api/inference_server/square_model_inference/inference/adaptertransformer.py
@@ -1,5 +1,3 @@
-import json
-
 from loguru import logger
 
 from transformers import AutoModelWithHeads, list_adapters
diff --git a/square-model-inference-api/inference_server/square_model_inference/models/heartbeat.py b/square-model-inference-api/inference_server/square_model_inference/models/heartbeat.py
index 34a47e56e..55f1fea20 100644
--- a/square-model-inference-api/inference_server/square_model_inference/models/heartbeat.py
+++ b/square-model-inference-api/inference_server/square_model_inference/models/heartbeat.py
@@ -1,5 +1,5 @@
 from pydantic import BaseModel
 
 
-class HearbeatResult(BaseModel):
+class HeartbeatResult(BaseModel):
     is_alive: bool
diff --git a/square-model-inference-api/inference_server/square_model_inference/models/prediction.py b/square-model-inference-api/inference_server/square_model_inference/models/prediction.py
index 3f0963f69..a3f1d7fbb 100644
--- a/square-model-inference-api/inference_server/square_model_inference/models/prediction.py
+++ b/square-model-inference-api/inference_server/square_model_inference/models/prediction.py
@@ -1,5 +1,4 @@
-from collections import Iterable
-from typing import Dict, Union, Tuple, List, Optional
+from typing import Dict, Union, Tuple, List, Optional, Iterable
 
 import torch
 from io import BytesIO

From d75668c991056802e4e97fd27f133f6e95f3e395 Mon Sep 17 00:00:00 2001
From: Geigle <gregortheodor.geigle@stud.tu-darmstadt.de>
Date: Wed, 14 Jul 2021 16:17:16 +0200
Subject: [PATCH 14/23] Added offline encoding script for data api

---
 .../offline_encoding_for_data_api.py          | 322 ++++++++++++++++++
 1 file changed, 322 insertions(+)
 create mode 100644 square-model-inference-api/offline_encoding_for_data_api.py

diff --git a/square-model-inference-api/offline_encoding_for_data_api.py b/square-model-inference-api/offline_encoding_for_data_api.py
new file mode 100644
index 000000000..4f787e24e
--- /dev/null
+++ b/square-model-inference-api/offline_encoding_for_data_api.py
@@ -0,0 +1,322 @@
+# This is a stand-alone script that does not require additional code (except the imported packages).
+# We copy-pasted code from Model API and removing some not needed stuff for this.
+import argparse
+import os
+import logging
+import json
+import pickle
+from typing import Union, List
+
+import torch
+from pydantic import BaseModel, Field
+# Conditionally load adapter or sentence-transformer later to simplify installation
+#from sentence_transformers import SentenceTransformer as SentenceTransformerModel
+import transformers
+from transformers import AutoModel, AutoTokenizer  #, AutoModelWithHeads
+
+logger = logging.getLogger(__name__)
+logger.setLevel(logging.DEBUG)
+
+
+class PredictionRequest(BaseModel):
+    """
+    Prediction request containing the input, pre-processing parameters, parameters for the model forward pass,
+     the task with task-specific parameters, and parameters for any post-processing
+    """
+    input: Union[List[str], List[List[str]], dict] = Field(
+        ...,
+        description="Input for the model. Supports Huggingface Transformer inputs (i.e., list of sentences, or "
+                    "list of pairs of sentences), a dictionary with Transformer inputs, or a dictionary containing "
+                    "numpy arrays (as lists). For the numpy arrays, also set is_preprocessed=True. "
+                    "<br><br>"
+                    "Transformer/ Adapter:<br>"
+                    "Task 'question_answering' expects the input to be in the (question, context) format."
+    )
+    preprocessing_kwargs: dict = Field(
+        default={},
+        description="Optional dictionary containing additional parameters for the pre-processing step.<br><br>"
+                    "SentenceTransformer: This is ignored.<br>"
+                    "Transformer/ Adapter: See the Huggingface tokenizer for possible parameters."
+    )
+    model_kwargs: dict = Field(
+        default={},
+        description="Optional dictionary containing parameters that are passed to the model for the forward pass "
+                    "to control what additional tensors are returned.<br><br>"
+                    "SentenceTransformer: This is ignored.<br>"
+                    "Transformer/ Adapter: See the forward method of the Huggingface models for possible parameters"
+                    "For example, set ‘output_attentions=True’ to receive the attention results in the output."
+    )
+    task_kwargs: dict = Field(
+        default={},
+        description="Optional dictionary containing additional parameters for handling of the task and "
+                    "task-related post-processing.<br><br>"
+                    "SentenceTransformer: This is ignored.<br>"
+                    "Transformer/ Adapter:<br>"
+                    "'sentence_classification':<br>"
+                    "- 'is_regression': Flag to treat output of models with num_labels>1 as regression, too, i.e., no softmax and no labels are returned<br>"
+                    "'token_classification':<br>"
+                    "- 'is_regression': Flag to treat output of models with num_labels>1 as regression, too, i.e., no softmax and no labels are returned<br>"
+                    "'embedding':<br>"
+                    "- 'embedding_mode: One of 'mean', 'max', 'cls', 'token'. The pooling mode used (or not used for 'token'). Default 'mean'.<br>"
+                    "'question_answering':<br>"
+                    "- 'topk': Return the top-k most likely spans. Default 1.<br>"
+                    "- 'max_answer_len': Maximal token length of answers. Default 128.<br>"
+                    "'generation':<br>"
+                    "- 'clean_up_tokenization_spaces': See parameter in Huggingface tokenizer.decode(). Default False<br>"
+                    "- See Huggingface model.generate() for all possible parameters that can be used. "
+                    "Note, 'model_kwargs' and 'task_kwargs' are merged for generation."
+
+    )
+
+
+class SentenceTransformer:
+    """
+    The class for all sentence-transformers models
+    """
+
+    def __init__(self, model_name, batch_size, disable_gpu):
+        """
+        Initialize the SentenceTransformer
+        :param model_name: the sentence-transformer model name (https://sbert.net/docs/pretrained_models.html)
+        :param batch_size: batch size used for inference
+        :param disable_gpu: do not move model to GPU even if CUDA is available
+        :param max_input_size: requests with a larger input are rejected
+        """
+        self._load_model(model_name, disable_gpu)
+        self.batch_size = batch_size
+
+    def _load_model(self, model_name, disable_gpu):
+        """
+        Load the Transformer model model_name and its tokenizer with Huggingface.
+        Model will be moved to GPU unless CUDA is unavailable or disable_gpu is true.
+        """
+        import sentence_transformers
+        logger.debug(f"Loading model {model_name}")
+        device = "cuda" if torch.cuda.is_available() and not disable_gpu else "cpu"
+        model = sentence_transformers.SentenceTransformer(model_name_or_path=model_name, device=device)
+        logger.info(f"Model {model_name} loaded on {device}")
+        self.model = model
+
+    def embedding(self, request):
+        embeddings = self.model.encode(request.input, batch_size=self.batch_size, show_progress_bar=False, convert_to_tensor=True)
+        return embeddings
+
+
+class Transformer:
+    """
+    The class for all Huggingface transformer-based models
+    """
+    SUPPORTED_EMBEDDING_MODES = ["mean", "max", "cls", "token"]
+
+    def __init__(self, model_name, batch_size, disable_gpu):
+        """
+        Initialize the Transformer
+        :param model_name: the Huggingface model name
+        :param batch_size: batch size used for inference
+        :param disable_gpu: do not move model to GPU even if CUDA is available
+        """
+        self._load_model(AutoModel, model_name, disable_gpu)
+        self.batch_size = batch_size
+
+    def _load_model(self, model_cls, model_name, disable_gpu):
+        """
+        Load the Transformer model model_name and its tokenizer with Huggingface.
+        Model will be moved to GPU unless CUDA is unavailable or disable_gpu is true.
+        """
+        logger.debug(f"Loading model {model_name}")
+        tokenizer = AutoTokenizer.from_pretrained(model_name)
+        # Check if GPU is available
+        device = "cuda" if torch.cuda.is_available() and not disable_gpu else "cpu"
+        model = model_cls.from_pretrained(model_name).to(device)
+        logger.info(f"Model {model_name} loaded on {device}")
+
+        self.model = model
+        self.tokenizer = tokenizer
+
+    def _ensure_tensor_on_device(self, **inputs):
+        """
+        Ensure PyTorch tensors are on the specified device.
+
+        Args:
+            inputs (keyword arguments that should be :obj:`torch.Tensor`): The tensors to place on :obj:`self.device`.
+
+        Return:
+            :obj:`Dict[str, torch.Tensor]`: The same as :obj:`inputs` but on the proper device.
+        """
+        return {name: tensor.to(self.model.device) for name, tensor in inputs.items()}
+
+    def _predict(self, request, output_features=False):
+        """
+        Inference on the input.
+        :param request: the request with the input and optional kwargs
+        :param output_features: return the features of the input.
+        Necessary if, e.g., attention mask is needed for post-processing.
+        :return: The model outputs and optionally the input features
+        """
+        all_predictions = []
+        request.preprocessing_kwargs["padding"] = request.preprocessing_kwargs.get("padding", True)
+        request.preprocessing_kwargs["truncation"] = request.preprocessing_kwargs.get("truncation", True)
+        features = self.tokenizer(request.input,
+                                  return_tensors="pt",
+                                  **request.preprocessing_kwargs)
+        for start_idx in range(0, len(request.input), self.batch_size):
+            with torch.no_grad():
+                input_features = {k: features[k][start_idx:start_idx+self.batch_size] for k in features.keys()}
+                input_features = self._ensure_tensor_on_device(**input_features)
+                predictions = self.model(**input_features, **request.model_kwargs)
+                all_predictions.append(predictions)
+        keys = all_predictions[0].keys()
+        final_prediction = {}
+        for key in keys:
+            # HuggingFace outputs for 'attentions' and more is returned as tuple of tensors
+            # Tuple of tuples only exists for 'past_key_values' which is only relevant for generation.
+            # Generation should NOT use this function
+            if isinstance(all_predictions[0][key], tuple):
+                tuple_of_lists = list(zip(*[[p.cpu() for p in tpl[key]] for tpl in all_predictions]))
+                final_prediction[key] = tuple(torch.cat(l) for l in tuple_of_lists)
+            else:
+                final_prediction[key] = torch.cat([p[key].cpu() for p in all_predictions])
+        if output_features:
+            return final_prediction, features
+        return final_prediction
+
+    def embedding(self, request):
+        request.model_kwargs["output_hidden_states"] = True
+        predictions, features = self._predict(request, output_features=True)
+        # We remove hidden_states from predictions!
+        hidden_state = predictions.pop("hidden_states")[-1]
+        attention_mask = features["attention_mask"]
+
+        embedding_mode = request.task_kwargs.get("embedding_mode", "mean")
+
+        if embedding_mode not in self.SUPPORTED_EMBEDDING_MODES:
+            raise ValueError(f"Embedding mode {embedding_mode} not in list of supported modes {self.SUPPORTED_EMBEDDING_MODES}")
+
+        if embedding_mode == "cls":
+            emb = hidden_state[:, 0, :]
+        # copied from sentence-transformers pooling
+        elif embedding_mode == "max":
+            input_mask_expanded = attention_mask.unsqueeze(-1).expand(hidden_state.size()).float()
+            hidden_state[input_mask_expanded == 0] = -1e9  # Set padding tokens to large negative value
+            emb = torch.max(hidden_state, 1)[0]
+        # copied from sentence-transformers pooling
+        elif embedding_mode == "mean":
+            input_mask_expanded = attention_mask.unsqueeze(-1).expand(hidden_state.size()).float()
+            sum_embeddings = torch.sum(hidden_state * input_mask_expanded, 1)
+            sum_mask = input_mask_expanded.sum(1)
+            emb = sum_embeddings / sum_mask
+        elif embedding_mode == "token":
+            emb = hidden_state
+            #word_ids = [features.word_ids(i) for i in range(len(request.input))]
+        return emb
+
+
+class AdapterTransformer(Transformer):
+    """
+    The class for all adapter-based models using the adapter-transformers package
+    """
+    def __init__(self, model_name, batch_size, disable_gpu, transformers_cache, adapter_name):
+        """
+        Initialize the Adapter with its underlying Transformer and pre-load all available adapters from adapterhub.ml
+        :param model_name: the Huggingface model name
+        :param batch_size: batch size used for inference
+        :param disable_gpu: do not move model to GPU even if CUDA is available
+        :param transformers_cache: Should be same as TRANSFORMERS_CACHE env variable.
+        This folder will be used to store the adapters
+        """
+        self._load_model(transformers.AutoModelWithHeads, model_name, disable_gpu)
+        self.model.load_adapter(adapter_name, load_as=adapter_name, cache_dir=transformers_cache)
+        self.model.to(self.model.device)
+        self.model.set_active_adapters(adapter_name)
+        self.batch_size = batch_size
+
+
+def read_batch(file_pointer, batch_size):
+    i = 0
+    lines = []
+    finished = False
+    while i < batch_size:
+        line = file_pointer.readline().strip()
+        # empty string -> file finished
+        if not line:
+            finished = True
+            break
+        else:
+            lines.append(line)
+        i += 1
+    return lines, finished
+
+
+def encode(args):
+    transformers_cache = args.transformers_cache
+    os.environ["TRANSFORMERS_CACHE"] = transformers_cache
+    model_name = args.model_name
+    model_type = args.model_type
+    batch_size = args.batch_size
+    chunk_size = args.chunk_size
+    input_file = args.input_file
+    output_file = os.path.splitext(args.output_file)[0]  # Remove .pkl from name
+    adapter_name = args.adapter_name
+
+    if model_type == "sentence-transformer":
+        model = SentenceTransformer(model_name, batch_size, False)
+    elif model_type == "transformer":
+        model = Transformer(model_name, batch_size, False)
+    elif model_type == "adapter":
+        model = AdapterTransformer(model_name, batch_size, False, transformers_cache, adapter_name)
+
+    logger.info(f"Reading input from {input_file}")
+    if os.path.dirname(output_file):
+        os.makedirs(os.path.dirname(output_file), exist_ok=True)
+    with open(input_file, "r", encoding="utf-8") as f_in:
+        # We do not know how large the input file is so we read it batch-wise for memory safety reasons
+        finished_reading = False
+        total_processed_lines = 0
+        chunk_idx = 0
+        current_processed_lines = 0
+        current_output = {}
+        while not finished_reading:
+            lines, finished_reading = read_batch(f_in, batch_size)
+            if lines:
+                lines = [json.loads(line) for line in lines]
+                texts = [line["text"] for line in lines]
+                ids = [line["id"] for line in lines]
+
+                input = PredictionRequest(input=texts)
+                embeddings = model.embedding(input)
+                embeddings = embeddings.cpu().numpy()
+
+                current_output.update({id: row for id, row in zip(ids, embeddings)})
+
+            total_processed_lines += len(lines)
+            current_processed_lines += len(lines)
+
+            if current_processed_lines >= chunk_size or finished_reading:
+                logger.info(f"Processed {total_processed_lines} lines ({chunk_idx+1} chunks)")
+
+                chunk_output_file = f"{output_file}_{chunk_idx}.pkl"
+                with open(chunk_output_file, "wb") as out_f:
+                    logger.info(f"Writing chunk in {chunk_output_file}")
+                    pickle.dump(current_output, out_f)
+
+                current_processed_lines = 0
+                current_output = {}
+                chunk_idx += 1
+
+if __name__ == "__main__":
+    parser = argparse.ArgumentParser()
+    parser.add_argument("--transformers_cache", help="Cache folder where model files will be downloaded into and loaded from")
+    parser.add_argument("--model_name", help="Model name, i.e., name used in transformers oder sentence-transformers to load the pre-trained model")
+    parser.add_argument("--model_type", help="Model type, one of 'adapter', 'transformer', 'sentence-transformer'")
+    parser.add_argument("--batch_size", type=int, help="Batch size used for encoding")
+    parser.add_argument("--chunk_size", type=int, help="Chunk size used for writing out embeddings. "
+                                                       "ATTENTION: This value will be set to the first value satisfying true_chunk_size mod batch_size == 0"
+                                                       "Each output file contains chunk_size embeddings "
+                                                       "(except the last one if len($input) mod chunk_size != 0)")
+    parser.add_argument("--input_file", help="Input .jsonl file. Each line is a dict object: {'id': 'xxx', 'text': 'abc...'}")
+    parser.add_argument("--output_file", help="Output .pkl file. A chunk index will be inserted between the name and extension: 'path/to/name_chunkidx.pkl' ."
+                                              "Format: Dict[str, numpy.ndarray], i.e. mapping ids to the document embeddings")
+    parser.add_argument("--adapter_name", help="For model_type=adapter, the name of the adapter that should be loaded")
+    args = parser.parse_args()
+
+    encode(args)

From 16182cbbae960dde569c13d767441fed7320bc9f Mon Sep 17 00:00:00 2001
From: Geigle <gregortheodor.geigle@stud.tu-darmstadt.de>
Date: Wed, 14 Jul 2021 16:37:21 +0200
Subject: [PATCH 15/23] Adapter and GPUs: Fixed adapters not moved to model
 device

---
 .../square_model_inference/inference/adaptertransformer.py      | 2 ++
 1 file changed, 2 insertions(+)

diff --git a/square-model-inference-api/inference_server/square_model_inference/inference/adaptertransformer.py b/square-model-inference-api/inference_server/square_model_inference/inference/adaptertransformer.py
index 5c97532e9..99911bef2 100644
--- a/square-model-inference-api/inference_server/square_model_inference/inference/adaptertransformer.py
+++ b/square-model-inference-api/inference_server/square_model_inference/inference/adaptertransformer.py
@@ -45,6 +45,8 @@ def _load_adapter(self, model_name, transformers_cache):
                     logger.debug(f"Could not load {adapter} due to missing label_ids in config resulting in exception:\n{e.args[0]}")
                 else:
                     raise e
+        # Move all freshly loaded adapter weights to the same device as the model
+        self.model.to(self.model.device)
 
 # def _load_single_adapter(self, adapter_name: str):
     #     if adapter_name not in self.model.config.adapters.adapters:

From f1d8e00219400fe7125723f8110d3b3b37fdcada Mon Sep 17 00:00:00 2001
From: Geigle <gregortheodor.geigle@stud.tu-darmstadt.de>
Date: Thu, 22 Jul 2021 09:47:29 +0200
Subject: [PATCH 16/23] Added support for .hdf5 and float16 storage of
 embeddings. Note: pickle seems to actually be slightly smaller than hdf5

---
 .../offline_encoding_for_data_api.py          | 77 ++++++-------------
 1 file changed, 24 insertions(+), 53 deletions(-)

diff --git a/square-model-inference-api/offline_encoding_for_data_api.py b/square-model-inference-api/offline_encoding_for_data_api.py
index 4f787e24e..66c48696f 100644
--- a/square-model-inference-api/offline_encoding_for_data_api.py
+++ b/square-model-inference-api/offline_encoding_for_data_api.py
@@ -5,10 +5,10 @@
 import logging
 import json
 import pickle
+from dataclasses import dataclass
 from typing import Union, List
-
+import h5py
 import torch
-from pydantic import BaseModel, Field
 # Conditionally load adapter or sentence-transformer later to simplify installation
 #from sentence_transformers import SentenceTransformer as SentenceTransformerModel
 import transformers
@@ -18,55 +18,16 @@
 logger.setLevel(logging.DEBUG)
 
 
-class PredictionRequest(BaseModel):
+@dataclass
+class PredictionRequest:
     """
     Prediction request containing the input, pre-processing parameters, parameters for the model forward pass,
      the task with task-specific parameters, and parameters for any post-processing
     """
-    input: Union[List[str], List[List[str]], dict] = Field(
-        ...,
-        description="Input for the model. Supports Huggingface Transformer inputs (i.e., list of sentences, or "
-                    "list of pairs of sentences), a dictionary with Transformer inputs, or a dictionary containing "
-                    "numpy arrays (as lists). For the numpy arrays, also set is_preprocessed=True. "
-                    "<br><br>"
-                    "Transformer/ Adapter:<br>"
-                    "Task 'question_answering' expects the input to be in the (question, context) format."
-    )
-    preprocessing_kwargs: dict = Field(
-        default={},
-        description="Optional dictionary containing additional parameters for the pre-processing step.<br><br>"
-                    "SentenceTransformer: This is ignored.<br>"
-                    "Transformer/ Adapter: See the Huggingface tokenizer for possible parameters."
-    )
-    model_kwargs: dict = Field(
-        default={},
-        description="Optional dictionary containing parameters that are passed to the model for the forward pass "
-                    "to control what additional tensors are returned.<br><br>"
-                    "SentenceTransformer: This is ignored.<br>"
-                    "Transformer/ Adapter: See the forward method of the Huggingface models for possible parameters"
-                    "For example, set ‘output_attentions=True’ to receive the attention results in the output."
-    )
-    task_kwargs: dict = Field(
-        default={},
-        description="Optional dictionary containing additional parameters for handling of the task and "
-                    "task-related post-processing.<br><br>"
-                    "SentenceTransformer: This is ignored.<br>"
-                    "Transformer/ Adapter:<br>"
-                    "'sentence_classification':<br>"
-                    "- 'is_regression': Flag to treat output of models with num_labels>1 as regression, too, i.e., no softmax and no labels are returned<br>"
-                    "'token_classification':<br>"
-                    "- 'is_regression': Flag to treat output of models with num_labels>1 as regression, too, i.e., no softmax and no labels are returned<br>"
-                    "'embedding':<br>"
-                    "- 'embedding_mode: One of 'mean', 'max', 'cls', 'token'. The pooling mode used (or not used for 'token'). Default 'mean'.<br>"
-                    "'question_answering':<br>"
-                    "- 'topk': Return the top-k most likely spans. Default 1.<br>"
-                    "- 'max_answer_len': Maximal token length of answers. Default 128.<br>"
-                    "'generation':<br>"
-                    "- 'clean_up_tokenization_spaces': See parameter in Huggingface tokenizer.decode(). Default False<br>"
-                    "- See Huggingface model.generate() for all possible parameters that can be used. "
-                    "Note, 'model_kwargs' and 'task_kwargs' are merged for generation."
-
-    )
+    input: Union[List[str], List[List[str]], dict]
+    preprocessing_kwargs: dict
+    model_kwargs: dict
+    task_kwargs: dict
 
 
 class SentenceTransformer:
@@ -282,9 +243,11 @@ def encode(args):
                 texts = [line["text"] for line in lines]
                 ids = [line["id"] for line in lines]
 
-                input = PredictionRequest(input=texts)
+                input = PredictionRequest(input=texts, preprocessing_kwargs={}, model_kwargs={}, task_kwargs={"embedding_mode": "mean"})
                 embeddings = model.embedding(input)
                 embeddings = embeddings.cpu().numpy()
+                if args.float16:
+                    embeddings = embeddings.astype("float16")
 
                 current_output.update({id: row for id, row in zip(ids, embeddings)})
 
@@ -294,11 +257,17 @@ def encode(args):
             if current_processed_lines >= chunk_size or finished_reading:
                 logger.info(f"Processed {total_processed_lines} lines ({chunk_idx+1} chunks)")
 
-                chunk_output_file = f"{output_file}_{chunk_idx}.pkl"
-                with open(chunk_output_file, "wb") as out_f:
-                    logger.info(f"Writing chunk in {chunk_output_file}")
-                    pickle.dump(current_output, out_f)
-
+                if args.hdf5:
+                    chunk_output_file = f"{output_file}_{chunk_idx}.h5"
+                    with h5py.File(chunk_output_file, "w") as out_f:
+                        logger.info(f"Writing chunk in {chunk_output_file}")
+                        for k, v in current_output.items():
+                            out_f.create_dataset(k, data=v)
+                else:
+                    chunk_output_file = f"{output_file}_{chunk_idx}.pkl"
+                    with open(chunk_output_file, "wb") as out_f:
+                        logger.info(f"Writing chunk in {chunk_output_file}")
+                        pickle.dump(current_output, out_f)
                 current_processed_lines = 0
                 current_output = {}
                 chunk_idx += 1
@@ -317,6 +286,8 @@ def encode(args):
     parser.add_argument("--output_file", help="Output .pkl file. A chunk index will be inserted between the name and extension: 'path/to/name_chunkidx.pkl' ."
                                               "Format: Dict[str, numpy.ndarray], i.e. mapping ids to the document embeddings")
     parser.add_argument("--adapter_name", help="For model_type=adapter, the name of the adapter that should be loaded")
+    parser.add_argument("--hdf5", action="store_true")
+    parser.add_argument("--float16", action="store_true")
     args = parser.parse_args()
 
     encode(args)

From fe0e868d9351488db410510c5c90492992a71706 Mon Sep 17 00:00:00 2001
From: Geigle <gregortheodor.geigle@stud.tu-darmstadt.de>
Date: Fri, 30 Jul 2021 14:40:25 +0200
Subject: [PATCH 17/23] Offline Encoding: changed output format so hdf5 can
 better compress it; added hdf5 gzip compression option

---
 .../offline_encoding_for_data_api.py          | 27 +++++++++++++------
 1 file changed, 19 insertions(+), 8 deletions(-)

diff --git a/square-model-inference-api/offline_encoding_for_data_api.py b/square-model-inference-api/offline_encoding_for_data_api.py
index 66c48696f..2e1036145 100644
--- a/square-model-inference-api/offline_encoding_for_data_api.py
+++ b/square-model-inference-api/offline_encoding_for_data_api.py
@@ -9,6 +9,7 @@
 from typing import Union, List
 import h5py
 import torch
+import numpy as np
 # Conditionally load adapter or sentence-transformer later to simplify installation
 #from sentence_transformers import SentenceTransformer as SentenceTransformerModel
 import transformers
@@ -235,7 +236,7 @@ def encode(args):
         total_processed_lines = 0
         chunk_idx = 0
         current_processed_lines = 0
-        current_output = {}
+        current_output = {"ids": [], "embeddings": []}
         while not finished_reading:
             lines, finished_reading = read_batch(f_in, batch_size)
             if lines:
@@ -249,7 +250,8 @@ def encode(args):
                 if args.float16:
                     embeddings = embeddings.astype("float16")
 
-                current_output.update({id: row for id, row in zip(ids, embeddings)})
+                current_output["ids"].extend(ids)
+                current_output["embeddings"].append(embeddings)
 
             total_processed_lines += len(lines)
             current_processed_lines += len(lines)
@@ -257,19 +259,25 @@ def encode(args):
             if current_processed_lines >= chunk_size or finished_reading:
                 logger.info(f"Processed {total_processed_lines} lines ({chunk_idx+1} chunks)")
 
+                current_output["embeddings"] = np.concatenate(current_output["embeddings"])
+
                 if args.hdf5:
                     chunk_output_file = f"{output_file}_{chunk_idx}.h5"
                     with h5py.File(chunk_output_file, "w") as out_f:
                         logger.info(f"Writing chunk in {chunk_output_file}")
-                        for k, v in current_output.items():
-                            out_f.create_dataset(k, data=v)
+                        if args.hdf5_gzip_level < 0:
+                            out_f.create_dataset("ids", data=np.array(current_output["ids"], dtype="S"))
+                            out_f.create_dataset("embeddings", data=current_output["embeddings"])
+                        else:
+                            out_f.create_dataset("ids", data=np.array(current_output["ids"], dtype="S"),  compression="gzip", compression_opts=min(args.hdf5_gzip_level, 9))
+                            out_f.create_dataset("embeddings", data=current_output["embeddings"],  compression="gzip", compression_opts=min(args.hdf5_gzip_level, 9))
                 else:
                     chunk_output_file = f"{output_file}_{chunk_idx}.pkl"
                     with open(chunk_output_file, "wb") as out_f:
                         logger.info(f"Writing chunk in {chunk_output_file}")
                         pickle.dump(current_output, out_f)
                 current_processed_lines = 0
-                current_output = {}
+                current_output = {"ids": [], "embeddings": []}
                 chunk_idx += 1
 
 if __name__ == "__main__":
@@ -279,14 +287,17 @@ def encode(args):
     parser.add_argument("--model_type", help="Model type, one of 'adapter', 'transformer', 'sentence-transformer'")
     parser.add_argument("--batch_size", type=int, help="Batch size used for encoding")
     parser.add_argument("--chunk_size", type=int, help="Chunk size used for writing out embeddings. "
-                                                       "ATTENTION: This value will be set to the first value satisfying true_chunk_size mod batch_size == 0"
+                                                       "ATTENTION: This value will be set to the first value satisfying: true_chunk_size mod batch_size == 0"
                                                        "Each output file contains chunk_size embeddings "
-                                                       "(except the last one if len($input) mod chunk_size != 0)")
+                                                       "(except the last one if len(input) mod chunk_size != 0)")
     parser.add_argument("--input_file", help="Input .jsonl file. Each line is a dict object: {'id': 'xxx', 'text': 'abc...'}")
     parser.add_argument("--output_file", help="Output .pkl file. A chunk index will be inserted between the name and extension: 'path/to/name_chunkidx.pkl' ."
-                                              "Format: Dict[str, numpy.ndarray], i.e. mapping ids to the document embeddings")
+                                              "Format: {'ids': List[str], 'embeddings': ndarray}. "
+                                              "Note for hdf5, use f['ids'].asstr() to load ids as string because default is binary.")
     parser.add_argument("--adapter_name", help="For model_type=adapter, the name of the adapter that should be loaded")
     parser.add_argument("--hdf5", action="store_true")
+    parser.add_argument("--hdf5-gzip-level", type=int, default=4, help="GZIP compression level for HDF5 in range 0-9, default 4 (bigger is more compressed). "
+                                                                       "Set to negative value to disable compression")
     parser.add_argument("--float16", action="store_true")
     args = parser.parse_args()
 

From fe59bcf41410b099fe123a905544e5704d5e066a Mon Sep 17 00:00:00 2001
From: Geigle <gregortheodor.geigle@stud.tu-darmstadt.de>
Date: Fri, 30 Jul 2021 16:15:47 +0200
Subject: [PATCH 18/23] Included in CICD Pipeline; changed logging formatting
 for ELK stack

---
 .github/workflows/ci.yml                      | 183 ++++++++++++++++++
 square-model-inference-api/README.md          |   3 +
 .../auth_server/Dockerfile                    |   7 +-
 .../auth_server/ElkJsonFormatter.tar.gz       | Bin 0 -> 1407 bytes
 .../auth_server/auth_api/security.py          |   5 +-
 .../auth_server/logging.conf                  |  21 ++
 .../auth_server/main.py                       |   3 +
 .../auth_server/requirements.txt              |   1 +
 .../example_docker-compose.yml                |  11 +-
 .../inference_server/Dockerfile               |  15 +-
 .../inference_server/ElkJsonFormatter.tar.gz  | Bin 0 -> 1407 bytes
 .../inference_server/logging.conf             |  21 ++
 .../inference_server/main.py                  |   6 +-
 .../inference_server/requirements1.txt        |   3 +-
 square-model-inference-api/nginx/nginx.conf   |  17 ++
 15 files changed, 283 insertions(+), 13 deletions(-)
 create mode 100644 square-model-inference-api/auth_server/ElkJsonFormatter.tar.gz
 create mode 100644 square-model-inference-api/auth_server/logging.conf
 create mode 100644 square-model-inference-api/inference_server/ElkJsonFormatter.tar.gz
 create mode 100644 square-model-inference-api/inference_server/logging.conf

diff --git a/.github/workflows/ci.yml b/.github/workflows/ci.yml
index 5c484fd29..6f860fb37 100644
--- a/.github/workflows/ci.yml
+++ b/.github/workflows/ci.yml
@@ -151,3 +151,186 @@ jobs:
         run: |
           rm -rf /tmp/.buildx-cache
           mv /tmp/.buildx-cache-new /tmp/.buildx-cache
+
+  model-api:
+    runs-on: ubuntu-latest
+
+    steps:
+      - uses: actions/checkout@v2
+
+      - name: Prepare
+        id: prep
+        run: |
+          TAG=$(echo $GITHUB_SHA | head -c7)
+          IMAGE="ukpsquare/square-model-api"
+          echo ::set-output name=image::${IMAGE}
+          echo ::set-output name=tag::${TAG}
+      - name: Set up Docker Buildx
+        id: buildx
+        uses: docker/setup-buildx-action@v1
+        with:
+          install: true
+
+      - name: Cache Docker layers
+        uses: actions/cache@v2
+        with:
+          path: /tmp/.buildx-cache
+          key: ${{ runner.os }}-buildx-model_api-${{ github.sha }}
+          restore-keys: |
+            ${{ runner.os }}-buildx-model_api-
+            ${{ runner.os }}-buildx-
+
+      - name: Build test image
+        uses: docker/build-push-action@v2
+        with:
+          builder: ${{ steps.buildx.outputs.name }}
+          context: ./square-model-inference-api/inference_server
+          target: test
+          load: true
+          tags: ${{ steps.prep.outputs.image }}:${{ steps.prep.outputs.tag }}-test
+          cache-from: type=local,src=/tmp/.buildx-cache
+          cache-to: type=local,mode=max,dest=/tmp/.buildx-cache-new
+
+        # Temp fix
+        # https://github.com/docker/build-push-action/issues/252
+        # https://github.com/moby/buildkit/issues/1896
+      - name: Move cache
+        run: |
+          rm -rf /tmp/.buildx-cache
+          mv /tmp/.buildx-cache-new /tmp/.buildx-cache
+
+      - name: Retrieve Test Reports
+        id: extract
+        uses: shrink/actions-docker-extract@v1
+        with:
+          image: ${{ steps.prep.outputs.image }}:${{ steps.prep.outputs.tag }}-test
+          path: /app/test-reports
+
+      - uses: actions/upload-artifact@v2
+        with:
+          name: model_api-test-reports
+          path: ${{ steps.extract.outputs.destination }}/test-reports
+
+      - name: Publish Test Report
+        uses: mikepenz/action-junit-report@v2
+        with:
+          report_paths: ${{ steps.extract.outputs.destination }}/test-reports/junit.xml
+          check_name: Model API Test Report
+          fail_on_failure: true
+
+      - name: Login to Docker Hub
+        uses: docker/login-action@v1
+        with:
+          username: ${{ secrets.DOCKER_HUB_USERNAME }}
+          password: ${{ secrets.DOCKER_HUB_ACCESS_TOKEN }}
+
+      - name: Build deployable image
+        uses: docker/build-push-action@v2
+        with:
+          builder: ${{ steps.buildx.outputs.name }}
+          context: ./square-model-inference-api/inference_server
+          target: build
+          push: ${{github.ref == 'refs/heads/master'}}
+          tags: ${{ steps.prep.outputs.image }}:${{ steps.prep.outputs.tag }}, ${{ steps.prep.outputs.image }}:latest
+          cache-from: type=local,src=/tmp/.buildx-cache
+          cache-to: type=local,mode=max,dest=/tmp/.buildx-cache-new
+
+      #  Temp fix
+      # https://github.com/docker/build-push-action/issues/252
+      # https://github.com/moby/buildkit/issues/1896
+      - name: Move cache
+        run: |
+          rm -rf /tmp/.buildx-cache
+          mv /tmp/.buildx-cache-new /tmp/.buildx-cache
+
+  model-api-auth:
+    runs-on: ubuntu-latest
+
+    steps:
+      - uses: actions/checkout@v2
+
+      - name: Prepare
+        id: prep
+        run: |
+          TAG=$(echo $GITHUB_SHA | head -c7)
+          IMAGE="ukpsquare/square-model-api-auth"
+          echo ::set-output name=image::${IMAGE}
+          echo ::set-output name=tag::${TAG}
+      - name: Set up Docker Buildx
+        id: buildx
+        uses: docker/setup-buildx-action@v1
+        with:
+          install: true
+
+      - name: Cache Docker layers
+        uses: actions/cache@v2
+        with:
+          path: /tmp/.buildx-cache
+          key: ${{ runner.os }}-buildx-model_api_auth-${{ github.sha }}
+          restore-keys: |
+            ${{ runner.os }}-buildx-model_api_auth-
+            ${{ runner.os }}-buildx-
+
+# No tests for auth server currently.
+#      - name: Build test image
+#        uses: docker/build-push-action@v2
+#        with:
+#          builder: ${{ steps.buildx.outputs.name }}
+#          context: ./square-model-inference-api/auth_server
+#          target: test
+#          load: true
+#          tags: ${{ steps.prep.outputs.image }}:${{ steps.prep.outputs.tag }}-test
+#          cache-from: type=local,src=/tmp/.buildx-cache
+#          cache-to: type=local,mode=max,dest=/tmp/.buildx-cache-new
+#
+#        # Temp fix
+#        # https://github.com/docker/build-push-action/issues/252
+#        # https://github.com/moby/buildkit/issues/1896
+#      - name: Move cache
+#        run: |
+#          rm -rf /tmp/.buildx-cache
+#          mv /tmp/.buildx-cache-new /tmp/.buildx-cache
+#
+#      - name: Retrieve Test Reports
+#        id: extract
+#        uses: shrink/actions-docker-extract@v1
+#        with:
+#          image: ${{ steps.prep.outputs.image }}:${{ steps.prep.outputs.tag }}-test
+#          path: /app/test-reports
+#
+#      - uses: actions/upload-artifact@v2
+#        with:
+#          name: model_api_auth-test-reports
+#          path: ${{ steps.extract.outputs.destination }}/test-reports
+#
+#      - name: Publish Test Report
+#        uses: mikepenz/action-junit-report@v2
+#        with:
+#          report_paths: ${{ steps.extract.outputs.destination }}/test-reports/junit.xml
+#          check_name: Model API Auth Test Report
+#          fail_on_failure: true
+
+      - name: Login to Docker Hub
+        uses: docker/login-action@v1
+        with:
+          username: ${{ secrets.DOCKER_HUB_USERNAME }}
+          password: ${{ secrets.DOCKER_HUB_ACCESS_TOKEN }}
+
+      - name: Build deployable image
+        uses: docker/build-push-action@v2
+        with:
+          builder: ${{ steps.buildx.outputs.name }}
+          context: ./square-model-inference-api/auth_server
+          #target: build  # Not multistage
+          push: ${{github.ref == 'refs/heads/master'}}
+          tags: ${{ steps.prep.outputs.image }}:${{ steps.prep.outputs.tag }}, ${{ steps.prep.outputs.image }}:latest
+          cache-from: type=local,src=/tmp/.buildx-cache
+          cache-to: type=local,mode=max,dest=/tmp/.buildx-cache-new
+
+      #  Temp fix
+      # https://github.com/docker/build-push-action/issues/252
+      # https://github.com/moby/buildkit/issues/1896
+      - name: Move cache
+        run: |
+          rm -rf /tmp/.buildx-cache
+          mv /tmp/.buildx-cache-new /tmp/.buildx-cache
\ No newline at end of file
diff --git a/square-model-inference-api/README.md b/square-model-inference-api/README.md
index c51c79ab3..26dcb90a1 100644
--- a/square-model-inference-api/README.md
+++ b/square-model-inference-api/README.md
@@ -30,6 +30,9 @@ to handle authorization of requests with the auth server.
 └───example_docker-compose.yml  # Example docker-compose setup for the Model API
 ```
 
+### Logging
+The components use the json-formatted logging used by the ELK Stack in square-core/logging.
+
 ## Requirements
 
 Python 3.7+, Docker (optional), Make (optional)
diff --git a/square-model-inference-api/auth_server/Dockerfile b/square-model-inference-api/auth_server/Dockerfile
index aadc437d3..e630e0821 100644
--- a/square-model-inference-api/auth_server/Dockerfile
+++ b/square-model-inference-api/auth_server/Dockerfile
@@ -5,10 +5,13 @@ ENV PYTHONUNBUFFERED 1
 EXPOSE 8081
 WORKDIR /app
 
+COPY ElkJsonFormatter.tar.gz ./ElkJsonFormatter.tar.gz
 RUN pip install --upgrade pip
 COPY requirements.txt ./
 RUN pip install -r requirements.txt
 
-COPY . ./
+COPY main.py main.py
+COPY ./auth_api auth_api
+COPY logging.conf logging.conf
 
-CMD ["uvicorn", "main:app", "--host", "0.0.0.0", "--port", "8081"]
\ No newline at end of file
+CMD ["uvicorn", "main:app", "--host", "0.0.0.0", "--port", "8081", "--log-config", "logging.conf"]
\ No newline at end of file
diff --git a/square-model-inference-api/auth_server/ElkJsonFormatter.tar.gz b/square-model-inference-api/auth_server/ElkJsonFormatter.tar.gz
new file mode 100644
index 0000000000000000000000000000000000000000..19a8ecae00106938f782c0e9437af63071c4adce
GIT binary patch
literal 1407
zcmV-_1%Ub=iwFp$Kh$6X|72-%bT370Yf5u(Zbol%ZDDkDWpXVrE-)@KE_7jX0PUPx
zQ{qS%hP}?O=<)`rf|!IH)GeyU+HrOcJLr1Yxv;F*CY{i0PMdBV`S;tMfIvV|bTaPn
zy*H3fcLzwm_sQ33THR51-W}t+j|BT9-})iaN<H+dX<cLbUY6^+X=z*NZmR|j8OOOe
z{A+*7|A`D6#hk{Zr`v{U8HR0Zs@c_zJ!K2D0XmFsKC?9WkmfPwoa7x%)l_5W)#Kl>
ztn$?^Up2#=kNLUyH*L$=LROveFM{CFdZJno{Ce@Pz6t*~7yaJ=|F6aWcbE9TVegvS
zE)ZaI{l8kg^U~0Gmj4^V0W|Rcjq!iOUiAMi_<t?_Z_N9@r3(W9=Li3<_<yxP%|_1(
z4cjyy^nXXM#J|yXbaDT$)fxXA>i<EW#wZ){L7My{&To{4A;}Sqvoz;uTA~#7e9Vb7
zi^=}U_HT;fMVK*Ey-J$X9@M!Vt$hUv`6NKt_uYV!$Y)JPqM(gLeK#jwn)_`ud5sBU
zI3%u5Jw9$L#fF(UNcE^~Bs*%UGd-{D^S;Yz&5Oo6`Q^lkkF&-l>fL``HA%lWTPkH~
zP#m(kymdrwNmMi}OVpwu!7*80z4)1C>pxf(XIVr&%xRhw^&6*uogbVWHrDWM;6Lym
z_z(PF#edhO3FWT)9PU4r|C;4g_%GB2{@2ETwR*ZIfUlN=G5l(J8{@y-MgHr+|62I(
zEF1sb+jAU<b>ROr-6vt#p-GVLtQi{58~@d9&4BUWjq!iT|K3dgC;cDtKk)wt{r~vO
zADvJA4@a+<{~1no{MT?a6Y{@0^p$YzV~#t2lbq2s*+;sF|2~e%{$lobU#GZ8l+$4x
z<9xJ_&ih~bN8kF&$22CL42NV|bTH(DG_P)VNQ`MTE&Y>vBw;hn-V^5KG~>n5CWXfl
z=5oY!S_1J4{0II6|3S6!|Mckm<oCl<mEZB#;=frP|1r9zZ3F-7(8=M!`>%&;>?@4$
zVWwsy<tfkpmB$_sD68)tl_y<bkJR!>MiQSS-pGw;a>G_s&&j_-nv+%4c$&GzQ`+ZT
z-oSt0Kky&48T?;nyf5Uxrs);_TekfF7x4eb3ql?8f8amxANUXae-i&^GpSd~{}=xM
zZJ4@a0srezIrY0E=V^D!8OXh!Aua(OJOKC){0IJnfd4NXB3;RUqniJlT?6?4Q_eqx
zKJdSG{=@noKac;**8k{gyIluXfZAaGuQAJSef}@Z|I;nU)`9<Z2<HC*{~`Z}{2%`R
z4g8<s|NXSEI{0td)%drC0RaE&(Aj`8lwt2iP#cN2g0HAt)r8X|l&g*~gc<sLdem1D
zQk1h4T@PvGqkNd4tMcgqnoJ!;%%_ZB6&1e?NP@fpPD0ra9}wiFK53&YBA5|0%;ah+
zMfq)cSAnSLaMIHi!iit(^+|w-5udFki?1;opeuQ}oL&z(am875dFnEf--??Frn12m
zD(@jwbS6KG{J^qhS%`Rwrmc$|q6g7!6v<t3I}*Ezk0w1%jy!JP9Df5EpoDmGDVcoK
zl1*cLQ}&mT?=zoTTt88SuOlLV9pFFY|8F_}7r+0m0e1nL%m31cy&$ZE|C+hLe@lb;
zf3@g>grO@R>|81whi=LC9x65pzV(o%fD=Ie5BWdj|Des{{~Bw(t;2u$u)o6pt^?!$
zwWwJCtvK5}O(Qm0_^l{uqkty9J02ZiO8FPTX0arxT${AFUA=m?+r`dt*4x`IE~ate
zq?_Jcr%jr8(a<Noi$=Ax#$~H)IZmG4$<)){#rA{K7Pl{z7T6aE1OkCTAP@)y0)asD
N`VWGuRs{ff001<j^lJbB

literal 0
HcmV?d00001

diff --git a/square-model-inference-api/auth_server/auth_api/security.py b/square-model-inference-api/auth_server/auth_api/security.py
index 1a74e31d8..058086f88 100644
--- a/square-model-inference-api/auth_server/auth_api/security.py
+++ b/square-model-inference-api/auth_server/auth_api/security.py
@@ -3,20 +3,23 @@
 from fastapi import HTTPException, Security
 from fastapi.security.api_key import APIKeyHeader
 from starlette.status import HTTP_400_BAD_REQUEST, HTTP_401_UNAUTHORIZED
+from loguru import logger
 
 from .config import API_KEY, API_KEY_HEADER_NAME
 from .messages import AUTH_REQ, NO_API_KEY
-
 api_key_header = APIKeyHeader(name=API_KEY_HEADER_NAME, auto_error=False)
 
 
 def validate_request(header: str = Security(api_key_header),) -> bool:
     if header is None:
+        logger.info("Attempted access without API Key")
         raise HTTPException(
             status_code=HTTP_400_BAD_REQUEST, detail=NO_API_KEY.format(API_KEY_HEADER_NAME), headers={}
         )
     elif not secrets.compare_digest(header, str(API_KEY)):
+        logger.info(f"Attempted access with wrong API Key {header}")
         raise HTTPException(
             status_code=HTTP_401_UNAUTHORIZED, detail=AUTH_REQ, headers={}
         )
+    logger.info("Successful authorization with API Key")
     return True
diff --git a/square-model-inference-api/auth_server/logging.conf b/square-model-inference-api/auth_server/logging.conf
new file mode 100644
index 000000000..d62db8187
--- /dev/null
+++ b/square-model-inference-api/auth_server/logging.conf
@@ -0,0 +1,21 @@
+[loggers]
+keys = root
+
+[logger_root]
+level = DEBUG
+handlers = root
+
+[handlers]
+keys = root
+
+[handler_root]
+class = StreamHandler
+level = DEBUG
+formatter = json
+
+[formatters]
+keys = json
+
+[formatter_json]
+class = ElkJsonFormatter.ElkJsonFormatter
+
diff --git a/square-model-inference-api/auth_server/main.py b/square-model-inference-api/auth_server/main.py
index 93894341f..c137f9d0a 100644
--- a/square-model-inference-api/auth_server/main.py
+++ b/square-model-inference-api/auth_server/main.py
@@ -1,5 +1,8 @@
 from fastapi import FastAPI, Depends
 import auth_api.security as security
+from logging.config import fileConfig
+
+fileConfig("logging.conf")
 
 app = FastAPI()
 
diff --git a/square-model-inference-api/auth_server/requirements.txt b/square-model-inference-api/auth_server/requirements.txt
index 1141b0ad8..a7128b727 100644
--- a/square-model-inference-api/auth_server/requirements.txt
+++ b/square-model-inference-api/auth_server/requirements.txt
@@ -1,3 +1,4 @@
 uvicorn==0.13.4     # ASGI server
 fastapi==0.65.1     # REST API Framework
 pydantic==1.8.2     # Input/ output modelling
+ElkJsonFormatter.tar.gz # Logging Formatter
\ No newline at end of file
diff --git a/square-model-inference-api/example_docker-compose.yml b/square-model-inference-api/example_docker-compose.yml
index d15b00b0a..e03421974 100644
--- a/square-model-inference-api/example_docker-compose.yml
+++ b/square-model-inference-api/example_docker-compose.yml
@@ -2,7 +2,8 @@ version: "3"
 
 services:
   auth:
-    build: ./auth_server/.
+    #build: ./auth_server/.
+    image: ukpsquare/square-model-api-auth:latest
     ports:
       - 8081:8081
     env_file:
@@ -17,8 +18,8 @@ services:
 
   # Example config for one model server
   inference_bert:
-    build: ./inference_server/.
-    image: square_model_inference:latest
+    #build: ./inference_server/.
+    image: ukpsquare/square-model-api:latest
     ports:
       - 8000:8000
     env_file:
@@ -30,8 +31,8 @@ services:
 
   # Example config for another model server
   inference_roberta:
-    build: ./inference_server/.
-    image: square_model_inference:latest
+    #build: ./inference_server/.
+    image: ukpsquare/square-model-api:latest
     ports:
       - 8001:8000
     env_file:
diff --git a/square-model-inference-api/inference_server/Dockerfile b/square-model-inference-api/inference_server/Dockerfile
index fefc84ae8..041a92c18 100644
--- a/square-model-inference-api/inference_server/Dockerfile
+++ b/square-model-inference-api/inference_server/Dockerfile
@@ -5,6 +5,7 @@ ENV PYTHONUNBUFFERED 1
 EXPOSE 8000
 WORKDIR /app
 
+COPY ElkJsonFormatter.tar.gz ./ElkJsonFormatter.tar.gz
 RUN pip install --upgrade pip
 COPY requirements1.txt requirements2.txt uninstall_requirements.txt ./
 RUN pip install -r requirements1.txt
@@ -19,11 +20,19 @@ COPY ./tests/pre_test_setup_for_docker_caching.py ./tests/pre_test_setup_for_doc
 RUN python ./tests/pre_test_setup_for_docker_caching.py
 COPY . ./
 RUN pip install pytest pytest-cov pytest-asyncio
-RUN PYTHONPATH=./ pytest --cov
+RUN mkdir test-reports
+RUN PYTHONPATH=./ pytest \
+    --junitxml=test-reports/junit.xml \
+    --cov \
+    --cov-report=xml:test-reports/coverage.xml \
+    --cov-report=html:test-reports/coverage.html; \
+    echo $? > test-reports/pytest.existcode
 
 # Deployment stage
 FROM base as build
 
-COPY . ./
+COPY main.py main.py
+COPY ./square_model_inference square_model_inference
+COPY logging.conf logging.conf
 
-CMD ["uvicorn", "main:app", "--host", "0.0.0.0", "--port", "8000"]
\ No newline at end of file
+CMD ["uvicorn", "main:app", "--host", "0.0.0.0", "--port", "8000", "--log-config", "logging.conf"]
\ No newline at end of file
diff --git a/square-model-inference-api/inference_server/ElkJsonFormatter.tar.gz b/square-model-inference-api/inference_server/ElkJsonFormatter.tar.gz
new file mode 100644
index 0000000000000000000000000000000000000000..19a8ecae00106938f782c0e9437af63071c4adce
GIT binary patch
literal 1407
zcmV-_1%Ub=iwFp$Kh$6X|72-%bT370Yf5u(Zbol%ZDDkDWpXVrE-)@KE_7jX0PUPx
zQ{qS%hP}?O=<)`rf|!IH)GeyU+HrOcJLr1Yxv;F*CY{i0PMdBV`S;tMfIvV|bTaPn
zy*H3fcLzwm_sQ33THR51-W}t+j|BT9-})iaN<H+dX<cLbUY6^+X=z*NZmR|j8OOOe
z{A+*7|A`D6#hk{Zr`v{U8HR0Zs@c_zJ!K2D0XmFsKC?9WkmfPwoa7x%)l_5W)#Kl>
ztn$?^Up2#=kNLUyH*L$=LROveFM{CFdZJno{Ce@Pz6t*~7yaJ=|F6aWcbE9TVegvS
zE)ZaI{l8kg^U~0Gmj4^V0W|Rcjq!iOUiAMi_<t?_Z_N9@r3(W9=Li3<_<yxP%|_1(
z4cjyy^nXXM#J|yXbaDT$)fxXA>i<EW#wZ){L7My{&To{4A;}Sqvoz;uTA~#7e9Vb7
zi^=}U_HT;fMVK*Ey-J$X9@M!Vt$hUv`6NKt_uYV!$Y)JPqM(gLeK#jwn)_`ud5sBU
zI3%u5Jw9$L#fF(UNcE^~Bs*%UGd-{D^S;Yz&5Oo6`Q^lkkF&-l>fL``HA%lWTPkH~
zP#m(kymdrwNmMi}OVpwu!7*80z4)1C>pxf(XIVr&%xRhw^&6*uogbVWHrDWM;6Lym
z_z(PF#edhO3FWT)9PU4r|C;4g_%GB2{@2ETwR*ZIfUlN=G5l(J8{@y-MgHr+|62I(
zEF1sb+jAU<b>ROr-6vt#p-GVLtQi{58~@d9&4BUWjq!iT|K3dgC;cDtKk)wt{r~vO
zADvJA4@a+<{~1no{MT?a6Y{@0^p$YzV~#t2lbq2s*+;sF|2~e%{$lobU#GZ8l+$4x
z<9xJ_&ih~bN8kF&$22CL42NV|bTH(DG_P)VNQ`MTE&Y>vBw;hn-V^5KG~>n5CWXfl
z=5oY!S_1J4{0II6|3S6!|Mckm<oCl<mEZB#;=frP|1r9zZ3F-7(8=M!`>%&;>?@4$
zVWwsy<tfkpmB$_sD68)tl_y<bkJR!>MiQSS-pGw;a>G_s&&j_-nv+%4c$&GzQ`+ZT
z-oSt0Kky&48T?;nyf5Uxrs);_TekfF7x4eb3ql?8f8amxANUXae-i&^GpSd~{}=xM
zZJ4@a0srezIrY0E=V^D!8OXh!Aua(OJOKC){0IJnfd4NXB3;RUqniJlT?6?4Q_eqx
zKJdSG{=@noKac;**8k{gyIluXfZAaGuQAJSef}@Z|I;nU)`9<Z2<HC*{~`Z}{2%`R
z4g8<s|NXSEI{0td)%drC0RaE&(Aj`8lwt2iP#cN2g0HAt)r8X|l&g*~gc<sLdem1D
zQk1h4T@PvGqkNd4tMcgqnoJ!;%%_ZB6&1e?NP@fpPD0ra9}wiFK53&YBA5|0%;ah+
zMfq)cSAnSLaMIHi!iit(^+|w-5udFki?1;opeuQ}oL&z(am875dFnEf--??Frn12m
zD(@jwbS6KG{J^qhS%`Rwrmc$|q6g7!6v<t3I}*Ezk0w1%jy!JP9Df5EpoDmGDVcoK
zl1*cLQ}&mT?=zoTTt88SuOlLV9pFFY|8F_}7r+0m0e1nL%m31cy&$ZE|C+hLe@lb;
zf3@g>grO@R>|81whi=LC9x65pzV(o%fD=Ie5BWdj|Des{{~Bw(t;2u$u)o6pt^?!$
zwWwJCtvK5}O(Qm0_^l{uqkty9J02ZiO8FPTX0arxT${AFUA=m?+r`dt*4x`IE~ate
zq?_Jcr%jr8(a<Noi$=Ax#$~H)IZmG4$<)){#rA{K7Pl{z7T6aE1OkCTAP@)y0)asD
N`VWGuRs{ff001<j^lJbB

literal 0
HcmV?d00001

diff --git a/square-model-inference-api/inference_server/logging.conf b/square-model-inference-api/inference_server/logging.conf
new file mode 100644
index 000000000..d62db8187
--- /dev/null
+++ b/square-model-inference-api/inference_server/logging.conf
@@ -0,0 +1,21 @@
+[loggers]
+keys = root
+
+[logger_root]
+level = DEBUG
+handlers = root
+
+[handlers]
+keys = root
+
+[handler_root]
+class = StreamHandler
+level = DEBUG
+formatter = json
+
+[formatters]
+keys = json
+
+[formatter_json]
+class = ElkJsonFormatter.ElkJsonFormatter
+
diff --git a/square-model-inference-api/inference_server/main.py b/square-model-inference-api/inference_server/main.py
index e7004c49e..167adb04c 100644
--- a/square-model-inference-api/inference_server/main.py
+++ b/square-model-inference-api/inference_server/main.py
@@ -1,10 +1,14 @@
+import sys
+
 from fastapi import FastAPI
 from square_model_inference.api.routes.router import api_router
 from square_model_inference.core.config import API_PREFIX, APP_NAME, APP_VERSION
 from square_model_inference.core.event_handlers import start_app_handler, stop_app_handler
-
+from logging.config import fileConfig
 
 def get_app() -> FastAPI:
+    # Set logging config. Loguru uses this then, too
+    fileConfig("logging.conf")
     fast_app = FastAPI(title=APP_NAME, version=APP_VERSION)
     fast_app.include_router(api_router, prefix=API_PREFIX)
 
diff --git a/square-model-inference-api/inference_server/requirements1.txt b/square-model-inference-api/inference_server/requirements1.txt
index 00e9c0a57..794d45f82 100644
--- a/square-model-inference-api/inference_server/requirements1.txt
+++ b/square-model-inference-api/inference_server/requirements1.txt
@@ -5,4 +5,5 @@ loguru==0.5.3                   # Easier logging
 python-dotenv==0.17.1           # Required for .env configs
 sentencepiece==0.1.95
 torch==1.8.1
-sentence-transformers==1.2.0
\ No newline at end of file
+sentence-transformers==1.2.0
+ElkJsonFormatter.tar.gz # Logging Formatter
\ No newline at end of file
diff --git a/square-model-inference-api/nginx/nginx.conf b/square-model-inference-api/nginx/nginx.conf
index c6051be8f..1858be5bf 100644
--- a/square-model-inference-api/nginx/nginx.conf
+++ b/square-model-inference-api/nginx/nginx.conf
@@ -3,7 +3,24 @@ events {
 }
 
 http {
+    log_format json_combined escape=json
+      '{ "@timestamp": "$time_iso8601", '
+      '"remote_addr": "$remote_addr", '
+      '"http_referer": "$http_referer", '
+      '"request": "$request", '
+      '"status": $status, '
+      '"body_bytes_sent": $body_bytes_sent, '
+      '"http_user_agent": "$http_user_agent", '
+      '"http_x_forwarded_for": "$http_x_forwarded_for", '
+      '"upstream_addr": "$upstream_addr",'
+      '"upstream_http_host": "$upstream_http_host",'
+      '"upstream_response_time": "$upstream_response_time",'
+      '"request_time": "$request_time"'
+      '}';
+
     server {
+        access_log /var/log/nginx/access.log  json_combined;
+
         listen 8080;
 
         location / {

From 547791511c63d7761c44cef3b2a95718533208d6 Mon Sep 17 00:00:00 2001
From: Geigle <gregortheodor.geigle@stud.tu-darmstadt.de>
Date: Thu, 5 Aug 2021 14:10:59 +0200
Subject: [PATCH 19/23] Updated adapter-transformers version to fix error;
 added basic tests for auth server and included that in ci

---
 .github/workflows/ci.yml                      | 75 +++++++++----------
 .../auth_server/Dockerfile                    | 15 +++-
 .../auth_server/main.py                       |  7 +-
 .../auth_server/tests/conftest.py             | 24 ++++++
 .../auth_server/tests/test_api/__init__.py    |  0
 .../auth_server/tests/test_api/test_auth.py   | 34 +++++++++
 .../inference_server/main.py                  |  6 +-
 .../inference_server/requirements2.txt        |  2 +-
 8 files changed, 120 insertions(+), 43 deletions(-)
 create mode 100644 square-model-inference-api/auth_server/tests/conftest.py
 create mode 100644 square-model-inference-api/auth_server/tests/test_api/__init__.py
 create mode 100644 square-model-inference-api/auth_server/tests/test_api/test_auth.py

diff --git a/.github/workflows/ci.yml b/.github/workflows/ci.yml
index 6f860fb37..7bb1ad9c8 100644
--- a/.github/workflows/ci.yml
+++ b/.github/workflows/ci.yml
@@ -271,44 +271,43 @@ jobs:
             ${{ runner.os }}-buildx-model_api_auth-
             ${{ runner.os }}-buildx-
 
-# No tests for auth server currently.
-#      - name: Build test image
-#        uses: docker/build-push-action@v2
-#        with:
-#          builder: ${{ steps.buildx.outputs.name }}
-#          context: ./square-model-inference-api/auth_server
-#          target: test
-#          load: true
-#          tags: ${{ steps.prep.outputs.image }}:${{ steps.prep.outputs.tag }}-test
-#          cache-from: type=local,src=/tmp/.buildx-cache
-#          cache-to: type=local,mode=max,dest=/tmp/.buildx-cache-new
-#
-#        # Temp fix
-#        # https://github.com/docker/build-push-action/issues/252
-#        # https://github.com/moby/buildkit/issues/1896
-#      - name: Move cache
-#        run: |
-#          rm -rf /tmp/.buildx-cache
-#          mv /tmp/.buildx-cache-new /tmp/.buildx-cache
-#
-#      - name: Retrieve Test Reports
-#        id: extract
-#        uses: shrink/actions-docker-extract@v1
-#        with:
-#          image: ${{ steps.prep.outputs.image }}:${{ steps.prep.outputs.tag }}-test
-#          path: /app/test-reports
-#
-#      - uses: actions/upload-artifact@v2
-#        with:
-#          name: model_api_auth-test-reports
-#          path: ${{ steps.extract.outputs.destination }}/test-reports
-#
-#      - name: Publish Test Report
-#        uses: mikepenz/action-junit-report@v2
-#        with:
-#          report_paths: ${{ steps.extract.outputs.destination }}/test-reports/junit.xml
-#          check_name: Model API Auth Test Report
-#          fail_on_failure: true
+      - name: Build test image
+        uses: docker/build-push-action@v2
+        with:
+          builder: ${{ steps.buildx.outputs.name }}
+          context: ./square-model-inference-api/auth_server
+          target: test
+          load: true
+          tags: ${{ steps.prep.outputs.image }}:${{ steps.prep.outputs.tag }}-test
+          cache-from: type=local,src=/tmp/.buildx-cache
+          cache-to: type=local,mode=max,dest=/tmp/.buildx-cache-new
+
+        # Temp fix
+        # https://github.com/docker/build-push-action/issues/252
+        # https://github.com/moby/buildkit/issues/1896
+      - name: Move cache
+        run: |
+          rm -rf /tmp/.buildx-cache
+          mv /tmp/.buildx-cache-new /tmp/.buildx-cache
+
+      - name: Retrieve Test Reports
+        id: extract
+        uses: shrink/actions-docker-extract@v1
+        with:
+          image: ${{ steps.prep.outputs.image }}:${{ steps.prep.outputs.tag }}-test
+          path: /app/test-reports
+
+      - uses: actions/upload-artifact@v2
+        with:
+          name: model_api_auth-test-reports
+          path: ${{ steps.extract.outputs.destination }}/test-reports
+
+      - name: Publish Test Report
+        uses: mikepenz/action-junit-report@v2
+        with:
+          report_paths: ${{ steps.extract.outputs.destination }}/test-reports/junit.xml
+          check_name: Model API Auth Test Report
+          fail_on_failure: true
 
       - name: Login to Docker Hub
         uses: docker/login-action@v1
diff --git a/square-model-inference-api/auth_server/Dockerfile b/square-model-inference-api/auth_server/Dockerfile
index e630e0821..394ce24d8 100644
--- a/square-model-inference-api/auth_server/Dockerfile
+++ b/square-model-inference-api/auth_server/Dockerfile
@@ -1,4 +1,4 @@
-FROM python:3.7.6-slim-buster
+FROM python:3.7.6-slim-buster as base
 
 ENV PYTHONUNBUFFERED 1
 
@@ -14,4 +14,17 @@ COPY main.py main.py
 COPY ./auth_api auth_api
 COPY logging.conf logging.conf
 
+FROM base as test
+RUN pip install pytest pytest-cov pytest-asyncio
+RUN mkdir test-reports
+RUN PYTHONPATH=./ pytest \
+    --junitxml=test-reports/junit.xml \
+    --cov \
+    --cov-report=xml:test-reports/coverage.xml \
+    --cov-report=html:test-reports/coverage.html; \
+    echo $? > test-reports/pytest.existcode
+
+# Deployment stage
+FROM base as build
+
 CMD ["uvicorn", "main:app", "--host", "0.0.0.0", "--port", "8081", "--log-config", "logging.conf"]
\ No newline at end of file
diff --git a/square-model-inference-api/auth_server/main.py b/square-model-inference-api/auth_server/main.py
index c137f9d0a..fb323930d 100644
--- a/square-model-inference-api/auth_server/main.py
+++ b/square-model-inference-api/auth_server/main.py
@@ -1,9 +1,12 @@
 from fastapi import FastAPI, Depends
 import auth_api.security as security
 from logging.config import fileConfig
+from loguru import logger
 
-fileConfig("logging.conf")
-
+try:
+    fileConfig("logging.conf")
+except:
+    logger.info("Failed to load 'logging.conf'. Continuing without configuring the server logger")
 app = FastAPI()
 
 
diff --git a/square-model-inference-api/auth_server/tests/conftest.py b/square-model-inference-api/auth_server/tests/conftest.py
new file mode 100644
index 000000000..b06b27e94
--- /dev/null
+++ b/square-model-inference-api/auth_server/tests/conftest.py
@@ -0,0 +1,24 @@
+import pytest
+
+from starlette.config import environ
+API_KEY = "test_key"
+API_KEY_HEADER_NAME = "Authorization"
+environ["API_KEY"] = API_KEY
+environ["API_KEY_HEADER_NAME"] = API_KEY_HEADER_NAME
+
+from main import app
+
+
+@pytest.fixture()
+def test_app():
+    return app
+
+
+@pytest.fixture()
+def test_key():
+    return API_KEY
+
+
+@pytest.fixture()
+def test_header():
+    return API_KEY_HEADER_NAME
\ No newline at end of file
diff --git a/square-model-inference-api/auth_server/tests/test_api/__init__.py b/square-model-inference-api/auth_server/tests/test_api/__init__.py
new file mode 100644
index 000000000..e69de29bb
diff --git a/square-model-inference-api/auth_server/tests/test_api/test_auth.py b/square-model-inference-api/auth_server/tests/test_api/test_auth.py
new file mode 100644
index 000000000..10c33ed56
--- /dev/null
+++ b/square-model-inference-api/auth_server/tests/test_api/test_auth.py
@@ -0,0 +1,34 @@
+from starlette.testclient import TestClient
+
+def test_api_correct_auth(test_app, test_key, test_header) -> None:
+    test_client = TestClient(test_app)
+    response = test_client.get(
+        "/auth",
+        headers={test_header: test_key}
+    )
+    assert response.status_code == 200
+    assert response.json()["authenticated"] == True
+
+
+def test_api_no_header(test_app) -> None:
+    test_client = TestClient(test_app)
+    response = test_client.get(
+        "/auth"
+    )
+    assert response.status_code == 400
+
+def test_api_wrong_header(test_app, test_key, test_header) -> None:
+    test_client = TestClient(test_app)
+    response = test_client.get(
+        "/auth",
+        headers={test_header+"wrong": test_key}
+    )
+    assert response.status_code == 400
+
+def test_api_wrong_key(test_app, test_key, test_header) -> None:
+    test_client = TestClient(test_app)
+    response = test_client.get(
+        "/auth",
+        headers={test_header: test_key+"wrong"}
+    )
+    assert response.status_code == 401
\ No newline at end of file
diff --git a/square-model-inference-api/inference_server/main.py b/square-model-inference-api/inference_server/main.py
index 167adb04c..a3335add8 100644
--- a/square-model-inference-api/inference_server/main.py
+++ b/square-model-inference-api/inference_server/main.py
@@ -5,10 +5,14 @@
 from square_model_inference.core.config import API_PREFIX, APP_NAME, APP_VERSION
 from square_model_inference.core.event_handlers import start_app_handler, stop_app_handler
 from logging.config import fileConfig
+from loguru import logger
 
 def get_app() -> FastAPI:
     # Set logging config. Loguru uses this then, too
-    fileConfig("logging.conf")
+    try:
+        fileConfig("logging.conf")
+    except:
+        logger.info("Failed to load 'logging.conf'. Continuing without configuring the server logger")
     fast_app = FastAPI(title=APP_NAME, version=APP_VERSION)
     fast_app.include_router(api_router, prefix=API_PREFIX)
 
diff --git a/square-model-inference-api/inference_server/requirements2.txt b/square-model-inference-api/inference_server/requirements2.txt
index 05ea5692f..02424bec4 100644
--- a/square-model-inference-api/inference_server/requirements2.txt
+++ b/square-model-inference-api/inference_server/requirements2.txt
@@ -1 +1 @@
-adapter-transformers==2.1.1
\ No newline at end of file
+adapter-transformers==2.1.2
\ No newline at end of file

From c8203556a7f6aaba34fd67a20b90c015740fa80e Mon Sep 17 00:00:00 2001
From: Geigle <gregortheodor.geigle@stud.tu-darmstadt.de>
Date: Fri, 6 Aug 2021 15:33:16 +0200
Subject: [PATCH 20/23] Added multiprocessing with multiple GPUs for offline
 encoding (code only tested with 1 GPU) [skip ci]

---
 .../offline_encoding_for_data_api.py          | 167 +++++++++++++++++-
 1 file changed, 158 insertions(+), 9 deletions(-)

diff --git a/square-model-inference-api/offline_encoding_for_data_api.py b/square-model-inference-api/offline_encoding_for_data_api.py
index 2e1036145..173982e6f 100644
--- a/square-model-inference-api/offline_encoding_for_data_api.py
+++ b/square-model-inference-api/offline_encoding_for_data_api.py
@@ -5,6 +5,7 @@
 import logging
 import json
 import pickle
+import time
 from dataclasses import dataclass
 from typing import Union, List
 import h5py
@@ -13,11 +14,15 @@
 # Conditionally load adapter or sentence-transformer later to simplify installation
 #from sentence_transformers import SentenceTransformer as SentenceTransformerModel
 import transformers
+import torch.multiprocessing as mp
+import queue
 from transformers import AutoModel, AutoTokenizer  #, AutoModelWithHeads
 
 logger = logging.getLogger(__name__)
-logger.setLevel(logging.DEBUG)
-
+handler = logging.StreamHandler()
+handler.setFormatter(logging.Formatter('%(processName)s-%(levelname)s-%(asctime)s: %(message)s'))
+logger.addHandler(handler)
+logger.setLevel(logging.INFO)
 
 @dataclass
 class PredictionRequest:
@@ -42,11 +47,13 @@ def __init__(self, model_name, batch_size, disable_gpu):
         :param model_name: the sentence-transformer model name (https://sbert.net/docs/pretrained_models.html)
         :param batch_size: batch size used for inference
         :param disable_gpu: do not move model to GPU even if CUDA is available
-        :param max_input_size: requests with a larger input are rejected
         """
         self._load_model(model_name, disable_gpu)
         self.batch_size = batch_size
 
+    def to(self, device):
+        self.model.to(device)
+
     def _load_model(self, model_name, disable_gpu):
         """
         Load the Transformer model model_name and its tokenizer with Huggingface.
@@ -80,6 +87,9 @@ def __init__(self, model_name, batch_size, disable_gpu):
         self._load_model(AutoModel, model_name, disable_gpu)
         self.batch_size = batch_size
 
+    def to(self, device):
+        self.model.to(device)
+
     def _load_model(self, model_cls, model_name, disable_gpu):
         """
         Load the Transformer model model_name and its tokenizer with Huggingface.
@@ -169,7 +179,6 @@ def embedding(self, request):
             emb = sum_embeddings / sum_mask
         elif embedding_mode == "token":
             emb = hidden_state
-            #word_ids = [features.word_ids(i) for i in range(len(request.input))]
         return emb
 
 
@@ -280,6 +289,135 @@ def encode(args):
                 current_output = {"ids": [], "embeddings": []}
                 chunk_idx += 1
 
+
+def _read_process(input_file, batch_size, input_queue):
+    logger.info(f"Reading input from {input_file}")
+    with open(input_file, "r", encoding="utf-8") as f_in:
+        # We do not know how large the input file is so we read it batch-wise for memory safety reasons
+        finished_reading = False
+        while not finished_reading:
+            lines, finished_reading = read_batch(f_in, batch_size)
+            if lines:
+                lines = [json.loads(line) for line in lines]
+                texts = [line["text"] for line in lines]
+                ids = [line["id"] for line in lines]
+
+                input = PredictionRequest(input=texts, preprocessing_kwargs={}, model_kwargs={}, task_kwargs={"embedding_mode": "mean"})
+                input_queue.put((input, ids), block=True)
+
+
+def _encode_process(model, device, float16, input_queue, output_queue):
+    logger.info(f"Moving model to {device}")
+    model.to(device)
+    while True:
+        try:
+            input, ids = input_queue.get()
+            embeddings = model.embedding(input)
+            embeddings = embeddings.cpu().numpy()
+            if float16:
+                embeddings = embeddings.astype("float16")
+            output_queue.put((embeddings, ids))
+        except queue.Empty:
+            break
+
+
+def _write_process(output_file, chunk_size, args, output_queue):
+    chunk_idx = 0
+    current_processed_lines = 0
+    total_processed_lines = 0
+    current_output = {"ids": [], "embeddings": []}
+    while True:
+        try:
+            # We do not know when we are done writing. Instead we wait 60s and if we get nothing new, we assume we are done.
+            # i.e., after the timeout, queue.Empty exception is triggered and we write the remaining output and finish
+            embeddings, ids = output_queue.get(timeout=60)
+            current_output["ids"].extend(ids)
+            current_output["embeddings"].append(embeddings)
+
+            total_processed_lines += len(ids)
+            current_processed_lines += len(ids)
+
+            if current_processed_lines >= chunk_size:
+                logger.info(f"Processed {total_processed_lines} lines ({chunk_idx+1} chunks)")
+
+                current_output["embeddings"] = np.concatenate(current_output["embeddings"])
+
+                if args.hdf5:
+                    chunk_output_file = f"{output_file}_{chunk_idx}.h5"
+                    with h5py.File(chunk_output_file, "w") as out_f:
+                        logger.info(f"Writing chunk in {chunk_output_file}")
+                        if args.hdf5_gzip_level < 0:
+                            out_f.create_dataset("ids", data=np.array(current_output["ids"], dtype="S"))
+                            out_f.create_dataset("embeddings", data=current_output["embeddings"])
+                        else:
+                            out_f.create_dataset("ids", data=np.array(current_output["ids"], dtype="S"),  compression="gzip", compression_opts=min(args.hdf5_gzip_level, 9))
+                            out_f.create_dataset("embeddings", data=current_output["embeddings"],  compression="gzip", compression_opts=min(args.hdf5_gzip_level, 9))
+                else:
+                    chunk_output_file = f"{output_file}_{chunk_idx}.pkl"
+                    with open(chunk_output_file, "wb") as out_f:
+                        logger.info(f"Writing chunk in {chunk_output_file}")
+                        pickle.dump(current_output, out_f)
+                current_processed_lines = 0
+                current_output = {"ids": [], "embeddings": []}
+                chunk_idx += 1
+        except queue.Empty:
+            logger.info(f"Processed {total_processed_lines} lines ({chunk_idx+1} chunks)")
+            current_output["embeddings"] = np.concatenate(current_output["embeddings"])
+            if args.hdf5:
+                chunk_output_file = f"{output_file}_{chunk_idx}.h5"
+                with h5py.File(chunk_output_file, "w") as out_f:
+                    logger.info(f"Writing chunk in {chunk_output_file}")
+                    if args.hdf5_gzip_level < 0:
+                        out_f.create_dataset("ids", data=np.array(current_output["ids"], dtype="S"))
+                        out_f.create_dataset("embeddings", data=current_output["embeddings"])
+                    else:
+                        out_f.create_dataset("ids", data=np.array(current_output["ids"], dtype="S"),  compression="gzip", compression_opts=min(args.hdf5_gzip_level, 9))
+                        out_f.create_dataset("embeddings", data=current_output["embeddings"],  compression="gzip", compression_opts=min(args.hdf5_gzip_level, 9))
+            else:
+                chunk_output_file = f"{output_file}_{chunk_idx}.pkl"
+                with open(chunk_output_file, "wb") as out_f:
+                    logger.info(f"Writing chunk in {chunk_output_file}")
+                    pickle.dump(current_output, out_f)
+            break
+
+
+def encode_multiprocess(args):
+    transformers_cache = args.transformers_cache
+    os.environ["TRANSFORMERS_CACHE"] = transformers_cache
+    model_name = args.model_name
+    model_type = args.model_type
+    batch_size = args.batch_size
+    chunk_size = args.chunk_size
+    input_file = args.input_file
+    output_file = os.path.splitext(args.output_file)[0]  # Remove file extension from name
+    adapter_name = args.adapter_name
+    devices = args.gpus.split(",")
+
+    ctx = mp.get_context('spawn')
+
+    if model_type == "sentence-transformer":
+        model = SentenceTransformer(model_name, batch_size, True)
+    elif model_type == "transformer":
+        model = Transformer(model_name, batch_size, True)
+    elif model_type == "adapter":
+        model = AdapterTransformer(model_name, batch_size, True, transformers_cache, adapter_name)
+
+    if os.path.dirname(output_file):
+        os.makedirs(os.path.dirname(output_file), exist_ok=True)
+
+    input_queue = ctx.Queue(maxsize=2*len(devices))
+    output_queue = ctx.Queue()
+
+    read_p = ctx.Process(target=_read_process, args=(input_file, batch_size, input_queue), daemon=True)
+    read_p.start()
+    write_p = ctx.Process(target=_write_process, args=(output_file, chunk_size, args, output_queue), daemon=True)
+    write_p.start()
+    for cuda_id in devices:
+        p = ctx.Process(target=_encode_process, args=(model, cuda_id, args.float16, input_queue, output_queue), daemon=True)
+        p.start()
+
+    write_p.join()
+
 if __name__ == "__main__":
     parser = argparse.ArgumentParser()
     parser.add_argument("--transformers_cache", help="Cache folder where model files will be downloaded into and loaded from")
@@ -291,14 +429,25 @@ def encode(args):
                                                        "Each output file contains chunk_size embeddings "
                                                        "(except the last one if len(input) mod chunk_size != 0)")
     parser.add_argument("--input_file", help="Input .jsonl file. Each line is a dict object: {'id': 'xxx', 'text': 'abc...'}")
-    parser.add_argument("--output_file", help="Output .pkl file. A chunk index will be inserted between the name and extension: 'path/to/name_chunkidx.pkl' ."
+    parser.add_argument("--output_file", help="Output .pkl/.h5 file. A chunk index will be inserted between the name and extension: e.g. 'path/to/name_chunkidx.pkl' ."
                                               "Format: {'ids': List[str], 'embeddings': ndarray}. "
                                               "Note for hdf5, use f['ids'].asstr() to load ids as string because default is binary.")
     parser.add_argument("--adapter_name", help="For model_type=adapter, the name of the adapter that should be loaded")
-    parser.add_argument("--hdf5", action="store_true")
+    parser.add_argument("--hdf5", action="store_true", help="Save output with hdf5 instead of pickle")
     parser.add_argument("--hdf5-gzip-level", type=int, default=4, help="GZIP compression level for HDF5 in range 0-9, default 4 (bigger is more compressed). "
-                                                                       "Set to negative value to disable compression")
-    parser.add_argument("--float16", action="store_true")
+                                                                       "Set to negative value to disable compression."
+                                                                       "Only used when --hdf5 is also set.")
+    parser.add_argument("--float16", action="store_true", help="Save embeddings as float16")
+    parser.add_argument("--gpus", help="Set this value to use multiprocessing."
+                                       "Comma-separated list of devices (e.g., cuda:0,cuda:1) for multi-GPU processing."
+                                       "Reading, writing and each GPU is assigned its own process."
+                                       "Can also be used with only one device to use the multiprocessing for reading/ writing of outputs but this is not necessarily faster with one GPU.")
     args = parser.parse_args()
 
-    encode(args)
+    start_time = time.time()
+    if not args.gpus:
+        encode(args)
+    else:
+        encode_multiprocess(args)
+    end_time = time.time()
+    logger.info(f"Finished encoding in {end_time-start_time}s")

From bd6fdba2523431b4b2c6fe48fac2aa3049e21896 Mon Sep 17 00:00:00 2001
From: Geigle <gregortheodor.geigle@stud.tu-darmstadt.de>
Date: Sat, 7 Aug 2021 16:15:11 +0200
Subject: [PATCH 21/23] Replaced loguru with standard Python logging (problems
 with using the logging.conf); Fixed missing updates for Auth Server unit
 tests; added Locust for load testing

---
 .github/workflows/ci.yml                      |  2 +-
 square-model-inference-api/Makefile           |  1 +
 square-model-inference-api/README.md          | 10 ++-
 .../auth_server/auth_api/security.py          |  6 +-
 .../auth_server/main.py                       |  3 +-
 .../example_docker-compose.yml                |  1 +
 .../inference_server/main.py                  |  8 +--
 .../inference_server/requirements1.txt        |  1 -
 .../api/routes/prediction.py                  |  5 +-
 .../core/event_handlers.py                    |  5 +-
 .../inference/adaptertransformer.py           |  5 +-
 .../inference/sentencetransformer.py          |  8 +--
 .../inference/transformer.py                  |  5 +-
 .../pre_test_setup_for_docker_caching.py      |  6 +-
 square-model-inference-api/locust/README.md   | 52 +++++++++++++++
 square-model-inference-api/locust/config.json | 41 ++++++++++++
 .../locust/locustfile.py                      | 63 +++++++++++++++++++
 .../locust/requirements.txt                   |  1 +
 square-model-inference-api/nginx/nginx.conf   |  2 +-
 19 files changed, 200 insertions(+), 25 deletions(-)
 create mode 100644 square-model-inference-api/locust/README.md
 create mode 100644 square-model-inference-api/locust/config.json
 create mode 100644 square-model-inference-api/locust/locustfile.py
 create mode 100644 square-model-inference-api/locust/requirements.txt

diff --git a/.github/workflows/ci.yml b/.github/workflows/ci.yml
index 7bb1ad9c8..e4bd36c1d 100644
--- a/.github/workflows/ci.yml
+++ b/.github/workflows/ci.yml
@@ -320,7 +320,7 @@ jobs:
         with:
           builder: ${{ steps.buildx.outputs.name }}
           context: ./square-model-inference-api/auth_server
-          #target: build  # Not multistage
+          target: build
           push: ${{github.ref == 'refs/heads/master'}}
           tags: ${{ steps.prep.outputs.image }}:${{ steps.prep.outputs.tag }}, ${{ steps.prep.outputs.image }}:latest
           cache-from: type=local,src=/tmp/.buildx-cache
diff --git a/square-model-inference-api/Makefile b/square-model-inference-api/Makefile
index 0e8bc2e54..39899afde 100644
--- a/square-model-inference-api/Makefile
+++ b/square-model-inference-api/Makefile
@@ -8,6 +8,7 @@ all: clean test install run deploy down
 
 test:
 	python -m pip install --upgrade pip && pip install pytest pytest-cov pytest-asyncio
+	PYTHONPATH=auth_server/ pytest --cov
 	PYTHONPATH=inference_server/ pytest --cov
 
 install:
diff --git a/square-model-inference-api/README.md b/square-model-inference-api/README.md
index 26dcb90a1..8b245517f 100644
--- a/square-model-inference-api/README.md
+++ b/square-model-inference-api/README.md
@@ -12,9 +12,9 @@ to handle authorization of requests with the auth server.
 ├───auth_server                 # FastAPI Authorization Server
 │   ├───main.py                 # Entry point in server
 │   ├───Dockerfile              # Dockerfile for server
+│   ├───tests                   # Unit Tests
+│   │   ├───test_api
 │   └───auth_api
-├───nginx                       # nginx config for API Gateway & Authorizatio
-│   └───nginx.conf
 ├───inference_server            # FastAPI Model API Server
 │   ├───tests                   # Unit Tests
 │   │   ├───test_api
@@ -27,6 +27,9 @@ to handle authorization of requests with the auth server.
 │       ├───core                # Server config, Startup logic, etc.
 │       ├───models              # Input/ output modelling for API
 │       └───inference           # Deep Model implementation and inference code for NLP tasks
+├───nginx                       # nginx config for API Gateway & Authorizatio
+│   └───nginx.conf
+├───locust                      # Load testing configuration with Locust
 └───example_docker-compose.yml  # Example docker-compose setup for the Model API
 ```
 
@@ -73,10 +76,11 @@ make deploy
 ```
 
 #### Running Tests
-
+For unit tests:
 ```sh
 make test
 ```
+For load testing with Locust, see [this README](locust/README.md).
 
 ## Setup
 1. Create `auth_server/.env` with secret API key. See [here](auth_server/.env.example) for an example.
diff --git a/square-model-inference-api/auth_server/auth_api/security.py b/square-model-inference-api/auth_server/auth_api/security.py
index 058086f88..9fd328025 100644
--- a/square-model-inference-api/auth_server/auth_api/security.py
+++ b/square-model-inference-api/auth_server/auth_api/security.py
@@ -3,10 +3,14 @@
 from fastapi import HTTPException, Security
 from fastapi.security.api_key import APIKeyHeader
 from starlette.status import HTTP_400_BAD_REQUEST, HTTP_401_UNAUTHORIZED
-from loguru import logger
 
 from .config import API_KEY, API_KEY_HEADER_NAME
 from .messages import AUTH_REQ, NO_API_KEY
+
+import logging
+
+logger = logging.getLogger(__name__)
+
 api_key_header = APIKeyHeader(name=API_KEY_HEADER_NAME, auto_error=False)
 
 
diff --git a/square-model-inference-api/auth_server/main.py b/square-model-inference-api/auth_server/main.py
index fb323930d..6daf91883 100644
--- a/square-model-inference-api/auth_server/main.py
+++ b/square-model-inference-api/auth_server/main.py
@@ -1,8 +1,9 @@
 from fastapi import FastAPI, Depends
 import auth_api.security as security
 from logging.config import fileConfig
-from loguru import logger
+import logging
 
+logger = logging.getLogger(__name__)
 try:
     fileConfig("logging.conf")
 except:
diff --git a/square-model-inference-api/example_docker-compose.yml b/square-model-inference-api/example_docker-compose.yml
index e03421974..dbfa7e022 100644
--- a/square-model-inference-api/example_docker-compose.yml
+++ b/square-model-inference-api/example_docker-compose.yml
@@ -4,6 +4,7 @@ services:
   auth:
     #build: ./auth_server/.
     image: ukpsquare/square-model-api-auth:latest
+    container_name: square_model_auth
     ports:
       - 8081:8081
     env_file:
diff --git a/square-model-inference-api/inference_server/main.py b/square-model-inference-api/inference_server/main.py
index a3335add8..3f06b41de 100644
--- a/square-model-inference-api/inference_server/main.py
+++ b/square-model-inference-api/inference_server/main.py
@@ -1,14 +1,14 @@
-import sys
-
 from fastapi import FastAPI
 from square_model_inference.api.routes.router import api_router
 from square_model_inference.core.config import API_PREFIX, APP_NAME, APP_VERSION
 from square_model_inference.core.event_handlers import start_app_handler, stop_app_handler
 from logging.config import fileConfig
-from loguru import logger
+import logging
+
+logger = logging.getLogger(__name__)
 
 def get_app() -> FastAPI:
-    # Set logging config. Loguru uses this then, too
+    # Set logging config.
     try:
         fileConfig("logging.conf")
     except:
diff --git a/square-model-inference-api/inference_server/requirements1.txt b/square-model-inference-api/inference_server/requirements1.txt
index 794d45f82..6db7ba44c 100644
--- a/square-model-inference-api/inference_server/requirements1.txt
+++ b/square-model-inference-api/inference_server/requirements1.txt
@@ -1,7 +1,6 @@
 uvicorn==0.13.4                 # ASGI server
 fastapi==0.65.1                 # REST API Framework
 pydantic==1.8.2                 # Input/ output modelling
-loguru==0.5.3                   # Easier logging
 python-dotenv==0.17.1           # Required for .env configs
 sentencepiece==0.1.95
 torch==1.8.1
diff --git a/square-model-inference-api/inference_server/square_model_inference/api/routes/prediction.py b/square-model-inference-api/inference_server/square_model_inference/api/routes/prediction.py
index eae7bda22..1de004a0d 100644
--- a/square-model-inference-api/inference_server/square_model_inference/api/routes/prediction.py
+++ b/square-model-inference-api/inference_server/square_model_inference/api/routes/prediction.py
@@ -1,11 +1,14 @@
 from fastapi import APIRouter
 from starlette.requests import Request
-from loguru import logger
 
 from square_model_inference.models.request import PredictionRequest, Task
 from square_model_inference.models.prediction import PredictionOutputForSequenceClassification, PredictionOutputForTokenClassification, \
     PredictionOutputForQuestionAnswering, PredictionOutputForGeneration, PredictionOutputForEmbedding
 
+import logging
+
+logger = logging.getLogger(__name__)
+
 router = APIRouter()
 
 
diff --git a/square-model-inference-api/inference_server/square_model_inference/core/event_handlers.py b/square-model-inference-api/inference_server/square_model_inference/core/event_handlers.py
index 6011b30ec..2a664f050 100644
--- a/square-model-inference-api/inference_server/square_model_inference/core/event_handlers.py
+++ b/square-model-inference-api/inference_server/square_model_inference/core/event_handlers.py
@@ -1,7 +1,6 @@
 from typing import Callable
 
 from fastapi import FastAPI
-from loguru import logger
 
 from square_model_inference.inference.adaptertransformer import AdapterTransformer
 from square_model_inference.core.config import MODEL_TYPE, MODEL_NAME, MODEL_CLASS, DISABLE_GPU, BATCH_SIZE, \
@@ -9,6 +8,10 @@
 from square_model_inference.inference.sentencetransformer import SentenceTransformer
 from square_model_inference.inference.transformer import Transformer
 
+import logging
+
+logger = logging.getLogger(__name__)
+
 MODEL_MAPPING = {
     "adapter": AdapterTransformer,
     "transformer": Transformer,
diff --git a/square-model-inference-api/inference_server/square_model_inference/inference/adaptertransformer.py b/square-model-inference-api/inference_server/square_model_inference/inference/adaptertransformer.py
index 99911bef2..2ad247089 100644
--- a/square-model-inference-api/inference_server/square_model_inference/inference/adaptertransformer.py
+++ b/square-model-inference-api/inference_server/square_model_inference/inference/adaptertransformer.py
@@ -1,5 +1,3 @@
-from loguru import logger
-
 from transformers import AutoModelWithHeads, list_adapters
 
 from square_model_inference.inference.transformer import Transformer
@@ -7,6 +5,9 @@
 
 from square_model_inference.models.prediction import PredictionOutput
 
+import logging
+
+logger = logging.getLogger(__name__)
 
 class AdapterTransformer(Transformer):
     """
diff --git a/square-model-inference-api/inference_server/square_model_inference/inference/sentencetransformer.py b/square-model-inference-api/inference_server/square_model_inference/inference/sentencetransformer.py
index 7523bab29..b07de3bd0 100644
--- a/square-model-inference-api/inference_server/square_model_inference/inference/sentencetransformer.py
+++ b/square-model-inference-api/inference_server/square_model_inference/inference/sentencetransformer.py
@@ -1,9 +1,4 @@
-import json
-from typing import Union, Tuple
-
 import torch
-from loguru import logger
-import numpy as np
 from sentence_transformers import SentenceTransformer as SentenceTransformerModel
 
 from square_model_inference.inference.model import Model
@@ -11,6 +6,9 @@
 
 from square_model_inference.models.prediction import PredictionOutput, PredictionOutputForEmbedding
 
+import logging
+
+logger = logging.getLogger(__name__)
 
 class SentenceTransformer(Model):
     """
diff --git a/square-model-inference-api/inference_server/square_model_inference/inference/transformer.py b/square-model-inference-api/inference_server/square_model_inference/inference/transformer.py
index c7657de47..65dcceb0a 100644
--- a/square-model-inference-api/inference_server/square_model_inference/inference/transformer.py
+++ b/square-model-inference-api/inference_server/square_model_inference/inference/transformer.py
@@ -3,7 +3,6 @@
 from typing import Union, Tuple
 
 import torch
-from loguru import logger
 import numpy as np
 from transformers import AutoTokenizer, AutoModel, AutoModelForSequenceClassification, \
     AutoModelForTokenClassification, AutoModelForQuestionAnswering, AutoModelForCausalLM
@@ -14,6 +13,10 @@
 from square_model_inference.models.prediction import PredictionOutput, PredictionOutputForSequenceClassification, PredictionOutputForTokenClassification, \
     PredictionOutputForQuestionAnswering, PredictionOutputForGeneration, PredictionOutputForEmbedding
 
+import logging
+
+logger = logging.getLogger(__name__)
+
 CLASS_MAPPING = {
     "base": AutoModel,
     "sequence_classification": AutoModelForSequenceClassification,
diff --git a/square-model-inference-api/inference_server/tests/pre_test_setup_for_docker_caching.py b/square-model-inference-api/inference_server/tests/pre_test_setup_for_docker_caching.py
index 12c6de162..8460db236 100644
--- a/square-model-inference-api/inference_server/tests/pre_test_setup_for_docker_caching.py
+++ b/square-model-inference-api/inference_server/tests/pre_test_setup_for_docker_caching.py
@@ -1,12 +1,12 @@
 # This file is used for the Docker image to cache long-running setup for the tests,
 # i.e., downloading finetuned Transformer models and so on.
 # This way, adding new tests or even changing the server code does NOT trigger a new download during building
-import json
-
 from sentence_transformers import SentenceTransformer
 from starlette.config import environ
 from transformers import AutoTokenizer, AutoModelWithHeads, list_adapters
-from loguru import logger
+import logging
+
+logger = logging.getLogger(__name__)
 
 TRANSFORMERS_TESTING_CACHE = "./.model_testing_cache"
 environ["TRANSFORMERS_CACHE"] = TRANSFORMERS_TESTING_CACHE
diff --git a/square-model-inference-api/locust/README.md b/square-model-inference-api/locust/README.md
new file mode 100644
index 000000000..261f52f94
--- /dev/null
+++ b/square-model-inference-api/locust/README.md
@@ -0,0 +1,52 @@
+# Locust
+We use Locust for load testing.
+See their [documentation](https://docs.locust.io/en/stable/) for more infos.
+
+## Setup
+1. Install Python (>3.6) and Locust ([requirements.txt](requirements.txt)) on your system 
+   (does not have to be the same system as where the Model API runs). 
+2. Write your ``config.json`` (see below for more)
+3. Start the Model API servers (with Docker or locally or however you want). This does *not* have to be
+on the same computer/ server as Locust.
+4. Start the Locust server in this directory (``locust -f locustfile.py``), visit the web UI and start 
+the load test. For alternative methods see the above documentation.
+   
+## config.json
+We describe the keys with expected values for the config.json:
+```json
+{
+  "config": {
+    # Time each user waits (min, max) after a request. Default [1, 2]
+    "wait_time": [1, 2],
+    # API key for authorization
+    "api_key": "example_key"
+    # Header for API key. Default: Authorization
+    "api_key_header": "Authorization"
+  },
+  # List of Locust tasks. A user randomly selects from this list each time and starts the request
+  "tasks": [
+    {
+      # Request to /api/$model/$endpoint
+      "endpoint": "embedding",
+      "model": "bert-base-uncased",
+      # Set to integer greater 1 to increase chance that this task is chosen.
+      # This task appears $weight-times in the list of tasks 
+      # (from which the next task is uniformly chosen)
+      "weight": 1,
+      # The JSON of the query that is send to the Model API.
+      # See the documentation of the API for more details on this.
+      "query_json": {
+        "input":
+        [
+          "test input"
+        ],
+        "is_preprocessed": false,
+        "preprocessing_kwargs": { },
+        "model_kwargs": { },
+        "task_kwargs": { },
+        "adapter_name": ""
+      }
+    }
+  ]
+}
+```
diff --git a/square-model-inference-api/locust/config.json b/square-model-inference-api/locust/config.json
new file mode 100644
index 000000000..4e86ebaf6
--- /dev/null
+++ b/square-model-inference-api/locust/config.json
@@ -0,0 +1,41 @@
+{
+  "config": {
+
+    "wait_time": [0, 0.5],
+    "api_key": "example_key"
+  },
+  "tasks": [
+    {
+      "endpoint": "embedding",
+      "model": "bert-base-uncased",
+      "weight": 1,
+      "query_json": {
+        "input":
+        [
+          "test input"
+        ],
+        "is_preprocessed": false,
+        "preprocessing_kwargs": { },
+        "model_kwargs": { },
+        "task_kwargs": { },
+        "adapter_name": "nli/rte@ukp"
+      }
+    },
+    {
+      "endpoint": "sequence-classification",
+      "model": "bert-base-uncased",
+      "weight": 1,
+      "query_json": {
+        "input":
+        [
+          "test input"
+        ],
+        "is_preprocessed": false,
+        "preprocessing_kwargs": { },
+        "model_kwargs": { },
+        "task_kwargs": { },
+        "adapter_name": "nli/rte@ukp"
+      }
+    }
+  ]
+}
\ No newline at end of file
diff --git a/square-model-inference-api/locust/locustfile.py b/square-model-inference-api/locust/locustfile.py
new file mode 100644
index 000000000..a8e4514b4
--- /dev/null
+++ b/square-model-inference-api/locust/locustfile.py
@@ -0,0 +1,63 @@
+import json
+
+from locust import between
+from locust.contrib.fasthttp import FastHttpUser
+
+
+def curry_config_in_task(callable, config, endpoint):
+    """
+    Identical task function might be called with different configs (different model etc.) but Locust calls tasks only
+    with the user as argument. We curry the task function with the given config into a Locust-callable task function.
+    :param callable: the task function to use
+    :param config: the specific config to be used for the task function
+    :param endpoint: the model API task endpoint that is called
+    :return: a Locust task function with signature f(user) that calls callable(user, config, model_api_task)
+    """
+    def task(user):
+        return callable(user, config, endpoint)
+    return task
+
+
+def task_query(config, endpoint):
+    """
+    Template to make Locust tasks for queries that are generated dynamically based on the given config and endpoint.
+    Locust calls its task functions only with the user as argument so we create a closure and return it.
+    :param config: the config for the task with the model name, the API-Key, the JSON input, etc.
+    :param endpoint: the endpoint for the query
+    :return: the closure for the Locust task function that makes the specified query
+    """
+    def query(user):
+        path = f"/api/{config['model']}/{endpoint}"
+        query_json = config["query_json"]
+        headers = {config.get("api_key_header", "Authorization"): config["api_key"]}
+        user.client.post(path, json=query_json, headers=headers)
+    return query
+
+
+class ModelAPIUser(FastHttpUser):
+    wait_time = between(1, 2)
+    tasks = []
+
+    def __init__(self, *args, **kwargs):
+        # Load config
+        config = json.load(open("config.json"))
+        general_config = config["config"]
+
+        # Setup User
+        wait_time = general_config.get("wait_time", [1, 2])
+        # self.wait_time = between(...) does not work for some reason because it expects one argument that is not used but not supplied in calls
+        self.wait_time = lambda: between(wait_time[0], wait_time[1])(None)
+
+        # Setup the Locust tasks
+        tasks = []
+        for task in config["tasks"]:
+            task.update(general_config)
+            # Endpoint in URL uses - instead of _, so we replace it in case config was wrong
+            task_function = task_query(task, task["endpoint"].replace("_", "-"))
+            for _ in range(task.get("weight", 1)):
+                tasks.append(task_function)
+        self.tasks = tasks
+
+        super().__init__(*args, **kwargs)
+
+
diff --git a/square-model-inference-api/locust/requirements.txt b/square-model-inference-api/locust/requirements.txt
new file mode 100644
index 000000000..61ddec44d
--- /dev/null
+++ b/square-model-inference-api/locust/requirements.txt
@@ -0,0 +1 @@
+locust==2.0.0
\ No newline at end of file
diff --git a/square-model-inference-api/nginx/nginx.conf b/square-model-inference-api/nginx/nginx.conf
index 1858be5bf..54fbc7187 100644
--- a/square-model-inference-api/nginx/nginx.conf
+++ b/square-model-inference-api/nginx/nginx.conf
@@ -39,7 +39,7 @@ http {
 
         location /auth {
             internal;
-            proxy_pass http://auth:8081/auth;
+            proxy_pass http://square_model_auth:8081/auth;
         }
     }
 }
\ No newline at end of file

From 4a577c361cdc10d17c3bddf791ce77d342d58ab3 Mon Sep 17 00:00:00 2001
From: Geigle <gregortheodor.geigle@stud.tu-darmstadt.de>
Date: Sat, 7 Aug 2021 16:20:53 +0200
Subject: [PATCH 22/23] Removed unneeded function

---
 square-model-inference-api/locust/config.json | 21 ++-----------------
 .../locust/locustfile.py                      | 14 -------------
 2 files changed, 2 insertions(+), 33 deletions(-)

diff --git a/square-model-inference-api/locust/config.json b/square-model-inference-api/locust/config.json
index 4e86ebaf6..1b7dd94ba 100644
--- a/square-model-inference-api/locust/config.json
+++ b/square-model-inference-api/locust/config.json
@@ -1,7 +1,6 @@
 {
   "config": {
-
-    "wait_time": [0, 0.5],
+    "wait_time": [1, 2],
     "api_key": "example_key"
   },
   "tasks": [
@@ -18,23 +17,7 @@
         "preprocessing_kwargs": { },
         "model_kwargs": { },
         "task_kwargs": { },
-        "adapter_name": "nli/rte@ukp"
-      }
-    },
-    {
-      "endpoint": "sequence-classification",
-      "model": "bert-base-uncased",
-      "weight": 1,
-      "query_json": {
-        "input":
-        [
-          "test input"
-        ],
-        "is_preprocessed": false,
-        "preprocessing_kwargs": { },
-        "model_kwargs": { },
-        "task_kwargs": { },
-        "adapter_name": "nli/rte@ukp"
+        "adapter_name": ""
       }
     }
   ]
diff --git a/square-model-inference-api/locust/locustfile.py b/square-model-inference-api/locust/locustfile.py
index a8e4514b4..7738f3278 100644
--- a/square-model-inference-api/locust/locustfile.py
+++ b/square-model-inference-api/locust/locustfile.py
@@ -4,20 +4,6 @@
 from locust.contrib.fasthttp import FastHttpUser
 
 
-def curry_config_in_task(callable, config, endpoint):
-    """
-    Identical task function might be called with different configs (different model etc.) but Locust calls tasks only
-    with the user as argument. We curry the task function with the given config into a Locust-callable task function.
-    :param callable: the task function to use
-    :param config: the specific config to be used for the task function
-    :param endpoint: the model API task endpoint that is called
-    :return: a Locust task function with signature f(user) that calls callable(user, config, model_api_task)
-    """
-    def task(user):
-        return callable(user, config, endpoint)
-    return task
-
-
 def task_query(config, endpoint):
     """
     Template to make Locust tasks for queries that are generated dynamically based on the given config and endpoint.

From d3c047ecb48a93fcc8db93ea209ba702a677ed89 Mon Sep 17 00:00:00 2001
From: Geigle <gregortheodor.geigle@stud.tu-darmstadt.de>
Date: Tue, 10 Aug 2021 15:33:03 +0200
Subject: [PATCH 23/23] Finalized configs for CICD; fixed logging; updated
 documentation; embedding:added pooler output as option

---
 docker-compose.yaml                           | 38 +++++++++++++++++++
 square-model-inference-api/README.md          | 32 +++++++++++-----
 .../auth_server/main.py                       |  2 +-
 .../inference_server/.env.bert_adapter        | 23 +++++++++++
 .../inference_server/.env.dpr                 | 26 +++++++++++++
 .../inference_server/main.py                  |  2 +-
 .../inference/transformer.py                  |  4 +-
 .../models/prediction.py                      |  2 +-
 .../square_model_inference/models/request.py  |  3 +-
 .../tests/test_inference/test_transformers.py |  4 +-
 square-model-inference-api/locust/config.json | 18 ++++++++-
 square-model-inference-api/nginx/nginx.conf   | 21 ++++++----
 12 files changed, 151 insertions(+), 24 deletions(-)
 create mode 100644 square-model-inference-api/inference_server/.env.bert_adapter
 create mode 100644 square-model-inference-api/inference_server/.env.dpr

diff --git a/docker-compose.yaml b/docker-compose.yaml
index c093325f6..8d4d17ac9 100644
--- a/docker-compose.yaml
+++ b/docker-compose.yaml
@@ -29,6 +29,44 @@ services:
     ports:
       - 80:80
 
+### MODEL API
+  model_auth:
+    image: ukpsquare/square-model-api-auth:latest
+    container_name: square_model_auth
+    ports:
+      - 8081:8081
+    env_file:
+      - ./square-model-inference-api/auth_server/.env
+
+  model_nginx:
+    image: nginx
+    ports:
+      - 8080:8080
+    volumes:
+      - ./square-model-inference-api/nginx/nginx.conf:/etc/nginx/nginx.conf:ro
+
+  inference_bert_adapter:
+    image: ukpsquare/square-model-api:latest
+    ports:
+      - 8000:8000
+    env_file:
+      - ./square-model-inference-api/inference_server/.env.bert_adapter
+    container_name: square_model_inference_bert_adapter
+    volumes:
+      - ./.cache/:/etc/huggingface/.cache/
+
+  inference_dpr:
+    image: ukpsquare/square-model-api:latest
+    ports:
+      - 8001:8000
+    env_file:
+      - ./square-model-inference-api/inference_server/.env.dpr
+    container_name: square_model_inference_dpr
+    volumes:
+      - ./.cache/:/etc/huggingface/.cache/
+
+### / MODEL API Finished
+
   #adminer:
   #  image: adminer
   #  restart: always
diff --git a/square-model-inference-api/README.md b/square-model-inference-api/README.md
index 8b245517f..31eabd254 100644
--- a/square-model-inference-api/README.md
+++ b/square-model-inference-api/README.md
@@ -2,6 +2,15 @@
 Inference API that supports SOTA (QA) models & adapters. 
 Receives input and returns prediction and other artifacts (e.g. attention scores)
 
+## On the API Path
+The 'true' path of the API for the model server is of the form `/api/$endpoint` where the endpoint
+is embeddings, question-answering, etc. This is the path you use if you just run a model server locally.
+
+However, to run and distinguish multiple models, we use an API gateway with nginx so we extend 
+the path to `/api/$modelname/$endpoint` which is then resolved by nginx to the correct model server and forwarded
+to this server's `/api/$endpoint` endpoint. This is the path you use with Docker.
+This requires you to setup the docker-compose and nginx config as described below.
+
 ## Project structure
 
 The Model API uses 3 components: 
@@ -58,6 +67,20 @@ Both `transformers` and `adapter-transformers` use the same namespace so they co
 Thus, we first install `sentence-transformers` along with `transformers`, 
 uninstall `transformers`, and finally install `adapter-transformers`.
 
+
+## Setup
+### Docker
+1. Create `auth_server/.env` with secret API key. See [here](auth_server/.env.example) for an example.
+2. For each model server that should run, create a `.env.$model` to configure it.  
+   See [here](inference_server/.env.example) for an example.
+3. Configure `nginx/nginx.conf` to correctly forward requests to each server. The server DNS name has to
+   match `container_name` of each server in the `docker-compose.yaml`.
+4. Configure `docker-compose.yaml` by adding services for the auth server, nginx (with the config), and the
+   model servers (each with their .env file). See [example_docker-compose.yml](example_docker-compose.yml) for an example.
+### Local
+Create `inference_server/.env` and configure it as needed for your local model server.
+You do not need nginx and the authorization server for local testing.
+
 ## Running
 
 #### Running Localhost
@@ -81,12 +104,3 @@ For unit tests:
 make test
 ```
 For load testing with Locust, see [this README](locust/README.md).
-
-## Setup
-1. Create `auth_server/.env` with secret API key. See [here](auth_server/.env.example) for an example.
-2. For each model server that should run, create a `.env.$model` to configure it.  
-   See [here](inference_server/.env.example) for an example.
-3. Configure `nginx/nginx.conf` to correctly forward requests to each server. The server DNS name has to
-   match `container_name` of each server in the `docker-compose.yaml`.
-4. Configure `docker-compose.yaml` by adding services for the auth server, nginx (with the config), and the
-model servers (each with their .env file). See [example_docker-compose.yml](example_docker-compose.yml) for an example.
\ No newline at end of file
diff --git a/square-model-inference-api/auth_server/main.py b/square-model-inference-api/auth_server/main.py
index 6daf91883..b091250e9 100644
--- a/square-model-inference-api/auth_server/main.py
+++ b/square-model-inference-api/auth_server/main.py
@@ -5,7 +5,7 @@
 
 logger = logging.getLogger(__name__)
 try:
-    fileConfig("logging.conf")
+    fileConfig("logging.conf", disable_existing_loggers=False)
 except:
     logger.info("Failed to load 'logging.conf'. Continuing without configuring the server logger")
 app = FastAPI()
diff --git a/square-model-inference-api/inference_server/.env.bert_adapter b/square-model-inference-api/inference_server/.env.bert_adapter
new file mode 100644
index 000000000..6453fb42e
--- /dev/null
+++ b/square-model-inference-api/inference_server/.env.bert_adapter
@@ -0,0 +1,23 @@
+# Corresponds to the Huggingface name for finetuned Transformers or the name of a finetuned SentenceTransformers
+MODEL_NAME=bert-base-uncased
+# Type of the model, e.g. Transformers, Adapter, ...
+# See square_model_inference.core.event_handlers.MODEL_MAPPING for all available names with corresponding model
+MODEL_TYPE=adapter
+
+# Disable CUDA even if available
+DISABLE_GPU=True
+# Batch size used for many inputs
+BATCH_SIZE=32
+# Inputs larger than this size are rejected
+MAX_INPUT_SIZE=1024
+
+# Cache directory where model weights are stored
+# This is the name for the env variable used by transformers and sentence-transformers package
+TRANSFORMERS_CACHE=/etc/huggingface/.cache/
+
+
+# Flag that decides if returned numpy arrays are returned
+# as lists or encoded to base64 (smaller but not easily human readable).
+# See the comment in square_model_inference.models.prediction._encode_numpy on information on how to decode
+# the base64 string back to the numpy array
+RETURN_PLAINTEXT_ARRAYS=False
\ No newline at end of file
diff --git a/square-model-inference-api/inference_server/.env.dpr b/square-model-inference-api/inference_server/.env.dpr
new file mode 100644
index 000000000..a1e3784a4
--- /dev/null
+++ b/square-model-inference-api/inference_server/.env.dpr
@@ -0,0 +1,26 @@
+# Corresponds to the Huggingface name for finetuned Transformers or the name of a finetuned SentenceTransformers
+MODEL_NAME=facebook/dpr-question_encoder-single-nq-base
+# Type of the model, e.g. Transformers, Adapter, ...
+# See square_model_inference.core.event_handlers.MODEL_MAPPING for all available names with corresponding model
+MODEL_TYPE=transformer
+
+# Disable CUDA even if available
+DISABLE_GPU=False
+# Batch size used for many inputs
+BATCH_SIZE=32
+# Inputs larger than this size are rejected
+MAX_INPUT_SIZE=1024
+
+# Cache directory where model weights are stored
+# This is the name for the env variable used by transformers and sentence-transformers package
+TRANSFORMERS_CACHE=/etc/huggingface/.cache/
+
+# For MODEL_TYPE=transformers: decides the AutoModel* class used
+# See square_model_inference.inference.transformer.CLASS_MAPPING for valid names and corresponding class
+MODEL_CLASS=base
+
+# Flag that decides if returned numpy arrays are returned
+# as lists or encoded to base64 (smaller but not easily human readable).
+# See the comment in square_model_inference.models.prediction._encode_numpy on information on how to decode
+# the base64 string back to the numpy array
+RETURN_PLAINTEXT_ARRAYS=False
\ No newline at end of file
diff --git a/square-model-inference-api/inference_server/main.py b/square-model-inference-api/inference_server/main.py
index 3f06b41de..849fa23a2 100644
--- a/square-model-inference-api/inference_server/main.py
+++ b/square-model-inference-api/inference_server/main.py
@@ -10,7 +10,7 @@
 def get_app() -> FastAPI:
     # Set logging config.
     try:
-        fileConfig("logging.conf")
+        fileConfig("logging.conf", disable_existing_loggers=False)
     except:
         logger.info("Failed to load 'logging.conf'. Continuing without configuring the server logger")
     fast_app = FastAPI(title=APP_NAME, version=APP_VERSION)
diff --git a/square-model-inference-api/inference_server/square_model_inference/inference/transformer.py b/square-model-inference-api/inference_server/square_model_inference/inference/transformer.py
index 65dcceb0a..5b013d8a1 100644
--- a/square-model-inference-api/inference_server/square_model_inference/inference/transformer.py
+++ b/square-model-inference-api/inference_server/square_model_inference/inference/transformer.py
@@ -29,7 +29,7 @@ class Transformer(Model):
     """
     The class for all Huggingface transformer-based models
     """
-    SUPPORTED_EMBEDDING_MODES = ["mean", "max", "cls", "token"]
+    SUPPORTED_EMBEDDING_MODES = ["mean", "max", "cls", "token", "pooler"]
 
     def __init__(self, model_name, model_class, batch_size, disable_gpu, max_input_size, **kwargs):
         """
@@ -127,6 +127,8 @@ def _embedding(self, request: PredictionRequest) -> PredictionOutput:
 
         if embedding_mode == "cls":
             emb = hidden_state[:, 0, :]
+        elif embedding_mode == "pooler":
+            emb = predictions["pooler_output"]
         # copied from sentence-transformers pooling
         elif embedding_mode == "max":
             input_mask_expanded = attention_mask.unsqueeze(-1).expand(hidden_state.size()).float()
diff --git a/square-model-inference-api/inference_server/square_model_inference/models/prediction.py b/square-model-inference-api/inference_server/square_model_inference/models/prediction.py
index a3f1d7fbb..f6a10712f 100644
--- a/square-model-inference-api/inference_server/square_model_inference/models/prediction.py
+++ b/square-model-inference-api/inference_server/square_model_inference/models/prediction.py
@@ -124,7 +124,7 @@ def __init__(self, **data):
 
 
 class PredictionOutputForEmbedding(PredictionOutput):
-    embedding_mode: str = Field("", description="Only used by Transformers/ Adapters.<br> One of 'mean', 'max', 'cls', 'token'. The pooling mode used (or not used for 'token')")
+    embedding_mode: str = Field("", description="Only used by Transformers/ Adapters.<br> One of 'mean', 'max', 'cls', 'pooler', 'token'. The pooling mode used (or not used for 'token')")
     word_ids: List[List[Optional[int]]] = Field([], description="Only used by Transformers/ Adapters.<br> "
                                                           "Only set with embedding_mode='token'."
                                                           " Mapping from each token to the corresponding word in the input. "
diff --git a/square-model-inference-api/inference_server/square_model_inference/models/request.py b/square-model-inference-api/inference_server/square_model_inference/models/request.py
index 547de9191..89678d9a6 100644
--- a/square-model-inference-api/inference_server/square_model_inference/models/request.py
+++ b/square-model-inference-api/inference_server/square_model_inference/models/request.py
@@ -62,7 +62,8 @@ class PredictionRequest(BaseModel):
                     "'token_classification':<br>"
                     "- 'is_regression': Flag to treat output of models with num_labels>1 as regression, too, i.e., no softmax and no labels are returned<br>"
                     "'embedding':<br>"
-                    "- 'embedding_mode: One of 'mean', 'max', 'cls', 'token'. The pooling mode used (or not used for 'token'). Default 'mean'.<br>"
+                    "- 'embedding_mode: One of 'mean', 'max', 'cls', 'pooler', 'token'. The pooling mode used (or not used for 'token'). "
+                    "'pooler' uses the pooler_output of a Transformer, i.e. the processed  CLS token. Default value 'mean'.<br>"
                     "'question_answering':<br>"
                     "- 'topk': Return the top-k most likely spans. Default 1.<br>"
                     "- 'max_answer_len': Maximal token length of answers. Default 128.<br>"
diff --git a/square-model-inference-api/inference_server/tests/test_inference/test_transformers.py b/square-model-inference-api/inference_server/tests/test_inference/test_transformers.py
index f389e9d00..c715fd2f1 100644
--- a/square-model-inference-api/inference_server/tests/test_inference/test_transformers.py
+++ b/square-model-inference-api/inference_server/tests/test_inference/test_transformers.py
@@ -82,7 +82,9 @@ class TestTransformerEmbedding:
                                             (["this is a test"], "max"),
                                             (["this is a test", "this is a test with a longer sentence"], "max"),
                                             (["this is a test"], "cls"),
-                                            (["this is a test", "this is a test with a longer sentence"], "cls")],
+                                            (["this is a test", "this is a test with a longer sentence"], "cls"),
+                                            (["this is a test"], "pooler"),
+                                            (["this is a test", "this is a test with a longer sentence"], "pooler")],
                              )
     async def test_embedding(self, prediction_request, test_transformer_embedding, input, mode):
         prediction_request.input = input
diff --git a/square-model-inference-api/locust/config.json b/square-model-inference-api/locust/config.json
index 1b7dd94ba..94fc2f27b 100644
--- a/square-model-inference-api/locust/config.json
+++ b/square-model-inference-api/locust/config.json
@@ -5,9 +5,25 @@
   },
   "tasks": [
     {
-      "endpoint": "embedding",
+      "endpoint": "sequence-classification",
       "model": "bert-base-uncased",
       "weight": 1,
+      "query_json": {
+        "input":
+        [
+          "test input"
+        ],
+        "is_preprocessed": false,
+        "preprocessing_kwargs": { },
+        "model_kwargs": { },
+        "task_kwargs": { },
+        "adapter_name": "nli/rte@ukp"
+      }
+    },
+    {
+      "endpoint": "embedding",
+      "model": "facebook/dpr-question_encoder-single-nq-base",
+      "weight": 1,
       "query_json": {
         "input":
         [
diff --git a/square-model-inference-api/nginx/nginx.conf b/square-model-inference-api/nginx/nginx.conf
index 54fbc7187..56a4d7e53 100644
--- a/square-model-inference-api/nginx/nginx.conf
+++ b/square-model-inference-api/nginx/nginx.conf
@@ -22,21 +22,26 @@ http {
         access_log /var/log/nginx/access.log  json_combined;
 
         listen 8080;
-
-        location / {
-            proxy_pass http://square_model_inference_bert:8000/;
+        # Model API Documentation
+        location /docs {
+            proxy_pass http://square_model_inference_bert_adapter:8000/docs;
+        }
+        location /redoc {
+            proxy_pass http://square_model_inference_bert_adapter:8000/redoc;
         }
 
+        # Model Server API Gateway
         location /api/bert-base-uncased {
             auth_request /auth;
-            proxy_pass http://square_model_inference_bert:8000/api;
+            proxy_pass http://square_model_inference_bert_adapter:8000/api;
         }
 
-#         location /api/roberta-base {
-#             auth_request /auth;
-#             proxy_pass http://square_model_inference_roberta:8000/api;
-#         }
+         location /api/facebook/dpr-question_encoder-single-nq-base {
+             auth_request /auth;
+             proxy_pass http://square_model_inference_dpr:8000/api;
+         }
 
+        # Auth Server
         location /auth {
             internal;
             proxy_pass http://square_model_auth:8081/auth;