diff --git a/docker-compose.yaml b/docker-compose.yaml
index c093325f6..8d4d17ac9 100644
--- a/docker-compose.yaml
+++ b/docker-compose.yaml
@@ -29,6 +29,44 @@ services:
ports:
- 80:80
+### MODEL API
+ model_auth:
+ image: ukpsquare/square-model-api-auth:latest
+ container_name: square_model_auth
+ ports:
+ - 8081:8081
+ env_file:
+ - ./square-model-inference-api/auth_server/.env
+
+ model_nginx:
+ image: nginx
+ ports:
+ - 8080:8080
+ volumes:
+ - ./square-model-inference-api/nginx/nginx.conf:/etc/nginx/nginx.conf:ro
+
+ inference_bert_adapter:
+ image: ukpsquare/square-model-api:latest
+ ports:
+ - 8000:8000
+ env_file:
+ - ./square-model-inference-api/inference_server/.env.bert_adapter
+ container_name: square_model_inference_bert_adapter
+ volumes:
+ - ./.cache/:/etc/huggingface/.cache/
+
+ inference_dpr:
+ image: ukpsquare/square-model-api:latest
+ ports:
+ - 8001:8000
+ env_file:
+ - ./square-model-inference-api/inference_server/.env.dpr
+ container_name: square_model_inference_dpr
+ volumes:
+ - ./.cache/:/etc/huggingface/.cache/
+
+### / MODEL API Finished
+
#adminer:
# image: adminer
# restart: always
diff --git a/square-model-inference-api/README.md b/square-model-inference-api/README.md
index 8b245517f..31eabd254 100644
--- a/square-model-inference-api/README.md
+++ b/square-model-inference-api/README.md
@@ -2,6 +2,15 @@
Inference API that supports SOTA (QA) models & adapters.
Receives input and returns prediction and other artifacts (e.g. attention scores)
+## On the API Path
+The 'true' path of the API for the model server is of the form `/api/$endpoint` where the endpoint
+is embeddings, question-answering, etc. This is the path you use if you just run a model server locally.
+
+However, to run and distinguish multiple models, we use an API gateway with nginx so we extend
+the path to `/api/$modelname/$endpoint` which is then resolved by nginx to the correct model server and forwarded
+to this server's `/api/$endpoint` endpoint. This is the path you use with Docker.
+This requires you to setup the docker-compose and nginx config as described below.
+
## Project structure
The Model API uses 3 components:
@@ -58,6 +67,20 @@ Both `transformers` and `adapter-transformers` use the same namespace so they co
Thus, we first install `sentence-transformers` along with `transformers`,
uninstall `transformers`, and finally install `adapter-transformers`.
+
+## Setup
+### Docker
+1. Create `auth_server/.env` with secret API key. See [here](auth_server/.env.example) for an example.
+2. For each model server that should run, create a `.env.$model` to configure it.
+ See [here](inference_server/.env.example) for an example.
+3. Configure `nginx/nginx.conf` to correctly forward requests to each server. The server DNS name has to
+ match `container_name` of each server in the `docker-compose.yaml`.
+4. Configure `docker-compose.yaml` by adding services for the auth server, nginx (with the config), and the
+ model servers (each with their .env file). See [example_docker-compose.yml](example_docker-compose.yml) for an example.
+### Local
+Create `inference_server/.env` and configure it as needed for your local model server.
+You do not need nginx and the authorization server for local testing.
+
## Running
#### Running Localhost
@@ -81,12 +104,3 @@ For unit tests:
make test
```
For load testing with Locust, see [this README](locust/README.md).
-
-## Setup
-1. Create `auth_server/.env` with secret API key. See [here](auth_server/.env.example) for an example.
-2. For each model server that should run, create a `.env.$model` to configure it.
- See [here](inference_server/.env.example) for an example.
-3. Configure `nginx/nginx.conf` to correctly forward requests to each server. The server DNS name has to
- match `container_name` of each server in the `docker-compose.yaml`.
-4. Configure `docker-compose.yaml` by adding services for the auth server, nginx (with the config), and the
-model servers (each with their .env file). See [example_docker-compose.yml](example_docker-compose.yml) for an example.
\ No newline at end of file
diff --git a/square-model-inference-api/auth_server/main.py b/square-model-inference-api/auth_server/main.py
index 6daf91883..b091250e9 100644
--- a/square-model-inference-api/auth_server/main.py
+++ b/square-model-inference-api/auth_server/main.py
@@ -5,7 +5,7 @@
logger = logging.getLogger(__name__)
try:
- fileConfig("logging.conf")
+ fileConfig("logging.conf", disable_existing_loggers=False)
except:
logger.info("Failed to load 'logging.conf'. Continuing without configuring the server logger")
app = FastAPI()
diff --git a/square-model-inference-api/inference_server/.env.bert_adapter b/square-model-inference-api/inference_server/.env.bert_adapter
new file mode 100644
index 000000000..6453fb42e
--- /dev/null
+++ b/square-model-inference-api/inference_server/.env.bert_adapter
@@ -0,0 +1,23 @@
+# Corresponds to the Huggingface name for finetuned Transformers or the name of a finetuned SentenceTransformers
+MODEL_NAME=bert-base-uncased
+# Type of the model, e.g. Transformers, Adapter, ...
+# See square_model_inference.core.event_handlers.MODEL_MAPPING for all available names with corresponding model
+MODEL_TYPE=adapter
+
+# Disable CUDA even if available
+DISABLE_GPU=True
+# Batch size used for many inputs
+BATCH_SIZE=32
+# Inputs larger than this size are rejected
+MAX_INPUT_SIZE=1024
+
+# Cache directory where model weights are stored
+# This is the name for the env variable used by transformers and sentence-transformers package
+TRANSFORMERS_CACHE=/etc/huggingface/.cache/
+
+
+# Flag that decides if returned numpy arrays are returned
+# as lists or encoded to base64 (smaller but not easily human readable).
+# See the comment in square_model_inference.models.prediction._encode_numpy on information on how to decode
+# the base64 string back to the numpy array
+RETURN_PLAINTEXT_ARRAYS=False
\ No newline at end of file
diff --git a/square-model-inference-api/inference_server/.env.dpr b/square-model-inference-api/inference_server/.env.dpr
new file mode 100644
index 000000000..a1e3784a4
--- /dev/null
+++ b/square-model-inference-api/inference_server/.env.dpr
@@ -0,0 +1,26 @@
+# Corresponds to the Huggingface name for finetuned Transformers or the name of a finetuned SentenceTransformers
+MODEL_NAME=facebook/dpr-question_encoder-single-nq-base
+# Type of the model, e.g. Transformers, Adapter, ...
+# See square_model_inference.core.event_handlers.MODEL_MAPPING for all available names with corresponding model
+MODEL_TYPE=transformer
+
+# Disable CUDA even if available
+DISABLE_GPU=False
+# Batch size used for many inputs
+BATCH_SIZE=32
+# Inputs larger than this size are rejected
+MAX_INPUT_SIZE=1024
+
+# Cache directory where model weights are stored
+# This is the name for the env variable used by transformers and sentence-transformers package
+TRANSFORMERS_CACHE=/etc/huggingface/.cache/
+
+# For MODEL_TYPE=transformers: decides the AutoModel* class used
+# See square_model_inference.inference.transformer.CLASS_MAPPING for valid names and corresponding class
+MODEL_CLASS=base
+
+# Flag that decides if returned numpy arrays are returned
+# as lists or encoded to base64 (smaller but not easily human readable).
+# See the comment in square_model_inference.models.prediction._encode_numpy on information on how to decode
+# the base64 string back to the numpy array
+RETURN_PLAINTEXT_ARRAYS=False
\ No newline at end of file
diff --git a/square-model-inference-api/inference_server/main.py b/square-model-inference-api/inference_server/main.py
index 3f06b41de..849fa23a2 100644
--- a/square-model-inference-api/inference_server/main.py
+++ b/square-model-inference-api/inference_server/main.py
@@ -10,7 +10,7 @@
def get_app() -> FastAPI:
# Set logging config.
try:
- fileConfig("logging.conf")
+ fileConfig("logging.conf", disable_existing_loggers=False)
except:
logger.info("Failed to load 'logging.conf'. Continuing without configuring the server logger")
fast_app = FastAPI(title=APP_NAME, version=APP_VERSION)
diff --git a/square-model-inference-api/inference_server/square_model_inference/inference/transformer.py b/square-model-inference-api/inference_server/square_model_inference/inference/transformer.py
index 65dcceb0a..5b013d8a1 100644
--- a/square-model-inference-api/inference_server/square_model_inference/inference/transformer.py
+++ b/square-model-inference-api/inference_server/square_model_inference/inference/transformer.py
@@ -29,7 +29,7 @@ class Transformer(Model):
"""
The class for all Huggingface transformer-based models
"""
- SUPPORTED_EMBEDDING_MODES = ["mean", "max", "cls", "token"]
+ SUPPORTED_EMBEDDING_MODES = ["mean", "max", "cls", "token", "pooler"]
def __init__(self, model_name, model_class, batch_size, disable_gpu, max_input_size, **kwargs):
"""
@@ -127,6 +127,8 @@ def _embedding(self, request: PredictionRequest) -> PredictionOutput:
if embedding_mode == "cls":
emb = hidden_state[:, 0, :]
+ elif embedding_mode == "pooler":
+ emb = predictions["pooler_output"]
# copied from sentence-transformers pooling
elif embedding_mode == "max":
input_mask_expanded = attention_mask.unsqueeze(-1).expand(hidden_state.size()).float()
diff --git a/square-model-inference-api/inference_server/square_model_inference/models/prediction.py b/square-model-inference-api/inference_server/square_model_inference/models/prediction.py
index a3f1d7fbb..f6a10712f 100644
--- a/square-model-inference-api/inference_server/square_model_inference/models/prediction.py
+++ b/square-model-inference-api/inference_server/square_model_inference/models/prediction.py
@@ -124,7 +124,7 @@ def __init__(self, **data):
class PredictionOutputForEmbedding(PredictionOutput):
- embedding_mode: str = Field("", description="Only used by Transformers/ Adapters.
One of 'mean', 'max', 'cls', 'token'. The pooling mode used (or not used for 'token')")
+ embedding_mode: str = Field("", description="Only used by Transformers/ Adapters.
One of 'mean', 'max', 'cls', 'pooler', 'token'. The pooling mode used (or not used for 'token')")
word_ids: List[List[Optional[int]]] = Field([], description="Only used by Transformers/ Adapters.
"
"Only set with embedding_mode='token'."
" Mapping from each token to the corresponding word in the input. "
diff --git a/square-model-inference-api/inference_server/square_model_inference/models/request.py b/square-model-inference-api/inference_server/square_model_inference/models/request.py
index 547de9191..89678d9a6 100644
--- a/square-model-inference-api/inference_server/square_model_inference/models/request.py
+++ b/square-model-inference-api/inference_server/square_model_inference/models/request.py
@@ -62,7 +62,8 @@ class PredictionRequest(BaseModel):
"'token_classification':
"
"- 'is_regression': Flag to treat output of models with num_labels>1 as regression, too, i.e., no softmax and no labels are returned
"
"'embedding':
"
- "- 'embedding_mode: One of 'mean', 'max', 'cls', 'token'. The pooling mode used (or not used for 'token'). Default 'mean'.
"
+ "- 'embedding_mode: One of 'mean', 'max', 'cls', 'pooler', 'token'. The pooling mode used (or not used for 'token'). "
+ "'pooler' uses the pooler_output of a Transformer, i.e. the processed CLS token. Default value 'mean'.
"
"'question_answering':
"
"- 'topk': Return the top-k most likely spans. Default 1.
"
"- 'max_answer_len': Maximal token length of answers. Default 128.
"
diff --git a/square-model-inference-api/inference_server/tests/test_inference/test_transformers.py b/square-model-inference-api/inference_server/tests/test_inference/test_transformers.py
index f389e9d00..c715fd2f1 100644
--- a/square-model-inference-api/inference_server/tests/test_inference/test_transformers.py
+++ b/square-model-inference-api/inference_server/tests/test_inference/test_transformers.py
@@ -82,7 +82,9 @@ class TestTransformerEmbedding:
(["this is a test"], "max"),
(["this is a test", "this is a test with a longer sentence"], "max"),
(["this is a test"], "cls"),
- (["this is a test", "this is a test with a longer sentence"], "cls")],
+ (["this is a test", "this is a test with a longer sentence"], "cls"),
+ (["this is a test"], "pooler"),
+ (["this is a test", "this is a test with a longer sentence"], "pooler")],
)
async def test_embedding(self, prediction_request, test_transformer_embedding, input, mode):
prediction_request.input = input
diff --git a/square-model-inference-api/locust/config.json b/square-model-inference-api/locust/config.json
index 1b7dd94ba..94fc2f27b 100644
--- a/square-model-inference-api/locust/config.json
+++ b/square-model-inference-api/locust/config.json
@@ -5,9 +5,25 @@
},
"tasks": [
{
- "endpoint": "embedding",
+ "endpoint": "sequence-classification",
"model": "bert-base-uncased",
"weight": 1,
+ "query_json": {
+ "input":
+ [
+ "test input"
+ ],
+ "is_preprocessed": false,
+ "preprocessing_kwargs": { },
+ "model_kwargs": { },
+ "task_kwargs": { },
+ "adapter_name": "nli/rte@ukp"
+ }
+ },
+ {
+ "endpoint": "embedding",
+ "model": "facebook/dpr-question_encoder-single-nq-base",
+ "weight": 1,
"query_json": {
"input":
[
diff --git a/square-model-inference-api/nginx/nginx.conf b/square-model-inference-api/nginx/nginx.conf
index 54fbc7187..56a4d7e53 100644
--- a/square-model-inference-api/nginx/nginx.conf
+++ b/square-model-inference-api/nginx/nginx.conf
@@ -22,21 +22,26 @@ http {
access_log /var/log/nginx/access.log json_combined;
listen 8080;
-
- location / {
- proxy_pass http://square_model_inference_bert:8000/;
+ # Model API Documentation
+ location /docs {
+ proxy_pass http://square_model_inference_bert_adapter:8000/docs;
+ }
+ location /redoc {
+ proxy_pass http://square_model_inference_bert_adapter:8000/redoc;
}
+ # Model Server API Gateway
location /api/bert-base-uncased {
auth_request /auth;
- proxy_pass http://square_model_inference_bert:8000/api;
+ proxy_pass http://square_model_inference_bert_adapter:8000/api;
}
-# location /api/roberta-base {
-# auth_request /auth;
-# proxy_pass http://square_model_inference_roberta:8000/api;
-# }
+ location /api/facebook/dpr-question_encoder-single-nq-base {
+ auth_request /auth;
+ proxy_pass http://square_model_inference_dpr:8000/api;
+ }
+ # Auth Server
location /auth {
internal;
proxy_pass http://square_model_auth:8081/auth;