diff --git a/docker-compose.yaml b/docker-compose.yaml index c093325f6..8d4d17ac9 100644 --- a/docker-compose.yaml +++ b/docker-compose.yaml @@ -29,6 +29,44 @@ services: ports: - 80:80 +### MODEL API + model_auth: + image: ukpsquare/square-model-api-auth:latest + container_name: square_model_auth + ports: + - 8081:8081 + env_file: + - ./square-model-inference-api/auth_server/.env + + model_nginx: + image: nginx + ports: + - 8080:8080 + volumes: + - ./square-model-inference-api/nginx/nginx.conf:/etc/nginx/nginx.conf:ro + + inference_bert_adapter: + image: ukpsquare/square-model-api:latest + ports: + - 8000:8000 + env_file: + - ./square-model-inference-api/inference_server/.env.bert_adapter + container_name: square_model_inference_bert_adapter + volumes: + - ./.cache/:/etc/huggingface/.cache/ + + inference_dpr: + image: ukpsquare/square-model-api:latest + ports: + - 8001:8000 + env_file: + - ./square-model-inference-api/inference_server/.env.dpr + container_name: square_model_inference_dpr + volumes: + - ./.cache/:/etc/huggingface/.cache/ + +### / MODEL API Finished + #adminer: # image: adminer # restart: always diff --git a/square-model-inference-api/README.md b/square-model-inference-api/README.md index 8b245517f..31eabd254 100644 --- a/square-model-inference-api/README.md +++ b/square-model-inference-api/README.md @@ -2,6 +2,15 @@ Inference API that supports SOTA (QA) models & adapters. Receives input and returns prediction and other artifacts (e.g. attention scores) +## On the API Path +The 'true' path of the API for the model server is of the form `/api/$endpoint` where the endpoint +is embeddings, question-answering, etc. This is the path you use if you just run a model server locally. + +However, to run and distinguish multiple models, we use an API gateway with nginx so we extend +the path to `/api/$modelname/$endpoint` which is then resolved by nginx to the correct model server and forwarded +to this server's `/api/$endpoint` endpoint. This is the path you use with Docker. +This requires you to setup the docker-compose and nginx config as described below. + ## Project structure The Model API uses 3 components: @@ -58,6 +67,20 @@ Both `transformers` and `adapter-transformers` use the same namespace so they co Thus, we first install `sentence-transformers` along with `transformers`, uninstall `transformers`, and finally install `adapter-transformers`. + +## Setup +### Docker +1. Create `auth_server/.env` with secret API key. See [here](auth_server/.env.example) for an example. +2. For each model server that should run, create a `.env.$model` to configure it. + See [here](inference_server/.env.example) for an example. +3. Configure `nginx/nginx.conf` to correctly forward requests to each server. The server DNS name has to + match `container_name` of each server in the `docker-compose.yaml`. +4. Configure `docker-compose.yaml` by adding services for the auth server, nginx (with the config), and the + model servers (each with their .env file). See [example_docker-compose.yml](example_docker-compose.yml) for an example. +### Local +Create `inference_server/.env` and configure it as needed for your local model server. +You do not need nginx and the authorization server for local testing. + ## Running #### Running Localhost @@ -81,12 +104,3 @@ For unit tests: make test ``` For load testing with Locust, see [this README](locust/README.md). - -## Setup -1. Create `auth_server/.env` with secret API key. See [here](auth_server/.env.example) for an example. -2. For each model server that should run, create a `.env.$model` to configure it. - See [here](inference_server/.env.example) for an example. -3. Configure `nginx/nginx.conf` to correctly forward requests to each server. The server DNS name has to - match `container_name` of each server in the `docker-compose.yaml`. -4. Configure `docker-compose.yaml` by adding services for the auth server, nginx (with the config), and the -model servers (each with their .env file). See [example_docker-compose.yml](example_docker-compose.yml) for an example. \ No newline at end of file diff --git a/square-model-inference-api/auth_server/main.py b/square-model-inference-api/auth_server/main.py index 6daf91883..b091250e9 100644 --- a/square-model-inference-api/auth_server/main.py +++ b/square-model-inference-api/auth_server/main.py @@ -5,7 +5,7 @@ logger = logging.getLogger(__name__) try: - fileConfig("logging.conf") + fileConfig("logging.conf", disable_existing_loggers=False) except: logger.info("Failed to load 'logging.conf'. Continuing without configuring the server logger") app = FastAPI() diff --git a/square-model-inference-api/inference_server/.env.bert_adapter b/square-model-inference-api/inference_server/.env.bert_adapter new file mode 100644 index 000000000..6453fb42e --- /dev/null +++ b/square-model-inference-api/inference_server/.env.bert_adapter @@ -0,0 +1,23 @@ +# Corresponds to the Huggingface name for finetuned Transformers or the name of a finetuned SentenceTransformers +MODEL_NAME=bert-base-uncased +# Type of the model, e.g. Transformers, Adapter, ... +# See square_model_inference.core.event_handlers.MODEL_MAPPING for all available names with corresponding model +MODEL_TYPE=adapter + +# Disable CUDA even if available +DISABLE_GPU=True +# Batch size used for many inputs +BATCH_SIZE=32 +# Inputs larger than this size are rejected +MAX_INPUT_SIZE=1024 + +# Cache directory where model weights are stored +# This is the name for the env variable used by transformers and sentence-transformers package +TRANSFORMERS_CACHE=/etc/huggingface/.cache/ + + +# Flag that decides if returned numpy arrays are returned +# as lists or encoded to base64 (smaller but not easily human readable). +# See the comment in square_model_inference.models.prediction._encode_numpy on information on how to decode +# the base64 string back to the numpy array +RETURN_PLAINTEXT_ARRAYS=False \ No newline at end of file diff --git a/square-model-inference-api/inference_server/.env.dpr b/square-model-inference-api/inference_server/.env.dpr new file mode 100644 index 000000000..a1e3784a4 --- /dev/null +++ b/square-model-inference-api/inference_server/.env.dpr @@ -0,0 +1,26 @@ +# Corresponds to the Huggingface name for finetuned Transformers or the name of a finetuned SentenceTransformers +MODEL_NAME=facebook/dpr-question_encoder-single-nq-base +# Type of the model, e.g. Transformers, Adapter, ... +# See square_model_inference.core.event_handlers.MODEL_MAPPING for all available names with corresponding model +MODEL_TYPE=transformer + +# Disable CUDA even if available +DISABLE_GPU=False +# Batch size used for many inputs +BATCH_SIZE=32 +# Inputs larger than this size are rejected +MAX_INPUT_SIZE=1024 + +# Cache directory where model weights are stored +# This is the name for the env variable used by transformers and sentence-transformers package +TRANSFORMERS_CACHE=/etc/huggingface/.cache/ + +# For MODEL_TYPE=transformers: decides the AutoModel* class used +# See square_model_inference.inference.transformer.CLASS_MAPPING for valid names and corresponding class +MODEL_CLASS=base + +# Flag that decides if returned numpy arrays are returned +# as lists or encoded to base64 (smaller but not easily human readable). +# See the comment in square_model_inference.models.prediction._encode_numpy on information on how to decode +# the base64 string back to the numpy array +RETURN_PLAINTEXT_ARRAYS=False \ No newline at end of file diff --git a/square-model-inference-api/inference_server/main.py b/square-model-inference-api/inference_server/main.py index 3f06b41de..849fa23a2 100644 --- a/square-model-inference-api/inference_server/main.py +++ b/square-model-inference-api/inference_server/main.py @@ -10,7 +10,7 @@ def get_app() -> FastAPI: # Set logging config. try: - fileConfig("logging.conf") + fileConfig("logging.conf", disable_existing_loggers=False) except: logger.info("Failed to load 'logging.conf'. Continuing without configuring the server logger") fast_app = FastAPI(title=APP_NAME, version=APP_VERSION) diff --git a/square-model-inference-api/inference_server/square_model_inference/inference/transformer.py b/square-model-inference-api/inference_server/square_model_inference/inference/transformer.py index 65dcceb0a..5b013d8a1 100644 --- a/square-model-inference-api/inference_server/square_model_inference/inference/transformer.py +++ b/square-model-inference-api/inference_server/square_model_inference/inference/transformer.py @@ -29,7 +29,7 @@ class Transformer(Model): """ The class for all Huggingface transformer-based models """ - SUPPORTED_EMBEDDING_MODES = ["mean", "max", "cls", "token"] + SUPPORTED_EMBEDDING_MODES = ["mean", "max", "cls", "token", "pooler"] def __init__(self, model_name, model_class, batch_size, disable_gpu, max_input_size, **kwargs): """ @@ -127,6 +127,8 @@ def _embedding(self, request: PredictionRequest) -> PredictionOutput: if embedding_mode == "cls": emb = hidden_state[:, 0, :] + elif embedding_mode == "pooler": + emb = predictions["pooler_output"] # copied from sentence-transformers pooling elif embedding_mode == "max": input_mask_expanded = attention_mask.unsqueeze(-1).expand(hidden_state.size()).float() diff --git a/square-model-inference-api/inference_server/square_model_inference/models/prediction.py b/square-model-inference-api/inference_server/square_model_inference/models/prediction.py index a3f1d7fbb..f6a10712f 100644 --- a/square-model-inference-api/inference_server/square_model_inference/models/prediction.py +++ b/square-model-inference-api/inference_server/square_model_inference/models/prediction.py @@ -124,7 +124,7 @@ def __init__(self, **data): class PredictionOutputForEmbedding(PredictionOutput): - embedding_mode: str = Field("", description="Only used by Transformers/ Adapters.
One of 'mean', 'max', 'cls', 'token'. The pooling mode used (or not used for 'token')") + embedding_mode: str = Field("", description="Only used by Transformers/ Adapters.
One of 'mean', 'max', 'cls', 'pooler', 'token'. The pooling mode used (or not used for 'token')") word_ids: List[List[Optional[int]]] = Field([], description="Only used by Transformers/ Adapters.
" "Only set with embedding_mode='token'." " Mapping from each token to the corresponding word in the input. " diff --git a/square-model-inference-api/inference_server/square_model_inference/models/request.py b/square-model-inference-api/inference_server/square_model_inference/models/request.py index 547de9191..89678d9a6 100644 --- a/square-model-inference-api/inference_server/square_model_inference/models/request.py +++ b/square-model-inference-api/inference_server/square_model_inference/models/request.py @@ -62,7 +62,8 @@ class PredictionRequest(BaseModel): "'token_classification':
" "- 'is_regression': Flag to treat output of models with num_labels>1 as regression, too, i.e., no softmax and no labels are returned
" "'embedding':
" - "- 'embedding_mode: One of 'mean', 'max', 'cls', 'token'. The pooling mode used (or not used for 'token'). Default 'mean'.
" + "- 'embedding_mode: One of 'mean', 'max', 'cls', 'pooler', 'token'. The pooling mode used (or not used for 'token'). " + "'pooler' uses the pooler_output of a Transformer, i.e. the processed CLS token. Default value 'mean'.
" "'question_answering':
" "- 'topk': Return the top-k most likely spans. Default 1.
" "- 'max_answer_len': Maximal token length of answers. Default 128.
" diff --git a/square-model-inference-api/inference_server/tests/test_inference/test_transformers.py b/square-model-inference-api/inference_server/tests/test_inference/test_transformers.py index f389e9d00..c715fd2f1 100644 --- a/square-model-inference-api/inference_server/tests/test_inference/test_transformers.py +++ b/square-model-inference-api/inference_server/tests/test_inference/test_transformers.py @@ -82,7 +82,9 @@ class TestTransformerEmbedding: (["this is a test"], "max"), (["this is a test", "this is a test with a longer sentence"], "max"), (["this is a test"], "cls"), - (["this is a test", "this is a test with a longer sentence"], "cls")], + (["this is a test", "this is a test with a longer sentence"], "cls"), + (["this is a test"], "pooler"), + (["this is a test", "this is a test with a longer sentence"], "pooler")], ) async def test_embedding(self, prediction_request, test_transformer_embedding, input, mode): prediction_request.input = input diff --git a/square-model-inference-api/locust/config.json b/square-model-inference-api/locust/config.json index 1b7dd94ba..94fc2f27b 100644 --- a/square-model-inference-api/locust/config.json +++ b/square-model-inference-api/locust/config.json @@ -5,9 +5,25 @@ }, "tasks": [ { - "endpoint": "embedding", + "endpoint": "sequence-classification", "model": "bert-base-uncased", "weight": 1, + "query_json": { + "input": + [ + "test input" + ], + "is_preprocessed": false, + "preprocessing_kwargs": { }, + "model_kwargs": { }, + "task_kwargs": { }, + "adapter_name": "nli/rte@ukp" + } + }, + { + "endpoint": "embedding", + "model": "facebook/dpr-question_encoder-single-nq-base", + "weight": 1, "query_json": { "input": [ diff --git a/square-model-inference-api/nginx/nginx.conf b/square-model-inference-api/nginx/nginx.conf index 54fbc7187..56a4d7e53 100644 --- a/square-model-inference-api/nginx/nginx.conf +++ b/square-model-inference-api/nginx/nginx.conf @@ -22,21 +22,26 @@ http { access_log /var/log/nginx/access.log json_combined; listen 8080; - - location / { - proxy_pass http://square_model_inference_bert:8000/; + # Model API Documentation + location /docs { + proxy_pass http://square_model_inference_bert_adapter:8000/docs; + } + location /redoc { + proxy_pass http://square_model_inference_bert_adapter:8000/redoc; } + # Model Server API Gateway location /api/bert-base-uncased { auth_request /auth; - proxy_pass http://square_model_inference_bert:8000/api; + proxy_pass http://square_model_inference_bert_adapter:8000/api; } -# location /api/roberta-base { -# auth_request /auth; -# proxy_pass http://square_model_inference_roberta:8000/api; -# } + location /api/facebook/dpr-question_encoder-single-nq-base { + auth_request /auth; + proxy_pass http://square_model_inference_dpr:8000/api; + } + # Auth Server location /auth { internal; proxy_pass http://square_model_auth:8081/auth;