Open
Description
Elasticsearch Version
serverless
Installed Plugins
No response
Java Version
bundled
OS Version
serverless
Problem Description
I have an Inference endpoint up and running. It has been working fine for several days.
The cluster suddenly started responding with 404
error when trying to call the inference endpoint.
Steps to Reproduce
GET grocery_items/_search
{
"retriever": {
"standard": {
"query": {
"nested": {
"path": "Product Description_semantic.inference.chunks",
"query": {
"sparse_vector": {
"inference_id": "elser-endpoint",
"field": "Product Description_semantic.inference.chunks.embeddings",
"query": "find me lunch meat"
}
},
"inner_hits": {
"size": 2,
"name": "grocery_items.Product Description_semantic",
"_source": [
"Product Description_semantic.inference.chunks.text"
]
}
}
}
}
}
}
response
{
"error": {
"root_cause": [
{
"type": "status_exception",
"reason": "Unable to find model deployment task [elser-endpoint] please stop and start the deployment or try again momentarily"
}
],
"type": "status_exception",
"reason": "Unable to find model deployment task [elser-endpoint] please stop and start the deployment or try again momentarily"
},
"status": 404
}
GET _inference/elser-endpoint
response
{
"endpoints": [
{
"inference_id": "elser-endpoint",
"task_type": "sparse_embedding",
"service": "elasticsearch",
"service_settings": {
"num_allocations": 10,
"num_threads": 1,
"model_id": ".elser_model_2_linux-x86_64",
"adaptive_allocations": {
"enabled": true,
"min_number_of_allocations": 10,
"max_number_of_allocations": 100
}
}
}
]
}
get _ml/trained_models/.elser_model_2_linux-x86_64/_stats
response
{
"count": 1,
"trained_model_stats": [
{
"model_id": ".elser_model_2_linux-x86_64",
"model_size_stats": {
"model_size_bytes": 274756282,
"required_native_memory_bytes": 2101346304
},
"pipeline_count": 1,
"ingest": {
"total": {
"count": 0,
"time_in_millis": 0,
"current": 0,
"failed": 0
},
"pipelines": {
".kibana-elastic-ai-assistant-ingest-pipeline-knowledge-base": {
"count": 0,
"time_in_millis": 0,
"current": 0,
"failed": 0,
"ingested_as_first_pipeline_in_bytes": 0,
"produced_as_first_pipeline_in_bytes": 0,
"processors": [
{
"inference": {
"type": "inference",
"stats": {
"count": 0,
"time_in_millis": 0,
"current": 0,
"failed": 0
}
}
}
]
}
}
},
"inference_stats": {
"failure_count": 0,
"inference_count": 2,
"cache_miss_count": 0,
"missing_all_fields_count": 0,
"timestamp": 1729883203847
},
"deployment_stats": {
"deployment_id": "elser-endpoint",
"model_id": ".elser_model_2_linux-x86_64",
"threads_per_allocation": 1,
"number_of_allocations": 10,
"adaptive_allocations": {
"enabled": true,
"min_number_of_allocations": 10,
"max_number_of_allocations": 100
},
"queue_capacity": 1024,
"cache_size": "262mb",
"priority": "normal",
"start_time": 1728864116915,
"inference_count": 2,
"peak_throughput_per_minute": 1,
"nodes": [
{
"node": {
"serverless": {
"name": "serverless",
"ephemeral_id": "serverless",
"transport_address": "0.0.0.0:0",
"external_id": "serverless",
"attributes": {},
"roles": [
"ml"
],
"version": "9.0.0",
"min_index_version": 8000099,
"max_index_version": 9000000
}
},
"routing_state": {
"routing_state": "started"
},
"inference_count": 2,
"average_inference_time_ms": 84,
"average_inference_time_ms_excluding_cache_hits": 84,
"inference_cache_hit_count": 0,
"last_access": 1729882326131,
"number_of_pending_requests": 0,
"start_time": 1729875144036,
"threads_per_allocation": 1,
"number_of_allocations": 1,
"peak_throughput_per_minute": 1,
"throughput_last_minute": 0,
"inference_cache_hit_count_last_minute": 0
}
]
}
}
]
}
Though whats weird is I was trying to deploy an ELSER model
PUT _inference/sparse_embedding/elser-endpoint-backup
{
"service": "elser",
"service_settings": {
"adaptive_allocations": {
"enabled": true,
"min_number_of_allocations": 10,
"max_number_of_allocations": 100
},
"num_threads": 1
}
}
response
{
"error": {
"root_cause": [
{
"type": "illegal_argument_exception",
"reason": "not enough memory on node [gIs_zXQYT3SMtEozmJJg0w] to assign model [my-e5-endpoint]"
}
],
"type": "illegal_argument_exception",
"reason": "not enough memory on node [gIs_zXQYT3SMtEozmJJg0w] to assign model [my-e5-endpoint]"
},
"status": 400
}
serverless prod - fbe378fd1dab4affa7c981c90a03f440
Logs (if relevant)
app log last success (central us time)
2024-10-25 13:51:57,332 - INFO - _async_transport.py - perform_request - line 271 - POST [https://grocery-store-fbe378.es.us-east-1.aws.elastic.cloud:443/users/_search](https://grocery-store-fbe378.es.us-east-1.aws.elastic.cloud/users/_search) [status:200 duration:0.045s]
then on next call
2024-10-25 13:52:04,257 - INFO - _async_transport.py - perform_request - line 271 - POST [https://grocery-store-fbe378.es.us-east-1.aws.elastic.cloud:443/grocery_items/_search](https://grocery-store-fbe378.es.us-east-1.aws.elastic.cloud/grocery_items/_search) [status:404 duration:0.045s]