Skip to content

Inference Endpoint becomes unavailable #115692

Open
@jeffvestal

Description

@jeffvestal

Elasticsearch Version

serverless

Installed Plugins

No response

Java Version

bundled

OS Version

serverless

Problem Description

I have an Inference endpoint up and running. It has been working fine for several days.
The cluster suddenly started responding with 404 error when trying to call the inference endpoint.

Steps to Reproduce

GET grocery_items/_search
   {
        "retriever": {
                        "standard": {
                            "query": {
                                "nested": {
                                    "path": "Product Description_semantic.inference.chunks",
                                    "query": {
                                        "sparse_vector": {
                                            "inference_id": "elser-endpoint",
                                            "field": "Product Description_semantic.inference.chunks.embeddings",
                                            "query": "find me lunch meat"
                                        }
                                    },
                                    "inner_hits": {
                                        "size": 2,
                                        "name": "grocery_items.Product Description_semantic",
                                        "_source": [
                                            "Product Description_semantic.inference.chunks.text"
                                        ]
                                    }
                                }
                            }
                        }
        }
   }

response

{
  "error": {
    "root_cause": [
      {
        "type": "status_exception",
        "reason": "Unable to find model deployment task [elser-endpoint] please stop and start the deployment or try again momentarily"
      }
    ],
    "type": "status_exception",
    "reason": "Unable to find model deployment task [elser-endpoint] please stop and start the deployment or try again momentarily"
  },
  "status": 404
}

GET _inference/elser-endpoint
response

{
  "endpoints": [
    {
      "inference_id": "elser-endpoint",
      "task_type": "sparse_embedding",
      "service": "elasticsearch",
      "service_settings": {
        "num_allocations": 10,
        "num_threads": 1,
        "model_id": ".elser_model_2_linux-x86_64",
        "adaptive_allocations": {
          "enabled": true,
          "min_number_of_allocations": 10,
          "max_number_of_allocations": 100
        }
      }
    }
  ]
}

get _ml/trained_models/.elser_model_2_linux-x86_64/_stats
response

{
  "count": 1,
  "trained_model_stats": [
    {
      "model_id": ".elser_model_2_linux-x86_64",
      "model_size_stats": {
        "model_size_bytes": 274756282,
        "required_native_memory_bytes": 2101346304
      },
      "pipeline_count": 1,
      "ingest": {
        "total": {
          "count": 0,
          "time_in_millis": 0,
          "current": 0,
          "failed": 0
        },
        "pipelines": {
          ".kibana-elastic-ai-assistant-ingest-pipeline-knowledge-base": {
            "count": 0,
            "time_in_millis": 0,
            "current": 0,
            "failed": 0,
            "ingested_as_first_pipeline_in_bytes": 0,
            "produced_as_first_pipeline_in_bytes": 0,
            "processors": [
              {
                "inference": {
                  "type": "inference",
                  "stats": {
                    "count": 0,
                    "time_in_millis": 0,
                    "current": 0,
                    "failed": 0
                  }
                }
              }
            ]
          }
        }
      },
      "inference_stats": {
        "failure_count": 0,
        "inference_count": 2,
        "cache_miss_count": 0,
        "missing_all_fields_count": 0,
        "timestamp": 1729883203847
      },
      "deployment_stats": {
        "deployment_id": "elser-endpoint",
        "model_id": ".elser_model_2_linux-x86_64",
        "threads_per_allocation": 1,
        "number_of_allocations": 10,
        "adaptive_allocations": {
          "enabled": true,
          "min_number_of_allocations": 10,
          "max_number_of_allocations": 100
        },
        "queue_capacity": 1024,
        "cache_size": "262mb",
        "priority": "normal",
        "start_time": 1728864116915,
        "inference_count": 2,
        "peak_throughput_per_minute": 1,
        "nodes": [
          {
            "node": {
              "serverless": {
                "name": "serverless",
                "ephemeral_id": "serverless",
                "transport_address": "0.0.0.0:0",
                "external_id": "serverless",
                "attributes": {},
                "roles": [
                  "ml"
                ],
                "version": "9.0.0",
                "min_index_version": 8000099,
                "max_index_version": 9000000
              }
            },
            "routing_state": {
              "routing_state": "started"
            },
            "inference_count": 2,
            "average_inference_time_ms": 84,
            "average_inference_time_ms_excluding_cache_hits": 84,
            "inference_cache_hit_count": 0,
            "last_access": 1729882326131,
            "number_of_pending_requests": 0,
            "start_time": 1729875144036,
            "threads_per_allocation": 1,
            "number_of_allocations": 1,
            "peak_throughput_per_minute": 1,
            "throughput_last_minute": 0,
            "inference_cache_hit_count_last_minute": 0
          }
        ]
      }
    }
  ]
}

Though whats weird is I was trying to deploy an ELSER model

PUT _inference/sparse_embedding/elser-endpoint-backup
{
  "service": "elser",
  "service_settings": {
    "adaptive_allocations": {
      "enabled": true,
      "min_number_of_allocations": 10,
      "max_number_of_allocations": 100
    },
    "num_threads": 1
  }
}

response

{
  "error": {
    "root_cause": [
      {
        "type": "illegal_argument_exception",
        "reason": "not enough memory on node [gIs_zXQYT3SMtEozmJJg0w] to assign model [my-e5-endpoint]"
      }
    ],
    "type": "illegal_argument_exception",
    "reason": "not enough memory on node [gIs_zXQYT3SMtEozmJJg0w] to assign model [my-e5-endpoint]"
  },
  "status": 400
}

serverless prod - fbe378fd1dab4affa7c981c90a03f440

Logs (if relevant)

app log last success (central us time)

2024-10-25 13:51:57,332 - INFO - _async_transport.py - perform_request - line 271 - POST [https://grocery-store-fbe378.es.us-east-1.aws.elastic.cloud:443/users/_search](https://grocery-store-fbe378.es.us-east-1.aws.elastic.cloud/users/_search) [status:200 duration:0.045s]

then on next call

2024-10-25 13:52:04,257 - INFO - _async_transport.py - perform_request - line 271 - POST [https://grocery-store-fbe378.es.us-east-1.aws.elastic.cloud:443/grocery_items/_search](https://grocery-store-fbe378.es.us-east-1.aws.elastic.cloud/grocery_items/_search) [status:404 duration:0.045s]

Metadata

Metadata

Type

No type

Projects

No projects

Milestone

No milestone

Relationships

None yet

Development

No branches or pull requests

Issue actions