openvinotoolkit · mzegla · Nov 7, 2025 · Oct 28, 2025 · Oct 29, 2025 · Oct 29, 2025
diff --git a/docs/llm/reference.md b/docs/llm/reference.md
@@ -122,6 +122,19 @@ utilization of resource will be lower. Old cache will be cleared automatically b
 
 Another cache related option is `cache_eviction_config` which can help with latency of the long generation, but at the cost of accuracy. It's type is defined as follows:
 ```
+    message KVCrushConfig {
+      enum AnchorPointMode {
+        RANDOM = 0;
+        ZEROS = 1;
+        ONES = 2;
+        MEAN = 3;
+        ALTERNATING = 4;
+      }
+      optional uint64 budget = 2 [default = 0];
+      optional AnchorPointMode anchor_point_mode = 3 [default = RANDOM];
+      optional uint64 rng_seed = 4 [default = 0];
+    }
+
     message CacheEvictionConfig {
       enum AggregationMode {
       SUM = 0; // In this mode the importance scores of each token will be summed after each step of generation
@@ -133,6 +146,8 @@ Another cache related option is `cache_eviction_config` which can help with late
       required uint64 recent_size = 3;
       required uint64 max_cache_size = 4;
       optional bool apply_rotation = 5 [default = false];
+      optional uint64 snapkv_window_size = 6 [default = 8]
+      optional KVCrushConfig kv_crush_config = 7;
     }
 ```
 Learn more about the algorithm and above parameters from [GenAI docs](https://github.com/openvinotoolkit/openvino.genai/blob/master/site/docs/concepts/optimization-techniques/kvcache-eviction-algorithm.md). 
@@ -142,12 +157,27 @@ Example of cache eviction config in the node options:
 ### Scheduling settings
 In different use cases and load specification, requests and tokens scheduling might play a role when it comes to performance.
 
-`dynamic_split_fuse` [algorithm](https://github.com/microsoft/DeepSpeed/tree/master/blogs/deepspeed-fastgen#b-dynamic-splitfuse-) is enabled by default to boost the throughput by splitting the tokens to even chunks. In some conditions like with very low concurrency or with very short prompts, it might be beneficial to disable this algorithm. 
+- `dynamic_split_fuse` [algorithm](https://github.com/microsoft/DeepSpeed/tree/master/blogs/deepspeed-fastgen#b-dynamic-splitfuse-) is enabled by default to boost the throughput by splitting the tokens to even chunks. In some conditions like with very low concurrency or with very short prompts, it might be beneficial to disable this algorithm. 
 
-Since `max_num_batched_tokens` defines how many tokens can a pipeline process in one step, when `dynamic_split_fuse` is disabled, `max_num_batched_tokens` should be set to match the model max context length since the prompt is not split and must get processed fully in one step.
+- Since `max_num_batched_tokens` defines how many tokens can a pipeline process in one step, when `dynamic_split_fuse` is disabled, `max_num_batched_tokens` should be set to match the model max context length since the prompt is not split and must get processed fully in one step.
 
-Setting `max_num_seqs` might also be useful in providing certain level of generation speed of requests already in the pipeline. This value should not be higher than `max_num_batched_tokens`.
+- Setting `max_num_seqs` might also be useful in providing certain level of generation speed of requests already in the pipeline. This value should not be higher than `max_num_batched_tokens`.
 
+- Scheduler configuration also accept sparse attention config with following options in `graph.pbtxt`:
+    ```
+    enum SparseAttentionMode {
+      TRISHAPE = 0;
+      XATTENTION = 1;
+    }
+    optional SparseAttentionMode mode = 1 [default = TRISHAPE];
+    optional uint64 num_last_dense_tokens_in_prefill = 2;
+    optional uint64 num_retained_start_tokens_in_cache = 3;
+    optional uint64 num_retained_recent_tokens_in_cache = 4;
+    optional float xattention_threshold = 5;
+    optional uint64 xattention_block_size = 6;
+    optional uint64 xattention_stride = 7;
+    ```
+    Description of parameters in that config can be found in GenAI docs about [SparseAttentionConfig](https://docs.openvino.ai/2025/api/genai_api/_autosummary/openvino_genai.SparseAttentionConfig.html#openvino-genai-sparseattentionconfig).
 
 **Note that the following options are ignored in Stateful servables (so in deployments on NPU): cache_size, dynamic_split_fuse, max_num_batched_tokens, max_num_seq, enable_prefix_caching**
 
@@ -159,7 +189,7 @@ __Tool parsers:__
 - `hermes3` (also works for Qwen3 models)
 - `llama3`
 - `phi4`
-- `mistral` (no streaming support)
+- `mistral`
 - `gptoss`
 - `qwen3coder`
 

diff --git a/src/llm/language_model/continuous_batching/servable_initializer.cpp b/src/llm/language_model/continuous_batching/servable_initializer.cpp
@@ -19,6 +19,7 @@
 #include <string>
 #include <vector>
 #include <openvino/genai/cache_eviction.hpp>
+#include <openvino/genai/sparse_attention.hpp>
 #include <openvino/genai/continuous_batching_pipeline.hpp>
 #include <openvino/openvino.hpp>
 #include <spdlog/spdlog.h>
@@ -41,6 +42,38 @@
 
 namespace ovms {
 
+ov::genai::SparseAttentionConfig prepareSparseAttentionConfig(const mediapipe::LLMCalculatorOptions& nodeOptions) {
+    ov::genai::SparseAttentionMode mode;
+    if (nodeOptions.sparse_attention_config().mode() == mediapipe::LLMCalculatorOptions::SparseAttentionConfig::TRISHAPE) {
+        mode = ov::genai::SparseAttentionMode::TRISHAPE;
+    } else {
+        mode = ov::genai::SparseAttentionMode::XATTENTION;
+    }
+    // Use default constructor to rely on GenAI defined defaults if user did not set specific fields
+    ov::genai::SparseAttentionConfig sparseAttentionConfig;
+    sparseAttentionConfig.mode = mode;
+    if (nodeOptions.sparse_attention_config().has_num_last_dense_tokens_in_prefill()) {
+        sparseAttentionConfig.num_last_dense_tokens_in_prefill = nodeOptions.sparse_attention_config().num_last_dense_tokens_in_prefill();
+    }
+    if (nodeOptions.sparse_attention_config().has_num_retained_start_tokens_in_cache()) {
+        sparseAttentionConfig.num_retained_start_tokens_in_cache = nodeOptions.sparse_attention_config().num_retained_start_tokens_in_cache();
+    }
+    if (nodeOptions.sparse_attention_config().has_num_retained_recent_tokens_in_cache()) {
+        sparseAttentionConfig.num_retained_recent_tokens_in_cache = nodeOptions.sparse_attention_config().num_retained_recent_tokens_in_cache();
+    }
+    if (nodeOptions.sparse_attention_config().has_xattention_threshold()) {
+        sparseAttentionConfig.xattention_threshold = nodeOptions.sparse_attention_config().xattention_threshold();
+    }
+    if (nodeOptions.sparse_attention_config().has_xattention_block_size()) {
+        sparseAttentionConfig.xattention_block_size = nodeOptions.sparse_attention_config().xattention_block_size();
+    }
+    if (nodeOptions.sparse_attention_config().has_xattention_stride()) {
+        sparseAttentionConfig.xattention_stride = nodeOptions.sparse_attention_config().xattention_stride();
+    }
+
+    return sparseAttentionConfig;
+}
+
 ov::genai::CacheEvictionConfig prepareCacheEvictionConfig(const mediapipe::LLMCalculatorOptions& nodeOptions) {
     ov::genai::AggregationMode aggregationMode;
     if (nodeOptions.cache_eviction_config().aggregation_mode() == mediapipe::LLMCalculatorOptions::CacheEvictionConfig::SUM) {
@@ -52,7 +85,36 @@ ov::genai::CacheEvictionConfig prepareCacheEvictionConfig(const mediapipe::LLMCa
     size_t recentSize = nodeOptions.cache_eviction_config().recent_size();
     size_t maxCacheSize = nodeOptions.cache_eviction_config().max_cache_size();
     bool applyRotation = nodeOptions.cache_eviction_config().apply_rotation();
-    return ov::genai::CacheEvictionConfig(startSize, recentSize, maxCacheSize, aggregationMode, applyRotation);
+    size_t snapkvWindowSize = nodeOptions.cache_eviction_config().snapkv_window_size();
+
+    ov::genai::KVCrushConfig kvCrushConfig;
+    if (nodeOptions.cache_eviction_config().has_kv_crush_config()) {
+        ov::genai::KVCrushAnchorPointMode anchorPointMode;
+        switch (nodeOptions.cache_eviction_config().kv_crush_config().anchor_point_mode()) {
+        case mediapipe::LLMCalculatorOptions::KVCrushConfig::RANDOM:
+            anchorPointMode = ov::genai::KVCrushAnchorPointMode::RANDOM;
+            break;
+        case mediapipe::LLMCalculatorOptions::KVCrushConfig::ZEROS:
+            anchorPointMode = ov::genai::KVCrushAnchorPointMode::ZEROS;
+            break;
+        case mediapipe::LLMCalculatorOptions::KVCrushConfig::ONES:
+            anchorPointMode = ov::genai::KVCrushAnchorPointMode::ONES;
+            break;
+        case mediapipe::LLMCalculatorOptions::KVCrushConfig::MEAN:
+            anchorPointMode = ov::genai::KVCrushAnchorPointMode::MEAN;
+            break;
+        case mediapipe::LLMCalculatorOptions::KVCrushConfig::ALTERNATING:
+            anchorPointMode = ov::genai::KVCrushAnchorPointMode::ALTERNATING;
+            break;
+        default:
+            anchorPointMode = ov::genai::KVCrushAnchorPointMode::RANDOM;
+            break;
+        }
+        size_t budget = nodeOptions.cache_eviction_config().kv_crush_config().budget();
+        size_t rngSeed = nodeOptions.cache_eviction_config().kv_crush_config().rng_seed();
+        kvCrushConfig = ov::genai::KVCrushConfig(budget, anchorPointMode, rngSeed);
+    }
+    return ov::genai::CacheEvictionConfig(startSize, recentSize, maxCacheSize, aggregationMode, applyRotation, snapkvWindowSize, kvCrushConfig);
 }
 
 ov::genai::SchedulerConfig ContinuousBatchingServableInitializer::prepareDraftPipelineSchedulerConfig(const mediapipe::LLMCalculatorOptions& nodeOptions) {
@@ -97,6 +159,13 @@ Status ContinuousBatchingServableInitializer::initialize(std::shared_ptr<GenAiSe
         properties->schedulerConfig.use_cache_eviction = false;
     }
 
+    if (nodeOptions.has_sparse_attention_config()) {
+        properties->schedulerConfig.use_sparse_attention = true;
+        properties->schedulerConfig.sparse_attention_config = prepareSparseAttentionConfig(nodeOptions);
+    } else {
+        properties->schedulerConfig.use_sparse_attention = false;
+    }
+
     properties->device = nodeOptions.device();
     properties->bestOfLimit = nodeOptions.best_of_limit();
     properties->enableToolGuidedGeneration = nodeOptions.enable_tool_guided_generation();

diff --git a/src/llm/llm_calculator.proto b/src/llm/llm_calculator.proto
@@ -26,17 +26,47 @@ message LLMCalculatorOptions {
     optional LLMCalculatorOptions ext = 113473750;
     }
 
+    message KVCrushConfig {
+      enum AnchorPointMode {
+        RANDOM = 0;
+        ZEROS = 1;
+        ONES = 2;
+        MEAN = 3;
+        ALTERNATING = 4;
+      }
+      optional uint64 budget = 2 [default = 0];
+      optional AnchorPointMode anchor_point_mode = 3 [default = RANDOM];
+      optional uint64 rng_seed = 4 [default = 0];
+    }
+
     message CacheEvictionConfig {
       enum AggregationMode {
       SUM = 0; // In this mode the importance scores of each token will be summed after each step of generation
       NORM_SUM = 1; // Same as SUM, but the importance scores are additionally divided by the lifetime (in tokens generated) of a given token in cache
       }
-
-      optional AggregationMode aggregation_mode = 1 [default = SUM];
+      optional AggregationMode aggregation_mode = 1 [default = NORM_SUM];
       required uint64 start_size = 2;
       required uint64 recent_size = 3;
       required uint64 max_cache_size = 4;
       optional bool apply_rotation = 5 [default = false];
+      optional uint64 snapkv_window_size = 6 [default = 8];
+      optional KVCrushConfig kv_crush_config = 7;
+    }
+
+    message SparseAttentionConfig {
+      enum SparseAttentionMode {
+        TRISHAPE = 0;
+        XATTENTION = 1;
+      }
+      optional SparseAttentionMode mode = 1 [default = TRISHAPE];
+      // As for parameters below, if they are not set, default values are set on GenAI level
+      // https://docs.openvino.ai/2025/api/genai_api/_autosummary/openvino_genai.SparseAttentionConfig.html#openvino_genai.SparseAttentionConfig.__init__
+      optional uint64 num_last_dense_tokens_in_prefill = 2;
+      optional uint64 num_retained_start_tokens_in_cache = 3;
+      optional uint64 num_retained_recent_tokens_in_cache = 4;
+      optional float xattention_threshold = 5;
+      optional uint64 xattention_block_size = 6;
+      optional uint64 xattention_stride = 7;
     }
 
     enum PipelineType {
@@ -101,4 +131,6 @@ message LLMCalculatorOptions {
     optional string reasoning_parser = 21;
 
     optional bool enable_tool_guided_generation = 22 [default = false];
+
+    optional SparseAttentionConfig sparse_attention_config = 23;
 }