Skip to content
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
38 changes: 34 additions & 4 deletions docs/llm/reference.md
Original file line number Diff line number Diff line change
Expand Up @@ -122,6 +122,19 @@ utilization of resource will be lower. Old cache will be cleared automatically b

Another cache related option is `cache_eviction_config` which can help with latency of the long generation, but at the cost of accuracy. It's type is defined as follows:
```
message KVCrushConfig {
enum AnchorPointMode {
RANDOM = 0;
ZEROS = 1;
ONES = 2;
MEAN = 3;
ALTERNATING = 4;
}
optional uint64 budget = 2 [default = 0];
optional AnchorPointMode anchor_point_mode = 3 [default = RANDOM];
optional uint64 rng_seed = 4 [default = 0];
}

message CacheEvictionConfig {
enum AggregationMode {
SUM = 0; // In this mode the importance scores of each token will be summed after each step of generation
Expand All @@ -133,6 +146,8 @@ Another cache related option is `cache_eviction_config` which can help with late
required uint64 recent_size = 3;
required uint64 max_cache_size = 4;
optional bool apply_rotation = 5 [default = false];
optional uint64 snapkv_window_size = 6 [default = 8]
optional KVCrushConfig kv_crush_config = 7;
}
```
Learn more about the algorithm and above parameters from [GenAI docs](https://github.com/openvinotoolkit/openvino.genai/blob/master/site/docs/concepts/optimization-techniques/kvcache-eviction-algorithm.md).
Expand All @@ -142,12 +157,27 @@ Example of cache eviction config in the node options:
### Scheduling settings
In different use cases and load specification, requests and tokens scheduling might play a role when it comes to performance.

`dynamic_split_fuse` [algorithm](https://github.com/microsoft/DeepSpeed/tree/master/blogs/deepspeed-fastgen#b-dynamic-splitfuse-) is enabled by default to boost the throughput by splitting the tokens to even chunks. In some conditions like with very low concurrency or with very short prompts, it might be beneficial to disable this algorithm.
- `dynamic_split_fuse` [algorithm](https://github.com/microsoft/DeepSpeed/tree/master/blogs/deepspeed-fastgen#b-dynamic-splitfuse-) is enabled by default to boost the throughput by splitting the tokens to even chunks. In some conditions like with very low concurrency or with very short prompts, it might be beneficial to disable this algorithm.

Since `max_num_batched_tokens` defines how many tokens can a pipeline process in one step, when `dynamic_split_fuse` is disabled, `max_num_batched_tokens` should be set to match the model max context length since the prompt is not split and must get processed fully in one step.
- Since `max_num_batched_tokens` defines how many tokens can a pipeline process in one step, when `dynamic_split_fuse` is disabled, `max_num_batched_tokens` should be set to match the model max context length since the prompt is not split and must get processed fully in one step.

Setting `max_num_seqs` might also be useful in providing certain level of generation speed of requests already in the pipeline. This value should not be higher than `max_num_batched_tokens`.
- Setting `max_num_seqs` might also be useful in providing certain level of generation speed of requests already in the pipeline. This value should not be higher than `max_num_batched_tokens`.

- Scheduler configuration also accept sparse attention config with following options in `graph.pbtxt`:
```
enum SparseAttentionMode {
TRISHAPE = 0;
XATTENTION = 1;
}
optional SparseAttentionMode mode = 1 [default = TRISHAPE];
optional uint64 num_last_dense_tokens_in_prefill = 2;
optional uint64 num_retained_start_tokens_in_cache = 3;
optional uint64 num_retained_recent_tokens_in_cache = 4;
optional float xattention_threshold = 5;
optional uint64 xattention_block_size = 6;
optional uint64 xattention_stride = 7;
```
Description of parameters in that config can be found in GenAI docs about [SparseAttentionConfig](https://docs.openvino.ai/2025/api/genai_api/_autosummary/openvino_genai.SparseAttentionConfig.html#openvino-genai-sparseattentionconfig).

**Note that the following options are ignored in Stateful servables (so in deployments on NPU): cache_size, dynamic_split_fuse, max_num_batched_tokens, max_num_seq, enable_prefix_caching**

Expand All @@ -159,7 +189,7 @@ __Tool parsers:__
- `hermes3` (also works for Qwen3 models)
- `llama3`
- `phi4`
- `mistral` (no streaming support)
- `mistral`
- `gptoss`
- `qwen3coder`

Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -19,6 +19,7 @@
#include <string>
#include <vector>
#include <openvino/genai/cache_eviction.hpp>
#include <openvino/genai/sparse_attention.hpp>
#include <openvino/genai/continuous_batching_pipeline.hpp>
#include <openvino/openvino.hpp>
#include <spdlog/spdlog.h>
Expand All @@ -41,6 +42,38 @@

namespace ovms {

ov::genai::SparseAttentionConfig prepareSparseAttentionConfig(const mediapipe::LLMCalculatorOptions& nodeOptions) {
ov::genai::SparseAttentionMode mode;
if (nodeOptions.sparse_attention_config().mode() == mediapipe::LLMCalculatorOptions::SparseAttentionConfig::TRISHAPE) {
mode = ov::genai::SparseAttentionMode::TRISHAPE;
} else {
mode = ov::genai::SparseAttentionMode::XATTENTION;
}
// Use default constructor to rely on GenAI defined defaults if user did not set specific fields
ov::genai::SparseAttentionConfig sparseAttentionConfig;
sparseAttentionConfig.mode = mode;
if (nodeOptions.sparse_attention_config().has_num_last_dense_tokens_in_prefill()) {
sparseAttentionConfig.num_last_dense_tokens_in_prefill = nodeOptions.sparse_attention_config().num_last_dense_tokens_in_prefill();
}
if (nodeOptions.sparse_attention_config().has_num_retained_start_tokens_in_cache()) {
sparseAttentionConfig.num_retained_start_tokens_in_cache = nodeOptions.sparse_attention_config().num_retained_start_tokens_in_cache();
}
if (nodeOptions.sparse_attention_config().has_num_retained_recent_tokens_in_cache()) {
sparseAttentionConfig.num_retained_recent_tokens_in_cache = nodeOptions.sparse_attention_config().num_retained_recent_tokens_in_cache();
}
if (nodeOptions.sparse_attention_config().has_xattention_threshold()) {
sparseAttentionConfig.xattention_threshold = nodeOptions.sparse_attention_config().xattention_threshold();
}
if (nodeOptions.sparse_attention_config().has_xattention_block_size()) {
sparseAttentionConfig.xattention_block_size = nodeOptions.sparse_attention_config().xattention_block_size();
}
if (nodeOptions.sparse_attention_config().has_xattention_stride()) {
sparseAttentionConfig.xattention_stride = nodeOptions.sparse_attention_config().xattention_stride();
}

return sparseAttentionConfig;
}

ov::genai::CacheEvictionConfig prepareCacheEvictionConfig(const mediapipe::LLMCalculatorOptions& nodeOptions) {
ov::genai::AggregationMode aggregationMode;
if (nodeOptions.cache_eviction_config().aggregation_mode() == mediapipe::LLMCalculatorOptions::CacheEvictionConfig::SUM) {
Expand All @@ -52,7 +85,36 @@ ov::genai::CacheEvictionConfig prepareCacheEvictionConfig(const mediapipe::LLMCa
size_t recentSize = nodeOptions.cache_eviction_config().recent_size();
size_t maxCacheSize = nodeOptions.cache_eviction_config().max_cache_size();
bool applyRotation = nodeOptions.cache_eviction_config().apply_rotation();
return ov::genai::CacheEvictionConfig(startSize, recentSize, maxCacheSize, aggregationMode, applyRotation);
size_t snapkvWindowSize = nodeOptions.cache_eviction_config().snapkv_window_size();

ov::genai::KVCrushConfig kvCrushConfig;
if (nodeOptions.cache_eviction_config().has_kv_crush_config()) {
ov::genai::KVCrushAnchorPointMode anchorPointMode;
switch (nodeOptions.cache_eviction_config().kv_crush_config().anchor_point_mode()) {
case mediapipe::LLMCalculatorOptions::KVCrushConfig::RANDOM:
anchorPointMode = ov::genai::KVCrushAnchorPointMode::RANDOM;
break;
case mediapipe::LLMCalculatorOptions::KVCrushConfig::ZEROS:
anchorPointMode = ov::genai::KVCrushAnchorPointMode::ZEROS;
break;
case mediapipe::LLMCalculatorOptions::KVCrushConfig::ONES:
anchorPointMode = ov::genai::KVCrushAnchorPointMode::ONES;
break;
case mediapipe::LLMCalculatorOptions::KVCrushConfig::MEAN:
anchorPointMode = ov::genai::KVCrushAnchorPointMode::MEAN;
break;
case mediapipe::LLMCalculatorOptions::KVCrushConfig::ALTERNATING:
anchorPointMode = ov::genai::KVCrushAnchorPointMode::ALTERNATING;
break;
default:
anchorPointMode = ov::genai::KVCrushAnchorPointMode::RANDOM;
break;
}
size_t budget = nodeOptions.cache_eviction_config().kv_crush_config().budget();
size_t rngSeed = nodeOptions.cache_eviction_config().kv_crush_config().rng_seed();
kvCrushConfig = ov::genai::KVCrushConfig(budget, anchorPointMode, rngSeed);
}
return ov::genai::CacheEvictionConfig(startSize, recentSize, maxCacheSize, aggregationMode, applyRotation, snapkvWindowSize, kvCrushConfig);
}

ov::genai::SchedulerConfig ContinuousBatchingServableInitializer::prepareDraftPipelineSchedulerConfig(const mediapipe::LLMCalculatorOptions& nodeOptions) {
Expand Down Expand Up @@ -97,6 +159,13 @@ Status ContinuousBatchingServableInitializer::initialize(std::shared_ptr<GenAiSe
properties->schedulerConfig.use_cache_eviction = false;
}

if (nodeOptions.has_sparse_attention_config()) {
properties->schedulerConfig.use_sparse_attention = true;
properties->schedulerConfig.sparse_attention_config = prepareSparseAttentionConfig(nodeOptions);
} else {
properties->schedulerConfig.use_sparse_attention = false;
}

properties->device = nodeOptions.device();
properties->bestOfLimit = nodeOptions.best_of_limit();
properties->enableToolGuidedGeneration = nodeOptions.enable_tool_guided_generation();
Expand Down
36 changes: 34 additions & 2 deletions src/llm/llm_calculator.proto
Original file line number Diff line number Diff line change
Expand Up @@ -26,17 +26,47 @@ message LLMCalculatorOptions {
optional LLMCalculatorOptions ext = 113473750;
}

message KVCrushConfig {
enum AnchorPointMode {
RANDOM = 0;
ZEROS = 1;
ONES = 2;
MEAN = 3;
ALTERNATING = 4;
}
optional uint64 budget = 2 [default = 0];
optional AnchorPointMode anchor_point_mode = 3 [default = RANDOM];
optional uint64 rng_seed = 4 [default = 0];
}

message CacheEvictionConfig {
enum AggregationMode {
SUM = 0; // In this mode the importance scores of each token will be summed after each step of generation
NORM_SUM = 1; // Same as SUM, but the importance scores are additionally divided by the lifetime (in tokens generated) of a given token in cache
}

optional AggregationMode aggregation_mode = 1 [default = SUM];
optional AggregationMode aggregation_mode = 1 [default = NORM_SUM];
required uint64 start_size = 2;
required uint64 recent_size = 3;
required uint64 max_cache_size = 4;
optional bool apply_rotation = 5 [default = false];
optional uint64 snapkv_window_size = 6 [default = 8];
optional KVCrushConfig kv_crush_config = 7;
}

message SparseAttentionConfig {
enum SparseAttentionMode {
TRISHAPE = 0;
XATTENTION = 1;
}
optional SparseAttentionMode mode = 1 [default = TRISHAPE];
// As for parameters below, if they are not set, default values are set on GenAI level
// https://docs.openvino.ai/2025/api/genai_api/_autosummary/openvino_genai.SparseAttentionConfig.html#openvino_genai.SparseAttentionConfig.__init__
optional uint64 num_last_dense_tokens_in_prefill = 2;
optional uint64 num_retained_start_tokens_in_cache = 3;
optional uint64 num_retained_recent_tokens_in_cache = 4;
optional float xattention_threshold = 5;
optional uint64 xattention_block_size = 6;
optional uint64 xattention_stride = 7;
}

enum PipelineType {
Expand Down Expand Up @@ -101,4 +131,6 @@ message LLMCalculatorOptions {
optional string reasoning_parser = 21;

optional bool enable_tool_guided_generation = 22 [default = false];

optional SparseAttentionConfig sparse_attention_config = 23;
}
Loading