From 401e494cc30fff737b2848486d4a4061d4d2838f Mon Sep 17 00:00:00 2001 From: Alexander Kalistratov Date: Fri, 5 Sep 2025 21:30:16 +0000 Subject: [PATCH 1/3] Fix logic of distinquishing between prefill and generate stages for gemma3 & handling of token_types_ids input --- .../src/plugin/npuw/llm_compiled_model.cpp | 2 + .../src/plugin/npuw/llm_infer_request.cpp | 85 ++++++++++++++++--- .../src/plugin/npuw/llm_infer_request.hpp | 21 +++-- 3 files changed, 91 insertions(+), 17 deletions(-) diff --git a/src/plugins/intel_npu/src/plugin/npuw/llm_compiled_model.cpp b/src/plugins/intel_npu/src/plugin/npuw/llm_compiled_model.cpp index cdf886efbbced0..807bd62e3c34f0 100644 --- a/src/plugins/intel_npu/src/plugin/npuw/llm_compiled_model.cpp +++ b/src/plugins/intel_npu/src/plugin/npuw/llm_compiled_model.cpp @@ -467,6 +467,8 @@ void reshape_to_static(std::shared_ptr model, ov::PartialShape new_shape; if (input_name.find("input_ids") != std::string::npos) { new_shape = ov::PartialShape({1, input_size}); + } else if (input_name.find("token_type_ids") != std::string::npos) { + new_shape = ov::PartialShape({1, input_size}); } else if (input_name.find("inputs_embeds") != std::string::npos) { // NB: VLMs case, model accepts inputs_embeds[BATCH, SEQ_LEN, EMB_SIZE] NPUW_ASSERT(input.get_partial_shape().size() == 3u); diff --git a/src/plugins/intel_npu/src/plugin/npuw/llm_infer_request.cpp b/src/plugins/intel_npu/src/plugin/npuw/llm_infer_request.cpp index 8861c9d419a918..b142051af8c09f 100644 --- a/src/plugins/intel_npu/src/plugin/npuw/llm_infer_request.cpp +++ b/src/plugins/intel_npu/src/plugin/npuw/llm_infer_request.cpp @@ -248,6 +248,14 @@ std::pair get_lora_dims_by_name(const std::string& state_nam return std::make_pair(low_rank_dim, full_rank_dim); } +void copy_to_right(const ov::SoPtr& src, const ov::SoPtr& dst) { + OPENVINO_ASSERT(src->get_byte_size() <= dst->get_byte_size()); + std::copy_n( + reinterpret_cast(src->data()), + src->get_byte_size(), + reinterpret_cast(dst->data()) + dst->get_byte_size() - src->get_byte_size()); +} + constexpr uint32_t INPUT_IDS_SEQ_LEN_DIM = 1; constexpr std::size_t kStartOutputKVCacheLayers = 1; @@ -472,6 +480,10 @@ void ov::npuw::LLMInferRequest::apply_lora() { void ov::npuw::LLMInferRequest::prepare_for_new_conversation() { fill_tensor_bytes(m_prefill_request->get_tensor(m_prefill_in_ports.at(m_input_ids_name)), 0u); + if (auto totyids_port = m_prefill_in_ports.find(layer_names::token_type_ids); + totyids_port != m_prefill_in_ports.end()) { + fill_tensor_bytes(m_prefill_request->get_tensor(totyids_port->second), 0u); + } fill_tensor(m_prefill_request->get_tensor(m_prefill_in_ports.at(layer_names::attention_mask)), 0); fill_tensor(m_prefill_request->get_tensor(m_prefill_in_ports.at(layer_names::position_ids)), 0); m_npuw_llm_compiled_model->m_kvcache_desc.num_stored_tokens = 0u; @@ -555,8 +567,8 @@ void ov::npuw::LLMInferRequest::copy_kvcache() { void ov::npuw::LLMInferRequest::update_kvcache_for( std::shared_ptr request, - std::unordered_map> in_ports, - std::unordered_map> out_ports, + const std::unordered_map>& in_ports, + const std::unordered_map>& out_ports, uint32_t num_tokens) { LOG_DEBUG("Store computed key and values for passed number of tokens in the input kv-cache" " layers."); @@ -629,7 +641,8 @@ void ov::npuw::LLMInferRequest::clear_chunk_prefill_kv_cache() { void ov::npuw::LLMInferRequest::infer_chunked_prefill(ov::SoPtr input_ids, ov::SoPtr attention_mask, - ov::SoPtr position_ids) { + ov::SoPtr position_ids, + ov::SoPtr token_types_ids) { LOG_DEBUG("Calling chunked inference for prefill model."); LOG_BLOCK(); @@ -646,6 +659,15 @@ void ov::npuw::LLMInferRequest::infer_chunked_prefill(ov::SoPtr inp auto attn_mask_in_tensor = m_prefill_request->get_tensor(m_prefill_in_ports.at(layer_names::attention_mask)); auto pos_ids_in_tensor = m_prefill_request->get_tensor(m_prefill_in_ports.at(layer_names::position_ids)); + auto to_ty_ids_in_tensor = [&]() { + if (auto ttis_port = m_prefill_in_ports.find(layer_names::token_type_ids); + ttis_port != m_prefill_in_ports.end()) { + return m_prefill_request->get_tensor(ttis_port->second); + } + + return ov::npuw::util::TensorPtr(); + }(); + auto& kvcache_desc = m_npuw_llm_compiled_model->m_kvcache_desc; int64_t remaining_prompts = input_prompt_len; @@ -663,10 +685,18 @@ void ov::npuw::LLMInferRequest::infer_chunked_prefill(ov::SoPtr inp // If the current prompt length is smaller than the chunk prompt length, // clear the last chunk of the attention mask to ensure non-relevant tokens are masked fill_tensor(attn_mask_in_tensor, 0, last_chunk_offset); + if (to_ty_ids_in_tensor) { + fill_tensor(to_ty_ids_in_tensor, 0, last_chunk_offset); + } } std::copy_n(attention_mask->data() + kvcache_desc.num_stored_tokens, current_prompts_len, attn_mask_in_tensor->data() + attn_mask_in_tensor->get_size() - current_prompts_len); + if (to_ty_ids_in_tensor) { + std::copy_n(token_types_ids->data() + kvcache_desc.num_stored_tokens, + current_prompts_len, + to_ty_ids_in_tensor->data() + to_ty_ids_in_tensor->get_size() - current_prompts_len); + } auto current_prefill_bytes = current_prompts_len * input_ids_elem_size; auto prefilled_bytes = kvcache_desc.num_stored_tokens * input_ids_elem_size; @@ -719,7 +749,8 @@ void ov::npuw::LLMInferRequest::infer_chunked_prefill(ov::SoPtr inp void ov::npuw::LLMInferRequest::infer_whole_prefill(ov::SoPtr input_ids, ov::SoPtr attention_mask, - ov::SoPtr position_ids) { + ov::SoPtr position_ids, + ov::SoPtr token_types_ids) { LOG_DEBUG("Calling inference for prefill model in a single launch."); LOG_BLOCK(); @@ -736,6 +767,13 @@ void ov::npuw::LLMInferRequest::infer_whole_prefill(ov::SoPtr input attention_mask->get_size(), padded_attention_mask->data() + padded_attention_mask->get_size() - attention_mask->get_size()); + if (token_types_ids) { + auto padded_token_type_ids = m_prefill_request->get_tensor(m_prefill_in_ports.at(layer_names::token_type_ids)); + + std::fill_n(reinterpret_cast(padded_token_type_ids->data()), token_types_ids->get_byte_size(), 0); + copy_to_right(token_types_ids, padded_token_type_ids); + } + auto padded_position_ids = m_prefill_request->get_tensor(m_prefill_in_ports.at(layer_names::position_ids)); pad_position_ids(padded_position_ids, position_ids); @@ -748,7 +786,8 @@ void ov::npuw::LLMInferRequest::infer_whole_prefill(ov::SoPtr input void ov::npuw::LLMInferRequest::infer_prefill(ov::SoPtr input_ids, ov::SoPtr attention_mask, - ov::SoPtr position_ids) { + ov::SoPtr position_ids, + ov::SoPtr token_types_ids) { LOG_DEBUG("Calling inference for prefill model..."); LOG_BLOCK(); @@ -764,9 +803,9 @@ void ov::npuw::LLMInferRequest::infer_prefill(ov::SoPtr input_ids, const bool use_chunk_prefill = m_npuw_llm_compiled_model->m_use_chunk_prefill; if (use_chunk_prefill) { - infer_chunked_prefill(input_ids, attention_mask, position_ids); + infer_chunked_prefill(input_ids, attention_mask, position_ids, token_types_ids); } else { - infer_whole_prefill(input_ids, attention_mask, position_ids); + infer_whole_prefill(input_ids, attention_mask, position_ids, token_types_ids); } if (m_lm_head_request) { @@ -784,7 +823,8 @@ void ov::npuw::LLMInferRequest::infer_prefill(ov::SoPtr input_ids, void ov::npuw::LLMInferRequest::infer_generate(ov::SoPtr input_ids, ov::SoPtr attention_mask, - ov::SoPtr position_ids) { + ov::SoPtr position_ids, + ov::SoPtr token_types_ids) { LOG_DEBUG("Calling inference for generate model..."); LOG_BLOCK(); auto& kvcache_desc = m_npuw_llm_compiled_model->m_kvcache_desc; @@ -823,6 +863,11 @@ void ov::npuw::LLMInferRequest::infer_generate(ov::SoPtr input_ids, input_ids->get_byte_size(), reinterpret_cast(kv_input_ids->data()) + kv_input_ids->get_byte_size() - input_ids->get_byte_size()); + if (token_types_ids) { + auto r_token_type_ids = m_kvcache_request->get_tensor(m_kvcache_in_ports.at(layer_names::token_type_ids)); + copy_to_right(token_types_ids, r_token_type_ids); + } + // NOTE: Attention mask pattern for generate model requires the set of "1" // units of length of the current prompt on the right (for present // kv layers) and the set of "1" units of number of previously calculated @@ -873,12 +918,30 @@ void ov::npuw::LLMInferRequest::infer() { // FIXME: position_ids might be optional for some models! auto position_ids = get_tensor(find_port_by_name(inputs, layer_names::position_ids).value()); + auto token_types_ids = [&]() { + if (auto ttis_port = find_port_by_name(inputs, layer_names::token_type_ids); ttis_port.has_value()) { + return get_tensor(ttis_port.value()); + } + + return ov::npuw::util::TensorPtr(); + }(); + // NB: For VLM, the "inputs_embeds" contains float values (embeddings) OPENVINO_ASSERT(ov::element::f32 == input_ids->get_element_type() || ov::element::i64 == input_ids->get_element_type()); OPENVINO_ASSERT(ov::element::i64 == attention_mask->get_element_type()); OPENVINO_ASSERT(ov::element::i64 == position_ids->get_element_type()); + if (m_first_run) { + // Most of the models have position_ids->data()[0] == 0 for the first infer + // But gemma3 has it's == 1 + // We need to store original zero position id in order to distinguish between prefill and generate stage + // While in most of the cases we need to do prefill only once, it is not true for chat mode + // where we need to do prefill on each user input. + m_zero_position_id = position_ids->data()[0]; + m_first_run = false; + } + // NB: Check the sequence length provided for input_ids // and start position idx in order to distinguish prefill // and generate stages. @@ -901,11 +964,11 @@ void ov::npuw::LLMInferRequest::infer() { // The outcome of two items is that prefill and generate stages // can be safely differentiated by start position id for // both main and draft models. - if (input_ids->get_shape()[INPUT_IDS_SEQ_LEN_DIM] > 1 && position_ids->data()[0] == 0) { - infer_prefill(input_ids, attention_mask, position_ids); + if (input_ids->get_shape()[INPUT_IDS_SEQ_LEN_DIM] > 1 && position_ids->data()[0] == m_zero_position_id) { + infer_prefill(input_ids, attention_mask, position_ids, token_types_ids); } else { trim_kvcache_for_speculative_decoding(position_ids); - infer_generate(input_ids, attention_mask, position_ids); + infer_generate(input_ids, attention_mask, position_ids, token_types_ids); } } diff --git a/src/plugins/intel_npu/src/plugin/npuw/llm_infer_request.hpp b/src/plugins/intel_npu/src/plugin/npuw/llm_infer_request.hpp index 86ccfef6e41700..9741c4432d4043 100644 --- a/src/plugins/intel_npu/src/plugin/npuw/llm_infer_request.hpp +++ b/src/plugins/intel_npu/src/plugin/npuw/llm_infer_request.hpp @@ -24,6 +24,7 @@ class LLMInferRequest final : public ov::ISyncInferRequest { static constexpr const char* past_key_values = "past_key_values"; static constexpr const char* output_embeds = "npuw_output_embed"; static constexpr const char* logits = "logits"; + static constexpr const char* token_type_ids = "token_type_ids"; }; explicit LLMInferRequest(const std::shared_ptr& compiled_model); @@ -49,26 +50,30 @@ class LLMInferRequest final : public ov::ISyncInferRequest { void init_tensor(const ov::Output& port); void copy_kvcache(); void update_kvcache_for(std::shared_ptr request, - std::unordered_map> in_ports, - std::unordered_map> out_ports, + const std::unordered_map>& in_ports, + const std::unordered_map>& out_ports, uint32_t tokens); void trim_kvcache_for_speculative_decoding(ov::SoPtr position_ids); void infer_chunked_prefill(ov::SoPtr input_ids, ov::SoPtr attention_mask, - ov::SoPtr position_ids); + ov::SoPtr position_ids, + ov::SoPtr input_token_ids); void infer_whole_prefill(ov::SoPtr input_ids, ov::SoPtr attention_mask, - ov::SoPtr position_ids); + ov::SoPtr position_ids, + ov::SoPtr input_token_ids); void infer_prefill(ov::SoPtr input_ids, ov::SoPtr attention_mask, - ov::SoPtr position_ids); + ov::SoPtr position_ids, + ov::SoPtr input_token_ids); void infer_generate(ov::SoPtr input_ids, ov::SoPtr attention_mask, - ov::SoPtr position_ids); + ov::SoPtr position_ids, + ov::SoPtr input_token_ids); std::shared_ptr m_kvcache_request; std::shared_ptr m_prefill_request; @@ -88,6 +93,10 @@ class LLMInferRequest final : public ov::ISyncInferRequest { bool m_generate_initialized = false; + bool m_first_run = true; + + int64_t m_zero_position_id = 0; + // Support LoRA std::vector> m_variableStates; void init_lora_states(); From 0233897195d8a3bb777baff5cbac7ee64d58bb56 Mon Sep 17 00:00:00 2001 From: Alexander Kalistratov Date: Wed, 17 Sep 2025 12:34:05 +0000 Subject: [PATCH 2/3] clang-format --- .../intel_npu/src/plugin/npuw/llm_infer_request.cpp | 7 +++---- 1 file changed, 3 insertions(+), 4 deletions(-) diff --git a/src/plugins/intel_npu/src/plugin/npuw/llm_infer_request.cpp b/src/plugins/intel_npu/src/plugin/npuw/llm_infer_request.cpp index b142051af8c09f..dd104a5bb00cc9 100644 --- a/src/plugins/intel_npu/src/plugin/npuw/llm_infer_request.cpp +++ b/src/plugins/intel_npu/src/plugin/npuw/llm_infer_request.cpp @@ -250,10 +250,9 @@ std::pair get_lora_dims_by_name(const std::string& state_nam void copy_to_right(const ov::SoPtr& src, const ov::SoPtr& dst) { OPENVINO_ASSERT(src->get_byte_size() <= dst->get_byte_size()); - std::copy_n( - reinterpret_cast(src->data()), - src->get_byte_size(), - reinterpret_cast(dst->data()) + dst->get_byte_size() - src->get_byte_size()); + std::copy_n(reinterpret_cast(src->data()), + src->get_byte_size(), + reinterpret_cast(dst->data()) + dst->get_byte_size() - src->get_byte_size()); } constexpr uint32_t INPUT_IDS_SEQ_LEN_DIM = 1; From f51d648f7aacca3eef69d9ee047cb5f1c7b0177a Mon Sep 17 00:00:00 2001 From: Alexander Kalistratov Date: Wed, 17 Sep 2025 13:00:28 +0000 Subject: [PATCH 3/3] fix --- .../src/plugin/npuw/llm_infer_request.cpp | 21 +++++++------------ 1 file changed, 8 insertions(+), 13 deletions(-) diff --git a/src/plugins/intel_npu/src/plugin/npuw/llm_infer_request.cpp b/src/plugins/intel_npu/src/plugin/npuw/llm_infer_request.cpp index dd104a5bb00cc9..94a88fe882b958 100644 --- a/src/plugins/intel_npu/src/plugin/npuw/llm_infer_request.cpp +++ b/src/plugins/intel_npu/src/plugin/npuw/llm_infer_request.cpp @@ -658,14 +658,11 @@ void ov::npuw::LLMInferRequest::infer_chunked_prefill(ov::SoPtr inp auto attn_mask_in_tensor = m_prefill_request->get_tensor(m_prefill_in_ports.at(layer_names::attention_mask)); auto pos_ids_in_tensor = m_prefill_request->get_tensor(m_prefill_in_ports.at(layer_names::position_ids)); - auto to_ty_ids_in_tensor = [&]() { - if (auto ttis_port = m_prefill_in_ports.find(layer_names::token_type_ids); - ttis_port != m_prefill_in_ports.end()) { - return m_prefill_request->get_tensor(ttis_port->second); - } + auto to_ty_ids_in_tensor = ov::npuw::util::TensorPtr(); - return ov::npuw::util::TensorPtr(); - }(); + if (auto ttis_port = m_prefill_in_ports.find(layer_names::token_type_ids); ttis_port != m_prefill_in_ports.end()) { + to_ty_ids_in_tensor = m_prefill_request->get_tensor(ttis_port->second); + } auto& kvcache_desc = m_npuw_llm_compiled_model->m_kvcache_desc; @@ -917,13 +914,11 @@ void ov::npuw::LLMInferRequest::infer() { // FIXME: position_ids might be optional for some models! auto position_ids = get_tensor(find_port_by_name(inputs, layer_names::position_ids).value()); - auto token_types_ids = [&]() { - if (auto ttis_port = find_port_by_name(inputs, layer_names::token_type_ids); ttis_port.has_value()) { - return get_tensor(ttis_port.value()); - } + auto token_types_ids = ov::npuw::util::TensorPtr(); - return ov::npuw::util::TensorPtr(); - }(); + if (auto ttis_port = find_port_by_name(inputs, layer_names::token_type_ids); ttis_port.has_value()) { + token_types_ids = get_tensor(ttis_port.value()); + } // NB: For VLM, the "inputs_embeds" contains float values (embeddings) OPENVINO_ASSERT(ov::element::f32 == input_ids->get_element_type() ||