diff --git a/src/plugins/intel_cpu/src/nodes/kernels/x64/rms_kernel.cpp b/src/plugins/intel_cpu/src/nodes/kernels/x64/rms_kernel.cpp index 7778ebb12450f0..d3307bb12fbebd 100644 --- a/src/plugins/intel_cpu/src/nodes/kernels/x64/rms_kernel.cpp +++ b/src/plugins/intel_cpu/src/nodes/kernels/x64/rms_kernel.cpp @@ -160,6 +160,7 @@ void jit_rms_kernel::generate() { reduce_vmm_to_scalar(vmm_rsqrt, vmm_sum0, vmm_sum1, vmm_sum3, vec_size); // mean(x^2) + OPENVINO_ASSERT(m_jcp.data_size != 0); mov(reg_tmp.cvt32(), float2int(1.0f / m_jcp.data_size)); vmovd(xmm_tmp, reg_tmp.cvt32()); vmulss(xmm_rsqrt, xmm_rsqrt, xmm_tmp); diff --git a/src/plugins/intel_cpu/src/nodes/llm_mlp.cpp b/src/plugins/intel_cpu/src/nodes/llm_mlp.cpp index b475a602c3cd1a..91a72fdbeb4cab 100644 --- a/src/plugins/intel_cpu/src/nodes/llm_mlp.cpp +++ b/src/plugins/intel_cpu/src/nodes/llm_mlp.cpp @@ -37,8 +37,6 @@ class LinearKsplit2 { LinearKsplit2() {} - ReduceAdd2bh* p_jit_reduce2bh; - // weight [N, K] // Gate & Up are interleaved in N dimension: 16-gate / 16-up // and post-ops will compute silu(gate)*up in unit of 16 elements @@ -201,7 +199,7 @@ class LinearGateUp { bool quantized_int8 = config.gate_up_quantized; auto reg_blk_K_size = quantized_int8 ? REG_BLK_K_SIZE_I8 : REG_BLK_K_SIZE; - auto cache_blk_k_size = quantized_int8 ? CACHE_BLK_K_SIZE : CACHE_BLK_K_SIZE; + auto cache_blk_k_size = CACHE_BLK_K_SIZE; auto weight_element_size = quantized_int8 ? sizeof(int8_t) : sizeof(ov::float16); // prepare weights, split N among threads diff --git a/src/plugins/intel_cpu/src/nodes/qkv_proj.cpp b/src/plugins/intel_cpu/src/nodes/qkv_proj.cpp index ce7bfae07591d6..a7f0286c6b667c 100644 --- a/src/plugins/intel_cpu/src/nodes/qkv_proj.cpp +++ b/src/plugins/intel_cpu/src/nodes/qkv_proj.cpp @@ -75,7 +75,7 @@ struct QKVProjection::Executor : public QKVProjection::ExecutorBase { // and activations will be dynamically per-token quantized and using AMX-INT8 to get the result bool quantized_int8 = m_node->m_config.quantized; - auto cache_blk_k_size = quantized_int8 ? CACHE_BLK_K_SIZE : CACHE_BLK_K_SIZE; + auto cache_blk_k_size = CACHE_BLK_K_SIZE; auto weight_element_size = quantized_int8 ? sizeof(int8_t) : sizeof(ov::float16); auto K = w0.size(1); diff --git a/src/plugins/intel_cpu/src/nodes/scaled_attn.cpp b/src/plugins/intel_cpu/src/nodes/scaled_attn.cpp index 7fe3fc8dc5045d..71d39a1fce5ba8 100644 --- a/src/plugins/intel_cpu/src/nodes/scaled_attn.cpp +++ b/src/plugins/intel_cpu/src/nodes/scaled_attn.cpp @@ -109,7 +109,7 @@ struct MHAKernel { } PlainTensor causal_mask; - bool select_nfltmax_at_0; // set attn_score to -FLT_MAX when causal_mask[...] equal to this + bool select_nfltmax_at_0 = false; // set attn_score to -FLT_MAX when causal_mask[...] equal to this void set_causal_mask(PlainTensor mask, bool _select_nfltmax_at_0) { causal_mask = mask; select_nfltmax_at_0 = _select_nfltmax_at_0; @@ -526,7 +526,7 @@ struct MHAKernel { } PlainTensor causal_mask; - bool select_nfltmax_at_0; // set attn_score to -FLT_MAX when causal_mask[...] equal to this + bool select_nfltmax_at_0 = false; // set attn_score to -FLT_MAX when causal_mask[...] equal to this void set_causal_mask(PlainTensor mask, bool _select_nfltmax_at_0) { causal_mask = mask; select_nfltmax_at_0 = _select_nfltmax_at_0; @@ -674,7 +674,7 @@ struct MHAKernel { } PlainTensor causal_mask; - bool select_nfltmax_at_0; // set attn_score to -FLT_MAX when causal_mask[...] equal to this + bool select_nfltmax_at_0 = false; // set attn_score to -FLT_MAX when causal_mask[...] equal to this void set_causal_mask(PlainTensor mask, bool _select_nfltmax_at_0) { causal_mask = mask; select_nfltmax_at_0 = _select_nfltmax_at_0;