diff --git a/src/turbomind/kernels/decoder_masked_multihead_attention.h b/src/turbomind/kernels/decoder_masked_multihead_attention.h index 50fa131d6..b44332090 100644 --- a/src/turbomind/kernels/decoder_masked_multihead_attention.h +++ b/src/turbomind/kernels/decoder_masked_multihead_attention.h @@ -121,7 +121,7 @@ struct Multihead_attention_params: public Multihead_attention_params_base { int max_position_embeddings = 0; bool use_dynamic_ntk = false; bool use_logn_attn = false; - float rotary_emb_base = 10000.0f; + float rotary_embedding_base = 10000.0f; }; template diff --git a/src/turbomind/kernels/decoder_masked_multihead_attention/decoder_masked_multihead_attention_template.cuh b/src/turbomind/kernels/decoder_masked_multihead_attention/decoder_masked_multihead_attention_template.cuh index ad8598baf..40e1dd91d 100644 --- a/src/turbomind/kernels/decoder_masked_multihead_attention/decoder_masked_multihead_attention_template.cuh +++ b/src/turbomind/kernels/decoder_masked_multihead_attention/decoder_masked_multihead_attention_template.cuh @@ -1378,19 +1378,19 @@ __global__ void masked_multihead_attention_kernel(Multihead_attention_params q = add(q, q_bias); k = add(k, k_bias); - float rotary_emb_base = params.rotary_emb_base; + float rotary_embedding_base = params.rotary_embedding_base; if (params.use_dynamic_ntk) { // +1 because of `length_per_sample == context_length - 1` - rotary_emb_base = rotary_embedding_get_base(params.length_per_sample[bi] + 1, - params.max_position_embeddings, - params.rotary_embedding_dim, - rotary_emb_base); + rotary_embedding_base = rotary_embedding_get_base(params.length_per_sample[bi] + 1, + params.max_position_embeddings, + params.rotary_embedding_dim, + rotary_embedding_base); } // Padded len const int padd_len = (params.total_padding_tokens == nullptr) ? 0 : params.total_padding_tokens[bi]; if (params.rotary_embedding_dim > 0) { - apply_rotary_embedding(q, k, tidx, params.rotary_embedding_dim, rotary_emb_base, params.timestep - padd_len); + apply_rotary_embedding(q, k, tidx, params.rotary_embedding_dim, rotary_embedding_base, params.timestep - padd_len); } if (params.use_logn_attn) { diff --git a/src/turbomind/kernels/unfused_attention_kernels.cu b/src/turbomind/kernels/unfused_attention_kernels.cu index 8f28313e0..324fbe8d5 100644 --- a/src/turbomind/kernels/unfused_attention_kernels.cu +++ b/src/turbomind/kernels/unfused_attention_kernels.cu @@ -863,7 +863,7 @@ __global__ void add_fusedQKV_bias_transpose_kernel(T* q_buf, int kv_head_num, int size_per_head, int rotary_embedding_dim, - float rotary_emb_base, + float rotary_embedding_base, int max_position_embeddings, bool use_dynamic_ntk, bool use_logn_attn) @@ -933,12 +933,12 @@ __global__ void add_fusedQKV_bias_transpose_kernel(T* q_buf, const int timestep = history_len + seq_idx; if (use_dynamic_ntk) { - rotary_emb_base = mmha::rotary_embedding_get_base( - context_len, max_position_embeddings, rotary_embedding_dim, rotary_emb_base); + rotary_embedding_base = mmha::rotary_embedding_get_base( + context_len, max_position_embeddings, rotary_embedding_dim, rotary_embedding_base); } // TODO: unused computation on k if GQA is used - mmha::apply_rotary_embedding(q, k, tidx, rotary_embedding_dim, rotary_emb_base, timestep); + mmha::apply_rotary_embedding(q, k, tidx, rotary_embedding_dim, rotary_embedding_base, timestep); if (use_logn_attn) { // +1 to convert to context length at the timestep @@ -990,7 +990,7 @@ __global__ void add_fusedQKV_bias_transpose_kernel(T* q_buf, kv_head_num, \ size_per_head, \ rotary_embedding_dim, \ - rotary_emb_base, \ + rotary_embedding_base, \ max_position_embeddings, \ use_dynamic_ntk, \ use_logn_attn); @@ -1011,7 +1011,7 @@ void invokeAddFusedQKVBiasTranspose(T* q_buf, const int kv_head_num, const int size_per_head, const int rotary_embedding_dim, - float rotary_emb_base, + float rotary_embedding_base, int max_position_embeddings, bool use_dynamic_ntk, bool use_logn_attn, @@ -1041,7 +1041,7 @@ void invokeAddFusedQKVBiasTranspose(T* q_buf, const int kv_head_num, \ const int size_per_head, \ const int rotary_embedding_dim, \ - float rotary_emb_base, \ + float rotary_embedding_base, \ int max_position_embeddings, \ bool use_dynamic_ntk, \ bool use_logn_attn, \ diff --git a/src/turbomind/kernels/unfused_attention_kernels.h b/src/turbomind/kernels/unfused_attention_kernels.h index 22f939a5e..b5c37b5d4 100644 --- a/src/turbomind/kernels/unfused_attention_kernels.h +++ b/src/turbomind/kernels/unfused_attention_kernels.h @@ -79,7 +79,7 @@ void invokeAddFusedQKVBiasTranspose(T* q_buf, const int kv_head_num, const int size_per_head, const int rotary_embedding_dim, - float rotary_emb_base, + float rotary_embedding_base, int max_position_embeddings, bool use_dynamic_ntk, bool use_logn_attn, diff --git a/src/turbomind/models/llama/LlamaContextAttentionLayer.cc b/src/turbomind/models/llama/LlamaContextAttentionLayer.cc index ad4136e69..e8f77e1c7 100644 --- a/src/turbomind/models/llama/LlamaContextAttentionLayer.cc +++ b/src/turbomind/models/llama/LlamaContextAttentionLayer.cc @@ -175,7 +175,7 @@ inline void LlamaContextAttentionLayer::forward(TensorMap* local_kv_head_num_, size_per_head_, params_.rotray_embedding_dim, - params_.rotary_emb_base, + params_.rotary_embedding_base, params_.max_position_embeddings, params_.use_dynamic_ntk, params_.use_logn_attn, diff --git a/src/turbomind/models/llama/LlamaDecoderSelfAttentionLayer.cc b/src/turbomind/models/llama/LlamaDecoderSelfAttentionLayer.cc index e39572762..3caaf5906 100644 --- a/src/turbomind/models/llama/LlamaDecoderSelfAttentionLayer.cc +++ b/src/turbomind/models/llama/LlamaDecoderSelfAttentionLayer.cc @@ -61,7 +61,7 @@ static inline void fusedQKV_masked_attention_dispatch(const T* qkv_buf, const int kv_head_num, const int size_per_head, const int rotary_embedding_dim, - const float rotary_emb_base, + const float rotary_embedding_base, const int max_position_embeddings, const bool use_dynamic_ntk, const bool use_logn_attn, @@ -130,7 +130,7 @@ static inline void fusedQKV_masked_attention_dispatch(const T* qkv_buf, params.hidden_size_per_head = size_per_head; params.rotary_embedding_dim = rotary_embedding_dim; - params.rotary_emb_base = rotary_emb_base; + params.rotary_embedding_base = rotary_embedding_base; params.max_position_embeddings = max_position_embeddings; params.use_dynamic_ntk = use_dynamic_ntk; params.use_logn_attn = use_logn_attn; @@ -263,7 +263,7 @@ void LlamaDecoderSelfAttentionLayer::forward(TensorMap* o local_kv_head_num_, size_per_head_, params_.rotray_embedding_dim, - params_.rotary_emb_base, + params_.rotary_embedding_base, params_.max_position_embeddings, params_.use_dynamic_ntk, params_.use_logn_attn, diff --git a/src/turbomind/models/llama/llama_params.h b/src/turbomind/models/llama/llama_params.h index 446fe64d2..8f8c96837 100644 --- a/src/turbomind/models/llama/llama_params.h +++ b/src/turbomind/models/llama/llama_params.h @@ -6,7 +6,7 @@ namespace turbomind { struct LlamaAttentionParams { int rotray_embedding_dim; - float rotary_emb_base; + float rotary_embedding_base; int max_position_embeddings; bool use_dynamic_ntk; bool use_logn_attn; diff --git a/src/turbomind/triton_backend/llama/LlamaTritonModel.cc b/src/turbomind/triton_backend/llama/LlamaTritonModel.cc index d7edf11d6..456f5f41c 100644 --- a/src/turbomind/triton_backend/llama/LlamaTritonModel.cc +++ b/src/turbomind/triton_backend/llama/LlamaTritonModel.cc @@ -137,7 +137,7 @@ LlamaTritonModel::LlamaTritonModel(size_t tensor_para_size, group_size_ = reader.GetInteger("llama", "group_size", 0); attn_params_.rotray_embedding_dim = reader.GetInteger("llama", "rotary_embedding"); - attn_params_.rotary_emb_base = reader.GetFloat("llama", "rope_theta", 10000.0f); + attn_params_.rotary_embedding_base = reader.GetFloat("llama", "rope_theta", 10000.0f); attn_params_.max_position_embeddings = reader.GetInteger("llama", "max_position_embeddings", 0); attn_params_.use_dynamic_ntk = reader.GetInteger("llama", "use_dynamic_ntk", 0); attn_params_.use_logn_attn = reader.GetInteger("llama", "use_logn_attn", 0);