From de46dcf9730c39f1da313c52f896f5e505d1a970 Mon Sep 17 00:00:00 2001 From: llbdyiu66 Date: Wed, 10 Dec 2025 15:03:12 +0800 Subject: [PATCH 1/4] add qwen2/3 old fused qkv/ffn --- paddleformers/transformers/qwen2/modeling.py | 146 +++++++++--- .../transformers/qwen2_moe/modeling.py | 220 +++++++++++------- paddleformers/transformers/qwen3/modeling.py | 141 +++++++---- .../transformers/qwen3_moe/modeling.py | 213 ++++++++++------- 4 files changed, 473 insertions(+), 247 deletions(-) diff --git a/paddleformers/transformers/qwen2/modeling.py b/paddleformers/transformers/qwen2/modeling.py index 856cc9b9bc..efc0cf54cf 100644 --- a/paddleformers/transformers/qwen2/modeling.py +++ b/paddleformers/transformers/qwen2/modeling.py @@ -307,7 +307,6 @@ class Qwen2PretrainedModel(PretrainedModel): @classmethod def _get_tensor_parallel_mappings(cls, config: Qwen2Config, is_split=True): - """Generate tensor parallel mappings for model conversion.""" from ..conversion_utils import split_or_merge_func fn = split_or_merge_func( @@ -317,50 +316,122 @@ def _get_tensor_parallel_mappings(cls, config: Qwen2Config, is_split=True): num_attention_heads=config.num_attention_heads, ) - LAYER_COLWISE = [ - "self_attn.q_proj.weight", - "self_attn.k_proj.weight", - "self_attn.v_proj.weight", - "mlp.up_proj.weight", - "mlp.gate_proj.weight", - ] - - LAYER_ROWWISE = ["self_attn.o_proj.weight", "mlp.down_proj.weight"] - - BIAS_KEYS = [ - "self_attn.q_proj.bias", - "self_attn.k_proj.bias", - "self_attn.v_proj.bias", - ] + def get_tensor_parallel_split_mappings(num_layers): + final_actions = {} - def make_base_actions(): - actions = { - "lm_head.weight": partial(fn, is_column=False), + base_actions = { + # Row Linear "embed_tokens.weight": partial(fn, is_column=False), + "layers.0.self_attn.o_proj.weight": partial(fn, is_column=False), + "layers.0.mlp.down_proj.weight": partial(fn, is_column=False), } - for layer_idx in range(config.num_hidden_layers): - actions.update( - { - f"{cls.base_model_prefix}.layers.{layer_idx}.{k}": partial(fn, is_column=True) - for k in LAYER_COLWISE - } - ) - actions.update( - { - f"{cls.base_model_prefix}.layers.{layer_idx}.{k}": partial(fn, is_column=False) - for k in LAYER_ROWWISE - } - ) - # bias - actions.update( - {f"{cls.base_model_prefix}.layers.{layer_idx}.{b}": partial(fn, is_column=True) for b in BIAS_KEYS} + + if config.tie_word_embeddings: + base_actions["lm_head.weight"] = partial(fn, is_column=False) + else: + base_actions["lm_head.weight"] = partial(fn, is_column=True) + + if not config.vocab_size % config.tensor_parallel_degree == 0: + base_actions.pop("lm_head.weight") + base_actions.pop("embed_tokens.weight") + # Column Linear + if config.fuse_attention_qkv: + base_actions["layers.0.self_attn.qkv_proj.weight"] = partial(fn, is_column=True) + base_actions["layers.0.self_attn.qkv_proj.bias"] = partial(fn, is_column=True) + else: + base_actions["layers.0.self_attn.q_proj.weight"] = partial(fn, is_column=True) + base_actions["layers.0.self_attn.q_proj.bias"] = partial(fn, is_column=True) + # if we have enough num_key_value_heads to split, then split it. + if config.num_key_value_heads % config.tensor_parallel_degree == 0: + base_actions["layers.0.self_attn.k_proj.weight"] = partial(fn, is_column=True) + base_actions["layers.0.self_attn.v_proj.weight"] = partial(fn, is_column=True) + base_actions["layers.0.self_attn.k_proj.bias"] = partial(fn, is_column=True) + base_actions["layers.0.self_attn.v_proj.bias"] = partial(fn, is_column=True) + + if config.fuse_attention_ffn: + base_actions["layers.0.mlp.gate_up_fused_proj.weight"] = partial( + fn, is_column=True, is_naive_2fuse=True ) + else: + base_actions["layers.0.mlp.gate_proj.weight"] = partial(fn, is_column=True) + base_actions["layers.0.mlp.up_proj.weight"] = partial(fn, is_column=True) + + for key, action in base_actions.items(): + if "layers.0." in key: + for i in range(num_layers): + final_actions[key.replace("layers.0.", f"layers.{i}.")] = action + final_actions[key] = action - return actions + return final_actions + + mappings = get_tensor_parallel_split_mappings(config.num_hidden_layers) - mappings = make_base_actions() return mappings + @classmethod + def _get_fuse_or_split_param_mappings(cls, config: Qwen2Config, is_fuse=False): + # return parameter fuse utils + from ..conversion_utils import split_or_fuse_func + + fn = split_or_fuse_func(is_fuse=is_fuse) + + # last key is fused key, other keys are to be fused. + fuse_qkv_keys = [ + ( + "layers.0.self_attn.q_proj.weight", + "layers.0.self_attn.k_proj.weight", + "layers.0.self_attn.v_proj.weight", + "layers.0.self_attn.qkv_proj.weight", + ), + ( + "layers.0.self_attn.q_proj.bias", + "layers.0.self_attn.k_proj.bias", + "layers.0.self_attn.v_proj.bias", + "layers.0.self_attn.qkv_proj.bias", + ), + ] + + fuse_gate_up_keys = ( + "layers.0.mlp.gate_proj.weight", + "layers.0.mlp.up_proj.weight", + "layers.0.mlp.gate_up_fused_proj.weight", + ) + num_heads = config.num_attention_heads + num_key_value_heads = getattr(config, "num_key_value_heads", num_heads) + fuse_attention_qkv = getattr(config, "fuse_attention_qkv", False) + fuse_attention_ffn = getattr(config, "fuse_attention_ffn", False) + + final_actions = {} + if is_fuse: + if fuse_attention_qkv: + for i in range(config.num_hidden_layers): + for fuse_keys in fuse_qkv_keys: + keys = tuple([key.replace("layers.0.", f"layers.{i}.") for key in fuse_keys]) + final_actions[keys] = partial( + fn, is_qkv=True, num_heads=num_heads, num_key_value_heads=num_key_value_heads + ) + if fuse_attention_ffn: + for i in range(config.num_hidden_layers): + keys = tuple([key.replace("layers.0.", f"layers.{i}.") for key in fuse_gate_up_keys]) + final_actions[keys] = fn + else: + if not fuse_attention_qkv: + for i in range(config.num_hidden_layers): + for fuse_keys in fuse_qkv_keys: + keys = tuple([key.replace("layers.0.", f"layers.{i}.") for key in fuse_keys]) + final_actions[keys] = partial( + fn, + split_nums=3, + is_qkv=True, + num_heads=num_heads, + num_key_value_heads=num_key_value_heads, + ) + if not fuse_attention_ffn: + for i in range(config.num_hidden_layers): + keys = tuple([key.replace("layers.0.", f"layers.{i}.") for key in fuse_gate_up_keys]) + final_actions[keys] = partial(fn, split_nums=2) + return final_actions + @classmethod def _gen_aoa_config(cls, config: Qwen2Config): model_prefix = "" if cls == cls.base_model_class else "model." @@ -1025,6 +1096,7 @@ class Qwen2ForCausalLMPipe(GeneralModelForCausalLMPipe): config_class = Qwen2Config _decoder_layer_cls = Qwen2DecoderLayer _get_tensor_parallel_mappings = Qwen2Model._get_tensor_parallel_mappings + _get_fuse_or_split_param_mappings = Qwen2Model._get_fuse_or_split_param_mappings _init_weights = Qwen2Model._init_weights _keep_in_fp32_modules = Qwen2Model._keep_in_fp32_modules _rotary_emb_cls = Qwen2RotaryEmbedding diff --git a/paddleformers/transformers/qwen2_moe/modeling.py b/paddleformers/transformers/qwen2_moe/modeling.py index 5ee6805a73..5e9e99ae14 100644 --- a/paddleformers/transformers/qwen2_moe/modeling.py +++ b/paddleformers/transformers/qwen2_moe/modeling.py @@ -516,7 +516,6 @@ class Qwen2MoePretrainedModel(PretrainedModel): @classmethod def _get_tensor_parallel_mappings(cls, config: Qwen2MoeConfig, is_split=True): - """Generate tensor parallel mappings for model conversion.""" from ..conversion_utils import split_or_merge_func fn = split_or_merge_func( @@ -526,95 +525,153 @@ def _get_tensor_parallel_mappings(cls, config: Qwen2MoeConfig, is_split=True): num_attention_heads=config.num_attention_heads, ) - LAYER_COLWISE = [ - "self_attn.q_proj.weight", - "self_attn.k_proj.weight", - "self_attn.v_proj.weight", - ] + def get_tensor_parallel_split_mappings(num_layers, num_experts): + final_actions = {} + + base_actions = { + "lm_head.weight": partial(fn, is_column=True), + # Row Linear + "embed_tokens.weight": partial(fn, is_column=False), + "layers.0.self_attn.o_proj.weight": partial(fn, is_column=False), + } - LAYER_ROWWISE = ["self_attn.o_proj.weight"] + if not config.vocab_size % config.tensor_parallel_degree == 0: + base_actions.pop("lm_head.weight") + base_actions.pop("embed_tokens.weight") - EXPERT_LAYER_COLWISE = [ - "up_proj.weight", - "gate_proj.weight", - ] + # Column Linear + if config.fuse_attention_qkv: + base_actions["layers.0.self_attn.qkv_proj.weight"] = partial(fn, is_column=True) + base_actions["layers.0.self_attn.qkv_proj.bias"] = partial(fn, is_column=True) + else: + base_actions["layers.0.self_attn.q_proj.weight"] = partial(fn, is_column=True) + base_actions["layers.0.self_attn.q_proj.bias"] = partial(fn, is_column=True) + # if we have enough num_key_value_heads to split, then split it. + if config.num_key_value_heads % config.tensor_parallel_degree == 0: + base_actions["layers.0.self_attn.k_proj.weight"] = partial(fn, is_column=True) + base_actions["layers.0.self_attn.v_proj.weight"] = partial(fn, is_column=True) + base_actions["layers.0.self_attn.k_proj.bias"] = partial(fn, is_column=True) + base_actions["layers.0.self_attn.v_proj.bias"] = partial(fn, is_column=True) + + for key, action in base_actions.items(): + if "layers.0." in key: + for i in range(num_layers): + final_actions[key.replace("layers.0.", f"layers.{i}.")] = action + final_actions[key] = action + + # Add tp split for expert params. + if config.fuse_attention_ffn: + base_actions = { + "layers.0.mlp.experts.0.gate_up_fused_proj.weight": partial( + fn, is_column=True, is_naive_2fuse=True + ), + "layers.0.mlp.experts.0.down_proj.weight": partial(fn, is_column=False), + } + else: + # Add tp split for expert params. + base_actions = { + "layers.0.mlp.experts.0.gate_proj.weight": partial(fn, is_column=True), + "layers.0.mlp.experts.0.up_proj.weight": partial(fn, is_column=True), + "layers.0.mlp.experts.0.down_proj.weight": partial(fn, is_column=False), + } + for key, action in base_actions.items(): + for i in range(num_layers): + newkey = key.replace("layers.0.", f"layers.{i}.") + for j in range(num_experts): + newkey2 = newkey.replace("experts.0.", f"experts.{j}.") + final_actions[newkey2] = action + + # Add tp split for shared expert params. + if config.fuse_attention_ffn: + base_actions = { + "layers.0.mlp.shared_expert.gate_up_fused_proj.weight": partial( + fn, is_column=True, is_naive_2fuse=True + ), + "layers.0.mlp.shared_expert.down_proj.weight": partial(fn, is_column=False), + } + else: + base_actions = { + "layers.0.mlp.shared_expert.gate_proj.weight": partial(fn, is_column=True), + "layers.0.mlp.shared_expert.up_proj.weight": partial(fn, is_column=True), + "layers.0.mlp.shared_expert.down_proj.weight": partial(fn, is_column=False), + } + for key, action in base_actions.items(): + if "layers.0." in key: + for i in range(num_layers): + final_actions[key.replace("layers.0.", f"layers.{i}.")] = action + final_actions[key] = action - EXPERT_LAYER_ROWWISE = ["down_proj.weight"] + return final_actions - SHARED_EXPERT_LAYER_COLWISE = [ - "up_proj.weight", - "gate_proj.weight", - ] + mappings = get_tensor_parallel_split_mappings(config.num_hidden_layers, config.num_experts) - SHARED_EXPERT_LAYER_ROWWISE = ["down_proj.weight"] + return mappings - BIAS_KEYS = [ - "self_attn.q_proj.bias", - "self_attn.k_proj.bias", - "self_attn.v_proj.bias", + @classmethod + def _get_fuse_or_split_param_mappings(cls, config: Qwen2MoeConfig, is_fuse=False): + # return parameter fuse utils + from ..conversion_utils import split_or_fuse_func + + fn = split_or_fuse_func(is_fuse=is_fuse) + + # last key is fused key, other keys are to be fused. + fuse_qkv_keys = [ + ( + "layers.0.self_attn.q_proj.weight", + "layers.0.self_attn.k_proj.weight", + "layers.0.self_attn.v_proj.weight", + "layers.0.self_attn.qkv_proj.weight", + ), + ( + "layers.0.self_attn.q_proj.bias", + "layers.0.self_attn.k_proj.bias", + "layers.0.self_attn.v_proj.bias", + "layers.0.self_attn.qkv_proj.bias", + ), ] - def make_base_actions(): - actions = { - "lm_head.weight": partial(fn, is_column=False), - "embed_tokens.weight": partial(fn, is_column=False), - } - for layer_idx in range(config.num_hidden_layers): - actions.update( - { - f"{cls.base_model_prefix}.layers.{layer_idx}.{k}": partial(fn, is_column=True) - for k in LAYER_COLWISE - } - ) - actions.update( - { - f"{cls.base_model_prefix}.layers.{layer_idx}.{k}": partial(fn, is_column=False) - for k in LAYER_ROWWISE - } - ) - actions.update( - { - f"{cls.base_model_prefix}.layers.{layer_idx}.mlp.experts.{e}.{k}": partial(fn, is_column=True) - for e in range(config.num_experts) - for k in EXPERT_LAYER_COLWISE - } - ) - actions.update( - { - f"{cls.base_model_prefix}.layers.{layer_idx}.mlp.experts.{e}.{k}": partial(fn, is_column=False) - for e in range(config.num_experts) - for k in EXPERT_LAYER_ROWWISE - } - ) - actions.update( - { - f"{cls.base_model_prefix}.layers.{layer_idx}.mlp.shared_expert.{k}": partial( - fn, is_column=True - ) - for k in SHARED_EXPERT_LAYER_COLWISE - } - ) - actions.update( - { - f"{cls.base_model_prefix}.layers.{layer_idx}.mlp.shared_expert.{k}": partial( - fn, is_column=False + fuse_gate_up_keys = ( + "layers.0.mlp.experts.0.gate_proj.weight", + "layers.0.mlp.experts.0.up_proj.weight", + "layers.0.mlp.experts.0.gate_up_fused_proj.weight", + ) + num_heads = config.num_attention_heads + num_key_value_heads = getattr(config, "num_key_value_heads", num_heads) + fuse_attention_qkv = getattr(config, "fuse_attention_qkv", False) + fuse_attention_ffn = getattr(config, "fuse_attention_ffn", False) + num_experts = getattr(config, "num_experts", 128) + + final_actions = {} + if is_fuse: + if fuse_attention_qkv: + for i in range(config.num_hidden_layers): + keys = [key.replace("layers.0.", f"layers.{i}.") for key in fuse_gate_up_keys] + for j in range(num_experts): + experts_keys = tuple([key.replace("experts.0.", f"experts.{j}.") for key in keys]) + final_actions[experts_keys] = fn + if fuse_attention_ffn: + for i in range(config.num_hidden_layers): + keys = tuple([key.replace("layers.0.", f"layers.{i}.") for key in fuse_gate_up_keys]) + final_actions[keys] = fn + else: + if not fuse_attention_qkv: + for i in range(config.num_hidden_layers): + for fuse_keys in fuse_qkv_keys: + keys = tuple([key.replace("layers.0.", f"layers.{i}.") for key in fuse_keys]) + final_actions[keys] = partial( + fn, + split_nums=3, + is_qkv=True, + num_heads=num_heads, + num_key_value_heads=num_key_value_heads, ) - for k in SHARED_EXPERT_LAYER_ROWWISE - } - ) - # bias - if config.qkv_bias: - actions.update( - { - f"{cls.base_model_prefix}.layers.{layer_idx}.{b}": partial(fn, is_column=True) - for b in BIAS_KEYS - } - ) - - return actions - - mappings = make_base_actions() - return mappings + if not fuse_attention_ffn: + for i in range(config.num_hidden_layers): + keys = [key.replace("layers.0.", f"layers.{i}.") for key in fuse_gate_up_keys] + for j in range(num_experts): + experts_keys = tuple([key.replace("experts.0.", f"experts.{j}.") for key in keys]) + final_actions[experts_keys] = partial(fn, split_nums=2) + return final_actions @classmethod def _gen_aoa_config(cls, config: Qwen2MoeConfig): @@ -1099,6 +1156,7 @@ class Qwen2MoeForCausalLMPipe(GeneralModelForCausalLMPipe): config_class = Qwen2MoeConfig _decoder_layer_cls = Qwen2MoeDecoderLayer _get_tensor_parallel_mappings = Qwen2MoeModel._get_tensor_parallel_mappings + _get_fuse_or_split_param_mappings = Qwen2MoeModel._get_fuse_or_split_param_mappings _init_weights = Qwen2MoeModel._init_weights _keep_in_fp32_modules = Qwen2MoeModel._keep_in_fp32_modules _rotary_emb_cls = Qwen2MoeRotaryEmbedding diff --git a/paddleformers/transformers/qwen3/modeling.py b/paddleformers/transformers/qwen3/modeling.py index cefddb34c8..720774424b 100644 --- a/paddleformers/transformers/qwen3/modeling.py +++ b/paddleformers/transformers/qwen3/modeling.py @@ -323,7 +323,7 @@ class Qwen3PretrainedModel(PretrainedModel): @classmethod def _get_tensor_parallel_mappings(cls, config: Qwen3Config, is_split=True): - """Generate tensor parallel mappings for model conversion.""" + from ..conversion_utils import split_or_merge_func fn = split_or_merge_func( @@ -333,54 +333,108 @@ def _get_tensor_parallel_mappings(cls, config: Qwen3Config, is_split=True): num_attention_heads=config.num_attention_heads, ) - LAYER_COLWISE = [ - "self_attn.q_proj.weight", - "self_attn.k_proj.weight", - "self_attn.v_proj.weight", - "mlp.up_proj.weight", - "mlp.gate_proj.weight", - ] - - LAYER_ROWWISE = ["self_attn.o_proj.weight", "mlp.down_proj.weight"] + def get_tensor_parallel_split_mappings(num_layers): + final_actions = {} - BIAS_KEYS = [ - "self_attn.q_proj.bias", - "self_attn.k_proj.bias", - "self_attn.v_proj.bias", - ] - - def make_base_actions(): - actions = { - "lm_head.weight": partial(fn, is_column=False), + base_actions = { + # Row Linear "embed_tokens.weight": partial(fn, is_column=False), + "layers.0.self_attn.o_proj.weight": partial(fn, is_column=False), + "layers.0.mlp.down_proj.weight": partial(fn, is_column=False), } - for layer_idx in range(config.num_hidden_layers): - actions.update( - { - f"{cls.base_model_prefix}.layers.{layer_idx}.{k}": partial(fn, is_column=True) - for k in LAYER_COLWISE - } - ) - actions.update( - { - f"{cls.base_model_prefix}.layers.{layer_idx}.{k}": partial(fn, is_column=False) - for k in LAYER_ROWWISE - } + + if config.tie_word_embeddings: + base_actions["lm_head.weight"] = partial(fn, is_column=False) + else: + base_actions["lm_head.weight"] = partial(fn, is_column=True) + + if not config.vocab_size % config.tensor_parallel_degree == 0: + base_actions.pop("lm_head.weight") + base_actions.pop("embed_tokens.weight") + # Column Linear + if config.fuse_attention_qkv: + base_actions["layers.0.self_attn.qkv_proj.weight"] = partial(fn, is_column=True) + else: + base_actions["layers.0.self_attn.q_proj.weight"] = partial(fn, is_column=True) + # if we have enough num_key_value_heads to split, then split it. + if config.num_key_value_heads % config.tensor_parallel_degree == 0: + base_actions["layers.0.self_attn.k_proj.weight"] = partial(fn, is_column=True) + base_actions["layers.0.self_attn.v_proj.weight"] = partial(fn, is_column=True) + + if config.fuse_attention_ffn: + base_actions["layers.0.mlp.gate_up_fused_proj.weight"] = partial( + fn, is_column=True, is_naive_2fuse=True ) - # bias - if config.attention_bias: - actions.update( - { - f"{cls.base_model_prefix}.layers.{layer_idx}.{b}": partial(fn, is_column=True) - for b in BIAS_KEYS - } - ) - - return actions - - mappings = make_base_actions() + else: + base_actions["layers.0.mlp.gate_proj.weight"] = partial(fn, is_column=True) + base_actions["layers.0.mlp.up_proj.weight"] = partial(fn, is_column=True) + + for key, action in base_actions.items(): + if "layers.0." in key: + for i in range(num_layers): + final_actions[key.replace("layers.0.", f"layers.{i}.")] = action + final_actions[key] = action + + return final_actions + + mappings = get_tensor_parallel_split_mappings(config.num_hidden_layers) + return mappings + @classmethod + def _get_fuse_or_split_param_mappings(cls, config: Qwen3Config, is_fuse=False): + # return parameter fuse utils + from ..conversion_utils import split_or_fuse_func + + fn = split_or_fuse_func(is_fuse=is_fuse) + + # last key is fused key, other keys are to be fused. + fuse_qkv_keys = [ + ( + "layers.0.self_attn.q_proj.weight", + "layers.0.self_attn.k_proj.weight", + "layers.0.self_attn.v_proj.weight", + "layers.0.self_attn.qkv_proj.weight", + ) + ] + + fuse_gate_up_keys = ( + "layers.0.mlp.gate_proj.weight", + "layers.0.mlp.up_proj.weight", + "layers.0.mlp.gate_up_fused_proj.weight", + ) + num_heads = config.num_attention_heads + num_key_value_heads = getattr(config, "num_key_value_heads", num_heads) + fuse_attention_qkv = getattr(config, "fuse_attention_qkv", False) + fuse_attention_ffn = getattr(config, "fuse_attention_ffn", False) + + final_actions = {} + if is_fuse: + if fuse_attention_qkv: + for i in range(config.num_hidden_layers): + for fuse_keys in fuse_qkv_keys: + keys = tuple([key.replace("layers.0.", f"layers.{i}.") for key in fuse_keys]) + final_actions[keys] = partial( + fn, is_qkv=True, num_heads=num_heads, num_key_value_heads=num_key_value_heads + ) + if fuse_attention_ffn: + for i in range(config.num_hidden_layers): + keys = tuple([key.replace("layers.0.", f"layers.{i}.") for key in fuse_gate_up_keys]) + final_actions[keys] = fn + else: + if not fuse_attention_qkv: + for i in range(config.num_hidden_layers): + for fuse_keys in fuse_qkv_keys: + keys = tuple([key.replace("layers.0.", f"layers.{i}.") for key in fuse_keys]) + final_actions[keys] = partial( + fn, split_nums=3, is_qkv=True, num_heads=num_heads, num_key_value_heads=num_key_value_heads + ) + if not fuse_attention_ffn: + for i in range(config.num_hidden_layers): + keys = tuple([key.replace("layers.0.", f"layers.{i}.") for key in fuse_gate_up_keys]) + final_actions[keys] = partial(fn, split_nums=2) + return final_actions + @classmethod def _gen_aoa_config(cls, config: Qwen3Config): model_prefix = "" if cls == cls.base_model_class else "model." @@ -1051,6 +1105,7 @@ class Qwen3ForCausalLMPipe(GeneralModelForCausalLMPipe): config_class = Qwen3Config _decoder_layer_cls = Qwen3DecoderLayer _get_tensor_parallel_mappings = Qwen3Model._get_tensor_parallel_mappings + _get_fuse_or_split_param_mappings = Qwen3Model._get_fuse_or_split_param_mappings _init_weights = Qwen3Model._init_weights _keep_in_fp32_modules = Qwen3Model._keep_in_fp32_modules _rotary_emb_cls = Qwen3RotaryEmbedding diff --git a/paddleformers/transformers/qwen3_moe/modeling.py b/paddleformers/transformers/qwen3_moe/modeling.py index ca834848c7..aadd31590c 100644 --- a/paddleformers/transformers/qwen3_moe/modeling.py +++ b/paddleformers/transformers/qwen3_moe/modeling.py @@ -547,7 +547,6 @@ class Qwen3MoePretrainedModel(PretrainedModel): @classmethod def _get_tensor_parallel_mappings(cls, config: Qwen3MoeConfig, is_split=True): - """Generate tensor parallel mappings for model conversion.""" from ..conversion_utils import split_or_merge_func fn = split_or_merge_func( @@ -557,97 +556,138 @@ def _get_tensor_parallel_mappings(cls, config: Qwen3MoeConfig, is_split=True): num_attention_heads=config.num_attention_heads, ) - LAYER_COLWISE = [ - "self_attn.q_proj.weight", - "self_attn.k_proj.weight", - "self_attn.v_proj.weight", - ] + def get_tensor_parallel_split_mappings(num_layers, num_experts): + final_actions = {} - LAYER_ROWWISE = ["self_attn.o_proj.weight"] + base_actions = { + "lm_head.weight": partial(fn, is_column=True), + # Row Linear + "embed_tokens.weight": partial(fn, is_column=False), + "layers.0.self_attn.o_proj.weight": partial(fn, is_column=False), + } - EXPERT_LAYER_COLWISE = [ - "up_proj.weight", - "gate_proj.weight", - ] + if not config.vocab_size % config.tensor_parallel_degree == 0: + base_actions.pop("lm_head.weight") + base_actions.pop("embed_tokens.weight") - EXPERT_LAYER_ROWWISE = ["down_proj.weight"] + # Column Linear + if config.fuse_attention_qkv: + base_actions["layers.0.self_attn.qkv_proj.weight"] = partial(fn, is_column=True) + base_actions["layers.0.self_attn.qkv_proj.bias"] = partial(fn, is_column=True) + else: + base_actions["layers.0.self_attn.q_proj.weight"] = partial(fn, is_column=True) + base_actions["layers.0.self_attn.q_proj.bias"] = partial(fn, is_column=True) + # if we have enough num_key_value_heads to split, then split it. + if config.num_key_value_heads % config.tensor_parallel_degree == 0: + base_actions["layers.0.self_attn.k_proj.weight"] = partial(fn, is_column=True) + base_actions["layers.0.self_attn.v_proj.weight"] = partial(fn, is_column=True) + base_actions["layers.0.self_attn.k_proj.bias"] = partial(fn, is_column=True) + base_actions["layers.0.self_attn.v_proj.bias"] = partial(fn, is_column=True) + + for key, action in base_actions.items(): + if "layers.0." in key: + for i in range(num_layers): + final_actions[key.replace("layers.0.", f"layers.{i}.")] = action + final_actions[key] = action + + # Add tp split for expert params. + if config.fuse_attention_ffn: + base_actions = { + "layers.0.mlp.experts.0.gate_up_fused_proj.weight": partial( + fn, is_column=True, is_naive_2fuse=True + ), + "layers.0.mlp.experts.0.down_proj.weight": partial(fn, is_column=False), + } + else: + # Add tp split for expert params. + base_actions = { + "layers.0.mlp.experts.0.gate_proj.weight": partial(fn, is_column=True), + "layers.0.mlp.experts.0.up_proj.weight": partial(fn, is_column=True), + "layers.0.mlp.experts.0.down_proj.weight": partial(fn, is_column=False), + } + for key, action in base_actions.items(): + for i in range(num_layers): + newkey = key.replace("layers.0.", f"layers.{i}.") + for j in range(num_experts): + newkey2 = newkey.replace("experts.0.", f"experts.{j}.") + final_actions[newkey2] = action + + # Add tp split for shared expert params. + base_actions = {} + for key, action in base_actions.items(): + if "layers.0." in key: + for i in range(num_layers): + final_actions[key.replace("layers.0.", f"layers.{i}.")] = action + final_actions[key] = action + + return final_actions + + mappings = get_tensor_parallel_split_mappings(config.num_hidden_layers, config.num_experts) - BIAS_KEYS = [ - "self_attn.q_proj.bias", - "self_attn.k_proj.bias", - "self_attn.v_proj.bias", - ] + return mappings - def make_base_actions(): - actions = { - "lm_head.weight": partial(fn, is_column=False), - "embed_tokens.weight": partial(fn, is_column=False), - } - for layer_idx in range(config.num_hidden_layers): - actions.update( - { - f"{cls.base_model_prefix}.layers.{layer_idx}.{k}": partial(fn, is_column=True) - for k in LAYER_COLWISE - } - ) - actions.update( - { - f"{cls.base_model_prefix}.layers.{layer_idx}.{k}": partial(fn, is_column=False) - for k in LAYER_ROWWISE - } - ) - try: - moe_group = fleet.get_hybrid_communicate_group().get_expert_parallel_group() - except Exception: - moe_group = None - expert_parallel_degree = dist.get_world_size(moe_group) if moe_group is not None else 1 - # TODO: merge disable_ffn_model_parallel and expert_parallel_degree - if expert_parallel_degree <= 1: - # # if disable_ffn_model_parallel is True, disable expert layer tp plan - # if not config.disable_ffn_model_parallel: - actions.update( - { - f"{cls.base_model_prefix}.layers.{layer_idx}.mlp.experts.{e}.{k}": partial( - fn, is_column=True - ) - for e in range(config.num_experts) - for k in EXPERT_LAYER_COLWISE - } - ) - actions.update( - { - f"{cls.base_model_prefix}.layers.{layer_idx}.mlp.experts.{e}.{k}": partial( - fn, is_column=False - ) - for e in range(config.num_experts) - for k in EXPERT_LAYER_ROWWISE - } - ) - actions.update( - { - f"{cls.base_model_prefix}.layers.{layer_idx}.mlp.{k}": partial(fn, is_column=False) - for k in EXPERT_LAYER_ROWWISE - } - ) - actions.update( - { - f"{cls.base_model_prefix}.layers.{layer_idx}.mlp.{k}": partial(fn, is_column=True) - for k in EXPERT_LAYER_COLWISE - } - ) + @classmethod + def _get_fuse_or_split_param_mappings(cls, config: Qwen3MoeConfig, is_fuse=False): + # return parameter fuse utils + from ..conversion_utils import split_or_fuse_func + + fn = split_or_fuse_func(is_fuse=is_fuse) + + # last key is fused key, other keys are to be fused. + fuse_qkv_keys = [ + ( + "layers.0.self_attn.q_proj.weight", + "layers.0.self_attn.k_proj.weight", + "layers.0.self_attn.v_proj.weight", + "layers.0.self_attn.qkv_proj.weight", + ), + ] - # bias - if config.attention_bias: - actions.update( - { - f"{cls.base_model_prefix}.layers.{layer_idx}.{b}": partial(fn, is_column=True) - for b in BIAS_KEYS - } - ) - return actions - - mappings = make_base_actions() - return mappings + fuse_gate_up_keys = ( + "layers.0.mlp.experts.0.gate_proj.weight", + "layers.0.mlp.experts.0.up_proj.weight", + "layers.0.mlp.experts.0.gate_up_fused_proj.weight", + ) + num_heads = config.num_attention_heads + num_key_value_heads = getattr(config, "num_key_value_heads", num_heads) + fuse_attention_qkv = getattr(config, "fuse_attention_qkv", False) + fuse_attention_ffn = getattr(config, "fuse_attention_ffn", False) + num_experts = getattr(config, "num_experts", 128) + + final_actions = {} + if is_fuse: + if fuse_attention_qkv: + for i in range(config.num_hidden_layers): + for fuse_keys in fuse_qkv_keys: + keys = tuple([key.replace("layers.0.", f"layers.{i}.") for key in fuse_keys]) + final_actions[keys] = partial( + fn, is_qkv=True, num_heads=num_heads, num_key_value_heads=num_key_value_heads + ) + if fuse_attention_ffn: + for i in range(config.num_hidden_layers): + keys = [key.replace("layers.0.", f"layers.{i}.") for key in fuse_gate_up_keys] + for j in range(num_experts): + experts_keys = tuple([key.replace("experts.0.", f"experts.{j}.") for key in keys]) + final_actions[experts_keys] = fn + else: + if not fuse_attention_qkv: + for i in range(config.num_hidden_layers): + for fuse_keys in fuse_qkv_keys: + keys = tuple([key.replace("layers.0.", f"layers.{i}.") for key in fuse_keys]) + final_actions[keys] = partial( + fn, + split_nums=3, + is_qkv=True, + num_heads=num_heads, + num_key_value_heads=num_key_value_heads, + ) + if not fuse_attention_ffn: + for i in range(config.num_hidden_layers): + keys = [key.replace("layers.0.", f"layers.{i}.") for key in fuse_gate_up_keys] + for j in range(num_experts): + experts_keys = tuple([key.replace("experts.0.", f"experts.{j}.") for key in keys]) + final_actions[experts_keys] = partial(fn, split_nums=2) + return final_actions @classmethod def _gen_aoa_config(cls, config: Qwen3MoeConfig): @@ -1120,6 +1160,7 @@ class Qwen3MoeForCausalLMPipe(GeneralModelForCausalLMPipe): config_class = Qwen3MoeConfig _decoder_layer_cls = Qwen3MoeDecoderLayer _get_tensor_parallel_mappings = Qwen3MoeModel._get_tensor_parallel_mappings + _get_fuse_or_split_param_mappings = Qwen3MoeModel._get_fuse_or_split_param_mappings _init_weights = Qwen3MoeModel._init_weights _keep_in_fp32_modules = Qwen3MoeModel._keep_in_fp32_modules _rotary_emb_cls = Qwen3MoeRotaryEmbedding From fd4cb625d067362f89a1e1e01412056bc3b26c39 Mon Sep 17 00:00:00 2001 From: llbdyiu66 Date: Wed, 10 Dec 2025 16:56:32 +0800 Subject: [PATCH 2/4] change fuse key up_gate_proj --- paddleformers/transformers/qwen2/modeling.py | 6 ++---- paddleformers/transformers/qwen2_moe/modeling.py | 10 +++------- paddleformers/transformers/qwen3/modeling.py | 6 ++---- paddleformers/transformers/qwen3_moe/modeling.py | 6 ++---- 4 files changed, 9 insertions(+), 19 deletions(-) diff --git a/paddleformers/transformers/qwen2/modeling.py b/paddleformers/transformers/qwen2/modeling.py index efc0cf54cf..90fd5bac8e 100644 --- a/paddleformers/transformers/qwen2/modeling.py +++ b/paddleformers/transformers/qwen2/modeling.py @@ -349,9 +349,7 @@ def get_tensor_parallel_split_mappings(num_layers): base_actions["layers.0.self_attn.v_proj.bias"] = partial(fn, is_column=True) if config.fuse_attention_ffn: - base_actions["layers.0.mlp.gate_up_fused_proj.weight"] = partial( - fn, is_column=True, is_naive_2fuse=True - ) + base_actions["layers.0.mlp.up_gate_proj.weight"] = partial(fn, is_column=True, is_naive_2fuse=True) else: base_actions["layers.0.mlp.gate_proj.weight"] = partial(fn, is_column=True) base_actions["layers.0.mlp.up_proj.weight"] = partial(fn, is_column=True) @@ -394,7 +392,7 @@ def _get_fuse_or_split_param_mappings(cls, config: Qwen2Config, is_fuse=False): fuse_gate_up_keys = ( "layers.0.mlp.gate_proj.weight", "layers.0.mlp.up_proj.weight", - "layers.0.mlp.gate_up_fused_proj.weight", + "layers.0.mlp.up_gate_proj.weight", ) num_heads = config.num_attention_heads num_key_value_heads = getattr(config, "num_key_value_heads", num_heads) diff --git a/paddleformers/transformers/qwen2_moe/modeling.py b/paddleformers/transformers/qwen2_moe/modeling.py index 5e9e99ae14..7713186452 100644 --- a/paddleformers/transformers/qwen2_moe/modeling.py +++ b/paddleformers/transformers/qwen2_moe/modeling.py @@ -562,9 +562,7 @@ def get_tensor_parallel_split_mappings(num_layers, num_experts): # Add tp split for expert params. if config.fuse_attention_ffn: base_actions = { - "layers.0.mlp.experts.0.gate_up_fused_proj.weight": partial( - fn, is_column=True, is_naive_2fuse=True - ), + "layers.0.mlp.experts.0.up_gate_proj.weight": partial(fn, is_column=True, is_naive_2fuse=True), "layers.0.mlp.experts.0.down_proj.weight": partial(fn, is_column=False), } else: @@ -584,9 +582,7 @@ def get_tensor_parallel_split_mappings(num_layers, num_experts): # Add tp split for shared expert params. if config.fuse_attention_ffn: base_actions = { - "layers.0.mlp.shared_expert.gate_up_fused_proj.weight": partial( - fn, is_column=True, is_naive_2fuse=True - ), + "layers.0.mlp.shared_expert.up_gate_proj.weight": partial(fn, is_column=True, is_naive_2fuse=True), "layers.0.mlp.shared_expert.down_proj.weight": partial(fn, is_column=False), } else: @@ -633,7 +629,7 @@ def _get_fuse_or_split_param_mappings(cls, config: Qwen2MoeConfig, is_fuse=False fuse_gate_up_keys = ( "layers.0.mlp.experts.0.gate_proj.weight", "layers.0.mlp.experts.0.up_proj.weight", - "layers.0.mlp.experts.0.gate_up_fused_proj.weight", + "layers.0.mlp.experts.0.up_gate_proj.weight", ) num_heads = config.num_attention_heads num_key_value_heads = getattr(config, "num_key_value_heads", num_heads) diff --git a/paddleformers/transformers/qwen3/modeling.py b/paddleformers/transformers/qwen3/modeling.py index 720774424b..8df090af6a 100644 --- a/paddleformers/transformers/qwen3/modeling.py +++ b/paddleformers/transformers/qwen3/modeling.py @@ -362,9 +362,7 @@ def get_tensor_parallel_split_mappings(num_layers): base_actions["layers.0.self_attn.v_proj.weight"] = partial(fn, is_column=True) if config.fuse_attention_ffn: - base_actions["layers.0.mlp.gate_up_fused_proj.weight"] = partial( - fn, is_column=True, is_naive_2fuse=True - ) + base_actions["layers.0.mlp.up_gate_proj.weight"] = partial(fn, is_column=True, is_naive_2fuse=True) else: base_actions["layers.0.mlp.gate_proj.weight"] = partial(fn, is_column=True) base_actions["layers.0.mlp.up_proj.weight"] = partial(fn, is_column=True) @@ -401,7 +399,7 @@ def _get_fuse_or_split_param_mappings(cls, config: Qwen3Config, is_fuse=False): fuse_gate_up_keys = ( "layers.0.mlp.gate_proj.weight", "layers.0.mlp.up_proj.weight", - "layers.0.mlp.gate_up_fused_proj.weight", + "layers.0.mlp.up_gate_proj.weight", ) num_heads = config.num_attention_heads num_key_value_heads = getattr(config, "num_key_value_heads", num_heads) diff --git a/paddleformers/transformers/qwen3_moe/modeling.py b/paddleformers/transformers/qwen3_moe/modeling.py index aadd31590c..6edb6fd94f 100644 --- a/paddleformers/transformers/qwen3_moe/modeling.py +++ b/paddleformers/transformers/qwen3_moe/modeling.py @@ -593,9 +593,7 @@ def get_tensor_parallel_split_mappings(num_layers, num_experts): # Add tp split for expert params. if config.fuse_attention_ffn: base_actions = { - "layers.0.mlp.experts.0.gate_up_fused_proj.weight": partial( - fn, is_column=True, is_naive_2fuse=True - ), + "layers.0.mlp.experts.0.up_gate_proj.weight": partial(fn, is_column=True, is_naive_2fuse=True), "layers.0.mlp.experts.0.down_proj.weight": partial(fn, is_column=False), } else: @@ -646,7 +644,7 @@ def _get_fuse_or_split_param_mappings(cls, config: Qwen3MoeConfig, is_fuse=False fuse_gate_up_keys = ( "layers.0.mlp.experts.0.gate_proj.weight", "layers.0.mlp.experts.0.up_proj.weight", - "layers.0.mlp.experts.0.gate_up_fused_proj.weight", + "layers.0.mlp.experts.0.up_gate_proj.weight", ) num_heads = config.num_attention_heads num_key_value_heads = getattr(config, "num_key_value_heads", num_heads) From 147c322a0653cf5a7105ee0c24e2bafd5b1e76a0 Mon Sep 17 00:00:00 2001 From: llbdyiu66 Date: Wed, 10 Dec 2025 19:54:46 +0800 Subject: [PATCH 3/4] fix --- paddleformers/transformers/qwen2/modeling.py | 105 +++++---- .../transformers/qwen2_moe/modeling.py | 203 ++++++++++++------ paddleformers/transformers/qwen3/modeling.py | 107 +++++---- .../transformers/qwen3_moe/modeling.py | 171 ++++++++++----- 4 files changed, 375 insertions(+), 211 deletions(-) diff --git a/paddleformers/transformers/qwen2/modeling.py b/paddleformers/transformers/qwen2/modeling.py index 90fd5bac8e..1bc41b675f 100644 --- a/paddleformers/transformers/qwen2/modeling.py +++ b/paddleformers/transformers/qwen2/modeling.py @@ -307,6 +307,7 @@ class Qwen2PretrainedModel(PretrainedModel): @classmethod def _get_tensor_parallel_mappings(cls, config: Qwen2Config, is_split=True): + """Generate tensor parallel mappings for model conversion.""" from ..conversion_utils import split_or_merge_func fn = split_or_merge_func( @@ -316,54 +317,74 @@ def _get_tensor_parallel_mappings(cls, config: Qwen2Config, is_split=True): num_attention_heads=config.num_attention_heads, ) - def get_tensor_parallel_split_mappings(num_layers): - final_actions = {} + LAYER_COLWISE = [ + "self_attn.q_proj.weight", + "self_attn.k_proj.weight", + "self_attn.v_proj.weight", + "mlp.up_proj.weight", + "mlp.gate_proj.weight", + ] - base_actions = { - # Row Linear - "embed_tokens.weight": partial(fn, is_column=False), - "layers.0.self_attn.o_proj.weight": partial(fn, is_column=False), - "layers.0.mlp.down_proj.weight": partial(fn, is_column=False), - } + FUSE_LAYER_COLWISE = [ + "self_attn.qkv_proj.weight", + "mlp.up_gate_proj.weight", + ] - if config.tie_word_embeddings: - base_actions["lm_head.weight"] = partial(fn, is_column=False) - else: - base_actions["lm_head.weight"] = partial(fn, is_column=True) - - if not config.vocab_size % config.tensor_parallel_degree == 0: - base_actions.pop("lm_head.weight") - base_actions.pop("embed_tokens.weight") - # Column Linear - if config.fuse_attention_qkv: - base_actions["layers.0.self_attn.qkv_proj.weight"] = partial(fn, is_column=True) - base_actions["layers.0.self_attn.qkv_proj.bias"] = partial(fn, is_column=True) - else: - base_actions["layers.0.self_attn.q_proj.weight"] = partial(fn, is_column=True) - base_actions["layers.0.self_attn.q_proj.bias"] = partial(fn, is_column=True) - # if we have enough num_key_value_heads to split, then split it. - if config.num_key_value_heads % config.tensor_parallel_degree == 0: - base_actions["layers.0.self_attn.k_proj.weight"] = partial(fn, is_column=True) - base_actions["layers.0.self_attn.v_proj.weight"] = partial(fn, is_column=True) - base_actions["layers.0.self_attn.k_proj.bias"] = partial(fn, is_column=True) - base_actions["layers.0.self_attn.v_proj.bias"] = partial(fn, is_column=True) - - if config.fuse_attention_ffn: - base_actions["layers.0.mlp.up_gate_proj.weight"] = partial(fn, is_column=True, is_naive_2fuse=True) - else: - base_actions["layers.0.mlp.gate_proj.weight"] = partial(fn, is_column=True) - base_actions["layers.0.mlp.up_proj.weight"] = partial(fn, is_column=True) + LAYER_ROWWISE = ["self_attn.o_proj.weight", "mlp.down_proj.weight"] - for key, action in base_actions.items(): - if "layers.0." in key: - for i in range(num_layers): - final_actions[key.replace("layers.0.", f"layers.{i}.")] = action - final_actions[key] = action + BIAS_KEYS = [ + "self_attn.q_proj.bias", + "self_attn.k_proj.bias", + "self_attn.v_proj.bias", + ] + FUSE_BIAS_KEYS = [ + "self_attn.qkv_proj.bias", + ] - return final_actions + def make_base_actions(): + actions = { + "lm_head.weight": partial(fn, is_column=False), + f"{cls.base_model_prefix}.embed_tokens.weight": partial(fn, is_column=False), + } + for layer_idx in range(config.num_hidden_layers): + # colwise + if not config.fuse_attention_qkv: + actions.update( + { + f"{cls.base_model_prefix}.layers.{layer_idx}.{k}": partial(fn, is_column=True) + for k in LAYER_COLWISE + } + ) + actions.update( + { + f"{cls.base_model_prefix}.layers.{layer_idx}.{b}": partial(fn, is_column=True) + for b in BIAS_KEYS + } + ) + else: + actions.update( + { + f"{cls.base_model_prefix}.layers.{layer_idx}.{k}": partial(fn, is_column=True) + for k in FUSE_LAYER_COLWISE + } + ) + actions.update( + { + f"{cls.base_model_prefix}.layers.{layer_idx}.{b}": partial(fn, is_column=True) + for b in FUSE_BIAS_KEYS + } + ) + # rowwise + actions.update( + { + f"{cls.base_model_prefix}.layers.{layer_idx}.{k}": partial(fn, is_column=False) + for k in LAYER_ROWWISE + } + ) - mappings = get_tensor_parallel_split_mappings(config.num_hidden_layers) + return actions + mappings = make_base_actions() return mappings @classmethod diff --git a/paddleformers/transformers/qwen2_moe/modeling.py b/paddleformers/transformers/qwen2_moe/modeling.py index 7713186452..6e71442108 100644 --- a/paddleformers/transformers/qwen2_moe/modeling.py +++ b/paddleformers/transformers/qwen2_moe/modeling.py @@ -516,6 +516,7 @@ class Qwen2MoePretrainedModel(PretrainedModel): @classmethod def _get_tensor_parallel_mappings(cls, config: Qwen2MoeConfig, is_split=True): + """Generate tensor parallel mappings for model conversion.""" from ..conversion_utils import split_or_merge_func fn = split_or_merge_func( @@ -525,82 +526,144 @@ def _get_tensor_parallel_mappings(cls, config: Qwen2MoeConfig, is_split=True): num_attention_heads=config.num_attention_heads, ) - def get_tensor_parallel_split_mappings(num_layers, num_experts): - final_actions = {} + LAYER_COLWISE = [ + "self_attn.q_proj.weight", + "self_attn.k_proj.weight", + "self_attn.v_proj.weight", + ] + FUSE_LAYER_COLWISE = [ + "self_attn.qkv_proj.weight", + ] - base_actions = { - "lm_head.weight": partial(fn, is_column=True), - # Row Linear - "embed_tokens.weight": partial(fn, is_column=False), - "layers.0.self_attn.o_proj.weight": partial(fn, is_column=False), - } + LAYER_ROWWISE = ["self_attn.o_proj.weight"] - if not config.vocab_size % config.tensor_parallel_degree == 0: - base_actions.pop("lm_head.weight") - base_actions.pop("embed_tokens.weight") + EXPERT_LAYER_COLWISE = [ + "up_proj.weight", + "gate_proj.weight", + ] + FUSE_EXPERT_LAYER_COLWISE = [ + "up_gate_proj.weight", + ] - # Column Linear - if config.fuse_attention_qkv: - base_actions["layers.0.self_attn.qkv_proj.weight"] = partial(fn, is_column=True) - base_actions["layers.0.self_attn.qkv_proj.bias"] = partial(fn, is_column=True) - else: - base_actions["layers.0.self_attn.q_proj.weight"] = partial(fn, is_column=True) - base_actions["layers.0.self_attn.q_proj.bias"] = partial(fn, is_column=True) - # if we have enough num_key_value_heads to split, then split it. - if config.num_key_value_heads % config.tensor_parallel_degree == 0: - base_actions["layers.0.self_attn.k_proj.weight"] = partial(fn, is_column=True) - base_actions["layers.0.self_attn.v_proj.weight"] = partial(fn, is_column=True) - base_actions["layers.0.self_attn.k_proj.bias"] = partial(fn, is_column=True) - base_actions["layers.0.self_attn.v_proj.bias"] = partial(fn, is_column=True) - - for key, action in base_actions.items(): - if "layers.0." in key: - for i in range(num_layers): - final_actions[key.replace("layers.0.", f"layers.{i}.")] = action - final_actions[key] = action - - # Add tp split for expert params. - if config.fuse_attention_ffn: - base_actions = { - "layers.0.mlp.experts.0.up_gate_proj.weight": partial(fn, is_column=True, is_naive_2fuse=True), - "layers.0.mlp.experts.0.down_proj.weight": partial(fn, is_column=False), - } - else: - # Add tp split for expert params. - base_actions = { - "layers.0.mlp.experts.0.gate_proj.weight": partial(fn, is_column=True), - "layers.0.mlp.experts.0.up_proj.weight": partial(fn, is_column=True), - "layers.0.mlp.experts.0.down_proj.weight": partial(fn, is_column=False), - } - for key, action in base_actions.items(): - for i in range(num_layers): - newkey = key.replace("layers.0.", f"layers.{i}.") - for j in range(num_experts): - newkey2 = newkey.replace("experts.0.", f"experts.{j}.") - final_actions[newkey2] = action - - # Add tp split for shared expert params. - if config.fuse_attention_ffn: - base_actions = { - "layers.0.mlp.shared_expert.up_gate_proj.weight": partial(fn, is_column=True, is_naive_2fuse=True), - "layers.0.mlp.shared_expert.down_proj.weight": partial(fn, is_column=False), - } - else: - base_actions = { - "layers.0.mlp.shared_expert.gate_proj.weight": partial(fn, is_column=True), - "layers.0.mlp.shared_expert.up_proj.weight": partial(fn, is_column=True), - "layers.0.mlp.shared_expert.down_proj.weight": partial(fn, is_column=False), - } - for key, action in base_actions.items(): - if "layers.0." in key: - for i in range(num_layers): - final_actions[key.replace("layers.0.", f"layers.{i}.")] = action - final_actions[key] = action + EXPERT_LAYER_ROWWISE = ["down_proj.weight"] + + SHARED_EXPERT_LAYER_COLWISE = [ + "up_proj.weight", + "gate_proj.weight", + ] + FUSE_SHARED_EXPERT_LAYER_COLWISE = [ + "up_gate_proj.weight", + ] + + SHARED_EXPERT_LAYER_ROWWISE = ["down_proj.weight"] - return final_actions + BIAS_KEYS = [ + "self_attn.q_proj.bias", + "self_attn.k_proj.bias", + "self_attn.v_proj.bias", + ] + FUSE_BIAS_KEYS = [ + "self_attn.qkv_proj.bias", + ] + + def make_base_actions(): + actions = { + "lm_head.weight": partial(fn, is_column=False), + f"{cls.base_model_prefix}.embed_tokens.weight": partial(fn, is_column=False), + } + for layer_idx in range(config.num_hidden_layers): + # colwise + if not config.fuse_attention_qkv: + actions.update( + { + f"{cls.base_model_prefix}.layers.{layer_idx}.{k}": partial(fn, is_column=True) + for k in LAYER_COLWISE + } + ) + actions.update( + { + f"{cls.base_model_prefix}.layers.{layer_idx}.mlp.experts.{e}.{k}": partial( + fn, is_column=True + ) + for e in range(config.num_experts) + for k in EXPERT_LAYER_COLWISE + } + ) + actions.update( + { + f"{cls.base_model_prefix}.layers.{layer_idx}.mlp.shared_expert.{k}": partial( + fn, is_column=True + ) + for k in SHARED_EXPERT_LAYER_COLWISE + } + ) + # bias + if config.qkv_bias: + actions.update( + { + f"{cls.base_model_prefix}.layers.{layer_idx}.{b}": partial(fn, is_column=True) + for b in BIAS_KEYS + } + ) + else: + actions.update( + { + f"{cls.base_model_prefix}.layers.{layer_idx}.{k}": partial(fn, is_column=True) + for k in FUSE_LAYER_COLWISE + } + ) + actions.update( + { + f"{cls.base_model_prefix}.layers.{layer_idx}.mlp.experts.{e}.{k}": partial( + fn, is_column=True + ) + for e in range(config.num_experts) + for k in FUSE_EXPERT_LAYER_COLWISE + } + ) + actions.update( + { + f"{cls.base_model_prefix}.layers.{layer_idx}.mlp.shared_expert.{k}": partial( + fn, is_column=True + ) + for k in FUSE_SHARED_EXPERT_LAYER_COLWISE + } + ) + # bias + if config.qkv_bias: + actions.update( + { + f"{cls.base_model_prefix}.layers.{layer_idx}.{b}": partial(fn, is_column=True) + for b in FUSE_BIAS_KEYS + } + ) + + # rowwise + actions.update( + { + f"{cls.base_model_prefix}.layers.{layer_idx}.{k}": partial(fn, is_column=False) + for k in LAYER_ROWWISE + } + ) + actions.update( + { + f"{cls.base_model_prefix}.layers.{layer_idx}.mlp.experts.{e}.{k}": partial(fn, is_column=False) + for e in range(config.num_experts) + for k in EXPERT_LAYER_ROWWISE + } + ) + actions.update( + { + f"{cls.base_model_prefix}.layers.{layer_idx}.mlp.shared_expert.{k}": partial( + fn, is_column=False + ) + for k in SHARED_EXPERT_LAYER_ROWWISE + } + ) - mappings = get_tensor_parallel_split_mappings(config.num_hidden_layers, config.num_experts) + return actions + mappings = make_base_actions() return mappings @classmethod diff --git a/paddleformers/transformers/qwen3/modeling.py b/paddleformers/transformers/qwen3/modeling.py index 8df090af6a..f7dcae2691 100644 --- a/paddleformers/transformers/qwen3/modeling.py +++ b/paddleformers/transformers/qwen3/modeling.py @@ -323,7 +323,7 @@ class Qwen3PretrainedModel(PretrainedModel): @classmethod def _get_tensor_parallel_mappings(cls, config: Qwen3Config, is_split=True): - + """Generate tensor parallel mappings for model conversion.""" from ..conversion_utils import split_or_merge_func fn = split_or_merge_func( @@ -333,50 +333,77 @@ def _get_tensor_parallel_mappings(cls, config: Qwen3Config, is_split=True): num_attention_heads=config.num_attention_heads, ) - def get_tensor_parallel_split_mappings(num_layers): - final_actions = {} - - base_actions = { - # Row Linear - "embed_tokens.weight": partial(fn, is_column=False), - "layers.0.self_attn.o_proj.weight": partial(fn, is_column=False), - "layers.0.mlp.down_proj.weight": partial(fn, is_column=False), - } + LAYER_COLWISE = [ + "self_attn.q_proj.weight", + "self_attn.k_proj.weight", + "self_attn.v_proj.weight", + "mlp.up_proj.weight", + "mlp.gate_proj.weight", + ] + FUSE_LAYER_COLWISE = [ + "self_attn.qkv_proj.weight", + "mlp.up_gate_proj.weight", + ] - if config.tie_word_embeddings: - base_actions["lm_head.weight"] = partial(fn, is_column=False) - else: - base_actions["lm_head.weight"] = partial(fn, is_column=True) - - if not config.vocab_size % config.tensor_parallel_degree == 0: - base_actions.pop("lm_head.weight") - base_actions.pop("embed_tokens.weight") - # Column Linear - if config.fuse_attention_qkv: - base_actions["layers.0.self_attn.qkv_proj.weight"] = partial(fn, is_column=True) - else: - base_actions["layers.0.self_attn.q_proj.weight"] = partial(fn, is_column=True) - # if we have enough num_key_value_heads to split, then split it. - if config.num_key_value_heads % config.tensor_parallel_degree == 0: - base_actions["layers.0.self_attn.k_proj.weight"] = partial(fn, is_column=True) - base_actions["layers.0.self_attn.v_proj.weight"] = partial(fn, is_column=True) - - if config.fuse_attention_ffn: - base_actions["layers.0.mlp.up_gate_proj.weight"] = partial(fn, is_column=True, is_naive_2fuse=True) - else: - base_actions["layers.0.mlp.gate_proj.weight"] = partial(fn, is_column=True) - base_actions["layers.0.mlp.up_proj.weight"] = partial(fn, is_column=True) + LAYER_ROWWISE = ["self_attn.o_proj.weight", "mlp.down_proj.weight"] - for key, action in base_actions.items(): - if "layers.0." in key: - for i in range(num_layers): - final_actions[key.replace("layers.0.", f"layers.{i}.")] = action - final_actions[key] = action + BIAS_KEYS = [ + "self_attn.q_proj.bias", + "self_attn.k_proj.bias", + "self_attn.v_proj.bias", + ] + FUSE_BIAS_KEYS = [ + "self_attn.qkv_proj.bias", + ] - return final_actions + def make_base_actions(): + actions = { + "lm_head.weight": partial(fn, is_column=False), + f"{cls.base_model_prefix}.embed_tokens.weight": partial(fn, is_column=False), + } + for layer_idx in range(config.num_hidden_layers): + # colwise + if not config.fuse_attention_qkv: + actions.update( + { + f"{cls.base_model_prefix}.layers.{layer_idx}.{k}": partial(fn, is_column=True) + for k in LAYER_COLWISE + } + ) + # bias + if config.attention_bias: + actions.update( + { + f"{cls.base_model_prefix}.layers.{layer_idx}.{b}": partial(fn, is_column=True) + for b in BIAS_KEYS + } + ) + else: + actions.update( + { + f"{cls.base_model_prefix}.layers.{layer_idx}.{k}": partial(fn, is_column=True) + for k in FUSE_LAYER_COLWISE + } + ) + # bias + if config.attention_bias: + actions.update( + { + f"{cls.base_model_prefix}.layers.{layer_idx}.{b}": partial(fn, is_column=True) + for b in FUSE_BIAS_KEYS + } + ) + # rowwise + actions.update( + { + f"{cls.base_model_prefix}.layers.{layer_idx}.{k}": partial(fn, is_column=False) + for k in LAYER_ROWWISE + } + ) - mappings = get_tensor_parallel_split_mappings(config.num_hidden_layers) + return actions + mappings = make_base_actions() return mappings @classmethod diff --git a/paddleformers/transformers/qwen3_moe/modeling.py b/paddleformers/transformers/qwen3_moe/modeling.py index 6edb6fd94f..89b04c2c77 100644 --- a/paddleformers/transformers/qwen3_moe/modeling.py +++ b/paddleformers/transformers/qwen3_moe/modeling.py @@ -547,6 +547,7 @@ class Qwen3MoePretrainedModel(PretrainedModel): @classmethod def _get_tensor_parallel_mappings(cls, config: Qwen3MoeConfig, is_split=True): + """Generate tensor parallel mappings for model conversion.""" from ..conversion_utils import split_or_merge_func fn = split_or_merge_func( @@ -556,72 +557,124 @@ def _get_tensor_parallel_mappings(cls, config: Qwen3MoeConfig, is_split=True): num_attention_heads=config.num_attention_heads, ) - def get_tensor_parallel_split_mappings(num_layers, num_experts): - final_actions = {} + LAYER_COLWISE = [ + "self_attn.q_proj.weight", + "self_attn.k_proj.weight", + "self_attn.v_proj.weight", + ] + FUSE_LAYER_COLWISE = [ + "self_attn.qkv_proj.weight", + ] - base_actions = { - "lm_head.weight": partial(fn, is_column=True), - # Row Linear - "embed_tokens.weight": partial(fn, is_column=False), - "layers.0.self_attn.o_proj.weight": partial(fn, is_column=False), - } + LAYER_ROWWISE = ["self_attn.o_proj.weight"] - if not config.vocab_size % config.tensor_parallel_degree == 0: - base_actions.pop("lm_head.weight") - base_actions.pop("embed_tokens.weight") + EXPERT_LAYER_COLWISE = [ + "up_proj.weight", + "gate_proj.weight", + ] + FUSE_EXPERT_LAYER_COLWISE = [ + "up_gate_proj.weight", + ] - # Column Linear - if config.fuse_attention_qkv: - base_actions["layers.0.self_attn.qkv_proj.weight"] = partial(fn, is_column=True) - base_actions["layers.0.self_attn.qkv_proj.bias"] = partial(fn, is_column=True) - else: - base_actions["layers.0.self_attn.q_proj.weight"] = partial(fn, is_column=True) - base_actions["layers.0.self_attn.q_proj.bias"] = partial(fn, is_column=True) - # if we have enough num_key_value_heads to split, then split it. - if config.num_key_value_heads % config.tensor_parallel_degree == 0: - base_actions["layers.0.self_attn.k_proj.weight"] = partial(fn, is_column=True) - base_actions["layers.0.self_attn.v_proj.weight"] = partial(fn, is_column=True) - base_actions["layers.0.self_attn.k_proj.bias"] = partial(fn, is_column=True) - base_actions["layers.0.self_attn.v_proj.bias"] = partial(fn, is_column=True) - - for key, action in base_actions.items(): - if "layers.0." in key: - for i in range(num_layers): - final_actions[key.replace("layers.0.", f"layers.{i}.")] = action - final_actions[key] = action - - # Add tp split for expert params. - if config.fuse_attention_ffn: - base_actions = { - "layers.0.mlp.experts.0.up_gate_proj.weight": partial(fn, is_column=True, is_naive_2fuse=True), - "layers.0.mlp.experts.0.down_proj.weight": partial(fn, is_column=False), - } - else: - # Add tp split for expert params. - base_actions = { - "layers.0.mlp.experts.0.gate_proj.weight": partial(fn, is_column=True), - "layers.0.mlp.experts.0.up_proj.weight": partial(fn, is_column=True), - "layers.0.mlp.experts.0.down_proj.weight": partial(fn, is_column=False), - } - for key, action in base_actions.items(): - for i in range(num_layers): - newkey = key.replace("layers.0.", f"layers.{i}.") - for j in range(num_experts): - newkey2 = newkey.replace("experts.0.", f"experts.{j}.") - final_actions[newkey2] = action + EXPERT_LAYER_ROWWISE = ["down_proj.weight"] - # Add tp split for shared expert params. - base_actions = {} - for key, action in base_actions.items(): - if "layers.0." in key: - for i in range(num_layers): - final_actions[key.replace("layers.0.", f"layers.{i}.")] = action - final_actions[key] = action + BIAS_KEYS = [ + "self_attn.q_proj.bias", + "self_attn.k_proj.bias", + "self_attn.v_proj.bias", + ] + FUSE_BIAS_KEYS = [ + "self_attn.qkv_proj.bias", + ] - return final_actions + def make_base_actions(): + actions = { + "lm_head.weight": partial(fn, is_column=False), + f"{cls.base_model_prefix}.embed_tokens.weight": partial(fn, is_column=False), + } + for layer_idx in range(config.num_hidden_layers): + if not config.fuse_attention_qkv: + actions.update( + { + f"{cls.base_model_prefix}.layers.{layer_idx}.{k}": partial(fn, is_column=True) + for k in LAYER_COLWISE + } + ) + else: + actions.update( + { + f"{cls.base_model_prefix}.layers.{layer_idx}.{k}": partial(fn, is_column=True) + for k in FUSE_LAYER_COLWISE + } + ) + + actions.update( + { + f"{cls.base_model_prefix}.layers.{layer_idx}.{k}": partial(fn, is_column=False) + for k in LAYER_ROWWISE + } + ) + + try: + moe_group = fleet.get_hybrid_communicate_group().get_expert_parallel_group() + except Exception: + moe_group = None + expert_parallel_degree = dist.get_world_size(moe_group) if moe_group is not None else 1 + # TODO: merge disable_ffn_model_parallel and expert_parallel_degree + if expert_parallel_degree <= 1: + # # if disable_ffn_model_parallel is True, disable expert layer tp plan + # if not config.disable_ffn_model_parallel: + if not config.fuse_attention_ffn: + actions.update( + { + f"{cls.base_model_prefix}.layers.{layer_idx}.mlp.experts.{e}.{k}": partial( + fn, is_column=True + ) + for e in range(config.num_experts) + for k in EXPERT_LAYER_COLWISE + } + ) + else: + actions.update( + { + f"{cls.base_model_prefix}.layers.{layer_idx}.mlp.experts.{e}.{k}": partial( + fn, is_column=True + ) + for e in range(config.num_experts) + for k in FUSE_EXPERT_LAYER_COLWISE + } + ) + + actions.update( + { + f"{cls.base_model_prefix}.layers.{layer_idx}.mlp.experts.{e}.{k}": partial( + fn, is_column=False + ) + for e in range(config.num_experts) + for k in EXPERT_LAYER_ROWWISE + } + ) + + # bias + if config.attention_bias: + if not config.fuse_attention_qkv: + actions.update( + { + f"{cls.base_model_prefix}.layers.{layer_idx}.{b}": partial(fn, is_column=True) + for b in BIAS_KEYS + } + ) + else: + actions.update( + { + f"{cls.base_model_prefix}.layers.{layer_idx}.{b}": partial(fn, is_column=True) + for b in FUSE_BIAS_KEYS + } + ) - mappings = get_tensor_parallel_split_mappings(config.num_hidden_layers, config.num_experts) + return actions + mappings = make_base_actions() return mappings @classmethod From 98a6f8d8860def4dcf582fe68ef8aafe2f25248a Mon Sep 17 00:00:00 2001 From: llbdyiu66 Date: Thu, 11 Dec 2025 19:32:11 +0800 Subject: [PATCH 4/4] fix --- paddleformers/transformers/qwen2/modeling.py | 26 ++++++++-- .../transformers/qwen2_moe/modeling.py | 50 +++++++++---------- paddleformers/transformers/qwen3/modeling.py | 25 +++++++++- .../transformers/qwen3_moe/modeling.py | 34 ++++++------- 4 files changed, 86 insertions(+), 49 deletions(-) diff --git a/paddleformers/transformers/qwen2/modeling.py b/paddleformers/transformers/qwen2/modeling.py index 1bc41b675f..dfc3c5e09d 100644 --- a/paddleformers/transformers/qwen2/modeling.py +++ b/paddleformers/transformers/qwen2/modeling.py @@ -321,12 +321,16 @@ def _get_tensor_parallel_mappings(cls, config: Qwen2Config, is_split=True): "self_attn.q_proj.weight", "self_attn.k_proj.weight", "self_attn.v_proj.weight", - "mlp.up_proj.weight", - "mlp.gate_proj.weight", ] - FUSE_LAYER_COLWISE = [ "self_attn.qkv_proj.weight", + ] + + FFN_LAYER_COLWISE = [ + "mlp.up_proj.weight", + "mlp.gate_proj.weight", + ] + FUSE_FFN_LAYER_COLWISE = [ "mlp.up_gate_proj.weight", ] @@ -374,6 +378,22 @@ def make_base_actions(): for b in FUSE_BIAS_KEYS } ) + if not config.fuse_attention_ffn: + actions.update( + { + f"{cls.base_model_prefix}.layers.{layer_idx}.{k}": partial(fn, is_column=True) + for k in FFN_LAYER_COLWISE + } + ) + else: + actions.update( + { + f"{cls.base_model_prefix}.layers.{layer_idx}.{k}": partial( + fn, is_column=True, is_naive_2fuse=True + ) + for k in FUSE_FFN_LAYER_COLWISE + } + ) # rowwise actions.update( { diff --git a/paddleformers/transformers/qwen2_moe/modeling.py b/paddleformers/transformers/qwen2_moe/modeling.py index 6e71442108..11e5b7a35a 100644 --- a/paddleformers/transformers/qwen2_moe/modeling.py +++ b/paddleformers/transformers/qwen2_moe/modeling.py @@ -580,6 +580,29 @@ def make_base_actions(): for k in LAYER_COLWISE } ) + if config.qkv_bias: + actions.update( + { + f"{cls.base_model_prefix}.layers.{layer_idx}.{b}": partial(fn, is_column=True) + for b in BIAS_KEYS + } + ) + else: + actions.update( + { + f"{cls.base_model_prefix}.layers.{layer_idx}.{k}": partial(fn, is_column=True) + for k in FUSE_LAYER_COLWISE + } + ) + if config.qkv_bias: + actions.update( + { + f"{cls.base_model_prefix}.layers.{layer_idx}.{b}": partial(fn, is_column=True) + for b in FUSE_BIAS_KEYS + } + ) + + if not config.fuse_attention_ffn: actions.update( { f"{cls.base_model_prefix}.layers.{layer_idx}.mlp.experts.{e}.{k}": partial( @@ -597,25 +620,11 @@ def make_base_actions(): for k in SHARED_EXPERT_LAYER_COLWISE } ) - # bias - if config.qkv_bias: - actions.update( - { - f"{cls.base_model_prefix}.layers.{layer_idx}.{b}": partial(fn, is_column=True) - for b in BIAS_KEYS - } - ) else: - actions.update( - { - f"{cls.base_model_prefix}.layers.{layer_idx}.{k}": partial(fn, is_column=True) - for k in FUSE_LAYER_COLWISE - } - ) actions.update( { f"{cls.base_model_prefix}.layers.{layer_idx}.mlp.experts.{e}.{k}": partial( - fn, is_column=True + fn, is_column=True, is_naive_2fuse=True ) for e in range(config.num_experts) for k in FUSE_EXPERT_LAYER_COLWISE @@ -624,20 +633,11 @@ def make_base_actions(): actions.update( { f"{cls.base_model_prefix}.layers.{layer_idx}.mlp.shared_expert.{k}": partial( - fn, is_column=True + fn, is_column=True, is_naive_2fuse=True ) for k in FUSE_SHARED_EXPERT_LAYER_COLWISE } ) - # bias - if config.qkv_bias: - actions.update( - { - f"{cls.base_model_prefix}.layers.{layer_idx}.{b}": partial(fn, is_column=True) - for b in FUSE_BIAS_KEYS - } - ) - # rowwise actions.update( { diff --git a/paddleformers/transformers/qwen3/modeling.py b/paddleformers/transformers/qwen3/modeling.py index f7dcae2691..987ae0deb3 100644 --- a/paddleformers/transformers/qwen3/modeling.py +++ b/paddleformers/transformers/qwen3/modeling.py @@ -337,11 +337,16 @@ def _get_tensor_parallel_mappings(cls, config: Qwen3Config, is_split=True): "self_attn.q_proj.weight", "self_attn.k_proj.weight", "self_attn.v_proj.weight", - "mlp.up_proj.weight", - "mlp.gate_proj.weight", ] FUSE_LAYER_COLWISE = [ "self_attn.qkv_proj.weight", + ] + + FFN_LAYER_COLWISE = [ + "mlp.up_proj.weight", + "mlp.gate_proj.weight", + ] + FUSE_FFN_LAYER_COLWISE = [ "mlp.up_gate_proj.weight", ] @@ -393,6 +398,22 @@ def make_base_actions(): for b in FUSE_BIAS_KEYS } ) + if not config.fuse_attention_ffn: + actions.update( + { + f"{cls.base_model_prefix}.layers.{layer_idx}.{k}": partial(fn, is_column=True) + for k in FFN_LAYER_COLWISE + } + ) + else: + actions.update( + { + f"{cls.base_model_prefix}.layers.{layer_idx}.{k}": partial( + fn, is_column=True, is_naive_2fuse=True + ) + for k in FUSE_FFN_LAYER_COLWISE + } + ) # rowwise actions.update( { diff --git a/paddleformers/transformers/qwen3_moe/modeling.py b/paddleformers/transformers/qwen3_moe/modeling.py index 89b04c2c77..4c407a00c0 100644 --- a/paddleformers/transformers/qwen3_moe/modeling.py +++ b/paddleformers/transformers/qwen3_moe/modeling.py @@ -600,6 +600,13 @@ def make_base_actions(): for k in LAYER_COLWISE } ) + if config.attention_bias: + actions.update( + { + f"{cls.base_model_prefix}.layers.{layer_idx}.{b}": partial(fn, is_column=True) + for b in BIAS_KEYS + } + ) else: actions.update( { @@ -607,6 +614,13 @@ def make_base_actions(): for k in FUSE_LAYER_COLWISE } ) + if config.attention_bias: + actions.update( + { + f"{cls.base_model_prefix}.layers.{layer_idx}.{b}": partial(fn, is_column=True) + for b in FUSE_BIAS_KEYS + } + ) actions.update( { @@ -638,13 +652,12 @@ def make_base_actions(): actions.update( { f"{cls.base_model_prefix}.layers.{layer_idx}.mlp.experts.{e}.{k}": partial( - fn, is_column=True + fn, is_column=True, is_naive_2fuse=True ) for e in range(config.num_experts) for k in FUSE_EXPERT_LAYER_COLWISE } ) - actions.update( { f"{cls.base_model_prefix}.layers.{layer_idx}.mlp.experts.{e}.{k}": partial( @@ -655,23 +668,6 @@ def make_base_actions(): } ) - # bias - if config.attention_bias: - if not config.fuse_attention_qkv: - actions.update( - { - f"{cls.base_model_prefix}.layers.{layer_idx}.{b}": partial(fn, is_column=True) - for b in BIAS_KEYS - } - ) - else: - actions.update( - { - f"{cls.base_model_prefix}.layers.{layer_idx}.{b}": partial(fn, is_column=True) - for b in FUSE_BIAS_KEYS - } - ) - return actions mappings = make_base_actions()