From a842392ab8aa0b265e12c9919ccc459fd23d9980 Mon Sep 17 00:00:00 2001 From: makubes <2416013822@qq.com> Date: Thu, 25 Sep 2025 22:12:18 +0800 Subject: [PATCH 1/3] fixbug: chatglm v2 remove useless memcpy --- paddlenlp/transformers/chatglm_v2/modeling.py | 9 +++++++++ 1 file changed, 9 insertions(+) diff --git a/paddlenlp/transformers/chatglm_v2/modeling.py b/paddlenlp/transformers/chatglm_v2/modeling.py index d4b4a39758d9..8df6dcf9304d 100644 --- a/paddlenlp/transformers/chatglm_v2/modeling.py +++ b/paddlenlp/transformers/chatglm_v2/modeling.py @@ -414,6 +414,15 @@ def forward( value_layer.shape[:2] + [self.num_attention_heads_per_partition, self.hidden_size_per_attention_head] ) + B, S, G, D = key_layer.shape + key_layer = key_layer.unsqueeze(-2) + key_layer = key_layer.expand(B, S, G, multiplier, D) + key_layer = key_layer.reshape(B, S, self.num_attention_heads_per_partition, self.hidden_size_per_attention_head) + value_layer = value_layer.unsqueeze(-2) + value_layer = value_layer.expand(B, S, G, multiplier, D) + value_layer = value_layer.reshape(B, S, self.num_attention_heads_per_partition, self.hidden_size_per_attention_head) + + # ================================== # core attention computation # ================================== From 2a3dbfc3d9891d6b173780c0aa5c4e9a3b104e26 Mon Sep 17 00:00:00 2001 From: makubes <2416013822@qq.com> Date: Thu, 25 Sep 2025 23:52:34 +0800 Subject: [PATCH 2/3] feat: support fast infer in p800 --- paddlenlp/transformers/chatglm_v2/modeling.py | 20 +++++-------------- 1 file changed, 5 insertions(+), 15 deletions(-) diff --git a/paddlenlp/transformers/chatglm_v2/modeling.py b/paddlenlp/transformers/chatglm_v2/modeling.py index 8df6dcf9304d..a4632a478649 100644 --- a/paddlenlp/transformers/chatglm_v2/modeling.py +++ b/paddlenlp/transformers/chatglm_v2/modeling.py @@ -405,23 +405,13 @@ def forward( multiplier = self.num_attention_heads_per_partition // self.num_multi_query_groups_per_partition - key_layer = key_layer.unsqueeze(-2).tile([1, 1, 1, multiplier, 1]) - key_layer = key_layer.reshape( - key_layer.shape[:2] + [self.num_attention_heads_per_partition, self.hidden_size_per_attention_head] - ) - value_layer = value_layer.unsqueeze(-2).tile([1, 1, 1, multiplier, 1]) - value_layer = value_layer.reshape( - value_layer.shape[:2] + [self.num_attention_heads_per_partition, self.hidden_size_per_attention_head] - ) - - B, S, G, D = key_layer.shape + S, B, G, D = key_layer.shape key_layer = key_layer.unsqueeze(-2) - key_layer = key_layer.expand(B, S, G, multiplier, D) - key_layer = key_layer.reshape(B, S, self.num_attention_heads_per_partition, self.hidden_size_per_attention_head) + key_layer = key_layer.expand(S, B, G, multiplier, D) + key_layer = key_layer.reshape( S, B, self.num_attention_heads_per_partition, self.hidden_size_per_attention_head) value_layer = value_layer.unsqueeze(-2) - value_layer = value_layer.expand(B, S, G, multiplier, D) - value_layer = value_layer.reshape(B, S, self.num_attention_heads_per_partition, self.hidden_size_per_attention_head) - + value_layer = value_layer.expand(S, B, G, multiplier, D) + value_layer = value_layer.reshape(S, B, self.num_attention_heads_per_partition, self.hidden_size_per_attention_head) # ================================== # core attention computation From 4713599b8f6e8277a02ddb55236ab90e63fdeac6 Mon Sep 17 00:00:00 2001 From: makubes <2416013822@qq.com> Date: Fri, 26 Sep 2025 00:52:23 +0800 Subject: [PATCH 3/3] feat: support fast infer in p800(#2) --- paddlenlp/transformers/chatglm_v2/modeling.py | 8 ++++++-- 1 file changed, 6 insertions(+), 2 deletions(-) diff --git a/paddlenlp/transformers/chatglm_v2/modeling.py b/paddlenlp/transformers/chatglm_v2/modeling.py index a4632a478649..bfe740616433 100644 --- a/paddlenlp/transformers/chatglm_v2/modeling.py +++ b/paddlenlp/transformers/chatglm_v2/modeling.py @@ -408,10 +408,14 @@ def forward( S, B, G, D = key_layer.shape key_layer = key_layer.unsqueeze(-2) key_layer = key_layer.expand(S, B, G, multiplier, D) - key_layer = key_layer.reshape( S, B, self.num_attention_heads_per_partition, self.hidden_size_per_attention_head) + key_layer = key_layer.reshape( + S, B, self.num_attention_heads_per_partition, self.hidden_size_per_attention_head + ) value_layer = value_layer.unsqueeze(-2) value_layer = value_layer.expand(S, B, G, multiplier, D) - value_layer = value_layer.reshape(S, B, self.num_attention_heads_per_partition, self.hidden_size_per_attention_head) + value_layer = value_layer.reshape( + S, B, self.num_attention_heads_per_partition, self.hidden_size_per_attention_head + ) # ================================== # core attention computation