From 4a61f7d20dfa08ee5f6a6693763d86b41792bee1 Mon Sep 17 00:00:00 2001 From: Ruonan Wang Date: Thu, 22 Aug 2024 05:34:53 -0700 Subject: [PATCH] update mlp of llama (#11897) * update mlp of llama * relax threshold of mlp test * revert code --- python/llm/src/ipex_llm/transformers/models/llama.py | 10 ++++++++++ .../test/inference_gpu/test_transformers_api_mlp.py | 2 +- 2 files changed, 11 insertions(+), 1 deletion(-) diff --git a/python/llm/src/ipex_llm/transformers/models/llama.py b/python/llm/src/ipex_llm/transformers/models/llama.py index 2c9c17e7a58..dfbbaf003a6 100644 --- a/python/llm/src/ipex_llm/transformers/models/llama.py +++ b/python/llm/src/ipex_llm/transformers/models/llama.py @@ -305,6 +305,16 @@ def llama_mlp_forward( ) hidden_states = attn_output.view(x.shape) return hidden_states + elif x.device.type == "xpu" and not self.training: + import xe_addons + gate = self.gate_proj(x) + up = self.up_proj(x) + xe_addons.mlp_silu_mul_inplaced(gate, up) + out = self.down_proj(gate) + if residual is not None: + return out + residual + else: + return out else: a = self.act_fn(self.gate_proj(x)) b = self.up_proj(x) diff --git a/python/llm/test/inference_gpu/test_transformers_api_mlp.py b/python/llm/test/inference_gpu/test_transformers_api_mlp.py index c6229d73fc4..d46d939a8ef 100644 --- a/python/llm/test/inference_gpu/test_transformers_api_mlp.py +++ b/python/llm/test/inference_gpu/test_transformers_api_mlp.py @@ -134,7 +134,7 @@ def Mistral_7B_Instruct_gpu_model(self, Name, Model, Tokenizer, model_path): # currently only compare the output of the last mlp layer. layer_before_MLP = "model.layers.31.post_attention_layernorm" MLP_layer = "model.layers.31.mlp" - lower_bound = 0 + lower_bound = 1e-3 self.run_optimize_gpu_model(Name, Model, Tokenizer, model_path, MLP_layer, layer_before_MLP, lower_bound) def Llama2_7B_gpu_model(self, Name, Model, Tokenizer, model_path):