From 4a61f7d20dfa08ee5f6a6693763d86b41792bee1 Mon Sep 17 00:00:00 2001
From: Ruonan Wang <ruonan1.wang@intel.com>
Date: Thu, 22 Aug 2024 05:34:53 -0700
Subject: [PATCH] update mlp of llama (#11897)

* update mlp of llama

* relax threshold of  mlp test

* revert code
---
 python/llm/src/ipex_llm/transformers/models/llama.py   | 10 ++++++++++
 .../test/inference_gpu/test_transformers_api_mlp.py    |  2 +-
 2 files changed, 11 insertions(+), 1 deletion(-)

diff --git a/python/llm/src/ipex_llm/transformers/models/llama.py b/python/llm/src/ipex_llm/transformers/models/llama.py
index 2c9c17e7a58..dfbbaf003a6 100644
--- a/python/llm/src/ipex_llm/transformers/models/llama.py
+++ b/python/llm/src/ipex_llm/transformers/models/llama.py
@@ -305,6 +305,16 @@ def llama_mlp_forward(
             )
             hidden_states = attn_output.view(x.shape)
         return hidden_states
+    elif x.device.type == "xpu" and not self.training:
+        import xe_addons
+        gate = self.gate_proj(x)
+        up = self.up_proj(x)
+        xe_addons.mlp_silu_mul_inplaced(gate, up)
+        out = self.down_proj(gate)
+        if residual is not None:
+            return out + residual
+        else:
+            return out
     else:
         a = self.act_fn(self.gate_proj(x))
         b = self.up_proj(x)
diff --git a/python/llm/test/inference_gpu/test_transformers_api_mlp.py b/python/llm/test/inference_gpu/test_transformers_api_mlp.py
index c6229d73fc4..d46d939a8ef 100644
--- a/python/llm/test/inference_gpu/test_transformers_api_mlp.py
+++ b/python/llm/test/inference_gpu/test_transformers_api_mlp.py
@@ -134,7 +134,7 @@ def Mistral_7B_Instruct_gpu_model(self, Name, Model, Tokenizer, model_path):
         # currently only compare the output of the last mlp layer.
         layer_before_MLP = "model.layers.31.post_attention_layernorm"
         MLP_layer = "model.layers.31.mlp"
-        lower_bound = 0
+        lower_bound = 1e-3
         self.run_optimize_gpu_model(Name, Model, Tokenizer, model_path, MLP_layer, layer_before_MLP, lower_bound)
 
     def Llama2_7B_gpu_model(self, Name, Model, Tokenizer, model_path):