From 714887ef852499e40cf661ff379fccf793bc24f0 Mon Sep 17 00:00:00 2001
From: plusbang <binbin1.deng@intel.com>
Date: Mon, 2 Dec 2024 16:55:13 +0800
Subject: [PATCH] save_directory is required argument for all models

---
 python/llm/dev/benchmark/all-in-one/run.py    |  6 ++--
 .../LLM/Pipeline-Models/README.md             | 22 +++++++-------
 .../LLM/Pipeline-Models/baichuan2.py          | 22 +++++++-------
 .../LLM/Pipeline-Models/llama2.py             | 22 +++++++-------
 .../LLM/Pipeline-Models/llama3.py             | 22 +++++++-------
 .../LLM/Pipeline-Models/minicpm.py            | 22 +++++++-------
 .../LLM/Pipeline-Models/qwen.py               | 24 +++++++--------
 .../HF-Transformers-AutoModels/LLM/README.md  | 20 ++++++-------
 .../LLM/baichuan2.py                          | 30 +++++++------------
 .../HF-Transformers-AutoModels/LLM/qwen.py    |  4 +--
 .../Multimodal/README.md                      | 11 ++++---
 .../Multimodal/bce-embedding.py               | 17 +++++------
 .../Multimodal/minicpm-llama3-v2.5.py         | 11 ++++---
 .../Multimodal/minicpm_v_2_6.py               | 11 ++++---
 .../Multimodal/speech_paraformer-large.py     | 11 ++++---
 .../src/ipex_llm/transformers/npu_model.py    |  9 +++---
 16 files changed, 129 insertions(+), 135 deletions(-)

diff --git a/python/llm/dev/benchmark/all-in-one/run.py b/python/llm/dev/benchmark/all-in-one/run.py
index 271c85dee224..fdf982cba414 100644
--- a/python/llm/dev/benchmark/all-in-one/run.py
+++ b/python/llm/dev/benchmark/all-in-one/run.py
@@ -634,7 +634,7 @@ def transformers_int4_npu_win(repo_id,
         model = AutoModel.from_pretrained(model_path, load_in_low_bit=low_bit, optimize_model=optimize_model,
                                           trust_remote_code=True, use_cache=True, max_context_len=max_context_len, max_prompt_len=int(in_out_len[0]), 
                                           quantization_group_size=npu_group_size, transpose_value_cache=transpose_value_cache,
-                                          attn_implementation="eager", torch_dtype=torch.float16).eval()
+                                          save_directory=save_directory, attn_implementation="eager", torch_dtype=torch.float16).eval()
         model = model.llm
         tokenizer = AutoTokenizer.from_pretrained(model_path, trust_remote_code=True)
     else:
@@ -702,6 +702,7 @@ def transformers_int4_npu_pipeline_win(repo_id,
     in_out_len = in_out_pairs[0].split("-")
     max_context_len = max(int(in_out_len[0]) + int(in_out_len[1]), 1024)
     mixed_precision = True if npu_group_size == 0 else False
+    save_directory = "./save_converted_model_dir"
     # Load model in 4 bit,
     # which convert the relevant layers in the model into INT4 format
     st = time.perf_counter()
@@ -709,7 +710,8 @@ def transformers_int4_npu_pipeline_win(repo_id,
     model = AutoModelForCausalLM.from_pretrained(model_path, load_in_low_bit=low_bit, trust_remote_code=True, pipeline=True, torch_dtype=torch.float16,
                                                  optimize_model=optimize_model, max_context_len=max_context_len, max_prompt_len=int(in_out_len[0]), 
                                                  quantization_group_size=npu_group_size, transpose_value_cache=transpose_value_cache,
-                                                 use_cache=True, attn_implementation="eager", mixed_precision=mixed_precision).eval()
+                                                 use_cache=True, attn_implementation="eager", mixed_precision=mixed_precision,
+                                                 save_directory=save_directory).eval()
     tokenizer = AutoTokenizer.from_pretrained(model_path, trust_remote_code=True)
 
     end = time.perf_counter()
diff --git a/python/llm/example/NPU/HF-Transformers-AutoModels/LLM/Pipeline-Models/README.md b/python/llm/example/NPU/HF-Transformers-AutoModels/LLM/Pipeline-Models/README.md
index 462199dbeee4..30db6e3f9bf9 100644
--- a/python/llm/example/NPU/HF-Transformers-AutoModels/LLM/Pipeline-Models/README.md
+++ b/python/llm/example/NPU/HF-Transformers-AutoModels/LLM/Pipeline-Models/README.md
@@ -47,45 +47,45 @@ In the example [generate.py](./generate.py), we show a basic use case for a Llam
 
 ```cmd
 :: to run Llama-2-7b-chat-hf
-python llama2.py
+python llama2.py --repo-id-or-model-path "meta-llama/Llama-2-7b-chat-hf" --save-directory <converted_model_path>
 
 :: to run Meta-Llama-3-8B-Instruct
-python llama3.py
+python llama3.py --repo-id-or-model-path "meta-llama/Meta-Llama-3-8B-Instruct" --save-directory <converted_model_path>
 
 :: to run Llama-3.2-1B-Instruct
-python llama3.py --repo-id-or-model-path "meta-llama/Llama-3.2-1B-Instruct"
+python llama3.py --repo-id-or-model-path "meta-llama/Llama-3.2-1B-Instruct" --save-directory <converted_model_path>
 
 :: to run Llama-3.2-3B-Instruct
-python llama3.py --repo-id-or-model-path "meta-llama/Llama-3.2-3B-Instruct"
+python llama3.py --repo-id-or-model-path "meta-llama/Llama-3.2-3B-Instruct" --save-directory <converted_model_path>
 
 :: to run Qwen2.5-7B-Instruct
-python qwen.py
+python qwen.py --repo-id-or-model-path "Qwen/Qwen2.5-7B-Instruct" --save-directory <converted_model_path>
 
 :: to run Qwen2-1.5B-Instruct
-python qwen.py --repo-id-or-model-path "Qwen/Qwen2-1.5B-Instruct" --low_bit "sym_int8"
+python qwen.py --repo-id-or-model-path "Qwen/Qwen2-1.5B-Instruct" --low-bit sym_int8 --save-directory <converted_model_path>
 
 :: to run Qwen2.5-3B-Instruct
-python qwen.py --repo-id-or-model-path "Qwen/Qwen2.5-3B-Instruct" --low_bit "sym_int8"
+python qwen.py --repo-id-or-model-path "Qwen/Qwen2.5-3B-Instruct" --low-bit sym_int8 --save-directory <converted_model_path>
 
 :: to run Baichuan2-7B-Chat
-python baichuan2.py
+python baichuan2.py --repo-id-or-model-path "baichuan-inc/Baichuan2-7B-Chat" --save-directory <converted_model_path>
 
 :: to run MiniCPM-1B-sft-bf16
-python minicpm.py
+python minicpm.py --repo-id-or-model-path "openbmb/MiniCPM-1B-sft-bf16" --save-directory <converted_model_path>
 
 :: to run MiniCPM-2B-sft-bf16
-python minicpm.py --repo-id-or-model-path "openbmb/MiniCPM-2B-sft-bf16"
+python minicpm.py --repo-id-or-model-path "openbmb/MiniCPM-2B-sft-bf16" --save-directory <converted_model_path>
 ```
 
 Arguments info:
 - `--repo-id-or-model-path REPO_ID_OR_MODEL_PATH`: argument defining the huggingface repo id for the model (e.g. `meta-llama/Llama-2-7b-chat-hf`) to be downloaded, or the path to the huggingface checkpoint folder.
-- `--lowbit-path LOWBIT_MODEL_PATH`: argument defining the path to save/load lowbit version of the model. If it is an empty string, the original pretrained model specified by `REPO_ID_OR_MODEL_PATH` will be loaded. If it is an existing path, the lowbit model in `LOWBIT_MODEL_PATH` will be loaded. If it is a non-existing path, the original pretrained model specified by `REPO_ID_OR_MODEL_PATH` will be loaded, and the converted lowbit version will be saved into `LOWBIT_MODEL_PATH`. It is default to be `''`, i.e. an empty string.
 - `--prompt PROMPT`: argument defining the prompt to be infered. It is default to be `What is AI?`.
 - `--n-predict N_PREDICT`: argument defining the max number of tokens to predict. It is default to be `32`.
 - `--max-context-len MAX_CONTEXT_LEN`: Defines the maximum sequence length for both input and output tokens. It is default to be `1024`.
 - `--max-prompt-len MAX_PROMPT_LEN`: Defines the maximum number of tokens that the input prompt can contain. It is default to be `512`.
 - `--disable-transpose-value-cache`: Disable the optimization of transposing value cache.
 - `--disable-streaming`: Disable streaming mode of generation.
+- `--save-directory SAVE_DIRECTORY`: argument defining the path to save converted model. If it is a non-existing path, the original pretrained model specified by `REPO_ID_OR_MODEL_PATH` will be loaded, otherwise the lowbit model in `SAVE_DIRECTORY` will be loaded.
 
 ### Sample Output of Streaming Mode
 #### [meta-llama/Llama-2-7b-chat-hf](https://huggingface.co/meta-llama/Llama-2-7b-chat-hf)
diff --git a/python/llm/example/NPU/HF-Transformers-AutoModels/LLM/Pipeline-Models/baichuan2.py b/python/llm/example/NPU/HF-Transformers-AutoModels/LLM/Pipeline-Models/baichuan2.py
index 53258002a66d..f3e3ddbc0ccd 100644
--- a/python/llm/example/NPU/HF-Transformers-AutoModels/LLM/Pipeline-Models/baichuan2.py
+++ b/python/llm/example/NPU/HF-Transformers-AutoModels/LLM/Pipeline-Models/baichuan2.py
@@ -49,12 +49,6 @@ def get_prompt(message: str, chat_history: list[tuple[str, str]],
         help="The huggingface repo id for the Baichuan2 model to be downloaded"
         ", or the path to the huggingface checkpoint folder",
     )
-    parser.add_argument("--lowbit-path", type=str,
-        default="",
-        help="The path to the lowbit model folder, leave blank if you do not want to save. \
-            If path not exists, lowbit model will be saved there. \
-            Else, lowbit model will be loaded.",
-    )
     parser.add_argument('--prompt', type=str, default="What is AI?",
                         help='Prompt to infer')
     parser.add_argument("--n-predict", type=int, default=32, help="Max tokens to predict")
@@ -63,11 +57,17 @@ def get_prompt(message: str, chat_history: list[tuple[str, str]],
     parser.add_argument("--quantization_group_size", type=int, default=0)
     parser.add_argument("--disable-transpose-value-cache", action="store_true", default=False)
     parser.add_argument("--disable-streaming", action="store_true", default=False)
+    parser.add_argument("--save-directory", type=str,
+        required=True,
+        help="The path of folder to save converted model, "
+             "If path not exists, lowbit model will be saved there. "
+             "Else, lowbit model will be loaded.",
+    )
 
     args = parser.parse_args()
     model_path = args.repo_id_or_model_path
 
-    if not args.lowbit_path or not os.path.exists(args.lowbit_path):
+    if not os.path.exists(args.save_directory):
         model = AutoModelForCausalLM.from_pretrained(model_path,
                                                      optimize_model=True,
                                                      pipeline=True,
@@ -77,10 +77,11 @@ def get_prompt(message: str, chat_history: list[tuple[str, str]],
                                                      torch_dtype=torch.float16,
                                                      attn_implementation="eager",
                                                      transpose_value_cache=not args.disable_transpose_value_cache,
-                                                     trust_remote_code=True)
+                                                     trust_remote_code=True,
+                                                     save_directory=args.save_directory)
     else:
         model = AutoModelForCausalLM.load_low_bit(
-            args.lowbit_path,
+            args.save_directory,
             attn_implementation="eager",
             torch_dtype=torch.float16,
             max_context_len=args.max_context_len,
@@ -92,9 +93,6 @@ def get_prompt(message: str, chat_history: list[tuple[str, str]],
 
     tokenizer = AutoTokenizer.from_pretrained(model_path, trust_remote_code=True)
 
-    if args.lowbit_path and not os.path.exists(args.lowbit_path):
-        model.save_low_bit(args.lowbit_path)
-
     if args.disable_streaming:
         streamer = None
     else:
diff --git a/python/llm/example/NPU/HF-Transformers-AutoModels/LLM/Pipeline-Models/llama2.py b/python/llm/example/NPU/HF-Transformers-AutoModels/LLM/Pipeline-Models/llama2.py
index c7168bcb4b9c..cb640bc7b059 100644
--- a/python/llm/example/NPU/HF-Transformers-AutoModels/LLM/Pipeline-Models/llama2.py
+++ b/python/llm/example/NPU/HF-Transformers-AutoModels/LLM/Pipeline-Models/llama2.py
@@ -49,12 +49,6 @@ def get_prompt(message: str, chat_history: list[tuple[str, str]],
         help="The huggingface repo id for the Llama2 model to be downloaded"
         ", or the path to the huggingface checkpoint folder",
     )
-    parser.add_argument("--lowbit-path", type=str,
-        default="",
-        help="The path to the lowbit model folder, leave blank if you do not want to save. \
-            If path not exists, lowbit model will be saved there. \
-            Else, lowbit model will be loaded.",
-    )
     parser.add_argument('--prompt', type=str, default="What is AI?",
                         help='Prompt to infer')
     parser.add_argument("--n-predict", type=int, default=32, help="Max tokens to predict")
@@ -63,11 +57,17 @@ def get_prompt(message: str, chat_history: list[tuple[str, str]],
     parser.add_argument("--quantization_group_size", type=int, default=0)
     parser.add_argument("--disable-transpose-value-cache", action="store_true", default=False)
     parser.add_argument("--disable-streaming", action="store_true", default=False)
+    parser.add_argument("--save-directory", type=str,
+        required=True,
+        help="The path of folder to save converted model, "
+             "If path not exists, lowbit model will be saved there. "
+             "Else, lowbit model will be loaded.",
+    )
 
     args = parser.parse_args()
     model_path = args.repo_id_or_model_path
 
-    if not args.lowbit_path or not os.path.exists(args.lowbit_path):
+    if not os.path.exists(args.save_directory):
         model = AutoModelForCausalLM.from_pretrained(model_path,
                                                      optimize_model=True,
                                                      pipeline=True,
@@ -76,10 +76,11 @@ def get_prompt(message: str, chat_history: list[tuple[str, str]],
                                                      quantization_group_size=args.quantization_group_size,
                                                      torch_dtype=torch.float16,
                                                      attn_implementation="eager",
-                                                     transpose_value_cache=not args.disable_transpose_value_cache)
+                                                     transpose_value_cache=not args.disable_transpose_value_cache,
+                                                     save_directory=args.save_directory)
     else:
         model = AutoModelForCausalLM.load_low_bit(
-            args.lowbit_path,
+            args.save_directory,
             attn_implementation="eager",
             torch_dtype=torch.float16,
             max_context_len=args.max_context_len,
@@ -90,9 +91,6 @@ def get_prompt(message: str, chat_history: list[tuple[str, str]],
 
     tokenizer = AutoTokenizer.from_pretrained(model_path, trust_remote_code=True)
 
-    if args.lowbit_path and not os.path.exists(args.lowbit_path):
-        model.save_low_bit(args.lowbit_path)
-    
     if args.disable_streaming:
         streamer = None
     else:
diff --git a/python/llm/example/NPU/HF-Transformers-AutoModels/LLM/Pipeline-Models/llama3.py b/python/llm/example/NPU/HF-Transformers-AutoModels/LLM/Pipeline-Models/llama3.py
index a837e03c6f3f..ac3433b92b46 100644
--- a/python/llm/example/NPU/HF-Transformers-AutoModels/LLM/Pipeline-Models/llama3.py
+++ b/python/llm/example/NPU/HF-Transformers-AutoModels/LLM/Pipeline-Models/llama3.py
@@ -55,12 +55,6 @@ def get_prompt(user_input: str, chat_history: list[tuple[str, str]],
         help="The huggingface repo id for the Llama3 model to be downloaded"
         ", or the path to the huggingface checkpoint folder",
     )
-    parser.add_argument("--lowbit-path", type=str,
-        default="",
-        help="The path to the lowbit model folder, leave blank if you do not want to save. \
-            If path not exists, lowbit model will be saved there. \
-            Else, lowbit model will be loaded.",
-    )
     parser.add_argument('--prompt', type=str, default="What is AI?",
                         help='Prompt to infer')
     parser.add_argument("--n-predict", type=int, default=32, help="Max tokens to predict")
@@ -69,11 +63,17 @@ def get_prompt(user_input: str, chat_history: list[tuple[str, str]],
     parser.add_argument("--quantization_group_size", type=int, default=0)
     parser.add_argument("--disable-transpose-value-cache", action="store_true", default=False)
     parser.add_argument("--disable-streaming", action="store_true", default=False)
+    parser.add_argument("--save-directory", type=str,
+        required=True,
+        help="The path of folder to save converted model, "
+             "If path not exists, lowbit model will be saved there. "
+             "Else, lowbit model will be loaded.",
+    )
 
     args = parser.parse_args()
     model_path = args.repo_id_or_model_path
 
-    if not args.lowbit_path or not os.path.exists(args.lowbit_path):
+    if not os.path.exists(args.save_directory):
         model = AutoModelForCausalLM.from_pretrained(model_path,
                                                     torch_dtype=torch.float16,
                                                     optimize_model=True,
@@ -82,10 +82,11 @@ def get_prompt(user_input: str, chat_history: list[tuple[str, str]],
                                                     max_prompt_len=args.max_prompt_len,
                                                     quantization_group_size=args.quantization_group_size,
                                                     attn_implementation="eager",
-                                                    transpose_value_cache=not args.disable_transpose_value_cache)
+                                                    transpose_value_cache=not args.disable_transpose_value_cache,
+                                                    save_directory=args.save_directory)
     else:
         model = AutoModelForCausalLM.load_low_bit(
-            args.lowbit_path,
+            args.save_directory,
             attn_implementation="eager",
             torch_dtype=torch.float16,
             max_context_len=args.max_context_len,
@@ -96,9 +97,6 @@ def get_prompt(user_input: str, chat_history: list[tuple[str, str]],
 
     tokenizer = AutoTokenizer.from_pretrained(model_path, trust_remote_code=True)
 
-    if args.lowbit_path and not os.path.exists(args.lowbit_path):
-        model.save_low_bit(args.lowbit_path)
-
     if args.disable_streaming:
         streamer = None
     else:
diff --git a/python/llm/example/NPU/HF-Transformers-AutoModels/LLM/Pipeline-Models/minicpm.py b/python/llm/example/NPU/HF-Transformers-AutoModels/LLM/Pipeline-Models/minicpm.py
index d9bcae4bae58..df5bd756c99d 100644
--- a/python/llm/example/NPU/HF-Transformers-AutoModels/LLM/Pipeline-Models/minicpm.py
+++ b/python/llm/example/NPU/HF-Transformers-AutoModels/LLM/Pipeline-Models/minicpm.py
@@ -36,12 +36,6 @@
         help="The huggingface repo id for the MiniCPM model to be downloaded"
         ", or the path to the huggingface checkpoint folder",
     )
-    parser.add_argument("--lowbit-path", type=str,
-        default="",
-        help="The path to the lowbit model folder, leave blank if you do not want to save. \
-             If path not exists, lowbit model will be saved there. \
-             Else, lowbit model will be loaded.",
-    )
     parser.add_argument('--prompt', type=str, default="What is AI?",
                         help='Prompt to infer')
     parser.add_argument("--n-predict", type=int, default=32, help="Max tokens to predict")
@@ -50,11 +44,17 @@
     parser.add_argument("--quantization_group_size", type=int, default=0)
     parser.add_argument("--disable-transpose-value-cache", action="store_true", default=False)
     parser.add_argument("--disable-streaming", action="store_true", default=False)
+    parser.add_argument("--save-directory", type=str,
+        required=True,
+        help="The path of folder to save converted model, "
+             "If path not exists, lowbit model will be saved there. "
+             "Else, lowbit model will be loaded.",
+    )
 
     args = parser.parse_args()
     model_path = args.repo_id_or_model_path
 
-    if not args.lowbit_path or not os.path.exists(args.lowbit_path):
+    if not os.path.exists(args.save_directory):
         model = AutoModelForCausalLM.from_pretrained(model_path,
                                                      optimize_model=True,
                                                      pipeline=True,
@@ -64,10 +64,11 @@
                                                      attn_implementation="eager",
                                                      quantization_group_size=args.quantization_group_size,
                                                      transpose_value_cache=not args.disable_transpose_value_cache,
-                                                     trust_remote_code=True)
+                                                     trust_remote_code=True,
+                                                     save_directory=args.save_directory)
     else:
         model = AutoModelForCausalLM.load_low_bit(
-            args.lowbit_path,
+            args.save_directory,
             attn_implementation="eager",
             torch_dtype=torch.float16,
             max_context_len=args.max_context_len,
@@ -79,9 +80,6 @@
 
     tokenizer = AutoTokenizer.from_pretrained(model_path, trust_remote_code=True)
 
-    if args.lowbit_path and not os.path.exists(args.lowbit_path):
-        model.save_low_bit(args.lowbit_path)
-
     if args.disable_streaming:
         streamer = None
     else:
diff --git a/python/llm/example/NPU/HF-Transformers-AutoModels/LLM/Pipeline-Models/qwen.py b/python/llm/example/NPU/HF-Transformers-AutoModels/LLM/Pipeline-Models/qwen.py
index d04961ece875..ef5ded708960 100644
--- a/python/llm/example/NPU/HF-Transformers-AutoModels/LLM/Pipeline-Models/qwen.py
+++ b/python/llm/example/NPU/HF-Transformers-AutoModels/LLM/Pipeline-Models/qwen.py
@@ -36,27 +36,27 @@
         help="The huggingface repo id for the Qwen model to be downloaded"
         ", or the path to the huggingface checkpoint folder",
     )
-    parser.add_argument("--lowbit-path", type=str,
-        default="",
-        help="The path to the lowbit model folder, leave blank if you do not want to save. \
-            If path not exists, lowbit model will be saved there. \
-            Else, lowbit model will be loaded.",
-    )
     parser.add_argument('--prompt', type=str, default="AI是什么?",
                         help='Prompt to infer')
     parser.add_argument("--n-predict", type=int, default=32, help="Max tokens to predict")
     parser.add_argument("--max-context-len", type=int, default=1024)
     parser.add_argument("--max-prompt-len", type=int, default=512)
     parser.add_argument("--quantization_group_size", type=int, default=0)
-    parser.add_argument('--low_bit', type=str, default="sym_int4",
+    parser.add_argument('--low-bit', type=str, default="sym_int4",
                         help='Low bit precision to quantize the model')
     parser.add_argument("--disable-transpose-value-cache", action="store_true", default=False)
     parser.add_argument("--disable-streaming", action="store_true", default=False)
+    parser.add_argument("--save-directory", type=str,
+        required=True,
+        help="The path of folder to save converted model, "
+             "If path not exists, lowbit model will be saved there. "
+             "Else, lowbit model will be loaded.",
+    )
 
     args = parser.parse_args()
     model_path = args.repo_id_or_model_path
 
-    if not args.lowbit_path or not os.path.exists(args.lowbit_path):
+    if not os.path.exists(args.save_directory):
         model = AutoModelForCausalLM.from_pretrained(model_path,
                                                      optimize_model=True,
                                                      pipeline=True,
@@ -68,10 +68,11 @@
                                                      attn_implementation="eager",
                                                      transpose_value_cache=not args.disable_transpose_value_cache,
                                                      mixed_precision=True,
-                                                     trust_remote_code=True)
+                                                     trust_remote_code=True,
+                                                     save_directory=args.save_directory)
     else:
         model = AutoModelForCausalLM.load_low_bit(
-            args.lowbit_path,
+            args.save_directory,
             attn_implementation="eager",
             torch_dtype=torch.float16,
             max_context_len=args.max_context_len,
@@ -81,9 +82,6 @@
 
     tokenizer = AutoTokenizer.from_pretrained(model_path, trust_remote_code=True)
 
-    if args.lowbit_path and not os.path.exists(args.lowbit_path):
-        model.save_low_bit(args.lowbit_path)
-
     if args.disable_streaming:
         streamer = None
     else:
diff --git a/python/llm/example/NPU/HF-Transformers-AutoModels/LLM/README.md b/python/llm/example/NPU/HF-Transformers-AutoModels/LLM/README.md
index 1377b7a0c087..e114f1156f48 100644
--- a/python/llm/example/NPU/HF-Transformers-AutoModels/LLM/README.md
+++ b/python/llm/example/NPU/HF-Transformers-AutoModels/LLM/README.md
@@ -99,34 +99,34 @@ The examples below show how to run the **_optimized HuggingFace model implementa
 ### Run
 ```cmd
 :: to run Llama-2-7b-chat-hf
-python llama2.py --repo-id-or-model-path meta-llama/Llama-2-7b-chat-hf --save-directory <converted_model_path>
+python llama2.py --repo-id-or-model-path "meta-llama/Llama-2-7b-chat-hf" --save-directory <converted_model_path>
 
 :: to run Meta-Llama-3-8B-Instruct
-python llama3.py --repo-id-or-model-path meta-llama/Meta-Llama-3-8B-Instruct --save-directory <converted_model_path>
+python llama3.py --repo-id-or-model-path "meta-llama/Meta-Llama-3-8B-Instruct" --save-directory <converted_model_path>
 
 :: to run Llama-3.2-1B-Instruct
-python llama3.py --repo-id-or-model-path meta-llama/Llama-3.2-1B-Instruct --save-directory <converted_model_path>
+python llama3.py --repo-id-or-model-path "meta-llama/Llama-3.2-1B-Instruct" --save-directory <converted_model_path>
 
 :: to run Llama-3.2-3B-Instruct
-python llama3.py --repo-id-or-model-path meta-llama/Llama-3.2-3B-Instruct --save-directory <converted_model_path>
+python llama3.py --repo-id-or-model-path "meta-llama/Llama-3.2-3B-Instruct" --save-directory <converted_model_path>
 
 :: to run Qwen2-1.5B-Instruct
-python qwen.py --repo-id-or-model-path Qwen/Qwen2-1.5B-Instruct --low_bit sym_int8 --save-directory <converted_model_path>
+python qwen.py --repo-id-or-model-path "Qwen/Qwen2-1.5B-Instruct" --low-bit sym_int8 --save-directory <converted_model_path>
 
 :: to run Qwen2.5-3B-Instruct
-python qwen.py --repo-id-or-model-path Qwen/Qwen2.5-3B-Instruct --low_bit sym_int8 --save-directory <converted_model_path>
+python qwen.py --repo-id-or-model-path "Qwen/Qwen2.5-3B-Instruct" --low-bit sym_int8 --save-directory <converted_model_path>
 
 :: to run Qwen2.5-7B-Instruct
-python qwen.py --repo-id-or-model-path Qwen/Qwen2.5-7B-Instruct --save-directory <converted_model_path>
+python qwen.py --repo-id-or-model-path "Qwen/Qwen2.5-7B-Instruct" --save-directory <converted_model_path>
 
 :: to run MiniCPM-1B-sft-bf16
-python minicpm.py --repo-id-or-model-path openbmb/MiniCPM-1B-sft-bf16 --save-directory <converted_model_path>
+python minicpm.py --repo-id-or-model-path "openbmb/MiniCPM-1B-sft-bf16" --save-directory <converted_model_path>
 
 :: to run MiniCPM-2B-sft-bf16
-python minicpm.py --repo-id-or-model-path openbmb/MiniCPM-2B-sft-bf16 --save-directory <converted_model_path>
+python minicpm.py --repo-id-or-model-path "openbmb/MiniCPM-2B-sft-bf16" --save-directory <converted_model_path>
 
 :: to run Baichuan2-7B-Chat
-python baichuan2.py
+python baichuan2.py --repo-id-or-model-path "baichuan-inc/Baichuan2-7B-Chat" --save-directory <converted_model_path>
 ```
 
 Arguments info:
diff --git a/python/llm/example/NPU/HF-Transformers-AutoModels/LLM/baichuan2.py b/python/llm/example/NPU/HF-Transformers-AutoModels/LLM/baichuan2.py
index 1d528357fece..05c47076ede0 100644
--- a/python/llm/example/NPU/HF-Transformers-AutoModels/LLM/baichuan2.py
+++ b/python/llm/example/NPU/HF-Transformers-AutoModels/LLM/baichuan2.py
@@ -50,57 +50,49 @@ def get_prompt(message: str, chat_history: list[tuple[str, str]],
         help="The huggingface repo id for the Baichuan2 model to be downloaded"
         ", or the path to the huggingface checkpoint folder",
     )
-    parser.add_argument("--lowbit-path", type=str,
-        default="",
-        help="The path to the lowbit model folder, leave blank if you do not want to save. \
-            If path not exists, lowbit model will be saved there. \
-            Else, lowbit model will be loaded.",
-    )
     parser.add_argument('--prompt', type=str, default="What is AI?",
                         help='Prompt to infer')
     parser.add_argument("--n-predict", type=int, default=32, help="Max tokens to predict")
     parser.add_argument("--max-context-len", type=int, default=1024)
     parser.add_argument("--max-prompt-len", type=int, default=512)
     parser.add_argument("--disable-transpose-value-cache", action="store_true", default=False)
-    parser.add_argument("--intra-pp", type=int, default=2)
-    parser.add_argument("--inter-pp", type=int, default=2)
+    parser.add_argument("--save-directory", type=str,
+        required=True,
+        help="The path of folder to save converted model, "
+             "If path not exists, lowbit model will be saved there. "
+             "Else, lowbit model will be loaded.",
+    )
 
     args = parser.parse_args()
     model_path = args.repo_id_or_model_path
 
-    if not args.lowbit_path or not os.path.exists(args.lowbit_path):
+    if not os.path.exists(args.save_directory):
         model = AutoModelForCausalLM.from_pretrained(
             model_path,
-            torch_dtype=torch.bfloat16,
+            torch_dtype=torch.float16,
             trust_remote_code=True,
             attn_implementation="eager",
             load_in_low_bit="sym_int4",
             optimize_model=True,
             max_context_len=args.max_context_len,
             max_prompt_len=args.max_prompt_len,
-            intra_pp=args.intra_pp,
-            inter_pp=args.inter_pp,
             transpose_value_cache=not args.disable_transpose_value_cache,
+            save_directory=args.save_directory
         )
     else:
         model = AutoModelForCausalLM.load_low_bit(
-            args.lowbit_path,
+            args.save_directory,
             attn_implementation="eager",
-            torch_dtype=torch.bfloat16,
+            torch_dtype=torch.float16,
             optimize_model=True,
             max_context_len=args.max_context_len,
             max_prompt_len=args.max_prompt_len,
-            intra_pp=args.intra_pp,
-            inter_pp=args.inter_pp,
             transpose_value_cache=not args.disable_transpose_value_cache,
             trust_remote_code=True,
         )
 
     tokenizer = AutoTokenizer.from_pretrained(model_path, trust_remote_code=True)
 
-    if args.lowbit_path and not os.path.exists(args.lowbit_path):
-        model.save_low_bit(args.lowbit_path)
-
     DEFAULT_SYSTEM_PROMPT = """\
     """
 
diff --git a/python/llm/example/NPU/HF-Transformers-AutoModels/LLM/qwen.py b/python/llm/example/NPU/HF-Transformers-AutoModels/LLM/qwen.py
index 93f5b9e09d4b..9089475cc2ba 100644
--- a/python/llm/example/NPU/HF-Transformers-AutoModels/LLM/qwen.py
+++ b/python/llm/example/NPU/HF-Transformers-AutoModels/LLM/qwen.py
@@ -43,7 +43,7 @@
     parser.add_argument("--max-context-len", type=int, default=1024)
     parser.add_argument("--max-prompt-len", type=int, default=960)
     parser.add_argument("--quantization_group_size", type=int, default=0)
-    parser.add_argument('--low_bit', type=str, default="sym_int4",
+    parser.add_argument('--low-bit', type=str, default="sym_int4",
                         help='Load in low bit to use')
     parser.add_argument("--disable-transpose-value-cache", action="store_true", default=False)
     parser.add_argument("--save-directory", type=str,
@@ -98,7 +98,7 @@
             print("input length:", len(_input_ids[0]))
             st = time.time()
             output = model.generate(
-                _input_ids, num_beams=1, do_sample=False, max_new_tokens=args.n_predict
+                _input_ids, num_beams=1, do_sample=False, max_new_tokens=args.n_predict, do_print=True
             )
             end = time.time()
             print(f"Inference time: {end-st} s")
diff --git a/python/llm/example/NPU/HF-Transformers-AutoModels/Multimodal/README.md b/python/llm/example/NPU/HF-Transformers-AutoModels/Multimodal/README.md
index 53f47df79463..6e1b3db72b9e 100644
--- a/python/llm/example/NPU/HF-Transformers-AutoModels/Multimodal/README.md
+++ b/python/llm/example/NPU/HF-Transformers-AutoModels/Multimodal/README.md
@@ -102,10 +102,10 @@ The examples below show how to run the **_optimized HuggingFace & FunASR model i
 ### 4.1 Run MiniCPM-Llama3-V-2_5 & MiniCPM-V-2_6
 ```bash
 # to run MiniCPM-Llama3-V-2_5
-python minicpm-llama3-v2.5.py
+python minicpm-llama3-v2.5.py --save-directory <converted_model_path>
 
 # to run MiniCPM-V-2_6
-python minicpm_v_2_6.py
+python minicpm_v_2_6.py --save-directory <converted_model_path>
 ```
 
 Arguments info:
@@ -116,6 +116,7 @@ Arguments info:
 - `--max-output-len MAX_OUTPUT_LEN`: Defines the maximum sequence length for both input and output tokens. It is default to be `1024`.
 - `--max-prompt-len MAX_PROMPT_LEN`: Defines the maximum number of tokens that the input prompt can contain. It is default to be `512`.
 - `--disable-transpose-value-cache`: Disable the optimization of transposing value cache.
+- `--save-directory SAVE_DIRECTORY`: argument defining the path to save converted model. If it is a non-existing path, the original pretrained model specified by `REPO_ID_OR_MODEL_PATH` will be loaded, otherwise the lowbit model in `SAVE_DIRECTORY` will be loaded.
 
 #### Sample Output
 ##### [openbmb/MiniCPM-V-2_6](https://huggingface.co/openbmb/MiniCPM-V-2_6)
@@ -133,12 +134,13 @@ The image features a young child holding and showing off a white teddy bear wear
 ### 4.2 Run Speech_Paraformer-Large
 ```bash
 # to run Speech_Paraformer-Large
-python speech_paraformer-large.py
+python speech_paraformer-large.py --save-directory <converted_model_path>
 ```
 
 Arguments info:
 - `--repo-id-or-model-path REPO_ID_OR_MODEL_PATH`: argument defining the asr repo id for the model (i.e. `iic/speech_paraformer-large-vad-punc_asr_nat-zh-cn-16k-common-vocab8404-pytorch`) to be downloaded, or the path to the asr checkpoint folder.
 - `--load_in_low_bit`: argument defining the `load_in_low_bit` format used. It is default to be `sym_int8`, `sym_int4` can also be used.
+- `--save-directory SAVE_DIRECTORY`: argument defining the path to save converted model. If it is a non-existing path, the original pretrained model specified by `REPO_ID_OR_MODEL_PATH` will be loaded, otherwise the lowbit model in `SAVE_DIRECTORY` will be loaded.
 
 #### Sample Output
 ##### [iic/speech_paraformer-large-vad-punc_asr_nat-zh-cn-16k-common-vocab8404-pytorch](https://www.modelscope.cn/models/iic/speech_paraformer-large-vad-punc_asr_nat-zh-cn-16k-common-vocab8404-pytorch)
@@ -156,11 +158,12 @@ rtf_avg: 0.232: 100%|███████████████████
 ### 4.3 Run Bce-Embedding-Base-V1
 ```bash
 # to run Bce-Embedding-Base-V1
-python bce-embedding.py
+python bce-embedding.py --save-directory <converted_model_path>
 ```
 
 Arguments info:
 - `--repo-id-or-model-path REPO_ID_OR_MODEL_PATH`: argument defining the asr repo id for the model (i.e. `maidalun1020/bce-embedding-base_v1`) to be downloaded, or the path to the asr checkpoint folder.
+- `--save-directory SAVE_DIRECTORY`: argument defining the path to save converted model. If it is a non-existing path, the original pretrained model specified by `REPO_ID_OR_MODEL_PATH` will be loaded, otherwise the lowbit model in `SAVE_DIRECTORY` will be loaded.
 
 #### Sample Output
 ##### [maidalun1020/bce-embedding-base_v1](https://huggingface.co/maidalun1020/bce-embedding-base_v1) |
diff --git a/python/llm/example/NPU/HF-Transformers-AutoModels/Multimodal/bce-embedding.py b/python/llm/example/NPU/HF-Transformers-AutoModels/Multimodal/bce-embedding.py
index a2f3550d52a0..760a5e5f28bc 100644
--- a/python/llm/example/NPU/HF-Transformers-AutoModels/Multimodal/bce-embedding.py
+++ b/python/llm/example/NPU/HF-Transformers-AutoModels/Multimodal/bce-embedding.py
@@ -35,19 +35,17 @@
         help="The huggingface repo id for the bce-embedding model to be downloaded"
         ", or the path to the huggingface checkpoint folder",
     )
-    parser.add_argument("--lowbit-path", type=str,
-        default="",
-        help="The path to the lowbit model folder, leave blank if you do not want to save. \
-            If path not exists, lowbit model will be saved there. \
-            Else, lowbit model will be loaded.",
-    )
     parser.add_argument('--prompt', type=str, default="'sentence_0', 'sentence_1'",
                         help='Prompt to infer')
     parser.add_argument("--max-context-len", type=int, default=1024)
     parser.add_argument("--max-prompt-len", type=int, default=512)
     parser.add_argument("--disable-transpose-value-cache", action="store_true", default=False)
-    parser.add_argument("--intra-pp", type=int, default=2)
-    parser.add_argument("--inter-pp", type=int, default=2)
+    parser.add_argument("--save-directory", type=str,
+        required=True,
+        help="The path of folder to save converted model, "
+             "If path not exists, lowbit model will be saved there. "
+             "Else, lowbit model will be loaded.",
+    )
 
     args = parser.parse_args()
     model_path = args.repo_id_or_model_path
@@ -60,9 +58,8 @@
         optimize_model=True,
         max_context_len=args.max_context_len,
         max_prompt_len=args.max_prompt_len,
-        intra_pp=args.intra_pp,
-        inter_pp=args.inter_pp,
         transpose_value_cache=not args.disable_transpose_value_cache,
+        save_directory=args.save_directory
     )
 
     # list of sentences
diff --git a/python/llm/example/NPU/HF-Transformers-AutoModels/Multimodal/minicpm-llama3-v2.5.py b/python/llm/example/NPU/HF-Transformers-AutoModels/Multimodal/minicpm-llama3-v2.5.py
index e7ffaf53c41e..e4cdef6120ae 100644
--- a/python/llm/example/NPU/HF-Transformers-AutoModels/Multimodal/minicpm-llama3-v2.5.py
+++ b/python/llm/example/NPU/HF-Transformers-AutoModels/Multimodal/minicpm-llama3-v2.5.py
@@ -48,8 +48,12 @@
     parser.add_argument("--max-context-len", type=int, default=1024)
     parser.add_argument("--max-prompt-len", type=int, default=512)
     parser.add_argument("--disable-transpose-value-cache", action="store_true", default=False)
-    parser.add_argument("--intra-pp", type=int, default=2)
-    parser.add_argument("--inter-pp", type=int, default=2)
+    parser.add_argument("--save-directory", type=str,
+        required=True,
+        help="The path of folder to save converted model, "
+             "If path not exists, lowbit model will be saved there. "
+             "Else, lowbit model will be loaded.",
+    )
 
     args = parser.parse_args()
     model_path = args.repo_id_or_model_path
@@ -63,9 +67,8 @@
         optimize_model=True,
         max_context_len=args.max_context_len,
         max_prompt_len=args.max_prompt_len,
-        intra_pp=args.intra_pp,
-        inter_pp=args.inter_pp,
         transpose_value_cache=not args.disable_transpose_value_cache,
+        save_directory=args.save_directory
     )
     tokenizer = AutoTokenizer.from_pretrained(model_path, trust_remote_code=True)
 
diff --git a/python/llm/example/NPU/HF-Transformers-AutoModels/Multimodal/minicpm_v_2_6.py b/python/llm/example/NPU/HF-Transformers-AutoModels/Multimodal/minicpm_v_2_6.py
index 1a524a5b2dc8..ec6b5361aa20 100644
--- a/python/llm/example/NPU/HF-Transformers-AutoModels/Multimodal/minicpm_v_2_6.py
+++ b/python/llm/example/NPU/HF-Transformers-AutoModels/Multimodal/minicpm_v_2_6.py
@@ -39,8 +39,12 @@
     parser.add_argument("--max-context-len", type=int, default=1024)
     parser.add_argument("--max-prompt-len", type=int, default=512)
     parser.add_argument("--disable-transpose-value-cache", action="store_true", default=False)
-    parser.add_argument("--intra-pp", type=int, default=None)
-    parser.add_argument("--inter-pp", type=int, default=None)
+    parser.add_argument("--save-directory", type=str,
+        required=True,
+        help="The path of folder to save converted model, "
+             "If path not exists, lowbit model will be saved there. "
+             "Else, lowbit model will be loaded.",
+    )
 
     args = parser.parse_args()
     model_path = args.repo_id_or_model_path
@@ -54,9 +58,8 @@
                                       optimize_model=True,
                                       max_context_len=args.max_context_len,
                                       max_prompt_len=args.max_prompt_len,
-                                      intra_pp=args.intra_pp,
-                                      inter_pp=args.inter_pp,
                                       transpose_value_cache=not args.disable_transpose_value_cache,
+                                      save_directory=args.save_directory
                                       )
     tokenizer = AutoTokenizer.from_pretrained(model_path,
                                               trust_remote_code=True)
diff --git a/python/llm/example/NPU/HF-Transformers-AutoModels/Multimodal/speech_paraformer-large.py b/python/llm/example/NPU/HF-Transformers-AutoModels/Multimodal/speech_paraformer-large.py
index d2ffe3ad8cc1..0bf03d411cd0 100644
--- a/python/llm/example/NPU/HF-Transformers-AutoModels/Multimodal/speech_paraformer-large.py
+++ b/python/llm/example/NPU/HF-Transformers-AutoModels/Multimodal/speech_paraformer-large.py
@@ -35,8 +35,12 @@
     )
     parser.add_argument('--load_in_low_bit', type=str, default="sym_int8",
                         help='Load in low bit to use')
-    parser.add_argument("--intra-pp", type=int, default=2)
-    parser.add_argument("--inter-pp", type=int, default=2)
+    parser.add_argument("--save-directory", type=str,
+        required=True,
+        help="The path of folder to save converted model, "
+             "If path not exists, lowbit model will be saved there. "
+             "Else, lowbit model will be loaded.",
+    )
 
     args = parser.parse_args()
     model_path = args.repo_id_or_model_path
@@ -47,8 +51,7 @@
         load_in_low_bit=args.load_in_low_bit,
         low_cpu_mem_usage=True,
         optimize_model=True,
-        intra_pp=args.intra_pp,
-        inter_pp=args.inter_pp,
+        save_directory=args.save_directory
     )
 
     res = model.generate(input=f"{model.model_path}/example/asr_example.wav",
diff --git a/python/llm/src/ipex_llm/transformers/npu_model.py b/python/llm/src/ipex_llm/transformers/npu_model.py
index cbf14d509c4c..1096b12f05f5 100644
--- a/python/llm/src/ipex_llm/transformers/npu_model.py
+++ b/python/llm/src/ipex_llm/transformers/npu_model.py
@@ -46,7 +46,6 @@ def ignore_argument(kwargs: dict, key: "str"):
 
 def save_low_bit(self, model_dir: str, *args, **kwargs):
     if hasattr(self, "save_directory"):
-        logger.info(f"Model has already saved to {self.save_directory}.")
         return 1
     origin_device = self.device
     kwargs["safe_serialization"] = False
@@ -258,6 +257,9 @@ def optimize_npu_model(cls, *args, **kwargs):
         save_directory = kwargs.pop('save_directory', None)
         fuse_layers = kwargs.pop('fuse_layers', None)
         imatrix_data = kwargs.pop('imatrix_data', None)
+        invalidInputError(save_directory is not None,
+                          "Please provide the path to save converted model "
+                          "through `save_directory`.")
 
         if hasattr(model, "llm"):
             llm = model.llm
@@ -280,9 +282,6 @@ def optimize_npu_model(cls, *args, **kwargs):
 
         if not pipeline:
             if model.config.model_type in ["qwen2", "llama", "minicpm"]:
-                invalidInputError(save_directory is not None,
-                                  "Please provide the path to save converted model "
-                                  "through `save_directory`.")
                 from ipex_llm.transformers.npu_models.convert import optimize_llm_single_process
                 optimize_llm_single_process(
                     llm,
@@ -317,6 +316,8 @@ def optimize_npu_model(cls, *args, **kwargs):
                         save_directory=save_directory,
                         fuse_layers=fuse_layers)
         model.save_low_bit = types.MethodType(save_low_bit, model)
+        model.save_low_bit(save_directory)
+        logger.info(f"Converted model has already saved to {save_directory}.")
         return model
 
     @classmethod