[NPU] update save-load API usage (#12473)

intel-analytics · Dec 3, 2024 · ab01753 · ab01753
1 parent 26adb82
commit ab01753
Show file tree

Hide file tree

Showing 20 changed files with 166 additions and 188 deletions.
diff --git a/python/llm/dev/benchmark/all-in-one/run.py b/python/llm/dev/benchmark/all-in-one/run.py
@@ -634,7 +634,7 @@ def transformers_int4_npu_win(repo_id,
         model = AutoModel.from_pretrained(model_path, load_in_low_bit=low_bit, optimize_model=optimize_model,
                                           trust_remote_code=True, use_cache=True, max_context_len=max_context_len, max_prompt_len=int(in_out_len[0]), 
                                           quantization_group_size=npu_group_size, transpose_value_cache=transpose_value_cache,
-                                          attn_implementation="eager", torch_dtype=torch.float16).eval()
+                                          save_directory=save_directory, attn_implementation="eager", torch_dtype=torch.float16).eval()
         model = model.llm
         tokenizer = AutoTokenizer.from_pretrained(model_path, trust_remote_code=True)
     else:
@@ -702,14 +702,16 @@ def transformers_int4_npu_pipeline_win(repo_id,
     in_out_len = in_out_pairs[0].split("-")
     max_context_len = max(int(in_out_len[0]) + int(in_out_len[1]), 1024)
     mixed_precision = True if npu_group_size == 0 else False
+    save_directory = "./save_converted_model_dir"
     # Load model in 4 bit,
     # which convert the relevant layers in the model into INT4 format
     st = time.perf_counter()
 
     model = AutoModelForCausalLM.from_pretrained(model_path, load_in_low_bit=low_bit, trust_remote_code=True, pipeline=True, torch_dtype=torch.float16,
                                                  optimize_model=optimize_model, max_context_len=max_context_len, max_prompt_len=int(in_out_len[0]), 
                                                  quantization_group_size=npu_group_size, transpose_value_cache=transpose_value_cache,
-                                                 use_cache=True, attn_implementation="eager", mixed_precision=mixed_precision).eval()
+                                                 use_cache=True, attn_implementation="eager", mixed_precision=mixed_precision,
+                                                 save_directory=save_directory).eval()
     tokenizer = AutoTokenizer.from_pretrained(model_path, trust_remote_code=True)
 
     end = time.perf_counter()

diff --git a/python/llm/example/NPU/HF-Transformers-AutoModels/LLM/Pipeline-Models/README.md b/python/llm/example/NPU/HF-Transformers-AutoModels/LLM/Pipeline-Models/README.md
@@ -47,45 +47,45 @@ In the example [generate.py](./generate.py), we show a basic use case for a Llam
 
 ```cmd
 :: to run Llama-2-7b-chat-hf
-python llama2.py
+python llama2.py --repo-id-or-model-path "meta-llama/Llama-2-7b-chat-hf" --save-directory <converted_model_path>
 
 :: to run Meta-Llama-3-8B-Instruct
-python llama3.py
+python llama3.py --repo-id-or-model-path "meta-llama/Meta-Llama-3-8B-Instruct" --save-directory <converted_model_path>
 
 :: to run Llama-3.2-1B-Instruct
-python llama3.py --repo-id-or-model-path "meta-llama/Llama-3.2-1B-Instruct"
+python llama3.py --repo-id-or-model-path "meta-llama/Llama-3.2-1B-Instruct" --save-directory <converted_model_path>
 
 :: to run Llama-3.2-3B-Instruct
-python llama3.py --repo-id-or-model-path "meta-llama/Llama-3.2-3B-Instruct"
+python llama3.py --repo-id-or-model-path "meta-llama/Llama-3.2-3B-Instruct" --save-directory <converted_model_path>
 
 :: to run Qwen2.5-7B-Instruct
-python qwen.py
+python qwen.py --repo-id-or-model-path "Qwen/Qwen2.5-7B-Instruct" --save-directory <converted_model_path>
 
 :: to run Qwen2-1.5B-Instruct
-python qwen.py --repo-id-or-model-path "Qwen/Qwen2-1.5B-Instruct" --low_bit "sym_int8"
+python qwen.py --repo-id-or-model-path "Qwen/Qwen2-1.5B-Instruct" --low-bit sym_int8 --save-directory <converted_model_path>
 
 :: to run Qwen2.5-3B-Instruct
-python qwen.py --repo-id-or-model-path "Qwen/Qwen2.5-3B-Instruct" --low_bit "sym_int8"
+python qwen.py --repo-id-or-model-path "Qwen/Qwen2.5-3B-Instruct" --low-bit sym_int8 --save-directory <converted_model_path>
 
 :: to run Baichuan2-7B-Chat
-python baichuan2.py
+python baichuan2.py --repo-id-or-model-path "baichuan-inc/Baichuan2-7B-Chat" --save-directory <converted_model_path>
 
 :: to run MiniCPM-1B-sft-bf16
-python minicpm.py
+python minicpm.py --repo-id-or-model-path "openbmb/MiniCPM-1B-sft-bf16" --save-directory <converted_model_path>
 
 :: to run MiniCPM-2B-sft-bf16
-python minicpm.py --repo-id-or-model-path "openbmb/MiniCPM-2B-sft-bf16"
+python minicpm.py --repo-id-or-model-path "openbmb/MiniCPM-2B-sft-bf16" --save-directory <converted_model_path>
 ```
 
 Arguments info:
 - `--repo-id-or-model-path REPO_ID_OR_MODEL_PATH`: argument defining the huggingface repo id for the model (e.g. `meta-llama/Llama-2-7b-chat-hf`) to be downloaded, or the path to the huggingface checkpoint folder.
-- `--lowbit-path LOWBIT_MODEL_PATH`: argument defining the path to save/load lowbit version of the model. If it is an empty string, the original pretrained model specified by `REPO_ID_OR_MODEL_PATH` will be loaded. If it is an existing path, the lowbit model in `LOWBIT_MODEL_PATH` will be loaded. If it is a non-existing path, the original pretrained model specified by `REPO_ID_OR_MODEL_PATH` will be loaded, and the converted lowbit version will be saved into `LOWBIT_MODEL_PATH`. It is default to be `''`, i.e. an empty string.
 - `--prompt PROMPT`: argument defining the prompt to be infered. It is default to be `What is AI?`.
 - `--n-predict N_PREDICT`: argument defining the max number of tokens to predict. It is default to be `32`.
 - `--max-context-len MAX_CONTEXT_LEN`: Defines the maximum sequence length for both input and output tokens. It is default to be `1024`.
 - `--max-prompt-len MAX_PROMPT_LEN`: Defines the maximum number of tokens that the input prompt can contain. It is default to be `512`.
 - `--disable-transpose-value-cache`: Disable the optimization of transposing value cache.
 - `--disable-streaming`: Disable streaming mode of generation.
+- `--save-directory SAVE_DIRECTORY`: argument defining the path to save converted model. If it is a non-existing path, the original pretrained model specified by `REPO_ID_OR_MODEL_PATH` will be loaded, otherwise the lowbit model in `SAVE_DIRECTORY` will be loaded.
 
 ### Sample Output of Streaming Mode
 #### [meta-llama/Llama-2-7b-chat-hf](https://huggingface.co/meta-llama/Llama-2-7b-chat-hf)

diff --git a/python/llm/example/NPU/HF-Transformers-AutoModels/LLM/Pipeline-Models/baichuan2.py b/python/llm/example/NPU/HF-Transformers-AutoModels/LLM/Pipeline-Models/baichuan2.py
@@ -49,12 +49,6 @@ def get_prompt(message: str, chat_history: list[tuple[str, str]],
         help="The huggingface repo id for the Baichuan2 model to be downloaded"
         ", or the path to the huggingface checkpoint folder",
     )
-    parser.add_argument("--lowbit-path", type=str,
-        default="",
-        help="The path to the lowbit model folder, leave blank if you do not want to save. \
-            If path not exists, lowbit model will be saved there. \
-            Else, lowbit model will be loaded.",
-    )
     parser.add_argument('--prompt', type=str, default="What is AI?",
                         help='Prompt to infer')
     parser.add_argument("--n-predict", type=int, default=32, help="Max tokens to predict")
@@ -63,11 +57,17 @@ def get_prompt(message: str, chat_history: list[tuple[str, str]],
     parser.add_argument("--quantization_group_size", type=int, default=0)
     parser.add_argument("--disable-transpose-value-cache", action="store_true", default=False)
     parser.add_argument("--disable-streaming", action="store_true", default=False)
+    parser.add_argument("--save-directory", type=str,
+        required=True,
+        help="The path of folder to save converted model, "
+             "If path not exists, lowbit model will be saved there. "
+             "Else, lowbit model will be loaded.",
+    )
 
     args = parser.parse_args()
     model_path = args.repo_id_or_model_path
 
-    if not args.lowbit_path or not os.path.exists(args.lowbit_path):
+    if not os.path.exists(args.save_directory):
         model = AutoModelForCausalLM.from_pretrained(model_path,
                                                      optimize_model=True,
                                                      pipeline=True,
@@ -77,10 +77,11 @@ def get_prompt(message: str, chat_history: list[tuple[str, str]],
                                                      torch_dtype=torch.float16,
                                                      attn_implementation="eager",
                                                      transpose_value_cache=not args.disable_transpose_value_cache,
-                                                     trust_remote_code=True)
+                                                     trust_remote_code=True,
+                                                     save_directory=args.save_directory)
     else:
         model = AutoModelForCausalLM.load_low_bit(
-            args.lowbit_path,
+            args.save_directory,
             attn_implementation="eager",
             torch_dtype=torch.float16,
             max_context_len=args.max_context_len,
@@ -92,9 +93,6 @@ def get_prompt(message: str, chat_history: list[tuple[str, str]],
 
     tokenizer = AutoTokenizer.from_pretrained(model_path, trust_remote_code=True)
 
-    if args.lowbit_path and not os.path.exists(args.lowbit_path):
-        model.save_low_bit(args.lowbit_path)
-
     if args.disable_streaming:
         streamer = None
     else:

diff --git a/python/llm/example/NPU/HF-Transformers-AutoModels/LLM/Pipeline-Models/llama2.py b/python/llm/example/NPU/HF-Transformers-AutoModels/LLM/Pipeline-Models/llama2.py
@@ -49,12 +49,6 @@ def get_prompt(message: str, chat_history: list[tuple[str, str]],
         help="The huggingface repo id for the Llama2 model to be downloaded"
         ", or the path to the huggingface checkpoint folder",
     )
-    parser.add_argument("--lowbit-path", type=str,
-        default="",
-        help="The path to the lowbit model folder, leave blank if you do not want to save. \
-            If path not exists, lowbit model will be saved there. \
-            Else, lowbit model will be loaded.",
-    )
     parser.add_argument('--prompt', type=str, default="What is AI?",
                         help='Prompt to infer')
     parser.add_argument("--n-predict", type=int, default=32, help="Max tokens to predict")
@@ -63,11 +57,17 @@ def get_prompt(message: str, chat_history: list[tuple[str, str]],
     parser.add_argument("--quantization_group_size", type=int, default=0)
     parser.add_argument("--disable-transpose-value-cache", action="store_true", default=False)
     parser.add_argument("--disable-streaming", action="store_true", default=False)
+    parser.add_argument("--save-directory", type=str,
+        required=True,
+        help="The path of folder to save converted model, "
+             "If path not exists, lowbit model will be saved there. "
+             "Else, lowbit model will be loaded.",
+    )
 
     args = parser.parse_args()
     model_path = args.repo_id_or_model_path
 
-    if not args.lowbit_path or not os.path.exists(args.lowbit_path):
+    if not os.path.exists(args.save_directory):
         model = AutoModelForCausalLM.from_pretrained(model_path,
                                                      optimize_model=True,
                                                      pipeline=True,
@@ -76,10 +76,11 @@ def get_prompt(message: str, chat_history: list[tuple[str, str]],
                                                      quantization_group_size=args.quantization_group_size,
                                                      torch_dtype=torch.float16,
                                                      attn_implementation="eager",
-                                                     transpose_value_cache=not args.disable_transpose_value_cache)
+                                                     transpose_value_cache=not args.disable_transpose_value_cache,
+                                                     save_directory=args.save_directory)
     else:
         model = AutoModelForCausalLM.load_low_bit(
-            args.lowbit_path,
+            args.save_directory,
             attn_implementation="eager",
             torch_dtype=torch.float16,
             max_context_len=args.max_context_len,
@@ -90,9 +91,6 @@ def get_prompt(message: str, chat_history: list[tuple[str, str]],
 
     tokenizer = AutoTokenizer.from_pretrained(model_path, trust_remote_code=True)
 
-    if args.lowbit_path and not os.path.exists(args.lowbit_path):
-        model.save_low_bit(args.lowbit_path)
-
     if args.disable_streaming:
         streamer = None
     else:

diff --git a/python/llm/example/NPU/HF-Transformers-AutoModels/LLM/Pipeline-Models/llama3.py b/python/llm/example/NPU/HF-Transformers-AutoModels/LLM/Pipeline-Models/llama3.py
@@ -55,12 +55,6 @@ def get_prompt(user_input: str, chat_history: list[tuple[str, str]],
         help="The huggingface repo id for the Llama3 model to be downloaded"
         ", or the path to the huggingface checkpoint folder",
     )
-    parser.add_argument("--lowbit-path", type=str,
-        default="",
-        help="The path to the lowbit model folder, leave blank if you do not want to save. \
-            If path not exists, lowbit model will be saved there. \
-            Else, lowbit model will be loaded.",
-    )
     parser.add_argument('--prompt', type=str, default="What is AI?",
                         help='Prompt to infer')
     parser.add_argument("--n-predict", type=int, default=32, help="Max tokens to predict")
@@ -69,11 +63,17 @@ def get_prompt(user_input: str, chat_history: list[tuple[str, str]],
     parser.add_argument("--quantization_group_size", type=int, default=0)
     parser.add_argument("--disable-transpose-value-cache", action="store_true", default=False)
     parser.add_argument("--disable-streaming", action="store_true", default=False)
+    parser.add_argument("--save-directory", type=str,
+        required=True,
+        help="The path of folder to save converted model, "
+             "If path not exists, lowbit model will be saved there. "
+             "Else, lowbit model will be loaded.",
+    )
 
     args = parser.parse_args()
     model_path = args.repo_id_or_model_path
 
-    if not args.lowbit_path or not os.path.exists(args.lowbit_path):
+    if not os.path.exists(args.save_directory):
         model = AutoModelForCausalLM.from_pretrained(model_path,
                                                     torch_dtype=torch.float16,
                                                     optimize_model=True,
@@ -82,10 +82,11 @@ def get_prompt(user_input: str, chat_history: list[tuple[str, str]],
                                                     max_prompt_len=args.max_prompt_len,
                                                     quantization_group_size=args.quantization_group_size,
                                                     attn_implementation="eager",
-                                                    transpose_value_cache=not args.disable_transpose_value_cache)
+                                                    transpose_value_cache=not args.disable_transpose_value_cache,
+                                                    save_directory=args.save_directory)
     else:
         model = AutoModelForCausalLM.load_low_bit(
-            args.lowbit_path,
+            args.save_directory,
             attn_implementation="eager",
             torch_dtype=torch.float16,
             max_context_len=args.max_context_len,
@@ -96,9 +97,6 @@ def get_prompt(user_input: str, chat_history: list[tuple[str, str]],
 
     tokenizer = AutoTokenizer.from_pretrained(model_path, trust_remote_code=True)
 
-    if args.lowbit_path and not os.path.exists(args.lowbit_path):
-        model.save_low_bit(args.lowbit_path)
-
     if args.disable_streaming:
         streamer = None
     else:

diff --git a/python/llm/example/NPU/HF-Transformers-AutoModels/LLM/Pipeline-Models/minicpm.py b/python/llm/example/NPU/HF-Transformers-AutoModels/LLM/Pipeline-Models/minicpm.py
@@ -36,12 +36,6 @@
         help="The huggingface repo id for the MiniCPM model to be downloaded"
         ", or the path to the huggingface checkpoint folder",
     )
-    parser.add_argument("--lowbit-path", type=str,
-        default="",
-        help="The path to the lowbit model folder, leave blank if you do not want to save. \
-             If path not exists, lowbit model will be saved there. \
-             Else, lowbit model will be loaded.",
-    )
     parser.add_argument('--prompt', type=str, default="What is AI?",
                         help='Prompt to infer')
     parser.add_argument("--n-predict", type=int, default=32, help="Max tokens to predict")
@@ -50,11 +44,17 @@
     parser.add_argument("--quantization_group_size", type=int, default=0)
     parser.add_argument("--disable-transpose-value-cache", action="store_true", default=False)
     parser.add_argument("--disable-streaming", action="store_true", default=False)
+    parser.add_argument("--save-directory", type=str,
+        required=True,
+        help="The path of folder to save converted model, "
+             "If path not exists, lowbit model will be saved there. "
+             "Else, lowbit model will be loaded.",
+    )
 
     args = parser.parse_args()
     model_path = args.repo_id_or_model_path
 
-    if not args.lowbit_path or not os.path.exists(args.lowbit_path):
+    if not os.path.exists(args.save_directory):
         model = AutoModelForCausalLM.from_pretrained(model_path,
                                                      optimize_model=True,
                                                      pipeline=True,
@@ -64,10 +64,11 @@
                                                      attn_implementation="eager",
                                                      quantization_group_size=args.quantization_group_size,
                                                      transpose_value_cache=not args.disable_transpose_value_cache,
-                                                     trust_remote_code=True)
+                                                     trust_remote_code=True,
+                                                     save_directory=args.save_directory)
     else:
         model = AutoModelForCausalLM.load_low_bit(
-            args.lowbit_path,
+            args.save_directory,
             attn_implementation="eager",
             torch_dtype=torch.float16,
             max_context_len=args.max_context_len,
@@ -79,9 +80,6 @@
 
     tokenizer = AutoTokenizer.from_pretrained(model_path, trust_remote_code=True)
 
-    if args.lowbit_path and not os.path.exists(args.lowbit_path):
-        model.save_low_bit(args.lowbit_path)
-
     if args.disable_streaming:
         streamer = None
     else: