From 714887ef852499e40cf661ff379fccf793bc24f0 Mon Sep 17 00:00:00 2001 From: plusbang Date: Mon, 2 Dec 2024 16:55:13 +0800 Subject: [PATCH] save_directory is required argument for all models --- python/llm/dev/benchmark/all-in-one/run.py | 6 ++-- .../LLM/Pipeline-Models/README.md | 22 +++++++------- .../LLM/Pipeline-Models/baichuan2.py | 22 +++++++------- .../LLM/Pipeline-Models/llama2.py | 22 +++++++------- .../LLM/Pipeline-Models/llama3.py | 22 +++++++------- .../LLM/Pipeline-Models/minicpm.py | 22 +++++++------- .../LLM/Pipeline-Models/qwen.py | 24 +++++++-------- .../HF-Transformers-AutoModels/LLM/README.md | 20 ++++++------- .../LLM/baichuan2.py | 30 +++++++------------ .../HF-Transformers-AutoModels/LLM/qwen.py | 4 +-- .../Multimodal/README.md | 11 ++++--- .../Multimodal/bce-embedding.py | 17 +++++------ .../Multimodal/minicpm-llama3-v2.5.py | 11 ++++--- .../Multimodal/minicpm_v_2_6.py | 11 ++++--- .../Multimodal/speech_paraformer-large.py | 11 ++++--- .../src/ipex_llm/transformers/npu_model.py | 9 +++--- 16 files changed, 129 insertions(+), 135 deletions(-) diff --git a/python/llm/dev/benchmark/all-in-one/run.py b/python/llm/dev/benchmark/all-in-one/run.py index 271c85dee224..fdf982cba414 100644 --- a/python/llm/dev/benchmark/all-in-one/run.py +++ b/python/llm/dev/benchmark/all-in-one/run.py @@ -634,7 +634,7 @@ def transformers_int4_npu_win(repo_id, model = AutoModel.from_pretrained(model_path, load_in_low_bit=low_bit, optimize_model=optimize_model, trust_remote_code=True, use_cache=True, max_context_len=max_context_len, max_prompt_len=int(in_out_len[0]), quantization_group_size=npu_group_size, transpose_value_cache=transpose_value_cache, - attn_implementation="eager", torch_dtype=torch.float16).eval() + save_directory=save_directory, attn_implementation="eager", torch_dtype=torch.float16).eval() model = model.llm tokenizer = AutoTokenizer.from_pretrained(model_path, trust_remote_code=True) else: @@ -702,6 +702,7 @@ def transformers_int4_npu_pipeline_win(repo_id, in_out_len = in_out_pairs[0].split("-") max_context_len = max(int(in_out_len[0]) + int(in_out_len[1]), 1024) mixed_precision = True if npu_group_size == 0 else False + save_directory = "./save_converted_model_dir" # Load model in 4 bit, # which convert the relevant layers in the model into INT4 format st = time.perf_counter() @@ -709,7 +710,8 @@ def transformers_int4_npu_pipeline_win(repo_id, model = AutoModelForCausalLM.from_pretrained(model_path, load_in_low_bit=low_bit, trust_remote_code=True, pipeline=True, torch_dtype=torch.float16, optimize_model=optimize_model, max_context_len=max_context_len, max_prompt_len=int(in_out_len[0]), quantization_group_size=npu_group_size, transpose_value_cache=transpose_value_cache, - use_cache=True, attn_implementation="eager", mixed_precision=mixed_precision).eval() + use_cache=True, attn_implementation="eager", mixed_precision=mixed_precision, + save_directory=save_directory).eval() tokenizer = AutoTokenizer.from_pretrained(model_path, trust_remote_code=True) end = time.perf_counter() diff --git a/python/llm/example/NPU/HF-Transformers-AutoModels/LLM/Pipeline-Models/README.md b/python/llm/example/NPU/HF-Transformers-AutoModels/LLM/Pipeline-Models/README.md index 462199dbeee4..30db6e3f9bf9 100644 --- a/python/llm/example/NPU/HF-Transformers-AutoModels/LLM/Pipeline-Models/README.md +++ b/python/llm/example/NPU/HF-Transformers-AutoModels/LLM/Pipeline-Models/README.md @@ -47,45 +47,45 @@ In the example [generate.py](./generate.py), we show a basic use case for a Llam ```cmd :: to run Llama-2-7b-chat-hf -python llama2.py +python llama2.py --repo-id-or-model-path "meta-llama/Llama-2-7b-chat-hf" --save-directory :: to run Meta-Llama-3-8B-Instruct -python llama3.py +python llama3.py --repo-id-or-model-path "meta-llama/Meta-Llama-3-8B-Instruct" --save-directory :: to run Llama-3.2-1B-Instruct -python llama3.py --repo-id-or-model-path "meta-llama/Llama-3.2-1B-Instruct" +python llama3.py --repo-id-or-model-path "meta-llama/Llama-3.2-1B-Instruct" --save-directory :: to run Llama-3.2-3B-Instruct -python llama3.py --repo-id-or-model-path "meta-llama/Llama-3.2-3B-Instruct" +python llama3.py --repo-id-or-model-path "meta-llama/Llama-3.2-3B-Instruct" --save-directory :: to run Qwen2.5-7B-Instruct -python qwen.py +python qwen.py --repo-id-or-model-path "Qwen/Qwen2.5-7B-Instruct" --save-directory :: to run Qwen2-1.5B-Instruct -python qwen.py --repo-id-or-model-path "Qwen/Qwen2-1.5B-Instruct" --low_bit "sym_int8" +python qwen.py --repo-id-or-model-path "Qwen/Qwen2-1.5B-Instruct" --low-bit sym_int8 --save-directory :: to run Qwen2.5-3B-Instruct -python qwen.py --repo-id-or-model-path "Qwen/Qwen2.5-3B-Instruct" --low_bit "sym_int8" +python qwen.py --repo-id-or-model-path "Qwen/Qwen2.5-3B-Instruct" --low-bit sym_int8 --save-directory :: to run Baichuan2-7B-Chat -python baichuan2.py +python baichuan2.py --repo-id-or-model-path "baichuan-inc/Baichuan2-7B-Chat" --save-directory :: to run MiniCPM-1B-sft-bf16 -python minicpm.py +python minicpm.py --repo-id-or-model-path "openbmb/MiniCPM-1B-sft-bf16" --save-directory :: to run MiniCPM-2B-sft-bf16 -python minicpm.py --repo-id-or-model-path "openbmb/MiniCPM-2B-sft-bf16" +python minicpm.py --repo-id-or-model-path "openbmb/MiniCPM-2B-sft-bf16" --save-directory ``` Arguments info: - `--repo-id-or-model-path REPO_ID_OR_MODEL_PATH`: argument defining the huggingface repo id for the model (e.g. `meta-llama/Llama-2-7b-chat-hf`) to be downloaded, or the path to the huggingface checkpoint folder. -- `--lowbit-path LOWBIT_MODEL_PATH`: argument defining the path to save/load lowbit version of the model. If it is an empty string, the original pretrained model specified by `REPO_ID_OR_MODEL_PATH` will be loaded. If it is an existing path, the lowbit model in `LOWBIT_MODEL_PATH` will be loaded. If it is a non-existing path, the original pretrained model specified by `REPO_ID_OR_MODEL_PATH` will be loaded, and the converted lowbit version will be saved into `LOWBIT_MODEL_PATH`. It is default to be `''`, i.e. an empty string. - `--prompt PROMPT`: argument defining the prompt to be infered. It is default to be `What is AI?`. - `--n-predict N_PREDICT`: argument defining the max number of tokens to predict. It is default to be `32`. - `--max-context-len MAX_CONTEXT_LEN`: Defines the maximum sequence length for both input and output tokens. It is default to be `1024`. - `--max-prompt-len MAX_PROMPT_LEN`: Defines the maximum number of tokens that the input prompt can contain. It is default to be `512`. - `--disable-transpose-value-cache`: Disable the optimization of transposing value cache. - `--disable-streaming`: Disable streaming mode of generation. +- `--save-directory SAVE_DIRECTORY`: argument defining the path to save converted model. If it is a non-existing path, the original pretrained model specified by `REPO_ID_OR_MODEL_PATH` will be loaded, otherwise the lowbit model in `SAVE_DIRECTORY` will be loaded. ### Sample Output of Streaming Mode #### [meta-llama/Llama-2-7b-chat-hf](https://huggingface.co/meta-llama/Llama-2-7b-chat-hf) diff --git a/python/llm/example/NPU/HF-Transformers-AutoModels/LLM/Pipeline-Models/baichuan2.py b/python/llm/example/NPU/HF-Transformers-AutoModels/LLM/Pipeline-Models/baichuan2.py index 53258002a66d..f3e3ddbc0ccd 100644 --- a/python/llm/example/NPU/HF-Transformers-AutoModels/LLM/Pipeline-Models/baichuan2.py +++ b/python/llm/example/NPU/HF-Transformers-AutoModels/LLM/Pipeline-Models/baichuan2.py @@ -49,12 +49,6 @@ def get_prompt(message: str, chat_history: list[tuple[str, str]], help="The huggingface repo id for the Baichuan2 model to be downloaded" ", or the path to the huggingface checkpoint folder", ) - parser.add_argument("--lowbit-path", type=str, - default="", - help="The path to the lowbit model folder, leave blank if you do not want to save. \ - If path not exists, lowbit model will be saved there. \ - Else, lowbit model will be loaded.", - ) parser.add_argument('--prompt', type=str, default="What is AI?", help='Prompt to infer') parser.add_argument("--n-predict", type=int, default=32, help="Max tokens to predict") @@ -63,11 +57,17 @@ def get_prompt(message: str, chat_history: list[tuple[str, str]], parser.add_argument("--quantization_group_size", type=int, default=0) parser.add_argument("--disable-transpose-value-cache", action="store_true", default=False) parser.add_argument("--disable-streaming", action="store_true", default=False) + parser.add_argument("--save-directory", type=str, + required=True, + help="The path of folder to save converted model, " + "If path not exists, lowbit model will be saved there. " + "Else, lowbit model will be loaded.", + ) args = parser.parse_args() model_path = args.repo_id_or_model_path - if not args.lowbit_path or not os.path.exists(args.lowbit_path): + if not os.path.exists(args.save_directory): model = AutoModelForCausalLM.from_pretrained(model_path, optimize_model=True, pipeline=True, @@ -77,10 +77,11 @@ def get_prompt(message: str, chat_history: list[tuple[str, str]], torch_dtype=torch.float16, attn_implementation="eager", transpose_value_cache=not args.disable_transpose_value_cache, - trust_remote_code=True) + trust_remote_code=True, + save_directory=args.save_directory) else: model = AutoModelForCausalLM.load_low_bit( - args.lowbit_path, + args.save_directory, attn_implementation="eager", torch_dtype=torch.float16, max_context_len=args.max_context_len, @@ -92,9 +93,6 @@ def get_prompt(message: str, chat_history: list[tuple[str, str]], tokenizer = AutoTokenizer.from_pretrained(model_path, trust_remote_code=True) - if args.lowbit_path and not os.path.exists(args.lowbit_path): - model.save_low_bit(args.lowbit_path) - if args.disable_streaming: streamer = None else: diff --git a/python/llm/example/NPU/HF-Transformers-AutoModels/LLM/Pipeline-Models/llama2.py b/python/llm/example/NPU/HF-Transformers-AutoModels/LLM/Pipeline-Models/llama2.py index c7168bcb4b9c..cb640bc7b059 100644 --- a/python/llm/example/NPU/HF-Transformers-AutoModels/LLM/Pipeline-Models/llama2.py +++ b/python/llm/example/NPU/HF-Transformers-AutoModels/LLM/Pipeline-Models/llama2.py @@ -49,12 +49,6 @@ def get_prompt(message: str, chat_history: list[tuple[str, str]], help="The huggingface repo id for the Llama2 model to be downloaded" ", or the path to the huggingface checkpoint folder", ) - parser.add_argument("--lowbit-path", type=str, - default="", - help="The path to the lowbit model folder, leave blank if you do not want to save. \ - If path not exists, lowbit model will be saved there. \ - Else, lowbit model will be loaded.", - ) parser.add_argument('--prompt', type=str, default="What is AI?", help='Prompt to infer') parser.add_argument("--n-predict", type=int, default=32, help="Max tokens to predict") @@ -63,11 +57,17 @@ def get_prompt(message: str, chat_history: list[tuple[str, str]], parser.add_argument("--quantization_group_size", type=int, default=0) parser.add_argument("--disable-transpose-value-cache", action="store_true", default=False) parser.add_argument("--disable-streaming", action="store_true", default=False) + parser.add_argument("--save-directory", type=str, + required=True, + help="The path of folder to save converted model, " + "If path not exists, lowbit model will be saved there. " + "Else, lowbit model will be loaded.", + ) args = parser.parse_args() model_path = args.repo_id_or_model_path - if not args.lowbit_path or not os.path.exists(args.lowbit_path): + if not os.path.exists(args.save_directory): model = AutoModelForCausalLM.from_pretrained(model_path, optimize_model=True, pipeline=True, @@ -76,10 +76,11 @@ def get_prompt(message: str, chat_history: list[tuple[str, str]], quantization_group_size=args.quantization_group_size, torch_dtype=torch.float16, attn_implementation="eager", - transpose_value_cache=not args.disable_transpose_value_cache) + transpose_value_cache=not args.disable_transpose_value_cache, + save_directory=args.save_directory) else: model = AutoModelForCausalLM.load_low_bit( - args.lowbit_path, + args.save_directory, attn_implementation="eager", torch_dtype=torch.float16, max_context_len=args.max_context_len, @@ -90,9 +91,6 @@ def get_prompt(message: str, chat_history: list[tuple[str, str]], tokenizer = AutoTokenizer.from_pretrained(model_path, trust_remote_code=True) - if args.lowbit_path and not os.path.exists(args.lowbit_path): - model.save_low_bit(args.lowbit_path) - if args.disable_streaming: streamer = None else: diff --git a/python/llm/example/NPU/HF-Transformers-AutoModels/LLM/Pipeline-Models/llama3.py b/python/llm/example/NPU/HF-Transformers-AutoModels/LLM/Pipeline-Models/llama3.py index a837e03c6f3f..ac3433b92b46 100644 --- a/python/llm/example/NPU/HF-Transformers-AutoModels/LLM/Pipeline-Models/llama3.py +++ b/python/llm/example/NPU/HF-Transformers-AutoModels/LLM/Pipeline-Models/llama3.py @@ -55,12 +55,6 @@ def get_prompt(user_input: str, chat_history: list[tuple[str, str]], help="The huggingface repo id for the Llama3 model to be downloaded" ", or the path to the huggingface checkpoint folder", ) - parser.add_argument("--lowbit-path", type=str, - default="", - help="The path to the lowbit model folder, leave blank if you do not want to save. \ - If path not exists, lowbit model will be saved there. \ - Else, lowbit model will be loaded.", - ) parser.add_argument('--prompt', type=str, default="What is AI?", help='Prompt to infer') parser.add_argument("--n-predict", type=int, default=32, help="Max tokens to predict") @@ -69,11 +63,17 @@ def get_prompt(user_input: str, chat_history: list[tuple[str, str]], parser.add_argument("--quantization_group_size", type=int, default=0) parser.add_argument("--disable-transpose-value-cache", action="store_true", default=False) parser.add_argument("--disable-streaming", action="store_true", default=False) + parser.add_argument("--save-directory", type=str, + required=True, + help="The path of folder to save converted model, " + "If path not exists, lowbit model will be saved there. " + "Else, lowbit model will be loaded.", + ) args = parser.parse_args() model_path = args.repo_id_or_model_path - if not args.lowbit_path or not os.path.exists(args.lowbit_path): + if not os.path.exists(args.save_directory): model = AutoModelForCausalLM.from_pretrained(model_path, torch_dtype=torch.float16, optimize_model=True, @@ -82,10 +82,11 @@ def get_prompt(user_input: str, chat_history: list[tuple[str, str]], max_prompt_len=args.max_prompt_len, quantization_group_size=args.quantization_group_size, attn_implementation="eager", - transpose_value_cache=not args.disable_transpose_value_cache) + transpose_value_cache=not args.disable_transpose_value_cache, + save_directory=args.save_directory) else: model = AutoModelForCausalLM.load_low_bit( - args.lowbit_path, + args.save_directory, attn_implementation="eager", torch_dtype=torch.float16, max_context_len=args.max_context_len, @@ -96,9 +97,6 @@ def get_prompt(user_input: str, chat_history: list[tuple[str, str]], tokenizer = AutoTokenizer.from_pretrained(model_path, trust_remote_code=True) - if args.lowbit_path and not os.path.exists(args.lowbit_path): - model.save_low_bit(args.lowbit_path) - if args.disable_streaming: streamer = None else: diff --git a/python/llm/example/NPU/HF-Transformers-AutoModels/LLM/Pipeline-Models/minicpm.py b/python/llm/example/NPU/HF-Transformers-AutoModels/LLM/Pipeline-Models/minicpm.py index d9bcae4bae58..df5bd756c99d 100644 --- a/python/llm/example/NPU/HF-Transformers-AutoModels/LLM/Pipeline-Models/minicpm.py +++ b/python/llm/example/NPU/HF-Transformers-AutoModels/LLM/Pipeline-Models/minicpm.py @@ -36,12 +36,6 @@ help="The huggingface repo id for the MiniCPM model to be downloaded" ", or the path to the huggingface checkpoint folder", ) - parser.add_argument("--lowbit-path", type=str, - default="", - help="The path to the lowbit model folder, leave blank if you do not want to save. \ - If path not exists, lowbit model will be saved there. \ - Else, lowbit model will be loaded.", - ) parser.add_argument('--prompt', type=str, default="What is AI?", help='Prompt to infer') parser.add_argument("--n-predict", type=int, default=32, help="Max tokens to predict") @@ -50,11 +44,17 @@ parser.add_argument("--quantization_group_size", type=int, default=0) parser.add_argument("--disable-transpose-value-cache", action="store_true", default=False) parser.add_argument("--disable-streaming", action="store_true", default=False) + parser.add_argument("--save-directory", type=str, + required=True, + help="The path of folder to save converted model, " + "If path not exists, lowbit model will be saved there. " + "Else, lowbit model will be loaded.", + ) args = parser.parse_args() model_path = args.repo_id_or_model_path - if not args.lowbit_path or not os.path.exists(args.lowbit_path): + if not os.path.exists(args.save_directory): model = AutoModelForCausalLM.from_pretrained(model_path, optimize_model=True, pipeline=True, @@ -64,10 +64,11 @@ attn_implementation="eager", quantization_group_size=args.quantization_group_size, transpose_value_cache=not args.disable_transpose_value_cache, - trust_remote_code=True) + trust_remote_code=True, + save_directory=args.save_directory) else: model = AutoModelForCausalLM.load_low_bit( - args.lowbit_path, + args.save_directory, attn_implementation="eager", torch_dtype=torch.float16, max_context_len=args.max_context_len, @@ -79,9 +80,6 @@ tokenizer = AutoTokenizer.from_pretrained(model_path, trust_remote_code=True) - if args.lowbit_path and not os.path.exists(args.lowbit_path): - model.save_low_bit(args.lowbit_path) - if args.disable_streaming: streamer = None else: diff --git a/python/llm/example/NPU/HF-Transformers-AutoModels/LLM/Pipeline-Models/qwen.py b/python/llm/example/NPU/HF-Transformers-AutoModels/LLM/Pipeline-Models/qwen.py index d04961ece875..ef5ded708960 100644 --- a/python/llm/example/NPU/HF-Transformers-AutoModels/LLM/Pipeline-Models/qwen.py +++ b/python/llm/example/NPU/HF-Transformers-AutoModels/LLM/Pipeline-Models/qwen.py @@ -36,27 +36,27 @@ help="The huggingface repo id for the Qwen model to be downloaded" ", or the path to the huggingface checkpoint folder", ) - parser.add_argument("--lowbit-path", type=str, - default="", - help="The path to the lowbit model folder, leave blank if you do not want to save. \ - If path not exists, lowbit model will be saved there. \ - Else, lowbit model will be loaded.", - ) parser.add_argument('--prompt', type=str, default="AI是什么?", help='Prompt to infer') parser.add_argument("--n-predict", type=int, default=32, help="Max tokens to predict") parser.add_argument("--max-context-len", type=int, default=1024) parser.add_argument("--max-prompt-len", type=int, default=512) parser.add_argument("--quantization_group_size", type=int, default=0) - parser.add_argument('--low_bit', type=str, default="sym_int4", + parser.add_argument('--low-bit', type=str, default="sym_int4", help='Low bit precision to quantize the model') parser.add_argument("--disable-transpose-value-cache", action="store_true", default=False) parser.add_argument("--disable-streaming", action="store_true", default=False) + parser.add_argument("--save-directory", type=str, + required=True, + help="The path of folder to save converted model, " + "If path not exists, lowbit model will be saved there. " + "Else, lowbit model will be loaded.", + ) args = parser.parse_args() model_path = args.repo_id_or_model_path - if not args.lowbit_path or not os.path.exists(args.lowbit_path): + if not os.path.exists(args.save_directory): model = AutoModelForCausalLM.from_pretrained(model_path, optimize_model=True, pipeline=True, @@ -68,10 +68,11 @@ attn_implementation="eager", transpose_value_cache=not args.disable_transpose_value_cache, mixed_precision=True, - trust_remote_code=True) + trust_remote_code=True, + save_directory=args.save_directory) else: model = AutoModelForCausalLM.load_low_bit( - args.lowbit_path, + args.save_directory, attn_implementation="eager", torch_dtype=torch.float16, max_context_len=args.max_context_len, @@ -81,9 +82,6 @@ tokenizer = AutoTokenizer.from_pretrained(model_path, trust_remote_code=True) - if args.lowbit_path and not os.path.exists(args.lowbit_path): - model.save_low_bit(args.lowbit_path) - if args.disable_streaming: streamer = None else: diff --git a/python/llm/example/NPU/HF-Transformers-AutoModels/LLM/README.md b/python/llm/example/NPU/HF-Transformers-AutoModels/LLM/README.md index 1377b7a0c087..e114f1156f48 100644 --- a/python/llm/example/NPU/HF-Transformers-AutoModels/LLM/README.md +++ b/python/llm/example/NPU/HF-Transformers-AutoModels/LLM/README.md @@ -99,34 +99,34 @@ The examples below show how to run the **_optimized HuggingFace model implementa ### Run ```cmd :: to run Llama-2-7b-chat-hf -python llama2.py --repo-id-or-model-path meta-llama/Llama-2-7b-chat-hf --save-directory +python llama2.py --repo-id-or-model-path "meta-llama/Llama-2-7b-chat-hf" --save-directory :: to run Meta-Llama-3-8B-Instruct -python llama3.py --repo-id-or-model-path meta-llama/Meta-Llama-3-8B-Instruct --save-directory +python llama3.py --repo-id-or-model-path "meta-llama/Meta-Llama-3-8B-Instruct" --save-directory :: to run Llama-3.2-1B-Instruct -python llama3.py --repo-id-or-model-path meta-llama/Llama-3.2-1B-Instruct --save-directory +python llama3.py --repo-id-or-model-path "meta-llama/Llama-3.2-1B-Instruct" --save-directory :: to run Llama-3.2-3B-Instruct -python llama3.py --repo-id-or-model-path meta-llama/Llama-3.2-3B-Instruct --save-directory +python llama3.py --repo-id-or-model-path "meta-llama/Llama-3.2-3B-Instruct" --save-directory :: to run Qwen2-1.5B-Instruct -python qwen.py --repo-id-or-model-path Qwen/Qwen2-1.5B-Instruct --low_bit sym_int8 --save-directory +python qwen.py --repo-id-or-model-path "Qwen/Qwen2-1.5B-Instruct" --low-bit sym_int8 --save-directory :: to run Qwen2.5-3B-Instruct -python qwen.py --repo-id-or-model-path Qwen/Qwen2.5-3B-Instruct --low_bit sym_int8 --save-directory +python qwen.py --repo-id-or-model-path "Qwen/Qwen2.5-3B-Instruct" --low-bit sym_int8 --save-directory :: to run Qwen2.5-7B-Instruct -python qwen.py --repo-id-or-model-path Qwen/Qwen2.5-7B-Instruct --save-directory +python qwen.py --repo-id-or-model-path "Qwen/Qwen2.5-7B-Instruct" --save-directory :: to run MiniCPM-1B-sft-bf16 -python minicpm.py --repo-id-or-model-path openbmb/MiniCPM-1B-sft-bf16 --save-directory +python minicpm.py --repo-id-or-model-path "openbmb/MiniCPM-1B-sft-bf16" --save-directory :: to run MiniCPM-2B-sft-bf16 -python minicpm.py --repo-id-or-model-path openbmb/MiniCPM-2B-sft-bf16 --save-directory +python minicpm.py --repo-id-or-model-path "openbmb/MiniCPM-2B-sft-bf16" --save-directory :: to run Baichuan2-7B-Chat -python baichuan2.py +python baichuan2.py --repo-id-or-model-path "baichuan-inc/Baichuan2-7B-Chat" --save-directory ``` Arguments info: diff --git a/python/llm/example/NPU/HF-Transformers-AutoModels/LLM/baichuan2.py b/python/llm/example/NPU/HF-Transformers-AutoModels/LLM/baichuan2.py index 1d528357fece..05c47076ede0 100644 --- a/python/llm/example/NPU/HF-Transformers-AutoModels/LLM/baichuan2.py +++ b/python/llm/example/NPU/HF-Transformers-AutoModels/LLM/baichuan2.py @@ -50,57 +50,49 @@ def get_prompt(message: str, chat_history: list[tuple[str, str]], help="The huggingface repo id for the Baichuan2 model to be downloaded" ", or the path to the huggingface checkpoint folder", ) - parser.add_argument("--lowbit-path", type=str, - default="", - help="The path to the lowbit model folder, leave blank if you do not want to save. \ - If path not exists, lowbit model will be saved there. \ - Else, lowbit model will be loaded.", - ) parser.add_argument('--prompt', type=str, default="What is AI?", help='Prompt to infer') parser.add_argument("--n-predict", type=int, default=32, help="Max tokens to predict") parser.add_argument("--max-context-len", type=int, default=1024) parser.add_argument("--max-prompt-len", type=int, default=512) parser.add_argument("--disable-transpose-value-cache", action="store_true", default=False) - parser.add_argument("--intra-pp", type=int, default=2) - parser.add_argument("--inter-pp", type=int, default=2) + parser.add_argument("--save-directory", type=str, + required=True, + help="The path of folder to save converted model, " + "If path not exists, lowbit model will be saved there. " + "Else, lowbit model will be loaded.", + ) args = parser.parse_args() model_path = args.repo_id_or_model_path - if not args.lowbit_path or not os.path.exists(args.lowbit_path): + if not os.path.exists(args.save_directory): model = AutoModelForCausalLM.from_pretrained( model_path, - torch_dtype=torch.bfloat16, + torch_dtype=torch.float16, trust_remote_code=True, attn_implementation="eager", load_in_low_bit="sym_int4", optimize_model=True, max_context_len=args.max_context_len, max_prompt_len=args.max_prompt_len, - intra_pp=args.intra_pp, - inter_pp=args.inter_pp, transpose_value_cache=not args.disable_transpose_value_cache, + save_directory=args.save_directory ) else: model = AutoModelForCausalLM.load_low_bit( - args.lowbit_path, + args.save_directory, attn_implementation="eager", - torch_dtype=torch.bfloat16, + torch_dtype=torch.float16, optimize_model=True, max_context_len=args.max_context_len, max_prompt_len=args.max_prompt_len, - intra_pp=args.intra_pp, - inter_pp=args.inter_pp, transpose_value_cache=not args.disable_transpose_value_cache, trust_remote_code=True, ) tokenizer = AutoTokenizer.from_pretrained(model_path, trust_remote_code=True) - if args.lowbit_path and not os.path.exists(args.lowbit_path): - model.save_low_bit(args.lowbit_path) - DEFAULT_SYSTEM_PROMPT = """\ """ diff --git a/python/llm/example/NPU/HF-Transformers-AutoModels/LLM/qwen.py b/python/llm/example/NPU/HF-Transformers-AutoModels/LLM/qwen.py index 93f5b9e09d4b..9089475cc2ba 100644 --- a/python/llm/example/NPU/HF-Transformers-AutoModels/LLM/qwen.py +++ b/python/llm/example/NPU/HF-Transformers-AutoModels/LLM/qwen.py @@ -43,7 +43,7 @@ parser.add_argument("--max-context-len", type=int, default=1024) parser.add_argument("--max-prompt-len", type=int, default=960) parser.add_argument("--quantization_group_size", type=int, default=0) - parser.add_argument('--low_bit', type=str, default="sym_int4", + parser.add_argument('--low-bit', type=str, default="sym_int4", help='Load in low bit to use') parser.add_argument("--disable-transpose-value-cache", action="store_true", default=False) parser.add_argument("--save-directory", type=str, @@ -98,7 +98,7 @@ print("input length:", len(_input_ids[0])) st = time.time() output = model.generate( - _input_ids, num_beams=1, do_sample=False, max_new_tokens=args.n_predict + _input_ids, num_beams=1, do_sample=False, max_new_tokens=args.n_predict, do_print=True ) end = time.time() print(f"Inference time: {end-st} s") diff --git a/python/llm/example/NPU/HF-Transformers-AutoModels/Multimodal/README.md b/python/llm/example/NPU/HF-Transformers-AutoModels/Multimodal/README.md index 53f47df79463..6e1b3db72b9e 100644 --- a/python/llm/example/NPU/HF-Transformers-AutoModels/Multimodal/README.md +++ b/python/llm/example/NPU/HF-Transformers-AutoModels/Multimodal/README.md @@ -102,10 +102,10 @@ The examples below show how to run the **_optimized HuggingFace & FunASR model i ### 4.1 Run MiniCPM-Llama3-V-2_5 & MiniCPM-V-2_6 ```bash # to run MiniCPM-Llama3-V-2_5 -python minicpm-llama3-v2.5.py +python minicpm-llama3-v2.5.py --save-directory # to run MiniCPM-V-2_6 -python minicpm_v_2_6.py +python minicpm_v_2_6.py --save-directory ``` Arguments info: @@ -116,6 +116,7 @@ Arguments info: - `--max-output-len MAX_OUTPUT_LEN`: Defines the maximum sequence length for both input and output tokens. It is default to be `1024`. - `--max-prompt-len MAX_PROMPT_LEN`: Defines the maximum number of tokens that the input prompt can contain. It is default to be `512`. - `--disable-transpose-value-cache`: Disable the optimization of transposing value cache. +- `--save-directory SAVE_DIRECTORY`: argument defining the path to save converted model. If it is a non-existing path, the original pretrained model specified by `REPO_ID_OR_MODEL_PATH` will be loaded, otherwise the lowbit model in `SAVE_DIRECTORY` will be loaded. #### Sample Output ##### [openbmb/MiniCPM-V-2_6](https://huggingface.co/openbmb/MiniCPM-V-2_6) @@ -133,12 +134,13 @@ The image features a young child holding and showing off a white teddy bear wear ### 4.2 Run Speech_Paraformer-Large ```bash # to run Speech_Paraformer-Large -python speech_paraformer-large.py +python speech_paraformer-large.py --save-directory ``` Arguments info: - `--repo-id-or-model-path REPO_ID_OR_MODEL_PATH`: argument defining the asr repo id for the model (i.e. `iic/speech_paraformer-large-vad-punc_asr_nat-zh-cn-16k-common-vocab8404-pytorch`) to be downloaded, or the path to the asr checkpoint folder. - `--load_in_low_bit`: argument defining the `load_in_low_bit` format used. It is default to be `sym_int8`, `sym_int4` can also be used. +- `--save-directory SAVE_DIRECTORY`: argument defining the path to save converted model. If it is a non-existing path, the original pretrained model specified by `REPO_ID_OR_MODEL_PATH` will be loaded, otherwise the lowbit model in `SAVE_DIRECTORY` will be loaded. #### Sample Output ##### [iic/speech_paraformer-large-vad-punc_asr_nat-zh-cn-16k-common-vocab8404-pytorch](https://www.modelscope.cn/models/iic/speech_paraformer-large-vad-punc_asr_nat-zh-cn-16k-common-vocab8404-pytorch) @@ -156,11 +158,12 @@ rtf_avg: 0.232: 100%|███████████████████ ### 4.3 Run Bce-Embedding-Base-V1 ```bash # to run Bce-Embedding-Base-V1 -python bce-embedding.py +python bce-embedding.py --save-directory ``` Arguments info: - `--repo-id-or-model-path REPO_ID_OR_MODEL_PATH`: argument defining the asr repo id for the model (i.e. `maidalun1020/bce-embedding-base_v1`) to be downloaded, or the path to the asr checkpoint folder. +- `--save-directory SAVE_DIRECTORY`: argument defining the path to save converted model. If it is a non-existing path, the original pretrained model specified by `REPO_ID_OR_MODEL_PATH` will be loaded, otherwise the lowbit model in `SAVE_DIRECTORY` will be loaded. #### Sample Output ##### [maidalun1020/bce-embedding-base_v1](https://huggingface.co/maidalun1020/bce-embedding-base_v1) | diff --git a/python/llm/example/NPU/HF-Transformers-AutoModels/Multimodal/bce-embedding.py b/python/llm/example/NPU/HF-Transformers-AutoModels/Multimodal/bce-embedding.py index a2f3550d52a0..760a5e5f28bc 100644 --- a/python/llm/example/NPU/HF-Transformers-AutoModels/Multimodal/bce-embedding.py +++ b/python/llm/example/NPU/HF-Transformers-AutoModels/Multimodal/bce-embedding.py @@ -35,19 +35,17 @@ help="The huggingface repo id for the bce-embedding model to be downloaded" ", or the path to the huggingface checkpoint folder", ) - parser.add_argument("--lowbit-path", type=str, - default="", - help="The path to the lowbit model folder, leave blank if you do not want to save. \ - If path not exists, lowbit model will be saved there. \ - Else, lowbit model will be loaded.", - ) parser.add_argument('--prompt', type=str, default="'sentence_0', 'sentence_1'", help='Prompt to infer') parser.add_argument("--max-context-len", type=int, default=1024) parser.add_argument("--max-prompt-len", type=int, default=512) parser.add_argument("--disable-transpose-value-cache", action="store_true", default=False) - parser.add_argument("--intra-pp", type=int, default=2) - parser.add_argument("--inter-pp", type=int, default=2) + parser.add_argument("--save-directory", type=str, + required=True, + help="The path of folder to save converted model, " + "If path not exists, lowbit model will be saved there. " + "Else, lowbit model will be loaded.", + ) args = parser.parse_args() model_path = args.repo_id_or_model_path @@ -60,9 +58,8 @@ optimize_model=True, max_context_len=args.max_context_len, max_prompt_len=args.max_prompt_len, - intra_pp=args.intra_pp, - inter_pp=args.inter_pp, transpose_value_cache=not args.disable_transpose_value_cache, + save_directory=args.save_directory ) # list of sentences diff --git a/python/llm/example/NPU/HF-Transformers-AutoModels/Multimodal/minicpm-llama3-v2.5.py b/python/llm/example/NPU/HF-Transformers-AutoModels/Multimodal/minicpm-llama3-v2.5.py index e7ffaf53c41e..e4cdef6120ae 100644 --- a/python/llm/example/NPU/HF-Transformers-AutoModels/Multimodal/minicpm-llama3-v2.5.py +++ b/python/llm/example/NPU/HF-Transformers-AutoModels/Multimodal/minicpm-llama3-v2.5.py @@ -48,8 +48,12 @@ parser.add_argument("--max-context-len", type=int, default=1024) parser.add_argument("--max-prompt-len", type=int, default=512) parser.add_argument("--disable-transpose-value-cache", action="store_true", default=False) - parser.add_argument("--intra-pp", type=int, default=2) - parser.add_argument("--inter-pp", type=int, default=2) + parser.add_argument("--save-directory", type=str, + required=True, + help="The path of folder to save converted model, " + "If path not exists, lowbit model will be saved there. " + "Else, lowbit model will be loaded.", + ) args = parser.parse_args() model_path = args.repo_id_or_model_path @@ -63,9 +67,8 @@ optimize_model=True, max_context_len=args.max_context_len, max_prompt_len=args.max_prompt_len, - intra_pp=args.intra_pp, - inter_pp=args.inter_pp, transpose_value_cache=not args.disable_transpose_value_cache, + save_directory=args.save_directory ) tokenizer = AutoTokenizer.from_pretrained(model_path, trust_remote_code=True) diff --git a/python/llm/example/NPU/HF-Transformers-AutoModels/Multimodal/minicpm_v_2_6.py b/python/llm/example/NPU/HF-Transformers-AutoModels/Multimodal/minicpm_v_2_6.py index 1a524a5b2dc8..ec6b5361aa20 100644 --- a/python/llm/example/NPU/HF-Transformers-AutoModels/Multimodal/minicpm_v_2_6.py +++ b/python/llm/example/NPU/HF-Transformers-AutoModels/Multimodal/minicpm_v_2_6.py @@ -39,8 +39,12 @@ parser.add_argument("--max-context-len", type=int, default=1024) parser.add_argument("--max-prompt-len", type=int, default=512) parser.add_argument("--disable-transpose-value-cache", action="store_true", default=False) - parser.add_argument("--intra-pp", type=int, default=None) - parser.add_argument("--inter-pp", type=int, default=None) + parser.add_argument("--save-directory", type=str, + required=True, + help="The path of folder to save converted model, " + "If path not exists, lowbit model will be saved there. " + "Else, lowbit model will be loaded.", + ) args = parser.parse_args() model_path = args.repo_id_or_model_path @@ -54,9 +58,8 @@ optimize_model=True, max_context_len=args.max_context_len, max_prompt_len=args.max_prompt_len, - intra_pp=args.intra_pp, - inter_pp=args.inter_pp, transpose_value_cache=not args.disable_transpose_value_cache, + save_directory=args.save_directory ) tokenizer = AutoTokenizer.from_pretrained(model_path, trust_remote_code=True) diff --git a/python/llm/example/NPU/HF-Transformers-AutoModels/Multimodal/speech_paraformer-large.py b/python/llm/example/NPU/HF-Transformers-AutoModels/Multimodal/speech_paraformer-large.py index d2ffe3ad8cc1..0bf03d411cd0 100644 --- a/python/llm/example/NPU/HF-Transformers-AutoModels/Multimodal/speech_paraformer-large.py +++ b/python/llm/example/NPU/HF-Transformers-AutoModels/Multimodal/speech_paraformer-large.py @@ -35,8 +35,12 @@ ) parser.add_argument('--load_in_low_bit', type=str, default="sym_int8", help='Load in low bit to use') - parser.add_argument("--intra-pp", type=int, default=2) - parser.add_argument("--inter-pp", type=int, default=2) + parser.add_argument("--save-directory", type=str, + required=True, + help="The path of folder to save converted model, " + "If path not exists, lowbit model will be saved there. " + "Else, lowbit model will be loaded.", + ) args = parser.parse_args() model_path = args.repo_id_or_model_path @@ -47,8 +51,7 @@ load_in_low_bit=args.load_in_low_bit, low_cpu_mem_usage=True, optimize_model=True, - intra_pp=args.intra_pp, - inter_pp=args.inter_pp, + save_directory=args.save_directory ) res = model.generate(input=f"{model.model_path}/example/asr_example.wav", diff --git a/python/llm/src/ipex_llm/transformers/npu_model.py b/python/llm/src/ipex_llm/transformers/npu_model.py index cbf14d509c4c..1096b12f05f5 100644 --- a/python/llm/src/ipex_llm/transformers/npu_model.py +++ b/python/llm/src/ipex_llm/transformers/npu_model.py @@ -46,7 +46,6 @@ def ignore_argument(kwargs: dict, key: "str"): def save_low_bit(self, model_dir: str, *args, **kwargs): if hasattr(self, "save_directory"): - logger.info(f"Model has already saved to {self.save_directory}.") return 1 origin_device = self.device kwargs["safe_serialization"] = False @@ -258,6 +257,9 @@ def optimize_npu_model(cls, *args, **kwargs): save_directory = kwargs.pop('save_directory', None) fuse_layers = kwargs.pop('fuse_layers', None) imatrix_data = kwargs.pop('imatrix_data', None) + invalidInputError(save_directory is not None, + "Please provide the path to save converted model " + "through `save_directory`.") if hasattr(model, "llm"): llm = model.llm @@ -280,9 +282,6 @@ def optimize_npu_model(cls, *args, **kwargs): if not pipeline: if model.config.model_type in ["qwen2", "llama", "minicpm"]: - invalidInputError(save_directory is not None, - "Please provide the path to save converted model " - "through `save_directory`.") from ipex_llm.transformers.npu_models.convert import optimize_llm_single_process optimize_llm_single_process( llm, @@ -317,6 +316,8 @@ def optimize_npu_model(cls, *args, **kwargs): save_directory=save_directory, fuse_layers=fuse_layers) model.save_low_bit = types.MethodType(save_low_bit, model) + model.save_low_bit(save_directory) + logger.info(f"Converted model has already saved to {save_directory}.") return model @classmethod