Skip to content

Commit

Permalink
[NPU] update save-load API usage (#12473)
Browse files Browse the repository at this point in the history
  • Loading branch information
plusbang authored Dec 3, 2024
1 parent 26adb82 commit ab01753
Show file tree
Hide file tree
Showing 20 changed files with 166 additions and 188 deletions.
6 changes: 4 additions & 2 deletions python/llm/dev/benchmark/all-in-one/run.py
Original file line number Diff line number Diff line change
Expand Up @@ -634,7 +634,7 @@ def transformers_int4_npu_win(repo_id,
model = AutoModel.from_pretrained(model_path, load_in_low_bit=low_bit, optimize_model=optimize_model,
trust_remote_code=True, use_cache=True, max_context_len=max_context_len, max_prompt_len=int(in_out_len[0]),
quantization_group_size=npu_group_size, transpose_value_cache=transpose_value_cache,
attn_implementation="eager", torch_dtype=torch.float16).eval()
save_directory=save_directory, attn_implementation="eager", torch_dtype=torch.float16).eval()
model = model.llm
tokenizer = AutoTokenizer.from_pretrained(model_path, trust_remote_code=True)
else:
Expand Down Expand Up @@ -702,14 +702,16 @@ def transformers_int4_npu_pipeline_win(repo_id,
in_out_len = in_out_pairs[0].split("-")
max_context_len = max(int(in_out_len[0]) + int(in_out_len[1]), 1024)
mixed_precision = True if npu_group_size == 0 else False
save_directory = "./save_converted_model_dir"
# Load model in 4 bit,
# which convert the relevant layers in the model into INT4 format
st = time.perf_counter()

model = AutoModelForCausalLM.from_pretrained(model_path, load_in_low_bit=low_bit, trust_remote_code=True, pipeline=True, torch_dtype=torch.float16,
optimize_model=optimize_model, max_context_len=max_context_len, max_prompt_len=int(in_out_len[0]),
quantization_group_size=npu_group_size, transpose_value_cache=transpose_value_cache,
use_cache=True, attn_implementation="eager", mixed_precision=mixed_precision).eval()
use_cache=True, attn_implementation="eager", mixed_precision=mixed_precision,
save_directory=save_directory).eval()
tokenizer = AutoTokenizer.from_pretrained(model_path, trust_remote_code=True)

end = time.perf_counter()
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -47,45 +47,45 @@ In the example [generate.py](./generate.py), we show a basic use case for a Llam

```cmd
:: to run Llama-2-7b-chat-hf
python llama2.py
python llama2.py --repo-id-or-model-path "meta-llama/Llama-2-7b-chat-hf" --save-directory <converted_model_path>
:: to run Meta-Llama-3-8B-Instruct
python llama3.py
python llama3.py --repo-id-or-model-path "meta-llama/Meta-Llama-3-8B-Instruct" --save-directory <converted_model_path>
:: to run Llama-3.2-1B-Instruct
python llama3.py --repo-id-or-model-path "meta-llama/Llama-3.2-1B-Instruct"
python llama3.py --repo-id-or-model-path "meta-llama/Llama-3.2-1B-Instruct" --save-directory <converted_model_path>
:: to run Llama-3.2-3B-Instruct
python llama3.py --repo-id-or-model-path "meta-llama/Llama-3.2-3B-Instruct"
python llama3.py --repo-id-or-model-path "meta-llama/Llama-3.2-3B-Instruct" --save-directory <converted_model_path>
:: to run Qwen2.5-7B-Instruct
python qwen.py
python qwen.py --repo-id-or-model-path "Qwen/Qwen2.5-7B-Instruct" --save-directory <converted_model_path>
:: to run Qwen2-1.5B-Instruct
python qwen.py --repo-id-or-model-path "Qwen/Qwen2-1.5B-Instruct" --low_bit "sym_int8"
python qwen.py --repo-id-or-model-path "Qwen/Qwen2-1.5B-Instruct" --low-bit sym_int8 --save-directory <converted_model_path>
:: to run Qwen2.5-3B-Instruct
python qwen.py --repo-id-or-model-path "Qwen/Qwen2.5-3B-Instruct" --low_bit "sym_int8"
python qwen.py --repo-id-or-model-path "Qwen/Qwen2.5-3B-Instruct" --low-bit sym_int8 --save-directory <converted_model_path>
:: to run Baichuan2-7B-Chat
python baichuan2.py
python baichuan2.py --repo-id-or-model-path "baichuan-inc/Baichuan2-7B-Chat" --save-directory <converted_model_path>
:: to run MiniCPM-1B-sft-bf16
python minicpm.py
python minicpm.py --repo-id-or-model-path "openbmb/MiniCPM-1B-sft-bf16" --save-directory <converted_model_path>
:: to run MiniCPM-2B-sft-bf16
python minicpm.py --repo-id-or-model-path "openbmb/MiniCPM-2B-sft-bf16"
python minicpm.py --repo-id-or-model-path "openbmb/MiniCPM-2B-sft-bf16" --save-directory <converted_model_path>
```

Arguments info:
- `--repo-id-or-model-path REPO_ID_OR_MODEL_PATH`: argument defining the huggingface repo id for the model (e.g. `meta-llama/Llama-2-7b-chat-hf`) to be downloaded, or the path to the huggingface checkpoint folder.
- `--lowbit-path LOWBIT_MODEL_PATH`: argument defining the path to save/load lowbit version of the model. If it is an empty string, the original pretrained model specified by `REPO_ID_OR_MODEL_PATH` will be loaded. If it is an existing path, the lowbit model in `LOWBIT_MODEL_PATH` will be loaded. If it is a non-existing path, the original pretrained model specified by `REPO_ID_OR_MODEL_PATH` will be loaded, and the converted lowbit version will be saved into `LOWBIT_MODEL_PATH`. It is default to be `''`, i.e. an empty string.
- `--prompt PROMPT`: argument defining the prompt to be infered. It is default to be `What is AI?`.
- `--n-predict N_PREDICT`: argument defining the max number of tokens to predict. It is default to be `32`.
- `--max-context-len MAX_CONTEXT_LEN`: Defines the maximum sequence length for both input and output tokens. It is default to be `1024`.
- `--max-prompt-len MAX_PROMPT_LEN`: Defines the maximum number of tokens that the input prompt can contain. It is default to be `512`.
- `--disable-transpose-value-cache`: Disable the optimization of transposing value cache.
- `--disable-streaming`: Disable streaming mode of generation.
- `--save-directory SAVE_DIRECTORY`: argument defining the path to save converted model. If it is a non-existing path, the original pretrained model specified by `REPO_ID_OR_MODEL_PATH` will be loaded, otherwise the lowbit model in `SAVE_DIRECTORY` will be loaded.

### Sample Output of Streaming Mode
#### [meta-llama/Llama-2-7b-chat-hf](https://huggingface.co/meta-llama/Llama-2-7b-chat-hf)
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -49,12 +49,6 @@ def get_prompt(message: str, chat_history: list[tuple[str, str]],
help="The huggingface repo id for the Baichuan2 model to be downloaded"
", or the path to the huggingface checkpoint folder",
)
parser.add_argument("--lowbit-path", type=str,
default="",
help="The path to the lowbit model folder, leave blank if you do not want to save. \
If path not exists, lowbit model will be saved there. \
Else, lowbit model will be loaded.",
)
parser.add_argument('--prompt', type=str, default="What is AI?",
help='Prompt to infer')
parser.add_argument("--n-predict", type=int, default=32, help="Max tokens to predict")
Expand All @@ -63,11 +57,17 @@ def get_prompt(message: str, chat_history: list[tuple[str, str]],
parser.add_argument("--quantization_group_size", type=int, default=0)
parser.add_argument("--disable-transpose-value-cache", action="store_true", default=False)
parser.add_argument("--disable-streaming", action="store_true", default=False)
parser.add_argument("--save-directory", type=str,
required=True,
help="The path of folder to save converted model, "
"If path not exists, lowbit model will be saved there. "
"Else, lowbit model will be loaded.",
)

args = parser.parse_args()
model_path = args.repo_id_or_model_path

if not args.lowbit_path or not os.path.exists(args.lowbit_path):
if not os.path.exists(args.save_directory):
model = AutoModelForCausalLM.from_pretrained(model_path,
optimize_model=True,
pipeline=True,
Expand All @@ -77,10 +77,11 @@ def get_prompt(message: str, chat_history: list[tuple[str, str]],
torch_dtype=torch.float16,
attn_implementation="eager",
transpose_value_cache=not args.disable_transpose_value_cache,
trust_remote_code=True)
trust_remote_code=True,
save_directory=args.save_directory)
else:
model = AutoModelForCausalLM.load_low_bit(
args.lowbit_path,
args.save_directory,
attn_implementation="eager",
torch_dtype=torch.float16,
max_context_len=args.max_context_len,
Expand All @@ -92,9 +93,6 @@ def get_prompt(message: str, chat_history: list[tuple[str, str]],

tokenizer = AutoTokenizer.from_pretrained(model_path, trust_remote_code=True)

if args.lowbit_path and not os.path.exists(args.lowbit_path):
model.save_low_bit(args.lowbit_path)

if args.disable_streaming:
streamer = None
else:
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -49,12 +49,6 @@ def get_prompt(message: str, chat_history: list[tuple[str, str]],
help="The huggingface repo id for the Llama2 model to be downloaded"
", or the path to the huggingface checkpoint folder",
)
parser.add_argument("--lowbit-path", type=str,
default="",
help="The path to the lowbit model folder, leave blank if you do not want to save. \
If path not exists, lowbit model will be saved there. \
Else, lowbit model will be loaded.",
)
parser.add_argument('--prompt', type=str, default="What is AI?",
help='Prompt to infer')
parser.add_argument("--n-predict", type=int, default=32, help="Max tokens to predict")
Expand All @@ -63,11 +57,17 @@ def get_prompt(message: str, chat_history: list[tuple[str, str]],
parser.add_argument("--quantization_group_size", type=int, default=0)
parser.add_argument("--disable-transpose-value-cache", action="store_true", default=False)
parser.add_argument("--disable-streaming", action="store_true", default=False)
parser.add_argument("--save-directory", type=str,
required=True,
help="The path of folder to save converted model, "
"If path not exists, lowbit model will be saved there. "
"Else, lowbit model will be loaded.",
)

args = parser.parse_args()
model_path = args.repo_id_or_model_path

if not args.lowbit_path or not os.path.exists(args.lowbit_path):
if not os.path.exists(args.save_directory):
model = AutoModelForCausalLM.from_pretrained(model_path,
optimize_model=True,
pipeline=True,
Expand All @@ -76,10 +76,11 @@ def get_prompt(message: str, chat_history: list[tuple[str, str]],
quantization_group_size=args.quantization_group_size,
torch_dtype=torch.float16,
attn_implementation="eager",
transpose_value_cache=not args.disable_transpose_value_cache)
transpose_value_cache=not args.disable_transpose_value_cache,
save_directory=args.save_directory)
else:
model = AutoModelForCausalLM.load_low_bit(
args.lowbit_path,
args.save_directory,
attn_implementation="eager",
torch_dtype=torch.float16,
max_context_len=args.max_context_len,
Expand All @@ -90,9 +91,6 @@ def get_prompt(message: str, chat_history: list[tuple[str, str]],

tokenizer = AutoTokenizer.from_pretrained(model_path, trust_remote_code=True)

if args.lowbit_path and not os.path.exists(args.lowbit_path):
model.save_low_bit(args.lowbit_path)

if args.disable_streaming:
streamer = None
else:
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -55,12 +55,6 @@ def get_prompt(user_input: str, chat_history: list[tuple[str, str]],
help="The huggingface repo id for the Llama3 model to be downloaded"
", or the path to the huggingface checkpoint folder",
)
parser.add_argument("--lowbit-path", type=str,
default="",
help="The path to the lowbit model folder, leave blank if you do not want to save. \
If path not exists, lowbit model will be saved there. \
Else, lowbit model will be loaded.",
)
parser.add_argument('--prompt', type=str, default="What is AI?",
help='Prompt to infer')
parser.add_argument("--n-predict", type=int, default=32, help="Max tokens to predict")
Expand All @@ -69,11 +63,17 @@ def get_prompt(user_input: str, chat_history: list[tuple[str, str]],
parser.add_argument("--quantization_group_size", type=int, default=0)
parser.add_argument("--disable-transpose-value-cache", action="store_true", default=False)
parser.add_argument("--disable-streaming", action="store_true", default=False)
parser.add_argument("--save-directory", type=str,
required=True,
help="The path of folder to save converted model, "
"If path not exists, lowbit model will be saved there. "
"Else, lowbit model will be loaded.",
)

args = parser.parse_args()
model_path = args.repo_id_or_model_path

if not args.lowbit_path or not os.path.exists(args.lowbit_path):
if not os.path.exists(args.save_directory):
model = AutoModelForCausalLM.from_pretrained(model_path,
torch_dtype=torch.float16,
optimize_model=True,
Expand All @@ -82,10 +82,11 @@ def get_prompt(user_input: str, chat_history: list[tuple[str, str]],
max_prompt_len=args.max_prompt_len,
quantization_group_size=args.quantization_group_size,
attn_implementation="eager",
transpose_value_cache=not args.disable_transpose_value_cache)
transpose_value_cache=not args.disable_transpose_value_cache,
save_directory=args.save_directory)
else:
model = AutoModelForCausalLM.load_low_bit(
args.lowbit_path,
args.save_directory,
attn_implementation="eager",
torch_dtype=torch.float16,
max_context_len=args.max_context_len,
Expand All @@ -96,9 +97,6 @@ def get_prompt(user_input: str, chat_history: list[tuple[str, str]],

tokenizer = AutoTokenizer.from_pretrained(model_path, trust_remote_code=True)

if args.lowbit_path and not os.path.exists(args.lowbit_path):
model.save_low_bit(args.lowbit_path)

if args.disable_streaming:
streamer = None
else:
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -36,12 +36,6 @@
help="The huggingface repo id for the MiniCPM model to be downloaded"
", or the path to the huggingface checkpoint folder",
)
parser.add_argument("--lowbit-path", type=str,
default="",
help="The path to the lowbit model folder, leave blank if you do not want to save. \
If path not exists, lowbit model will be saved there. \
Else, lowbit model will be loaded.",
)
parser.add_argument('--prompt', type=str, default="What is AI?",
help='Prompt to infer')
parser.add_argument("--n-predict", type=int, default=32, help="Max tokens to predict")
Expand All @@ -50,11 +44,17 @@
parser.add_argument("--quantization_group_size", type=int, default=0)
parser.add_argument("--disable-transpose-value-cache", action="store_true", default=False)
parser.add_argument("--disable-streaming", action="store_true", default=False)
parser.add_argument("--save-directory", type=str,
required=True,
help="The path of folder to save converted model, "
"If path not exists, lowbit model will be saved there. "
"Else, lowbit model will be loaded.",
)

args = parser.parse_args()
model_path = args.repo_id_or_model_path

if not args.lowbit_path or not os.path.exists(args.lowbit_path):
if not os.path.exists(args.save_directory):
model = AutoModelForCausalLM.from_pretrained(model_path,
optimize_model=True,
pipeline=True,
Expand All @@ -64,10 +64,11 @@
attn_implementation="eager",
quantization_group_size=args.quantization_group_size,
transpose_value_cache=not args.disable_transpose_value_cache,
trust_remote_code=True)
trust_remote_code=True,
save_directory=args.save_directory)
else:
model = AutoModelForCausalLM.load_low_bit(
args.lowbit_path,
args.save_directory,
attn_implementation="eager",
torch_dtype=torch.float16,
max_context_len=args.max_context_len,
Expand All @@ -79,9 +80,6 @@

tokenizer = AutoTokenizer.from_pretrained(model_path, trust_remote_code=True)

if args.lowbit_path and not os.path.exists(args.lowbit_path):
model.save_low_bit(args.lowbit_path)

if args.disable_streaming:
streamer = None
else:
Expand Down
Loading

0 comments on commit ab01753

Please sign in to comment.