Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

[NPU] update save-load API usage #12473

Merged
merged 6 commits into from
Dec 3, 2024
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
6 changes: 4 additions & 2 deletions python/llm/dev/benchmark/all-in-one/run.py
Original file line number Diff line number Diff line change
Expand Up @@ -634,7 +634,7 @@ def transformers_int4_npu_win(repo_id,
model = AutoModel.from_pretrained(model_path, load_in_low_bit=low_bit, optimize_model=optimize_model,
trust_remote_code=True, use_cache=True, max_context_len=max_context_len, max_prompt_len=int(in_out_len[0]),
quantization_group_size=npu_group_size, transpose_value_cache=transpose_value_cache,
attn_implementation="eager", torch_dtype=torch.float16).eval()
save_directory=save_directory, attn_implementation="eager", torch_dtype=torch.float16).eval()
model = model.llm
tokenizer = AutoTokenizer.from_pretrained(model_path, trust_remote_code=True)
else:
Expand Down Expand Up @@ -702,14 +702,16 @@ def transformers_int4_npu_pipeline_win(repo_id,
in_out_len = in_out_pairs[0].split("-")
max_context_len = max(int(in_out_len[0]) + int(in_out_len[1]), 1024)
mixed_precision = True if npu_group_size == 0 else False
save_directory = "./save_converted_model_dir"
# Load model in 4 bit,
# which convert the relevant layers in the model into INT4 format
st = time.perf_counter()

model = AutoModelForCausalLM.from_pretrained(model_path, load_in_low_bit=low_bit, trust_remote_code=True, pipeline=True, torch_dtype=torch.float16,
optimize_model=optimize_model, max_context_len=max_context_len, max_prompt_len=int(in_out_len[0]),
quantization_group_size=npu_group_size, transpose_value_cache=transpose_value_cache,
use_cache=True, attn_implementation="eager", mixed_precision=mixed_precision).eval()
use_cache=True, attn_implementation="eager", mixed_precision=mixed_precision,
save_directory=save_directory).eval()
tokenizer = AutoTokenizer.from_pretrained(model_path, trust_remote_code=True)

end = time.perf_counter()
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -47,45 +47,45 @@ In the example [generate.py](./generate.py), we show a basic use case for a Llam

```cmd
:: to run Llama-2-7b-chat-hf
python llama2.py
python llama2.py --repo-id-or-model-path "meta-llama/Llama-2-7b-chat-hf" --save-directory <converted_model_path>

:: to run Meta-Llama-3-8B-Instruct
python llama3.py
python llama3.py --repo-id-or-model-path "meta-llama/Meta-Llama-3-8B-Instruct" --save-directory <converted_model_path>

:: to run Llama-3.2-1B-Instruct
python llama3.py --repo-id-or-model-path "meta-llama/Llama-3.2-1B-Instruct"
python llama3.py --repo-id-or-model-path "meta-llama/Llama-3.2-1B-Instruct" --save-directory <converted_model_path>

:: to run Llama-3.2-3B-Instruct
python llama3.py --repo-id-or-model-path "meta-llama/Llama-3.2-3B-Instruct"
python llama3.py --repo-id-or-model-path "meta-llama/Llama-3.2-3B-Instruct" --save-directory <converted_model_path>

:: to run Qwen2.5-7B-Instruct
python qwen.py
python qwen.py --repo-id-or-model-path "Qwen/Qwen2.5-7B-Instruct" --save-directory <converted_model_path>

:: to run Qwen2-1.5B-Instruct
python qwen.py --repo-id-or-model-path "Qwen/Qwen2-1.5B-Instruct" --low_bit "sym_int8"
python qwen.py --repo-id-or-model-path "Qwen/Qwen2-1.5B-Instruct" --low-bit sym_int8 --save-directory <converted_model_path>

:: to run Qwen2.5-3B-Instruct
python qwen.py --repo-id-or-model-path "Qwen/Qwen2.5-3B-Instruct" --low_bit "sym_int8"
python qwen.py --repo-id-or-model-path "Qwen/Qwen2.5-3B-Instruct" --low-bit sym_int8 --save-directory <converted_model_path>

:: to run Baichuan2-7B-Chat
python baichuan2.py
python baichuan2.py --repo-id-or-model-path "baichuan-inc/Baichuan2-7B-Chat" --save-directory <converted_model_path>

:: to run MiniCPM-1B-sft-bf16
python minicpm.py
python minicpm.py --repo-id-or-model-path "openbmb/MiniCPM-1B-sft-bf16" --save-directory <converted_model_path>

:: to run MiniCPM-2B-sft-bf16
python minicpm.py --repo-id-or-model-path "openbmb/MiniCPM-2B-sft-bf16"
python minicpm.py --repo-id-or-model-path "openbmb/MiniCPM-2B-sft-bf16" --save-directory <converted_model_path>
```

Arguments info:
- `--repo-id-or-model-path REPO_ID_OR_MODEL_PATH`: argument defining the huggingface repo id for the model (e.g. `meta-llama/Llama-2-7b-chat-hf`) to be downloaded, or the path to the huggingface checkpoint folder.
- `--lowbit-path LOWBIT_MODEL_PATH`: argument defining the path to save/load lowbit version of the model. If it is an empty string, the original pretrained model specified by `REPO_ID_OR_MODEL_PATH` will be loaded. If it is an existing path, the lowbit model in `LOWBIT_MODEL_PATH` will be loaded. If it is a non-existing path, the original pretrained model specified by `REPO_ID_OR_MODEL_PATH` will be loaded, and the converted lowbit version will be saved into `LOWBIT_MODEL_PATH`. It is default to be `''`, i.e. an empty string.
- `--prompt PROMPT`: argument defining the prompt to be infered. It is default to be `What is AI?`.
- `--n-predict N_PREDICT`: argument defining the max number of tokens to predict. It is default to be `32`.
- `--max-context-len MAX_CONTEXT_LEN`: Defines the maximum sequence length for both input and output tokens. It is default to be `1024`.
- `--max-prompt-len MAX_PROMPT_LEN`: Defines the maximum number of tokens that the input prompt can contain. It is default to be `512`.
- `--disable-transpose-value-cache`: Disable the optimization of transposing value cache.
- `--disable-streaming`: Disable streaming mode of generation.
- `--save-directory SAVE_DIRECTORY`: argument defining the path to save converted model. If it is a non-existing path, the original pretrained model specified by `REPO_ID_OR_MODEL_PATH` will be loaded, otherwise the lowbit model in `SAVE_DIRECTORY` will be loaded.

### Sample Output of Streaming Mode
#### [meta-llama/Llama-2-7b-chat-hf](https://huggingface.co/meta-llama/Llama-2-7b-chat-hf)
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -49,12 +49,6 @@ def get_prompt(message: str, chat_history: list[tuple[str, str]],
help="The huggingface repo id for the Baichuan2 model to be downloaded"
", or the path to the huggingface checkpoint folder",
)
parser.add_argument("--lowbit-path", type=str,
default="",
help="The path to the lowbit model folder, leave blank if you do not want to save. \
If path not exists, lowbit model will be saved there. \
Else, lowbit model will be loaded.",
)
parser.add_argument('--prompt', type=str, default="What is AI?",
help='Prompt to infer')
parser.add_argument("--n-predict", type=int, default=32, help="Max tokens to predict")
Expand All @@ -63,11 +57,17 @@ def get_prompt(message: str, chat_history: list[tuple[str, str]],
parser.add_argument("--quantization_group_size", type=int, default=0)
parser.add_argument("--disable-transpose-value-cache", action="store_true", default=False)
parser.add_argument("--disable-streaming", action="store_true", default=False)
parser.add_argument("--save-directory", type=str,
required=True,
help="The path of folder to save converted model, "
"If path not exists, lowbit model will be saved there. "
"Else, lowbit model will be loaded.",
)

args = parser.parse_args()
model_path = args.repo_id_or_model_path

if not args.lowbit_path or not os.path.exists(args.lowbit_path):
if not os.path.exists(args.save_directory):
model = AutoModelForCausalLM.from_pretrained(model_path,
optimize_model=True,
pipeline=True,
Expand All @@ -77,10 +77,11 @@ def get_prompt(message: str, chat_history: list[tuple[str, str]],
torch_dtype=torch.float16,
attn_implementation="eager",
transpose_value_cache=not args.disable_transpose_value_cache,
trust_remote_code=True)
trust_remote_code=True,
save_directory=args.save_directory)
else:
model = AutoModelForCausalLM.load_low_bit(
args.lowbit_path,
args.save_directory,
attn_implementation="eager",
torch_dtype=torch.float16,
max_context_len=args.max_context_len,
Expand All @@ -92,9 +93,6 @@ def get_prompt(message: str, chat_history: list[tuple[str, str]],

tokenizer = AutoTokenizer.from_pretrained(model_path, trust_remote_code=True)

if args.lowbit_path and not os.path.exists(args.lowbit_path):
model.save_low_bit(args.lowbit_path)

if args.disable_streaming:
streamer = None
else:
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -49,12 +49,6 @@ def get_prompt(message: str, chat_history: list[tuple[str, str]],
help="The huggingface repo id for the Llama2 model to be downloaded"
", or the path to the huggingface checkpoint folder",
)
parser.add_argument("--lowbit-path", type=str,
default="",
help="The path to the lowbit model folder, leave blank if you do not want to save. \
If path not exists, lowbit model will be saved there. \
Else, lowbit model will be loaded.",
)
parser.add_argument('--prompt', type=str, default="What is AI?",
help='Prompt to infer')
parser.add_argument("--n-predict", type=int, default=32, help="Max tokens to predict")
Expand All @@ -63,11 +57,17 @@ def get_prompt(message: str, chat_history: list[tuple[str, str]],
parser.add_argument("--quantization_group_size", type=int, default=0)
parser.add_argument("--disable-transpose-value-cache", action="store_true", default=False)
parser.add_argument("--disable-streaming", action="store_true", default=False)
parser.add_argument("--save-directory", type=str,
required=True,
help="The path of folder to save converted model, "
"If path not exists, lowbit model will be saved there. "
"Else, lowbit model will be loaded.",
)

args = parser.parse_args()
model_path = args.repo_id_or_model_path

if not args.lowbit_path or not os.path.exists(args.lowbit_path):
if not os.path.exists(args.save_directory):
model = AutoModelForCausalLM.from_pretrained(model_path,
optimize_model=True,
pipeline=True,
Expand All @@ -76,10 +76,11 @@ def get_prompt(message: str, chat_history: list[tuple[str, str]],
quantization_group_size=args.quantization_group_size,
torch_dtype=torch.float16,
attn_implementation="eager",
transpose_value_cache=not args.disable_transpose_value_cache)
transpose_value_cache=not args.disable_transpose_value_cache,
save_directory=args.save_directory)
else:
model = AutoModelForCausalLM.load_low_bit(
args.lowbit_path,
args.save_directory,
attn_implementation="eager",
torch_dtype=torch.float16,
max_context_len=args.max_context_len,
Expand All @@ -90,9 +91,6 @@ def get_prompt(message: str, chat_history: list[tuple[str, str]],

tokenizer = AutoTokenizer.from_pretrained(model_path, trust_remote_code=True)

if args.lowbit_path and not os.path.exists(args.lowbit_path):
model.save_low_bit(args.lowbit_path)

if args.disable_streaming:
streamer = None
else:
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -55,12 +55,6 @@ def get_prompt(user_input: str, chat_history: list[tuple[str, str]],
help="The huggingface repo id for the Llama3 model to be downloaded"
", or the path to the huggingface checkpoint folder",
)
parser.add_argument("--lowbit-path", type=str,
default="",
help="The path to the lowbit model folder, leave blank if you do not want to save. \
If path not exists, lowbit model will be saved there. \
Else, lowbit model will be loaded.",
)
parser.add_argument('--prompt', type=str, default="What is AI?",
help='Prompt to infer')
parser.add_argument("--n-predict", type=int, default=32, help="Max tokens to predict")
Expand All @@ -69,11 +63,17 @@ def get_prompt(user_input: str, chat_history: list[tuple[str, str]],
parser.add_argument("--quantization_group_size", type=int, default=0)
parser.add_argument("--disable-transpose-value-cache", action="store_true", default=False)
parser.add_argument("--disable-streaming", action="store_true", default=False)
parser.add_argument("--save-directory", type=str,
required=True,
help="The path of folder to save converted model, "
"If path not exists, lowbit model will be saved there. "
"Else, lowbit model will be loaded.",
)

args = parser.parse_args()
model_path = args.repo_id_or_model_path

if not args.lowbit_path or not os.path.exists(args.lowbit_path):
if not os.path.exists(args.save_directory):
model = AutoModelForCausalLM.from_pretrained(model_path,
torch_dtype=torch.float16,
optimize_model=True,
Expand All @@ -82,10 +82,11 @@ def get_prompt(user_input: str, chat_history: list[tuple[str, str]],
max_prompt_len=args.max_prompt_len,
quantization_group_size=args.quantization_group_size,
attn_implementation="eager",
transpose_value_cache=not args.disable_transpose_value_cache)
transpose_value_cache=not args.disable_transpose_value_cache,
save_directory=args.save_directory)
else:
model = AutoModelForCausalLM.load_low_bit(
args.lowbit_path,
args.save_directory,
attn_implementation="eager",
torch_dtype=torch.float16,
max_context_len=args.max_context_len,
Expand All @@ -96,9 +97,6 @@ def get_prompt(user_input: str, chat_history: list[tuple[str, str]],

tokenizer = AutoTokenizer.from_pretrained(model_path, trust_remote_code=True)

if args.lowbit_path and not os.path.exists(args.lowbit_path):
model.save_low_bit(args.lowbit_path)

if args.disable_streaming:
streamer = None
else:
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -36,12 +36,6 @@
help="The huggingface repo id for the MiniCPM model to be downloaded"
", or the path to the huggingface checkpoint folder",
)
parser.add_argument("--lowbit-path", type=str,
default="",
help="The path to the lowbit model folder, leave blank if you do not want to save. \
If path not exists, lowbit model will be saved there. \
Else, lowbit model will be loaded.",
)
parser.add_argument('--prompt', type=str, default="What is AI?",
help='Prompt to infer')
parser.add_argument("--n-predict", type=int, default=32, help="Max tokens to predict")
Expand All @@ -50,11 +44,17 @@
parser.add_argument("--quantization_group_size", type=int, default=0)
parser.add_argument("--disable-transpose-value-cache", action="store_true", default=False)
parser.add_argument("--disable-streaming", action="store_true", default=False)
parser.add_argument("--save-directory", type=str,
required=True,
help="The path of folder to save converted model, "
"If path not exists, lowbit model will be saved there. "
"Else, lowbit model will be loaded.",
)

args = parser.parse_args()
model_path = args.repo_id_or_model_path

if not args.lowbit_path or not os.path.exists(args.lowbit_path):
if not os.path.exists(args.save_directory):
model = AutoModelForCausalLM.from_pretrained(model_path,
optimize_model=True,
pipeline=True,
Expand All @@ -64,10 +64,11 @@
attn_implementation="eager",
quantization_group_size=args.quantization_group_size,
transpose_value_cache=not args.disable_transpose_value_cache,
trust_remote_code=True)
trust_remote_code=True,
save_directory=args.save_directory)
else:
model = AutoModelForCausalLM.load_low_bit(
args.lowbit_path,
args.save_directory,
attn_implementation="eager",
torch_dtype=torch.float16,
max_context_len=args.max_context_len,
Expand All @@ -79,9 +80,6 @@

tokenizer = AutoTokenizer.from_pretrained(model_path, trust_remote_code=True)

if args.lowbit_path and not os.path.exists(args.lowbit_path):
model.save_low_bit(args.lowbit_path)

if args.disable_streaming:
streamer = None
else:
Expand Down
Loading