From 843a22ee73d5d31925233c0b06f4f6c29945ce5e Mon Sep 17 00:00:00 2001 From: James Shen Date: Thu, 3 Oct 2024 21:56:01 -0700 Subject: [PATCH] ADLR/megatron-lm!2180 - rotary_scaling fix for llama3.1 and 3.2 --- .../export/ptq_and_trtllm_export/README.md | 48 ++++++++++++++----- .../ptq_trtllm_llama2_7b.sh | 6 +-- .../ptq_trtllm_llama3_1_8b.sh | 7 +-- .../ptq_trtllm_llama3_8b.sh | 6 +-- .../ptq_trtllm_minitron_8b.sh | 2 +- .../ptq_trtllm_mistral_12b.sh | 2 +- .../text_generation_ptq.py | 7 +-- megatron/core/models/gpt/gpt_model.py | 3 +- megatron/inference/gpt/model_provider.py | 1 + 9 files changed, 50 insertions(+), 32 deletions(-) diff --git a/examples/export/ptq_and_trtllm_export/README.md b/examples/export/ptq_and_trtllm_export/README.md index e167b60e1c..c5255f7ccf 100644 --- a/examples/export/ptq_and_trtllm_export/README.md +++ b/examples/export/ptq_and_trtllm_export/README.md @@ -74,7 +74,7 @@ cd ../.. Now launch the PTQ + TensorRT-LLM export script, ```sh -bash examples/inference/quantization/ptq_trtllm_minitron_8b ./Minitron-8B-Base None +bash examples/export/ptq_and_trtllm_export/ptq_trtllm_minitron_8b ./Minitron-8B-Base None ``` By default, `cnn_dailymail` is used for calibration. The `GPTModel` will have quantizers for simulating the quantization effect. The checkpoint will be saved optionally (with quantizers as additional states) and can @@ -104,12 +104,12 @@ export trtllm_options=" \ --checkpoint_dir /tmp/trtllm_ckpt \ --output_dir /tmp/trtllm_engine \ --max_input_len 2048 \ - --max_output_len 512 \ + --max_seq_len 512 \ --max_batch_size 8 " trtllm-build ${trtllm_options} -python examples/inference/quantization/trtllm_text_generation.py --tokenizer nvidia/Minitron-8B-Base +python examples/export/ptq_and_trtllm_export/trtllm_text_generation.py --tokenizer nvidia/Minitron-8B-Base ``` ### mistral-12B FP8 Quantization and TensorRT-LLM Deployment @@ -139,7 +139,7 @@ huggingface-cli login Now launch the PTQ + TensorRT-LLM checkpoint export script, ```sh -bash examples/inference/quantization/ptq_trtllm_mistral_12b.sh ./Mistral-NeMo-12B-Base None +bash examples/export/ptq_and_trtllm_export/ptq_trtllm_mistral_12b.sh ./Mistral-NeMo-12B-Base None ``` Then build TensorRT engine and run text generation example using the newly built TensorRT engine @@ -149,12 +149,12 @@ export trtllm_options=" \ --checkpoint_dir /tmp/trtllm_ckpt \ --output_dir /tmp/trtllm_engine \ --max_input_len 2048 \ - --max_output_len 512 \ + --max_seq_len 512 \ --max_batch_size 8 " trtllm-build ${trtllm_options} -python examples/inference/quantization/trtllm_text_generation.py --tokenizer mistralai/Mistral-Nemo-Base-2407 +python examples/export/ptq_and_trtllm_export/trtllm_text_generation.py --tokenizer mistralai/Mistral-Nemo-Base-2407 ``` @@ -165,7 +165,7 @@ python examples/inference/quantization/trtllm_text_generation.py --tokenizer mis > that we support. ```sh -bash examples/inference/quantization/ptq_trtllm_llama_7b.sh ${CHECKPOINT_DIR} +bash examples/export/ptq_and_trtllm_export/ptq_trtllm_llama_7b.sh ${CHECKPOINT_DIR} ``` The script expect `${CHECKPOINT_DIR}` to have the following structure: @@ -184,8 +184,23 @@ The script expect `${CHECKPOINT_DIR}` to have the following structure: In short, other than the converted llama megatron checkpoint, also put the Hugging Face checkpoint inside as the source of the tokenizer. +Then build TensorRT engine and run text generation example using the newly built TensorRT engine + +```sh +export trtllm_options=" \ + --checkpoint_dir /tmp/trtllm_ckpt \ + --output_dir /tmp/trtllm_engine \ + --max_input_len 2048 \ + --max_seq_len 512 \ + --max_batch_size 8 " + +trtllm-build ${trtllm_options} + +python examples/export/ptq_and_trtllm_export/trtllm_text_generation.py --tokenizer meta-llama/Llama-2-7b +``` + ### llama3-8b / llama3.1-8b INT8 SmoothQuant and TensorRT-LLM Deployment -> **NOTE:** For llama3.1, the missing rope_scaling parameter will be fixed in modelopt-0.17 and trtllm-0.12. +> **NOTE:** For llama3.1, the missing rope_scaling parameter will be fixed in modelopt-0.19 and trtllm-0.13. > **NOTE:** There are two ways to acquire the checkpoint. Users can follow > the instruction in `docs/llama2.md` to convert the checkpoint to megatron legacy `GPTModel` format and @@ -199,16 +214,23 @@ If users choose to download the model from NGC, first extract the sharded checkp tar -xvf 8b_pre_trained_bf16.nemo ``` +> **NOTE:** You need a token generated from huggingface.co/settings/tokens and access to meta-llama/Llama-3.1-8B or meta-llama/Llama-3-8B on huggingface + +```sh +pip install -U "huggingface_hub[cli]" +huggingface-cli login +``` + Now launch the PTQ + TensorRT-LLM checkpoint export script for llama-3, ```sh -bash examples/inference/quantization/ptq_trtllm_llama3_8b.sh ./llama-3-8b-nemo_v1.0 None +bash examples/export/ptq_and_trtllm_export/ptq_trtllm_llama3_8b.sh ./llama-3-8b-nemo_v1.0 None ``` or llama-3.1 ```sh -bash examples/inference/quantization/ptq_trtllm_llama3_1_8b.sh ./llama-3_1-8b-nemo_v1.0 None +bash examples/export/ptq_and_trtllm_export/ptq_trtllm_llama3_1_8b.sh ./llama-3_1-8b-nemo_v1.0 None ``` Then build TensorRT engine and run text generation example using the newly built TensorRT engine @@ -218,14 +240,14 @@ export trtllm_options=" \ --checkpoint_dir /tmp/trtllm_ckpt \ --output_dir /tmp/trtllm_engine \ --max_input_len 2048 \ - --max_output_len 512 \ + --max_seq_len 512 \ --max_batch_size 8 " trtllm-build ${trtllm_options} -python examples/inference/quantization/trtllm_text_generation.py --tokenizer meta-llama/Meta-Llama-3-8B +python examples/export/ptq_and_trtllm_export/trtllm_text_generation.py --tokenizer meta-llama/Meta-Llama-3-8B # For llama-3 -python examples/inference/quantization/trtllm_text_generation.py --tokenizer meta-llama/Meta-Llama-3.1-8B +python examples/export/ptq_and_trtllm_export/trtllm_text_generation.py --tokenizer meta-llama/Meta-Llama-3.1-8B #For llama-3.1 ``` \ No newline at end of file diff --git a/examples/export/ptq_and_trtllm_export/ptq_trtllm_llama2_7b.sh b/examples/export/ptq_and_trtllm_export/ptq_trtllm_llama2_7b.sh index 8c4777f07a..ebcc448955 100644 --- a/examples/export/ptq_and_trtllm_export/ptq_trtllm_llama2_7b.sh +++ b/examples/export/ptq_and_trtllm_export/ptq_trtllm_llama2_7b.sh @@ -66,7 +66,7 @@ options=" \ --tokenizer-model ${TOKENIZER_MODEL} \ --save-interval 1000000 \ --use-dist-ckpt \ - --load ${CHECKPOINT_LOAD_DIR} + --load ${CHECKPOINT_LOAD_DIR} \ --fp16" # Precompile CUDA extentions @@ -76,7 +76,5 @@ python -c "import modelopt.torch.quantization.extensions as ext; print(ext.cuda_ launch_config="--nproc_per_node=${TP}" # Launch multi-process with torchrun -torchrun ${launch_config} examples/inference/quantization/text_generation_ptq.py ${options} ${additional_options} +torchrun ${launch_config} examples/export/ptq_and_trtllm_export/text_generation_ptq.py ${options} ${additional_options} -# This script is using mpi4py which will fork multiple processes. -python examples/inference/quantization/trtllm_text_generation.py ${trtllm_options} diff --git a/examples/export/ptq_and_trtllm_export/ptq_trtllm_llama3_1_8b.sh b/examples/export/ptq_and_trtllm_export/ptq_trtllm_llama3_1_8b.sh index d22ae4d472..a6251663f7 100644 --- a/examples/export/ptq_and_trtllm_export/ptq_trtllm_llama3_1_8b.sh +++ b/examples/export/ptq_and_trtllm_export/ptq_trtllm_llama3_1_8b.sh @@ -63,9 +63,10 @@ options=" \ --tokenizer-type HuggingFaceTokenizer \ --tokenizer-model meta-llama/Meta-Llama-3.1-8B \ --save-interval 1000000 \ + --use-rope-scaling \ --use-dist-ckpt \ - --load ${CHECKPOINT_LOAD_DIR} - --rotary-base 500000 + --load ${CHECKPOINT_LOAD_DIR} \ + --rotary-base 500000 \ --fp16" # Precompile CUDA extentions @@ -75,4 +76,4 @@ python -c "import modelopt.torch.quantization.extensions as ext; print(ext.cuda_ launch_config="--nproc_per_node=${TP}" # Launch multi-process with torchrun -torchrun ${launch_config} examples/inference/quantization/text_generation_ptq.py ${options} ${additional_options} +torchrun ${launch_config} examples/export/ptq_and_trtllm_export/text_generation_ptq.py ${options} ${additional_options} diff --git a/examples/export/ptq_and_trtllm_export/ptq_trtllm_llama3_8b.sh b/examples/export/ptq_and_trtllm_export/ptq_trtllm_llama3_8b.sh index 11ab023fad..f181c8c2dd 100644 --- a/examples/export/ptq_and_trtllm_export/ptq_trtllm_llama3_8b.sh +++ b/examples/export/ptq_and_trtllm_export/ptq_trtllm_llama3_8b.sh @@ -64,8 +64,8 @@ options=" \ --tokenizer-model meta-llama/Meta-Llama-3-8B \ --save-interval 1000000 \ --use-dist-ckpt \ - --load ${CHECKPOINT_LOAD_DIR} - --rotary-base 500000 + --load ${CHECKPOINT_LOAD_DIR} \ + --rotary-base 500000 \ --fp16" # Precompile CUDA extentions @@ -75,4 +75,4 @@ python -c "import modelopt.torch.quantization.extensions as ext; print(ext.cuda_ launch_config="--nproc_per_node=${TP}" # Launch multi-process with torchrun -torchrun ${launch_config} examples/inference/quantization/text_generation_ptq.py ${options} ${additional_options} +torchrun ${launch_config} examples/export/ptq_and_trtllm_export/text_generation_ptq.py ${options} ${additional_options} diff --git a/examples/export/ptq_and_trtllm_export/ptq_trtllm_minitron_8b.sh b/examples/export/ptq_and_trtllm_export/ptq_trtllm_minitron_8b.sh index 8c7bc0cb82..31ec192fd5 100644 --- a/examples/export/ptq_and_trtllm_export/ptq_trtllm_minitron_8b.sh +++ b/examples/export/ptq_and_trtllm_export/ptq_trtllm_minitron_8b.sh @@ -71,4 +71,4 @@ python -c "import modelopt.torch.quantization.extensions as ext; print(ext.cuda_ launch_config="--nproc_per_node=${TP}" # Launch multi-process with torchrun -torchrun ${launch_config} examples/inference/quantization/text_generation_ptq.py ${options} ${additional_options} +torchrun ${launch_config} examples/export/ptq_and_trtllm_export/text_generation_ptq.py ${options} ${additional_options} diff --git a/examples/export/ptq_and_trtllm_export/ptq_trtllm_mistral_12b.sh b/examples/export/ptq_and_trtllm_export/ptq_trtllm_mistral_12b.sh index 17ded50d1e..3eb02d2e1d 100644 --- a/examples/export/ptq_and_trtllm_export/ptq_trtllm_mistral_12b.sh +++ b/examples/export/ptq_and_trtllm_export/ptq_trtllm_mistral_12b.sh @@ -72,4 +72,4 @@ python -c "import modelopt.torch.quantization.extensions as ext; print(ext.cuda_ launch_config="--nproc_per_node=${TP}" # Launch multi-process with torchrun -torchrun ${launch_config} examples/inference/quantization/text_generation_ptq.py ${options} ${additional_options} +torchrun ${launch_config} examples/export/ptq_and_trtllm_export/text_generation_ptq.py ${options} ${additional_options} diff --git a/examples/export/ptq_and_trtllm_export/text_generation_ptq.py b/examples/export/ptq_and_trtllm_export/text_generation_ptq.py index 13b327b25a..340c9c90f7 100644 --- a/examples/export/ptq_and_trtllm_export/text_generation_ptq.py +++ b/examples/export/ptq_and_trtllm_export/text_generation_ptq.py @@ -6,12 +6,11 @@ import sys from pathlib import Path -sys.path.append(os.path.abspath(os.path.join(os.path.dirname(__file__), "../../"))) +sys.path.append(os.path.abspath(os.path.join(os.path.dirname(__file__), "../../../../"))) import modelopt.torch.quantization as mtq import torch from datasets import load_dataset -from modelopt.torch.utils.distributed import set_data_parallel_group, set_tensor_parallel_group from tqdm import tqdm # [ModelOpt]: changing the default model provider to the ModelOpt version @@ -179,10 +178,6 @@ def hf_dataset_forword_loop_func(model): if args.calib_dataset is not None: ptq_forward_loop_func = hf_dataset_forword_loop_func - # Setting data parallel and tensor parallel group - set_data_parallel_group(mpu.get_data_parallel_group()) - set_tensor_parallel_group(mpu.get_tensor_model_parallel_group()) - if args.export_quant_cfg in QUANT_CFG_CHOICES: mtq_config = QUANT_CFG_CHOICES[args.export_quant_cfg] if "*output_layer*" not in mtq_config["quant_cfg"]: diff --git a/megatron/core/models/gpt/gpt_model.py b/megatron/core/models/gpt/gpt_model.py index 7ee6dde182..bd52f89680 100644 --- a/megatron/core/models/gpt/gpt_model.py +++ b/megatron/core/models/gpt/gpt_model.py @@ -91,10 +91,11 @@ def __init__( # TODO: remove this dependency ? self.model_type = ModelType.encoder_or_decoder - # These 2 attributes are needed for TensorRT-LLM export. + # These 4 attributes are needed for TensorRT-LLM export. self.max_position_embeddings = max_sequence_length self.rotary_percent = rotary_percent self.rotary_base = rotary_base + self.rotary_scaling = rope_scaling if self.pre_process: self.embedding = LanguageModelEmbedding( diff --git a/megatron/inference/gpt/model_provider.py b/megatron/inference/gpt/model_provider.py index 2e92a96e9e..0df0168fa5 100644 --- a/megatron/inference/gpt/model_provider.py +++ b/megatron/inference/gpt/model_provider.py @@ -64,6 +64,7 @@ def model_provider(pre_process=True, post_process=True, parallel_output=True) -> "position_embedding_type": args.position_embedding_type, "rotary_percent": args.rotary_percent, "rotary_base": args.rotary_base, + "rope_scaling": args.use_rope_scaling, } model = model_type(**model_kwargs)