From 843a22ee73d5d31925233c0b06f4f6c29945ce5e Mon Sep 17 00:00:00 2001
From: James Shen <yueshen@nvidia.com>
Date: Thu, 3 Oct 2024 21:56:01 -0700
Subject: [PATCH] ADLR/megatron-lm!2180 - rotary_scaling fix for llama3.1 and
 3.2

---
 .../export/ptq_and_trtllm_export/README.md    | 48 ++++++++++++++-----
 .../ptq_trtllm_llama2_7b.sh                   |  6 +--
 .../ptq_trtllm_llama3_1_8b.sh                 |  7 +--
 .../ptq_trtllm_llama3_8b.sh                   |  6 +--
 .../ptq_trtllm_minitron_8b.sh                 |  2 +-
 .../ptq_trtllm_mistral_12b.sh                 |  2 +-
 .../text_generation_ptq.py                    |  7 +--
 megatron/core/models/gpt/gpt_model.py         |  3 +-
 megatron/inference/gpt/model_provider.py      |  1 +
 9 files changed, 50 insertions(+), 32 deletions(-)

diff --git a/examples/export/ptq_and_trtllm_export/README.md b/examples/export/ptq_and_trtllm_export/README.md
index e167b60e1c..c5255f7ccf 100644
--- a/examples/export/ptq_and_trtllm_export/README.md
+++ b/examples/export/ptq_and_trtllm_export/README.md
@@ -74,7 +74,7 @@ cd ../..
 
 Now launch the PTQ + TensorRT-LLM export script,
 ```sh
-bash examples/inference/quantization/ptq_trtllm_minitron_8b ./Minitron-8B-Base None
+bash examples/export/ptq_and_trtllm_export/ptq_trtllm_minitron_8b ./Minitron-8B-Base None
 ```
 By default, `cnn_dailymail` is used for calibration. The `GPTModel` will have quantizers for simulating the
 quantization effect. The checkpoint will be saved optionally (with quantizers as additional states) and can
@@ -104,12 +104,12 @@ export trtllm_options=" \
     --checkpoint_dir /tmp/trtllm_ckpt \
     --output_dir /tmp/trtllm_engine \
     --max_input_len 2048 \
-    --max_output_len 512 \
+    --max_seq_len 512 \
     --max_batch_size 8 "
 
 trtllm-build ${trtllm_options}
 
-python examples/inference/quantization/trtllm_text_generation.py --tokenizer nvidia/Minitron-8B-Base
+python examples/export/ptq_and_trtllm_export/trtllm_text_generation.py --tokenizer nvidia/Minitron-8B-Base
 ```
 
 ### mistral-12B FP8 Quantization and TensorRT-LLM Deployment
@@ -139,7 +139,7 @@ huggingface-cli login
 Now launch the PTQ + TensorRT-LLM checkpoint export script,
 
 ```sh
-bash examples/inference/quantization/ptq_trtllm_mistral_12b.sh ./Mistral-NeMo-12B-Base None
+bash examples/export/ptq_and_trtllm_export/ptq_trtllm_mistral_12b.sh ./Mistral-NeMo-12B-Base None
 ```
 
 Then build TensorRT engine and run text generation example using the newly built TensorRT engine
@@ -149,12 +149,12 @@ export trtllm_options=" \
     --checkpoint_dir /tmp/trtllm_ckpt \
     --output_dir /tmp/trtllm_engine \
     --max_input_len 2048 \
-    --max_output_len 512 \
+    --max_seq_len 512 \
     --max_batch_size 8 "
 
 trtllm-build ${trtllm_options}
 
-python examples/inference/quantization/trtllm_text_generation.py --tokenizer mistralai/Mistral-Nemo-Base-2407
+python examples/export/ptq_and_trtllm_export/trtllm_text_generation.py --tokenizer mistralai/Mistral-Nemo-Base-2407
 ```
 
 
@@ -165,7 +165,7 @@ python examples/inference/quantization/trtllm_text_generation.py --tokenizer mis
 > that we support.
 
 ```sh
-bash examples/inference/quantization/ptq_trtllm_llama_7b.sh ${CHECKPOINT_DIR}
+bash examples/export/ptq_and_trtllm_export/ptq_trtllm_llama_7b.sh ${CHECKPOINT_DIR}
 ```
 
 The script expect `${CHECKPOINT_DIR}` to have the following structure:
@@ -184,8 +184,23 @@ The script expect `${CHECKPOINT_DIR}` to have the following structure:
 In short, other than the converted llama megatron checkpoint, also put the Hugging Face checkpoint inside as
 the source of the tokenizer.
 
+Then build TensorRT engine and run text generation example using the newly built TensorRT engine
+
+```sh
+export trtllm_options=" \
+    --checkpoint_dir /tmp/trtllm_ckpt \
+    --output_dir /tmp/trtllm_engine \
+    --max_input_len 2048 \
+    --max_seq_len 512 \
+    --max_batch_size 8 "
+
+trtllm-build ${trtllm_options}
+
+python examples/export/ptq_and_trtllm_export/trtllm_text_generation.py --tokenizer meta-llama/Llama-2-7b
+```
+
 ### llama3-8b / llama3.1-8b INT8 SmoothQuant and TensorRT-LLM Deployment
-> **NOTE:** For llama3.1, the missing rope_scaling parameter will be fixed in modelopt-0.17 and trtllm-0.12.
+> **NOTE:** For llama3.1, the missing rope_scaling parameter will be fixed in modelopt-0.19 and trtllm-0.13.
 
 > **NOTE:** There are two ways to acquire the checkpoint. Users can follow
 > the instruction in `docs/llama2.md` to convert the checkpoint to megatron legacy `GPTModel` format and
@@ -199,16 +214,23 @@ If users choose to download the model from NGC, first extract the sharded checkp
 tar -xvf 8b_pre_trained_bf16.nemo
 ```
 
+> **NOTE:** You need a token generated from huggingface.co/settings/tokens and access to meta-llama/Llama-3.1-8B or meta-llama/Llama-3-8B on huggingface
+
+```sh
+pip install -U "huggingface_hub[cli]"
+huggingface-cli login
+```
+
 Now launch the PTQ + TensorRT-LLM checkpoint export script for llama-3,
 
 ```sh
-bash examples/inference/quantization/ptq_trtllm_llama3_8b.sh ./llama-3-8b-nemo_v1.0 None
+bash examples/export/ptq_and_trtllm_export/ptq_trtllm_llama3_8b.sh ./llama-3-8b-nemo_v1.0 None
 ```
 
 or llama-3.1
 
 ```sh
-bash examples/inference/quantization/ptq_trtllm_llama3_1_8b.sh ./llama-3_1-8b-nemo_v1.0 None
+bash examples/export/ptq_and_trtllm_export/ptq_trtllm_llama3_1_8b.sh ./llama-3_1-8b-nemo_v1.0 None
 ```
 
 Then build TensorRT engine and run text generation example using the newly built TensorRT engine
@@ -218,14 +240,14 @@ export trtllm_options=" \
     --checkpoint_dir /tmp/trtllm_ckpt \
     --output_dir /tmp/trtllm_engine \
     --max_input_len 2048 \
-    --max_output_len 512 \
+    --max_seq_len 512 \
     --max_batch_size 8 "
 
 trtllm-build ${trtllm_options}
 
-python examples/inference/quantization/trtllm_text_generation.py --tokenizer meta-llama/Meta-Llama-3-8B
+python examples/export/ptq_and_trtllm_export/trtllm_text_generation.py --tokenizer meta-llama/Meta-Llama-3-8B
 # For llama-3
 
-python examples/inference/quantization/trtllm_text_generation.py --tokenizer meta-llama/Meta-Llama-3.1-8B
+python examples/export/ptq_and_trtllm_export/trtllm_text_generation.py --tokenizer meta-llama/Meta-Llama-3.1-8B
 #For llama-3.1
 ```
\ No newline at end of file
diff --git a/examples/export/ptq_and_trtllm_export/ptq_trtllm_llama2_7b.sh b/examples/export/ptq_and_trtllm_export/ptq_trtllm_llama2_7b.sh
index 8c4777f07a..ebcc448955 100644
--- a/examples/export/ptq_and_trtllm_export/ptq_trtllm_llama2_7b.sh
+++ b/examples/export/ptq_and_trtllm_export/ptq_trtllm_llama2_7b.sh
@@ -66,7 +66,7 @@ options=" \
     --tokenizer-model ${TOKENIZER_MODEL} \
     --save-interval 1000000 \
     --use-dist-ckpt \
-    --load ${CHECKPOINT_LOAD_DIR}
+    --load ${CHECKPOINT_LOAD_DIR} \
     --fp16"
 
 # Precompile CUDA extentions
@@ -76,7 +76,5 @@ python -c "import modelopt.torch.quantization.extensions as ext; print(ext.cuda_
 launch_config="--nproc_per_node=${TP}"
 
 # Launch multi-process with torchrun
-torchrun ${launch_config} examples/inference/quantization/text_generation_ptq.py ${options} ${additional_options}
+torchrun ${launch_config} examples/export/ptq_and_trtllm_export/text_generation_ptq.py ${options} ${additional_options}
 
-# This script is using mpi4py which will fork multiple processes.
-python examples/inference/quantization/trtllm_text_generation.py ${trtllm_options}
diff --git a/examples/export/ptq_and_trtllm_export/ptq_trtllm_llama3_1_8b.sh b/examples/export/ptq_and_trtllm_export/ptq_trtllm_llama3_1_8b.sh
index d22ae4d472..a6251663f7 100644
--- a/examples/export/ptq_and_trtllm_export/ptq_trtllm_llama3_1_8b.sh
+++ b/examples/export/ptq_and_trtllm_export/ptq_trtllm_llama3_1_8b.sh
@@ -63,9 +63,10 @@ options=" \
     --tokenizer-type HuggingFaceTokenizer \
     --tokenizer-model meta-llama/Meta-Llama-3.1-8B \
     --save-interval 1000000 \
+    --use-rope-scaling \
     --use-dist-ckpt \
-    --load ${CHECKPOINT_LOAD_DIR}
-    --rotary-base 500000
+    --load ${CHECKPOINT_LOAD_DIR} \
+    --rotary-base 500000 \
     --fp16"
 
 # Precompile CUDA extentions
@@ -75,4 +76,4 @@ python -c "import modelopt.torch.quantization.extensions as ext; print(ext.cuda_
 launch_config="--nproc_per_node=${TP}"
 
 # Launch multi-process with torchrun
-torchrun ${launch_config} examples/inference/quantization/text_generation_ptq.py ${options} ${additional_options}
+torchrun ${launch_config} examples/export/ptq_and_trtllm_export/text_generation_ptq.py ${options} ${additional_options}
diff --git a/examples/export/ptq_and_trtllm_export/ptq_trtllm_llama3_8b.sh b/examples/export/ptq_and_trtllm_export/ptq_trtllm_llama3_8b.sh
index 11ab023fad..f181c8c2dd 100644
--- a/examples/export/ptq_and_trtllm_export/ptq_trtllm_llama3_8b.sh
+++ b/examples/export/ptq_and_trtllm_export/ptq_trtllm_llama3_8b.sh
@@ -64,8 +64,8 @@ options=" \
     --tokenizer-model meta-llama/Meta-Llama-3-8B \
     --save-interval 1000000 \
     --use-dist-ckpt \
-    --load ${CHECKPOINT_LOAD_DIR}
-    --rotary-base 500000
+    --load ${CHECKPOINT_LOAD_DIR} \
+    --rotary-base 500000 \
     --fp16"
 
 # Precompile CUDA extentions
@@ -75,4 +75,4 @@ python -c "import modelopt.torch.quantization.extensions as ext; print(ext.cuda_
 launch_config="--nproc_per_node=${TP}"
 
 # Launch multi-process with torchrun
-torchrun ${launch_config} examples/inference/quantization/text_generation_ptq.py ${options} ${additional_options}
+torchrun ${launch_config} examples/export/ptq_and_trtllm_export/text_generation_ptq.py ${options} ${additional_options}
diff --git a/examples/export/ptq_and_trtllm_export/ptq_trtllm_minitron_8b.sh b/examples/export/ptq_and_trtllm_export/ptq_trtllm_minitron_8b.sh
index 8c7bc0cb82..31ec192fd5 100644
--- a/examples/export/ptq_and_trtllm_export/ptq_trtllm_minitron_8b.sh
+++ b/examples/export/ptq_and_trtllm_export/ptq_trtllm_minitron_8b.sh
@@ -71,4 +71,4 @@ python -c "import modelopt.torch.quantization.extensions as ext; print(ext.cuda_
 launch_config="--nproc_per_node=${TP}"
 
 # Launch multi-process with torchrun
-torchrun ${launch_config} examples/inference/quantization/text_generation_ptq.py ${options} ${additional_options}
+torchrun ${launch_config} examples/export/ptq_and_trtllm_export/text_generation_ptq.py ${options} ${additional_options}
diff --git a/examples/export/ptq_and_trtllm_export/ptq_trtllm_mistral_12b.sh b/examples/export/ptq_and_trtllm_export/ptq_trtllm_mistral_12b.sh
index 17ded50d1e..3eb02d2e1d 100644
--- a/examples/export/ptq_and_trtllm_export/ptq_trtllm_mistral_12b.sh
+++ b/examples/export/ptq_and_trtllm_export/ptq_trtllm_mistral_12b.sh
@@ -72,4 +72,4 @@ python -c "import modelopt.torch.quantization.extensions as ext; print(ext.cuda_
 launch_config="--nproc_per_node=${TP}"
 
 # Launch multi-process with torchrun
-torchrun ${launch_config} examples/inference/quantization/text_generation_ptq.py ${options} ${additional_options}
+torchrun ${launch_config} examples/export/ptq_and_trtllm_export/text_generation_ptq.py ${options} ${additional_options}
diff --git a/examples/export/ptq_and_trtllm_export/text_generation_ptq.py b/examples/export/ptq_and_trtllm_export/text_generation_ptq.py
index 13b327b25a..340c9c90f7 100644
--- a/examples/export/ptq_and_trtllm_export/text_generation_ptq.py
+++ b/examples/export/ptq_and_trtllm_export/text_generation_ptq.py
@@ -6,12 +6,11 @@
 import sys
 from pathlib import Path
 
-sys.path.append(os.path.abspath(os.path.join(os.path.dirname(__file__), "../../")))
+sys.path.append(os.path.abspath(os.path.join(os.path.dirname(__file__), "../../../../")))
 
 import modelopt.torch.quantization as mtq
 import torch
 from datasets import load_dataset
-from modelopt.torch.utils.distributed import set_data_parallel_group, set_tensor_parallel_group
 from tqdm import tqdm
 
 # [ModelOpt]: changing the default model provider to the ModelOpt version
@@ -179,10 +178,6 @@ def hf_dataset_forword_loop_func(model):
     if args.calib_dataset is not None:
         ptq_forward_loop_func = hf_dataset_forword_loop_func
 
-    # Setting data parallel and tensor parallel group
-    set_data_parallel_group(mpu.get_data_parallel_group())
-    set_tensor_parallel_group(mpu.get_tensor_model_parallel_group())
-
     if args.export_quant_cfg in QUANT_CFG_CHOICES:
         mtq_config = QUANT_CFG_CHOICES[args.export_quant_cfg]
         if "*output_layer*" not in mtq_config["quant_cfg"]:
diff --git a/megatron/core/models/gpt/gpt_model.py b/megatron/core/models/gpt/gpt_model.py
index 7ee6dde182..bd52f89680 100644
--- a/megatron/core/models/gpt/gpt_model.py
+++ b/megatron/core/models/gpt/gpt_model.py
@@ -91,10 +91,11 @@ def __init__(
         # TODO: remove this dependency ?
         self.model_type = ModelType.encoder_or_decoder
 
-        # These 2 attributes are needed for TensorRT-LLM export.
+        # These 4 attributes are needed for TensorRT-LLM export.
         self.max_position_embeddings = max_sequence_length
         self.rotary_percent = rotary_percent
         self.rotary_base = rotary_base
+        self.rotary_scaling = rope_scaling
 
         if self.pre_process:
             self.embedding = LanguageModelEmbedding(
diff --git a/megatron/inference/gpt/model_provider.py b/megatron/inference/gpt/model_provider.py
index 2e92a96e9e..0df0168fa5 100644
--- a/megatron/inference/gpt/model_provider.py
+++ b/megatron/inference/gpt/model_provider.py
@@ -64,6 +64,7 @@ def model_provider(pre_process=True, post_process=True, parallel_output=True) ->
         "position_embedding_type": args.position_embedding_type,
         "rotary_percent": args.rotary_percent,
         "rotary_base": args.rotary_base,
+        "rope_scaling": args.use_rope_scaling,
     }
 
     model = model_type(**model_kwargs)