Merge branch 'yueshen/mixtral8x7b_support' into 'main'

Mixtral8x7b modelopt support See merge request ADLR/megatron-lm!2200
NVIDIA · Nov 6, 2024 · f39c48d · f39c48d
2 parents ad5ce45 + 0197f6f
commit f39c48d
Show file tree

Hide file tree

Showing 5 changed files with 154 additions and 15 deletions.
diff --git a/examples/export/ptq_and_trtllm_export/README.md b/examples/export/ptq_and_trtllm_export/README.md
@@ -250,4 +250,46 @@ python examples/export/ptq_and_trtllm_export/trtllm_text_generation.py --tokeniz
 
 python examples/export/ptq_and_trtllm_export/trtllm_text_generation.py --tokenizer meta-llama/Meta-Llama-3.1-8B
 #For llama-3.1
+```
+
+
+### Mixtral-8x7B FP8 Quantization and TensorRT-LLM Deployment
+First download the nemotron checkpoint from https://catalog.ngc.nvidia.com/orgs/nvidia/teams/nemo/models/mixtral-8x7b-v01, extract the
+sharded checkpoint from the `.nemo` tarbal.
+
+```sh
+ngc registry model download-version "nvidia/nemo/mixtral-8x7b-v01:1.0"
+cd mixtral-8x7b-v01_v1.0
+tar -xvf mixtral.nemo
+cd ..
+```
+
+Then log in to huggingface so that you can access to model
+
+> **NOTE:** You need a token generated from huggingface.co/settings/tokens and access to mistralai/Mixtral-8x7B-v0.1 on huggingface
+
+```sh
+pip install -U "huggingface_hub[cli]"
+huggingface-cli login
+```
+
+Now launch the PTQ + TensorRT-LLM checkpoint export script,
+
+```sh
+bash examples/export/ptq_and_trtllm_export/ptq_trtllm_mixtral_8x7b.sh ./mixtral-8x7b-v01_v1.0/
+```
+
+Then build TensorRT engine and run text generation example using the newly built TensorRT engine
+
+```sh
+export trtllm_options=" \
+    --checkpoint_dir /tmp/trtllm_ckpt \
+    --output_dir /tmp/trtllm_engine \
+    --max_input_len 2048 \
+    --max_seq_len 512 \
+    --max_batch_size 8 "
+
+trtllm-build ${trtllm_options}
+
+python examples/export/ptq_and_trtllm_export/trtllm_text_generation.py --tokenizer mistralai/Mixtral-8x7B-v0.1
 ```
diff --git a/examples/export/ptq_and_trtllm_export/ptq_trtllm_mixtral_8x7b.sh b/examples/export/ptq_and_trtllm_export/ptq_trtllm_mixtral_8x7b.sh
@@ -0,0 +1,84 @@
+#!/bin/bash
+set -e
+
+DEFAULT_NAME="/checkpoints/Mistral-NeMo-12B-Base"
+NAME="${1:-$DEFAULT_NAME}"
+
+DEFAULT_QUANT_CFG="fp8"
+QUANT_CFG="${2:-$DEFAULT_QUANT_CFG}"
+
+# NOTE: UNFUSED ATTENTION MUST BE USED TO AVOID ADDITIONAL STATE_DICT KEY MISMATCH.
+export NVTE_FLASH_ATTN=0
+export NVTE_FUSED_ATTN=0
+export NVTE_UNFUSED_ATTN=1
+
+# CHANGE THE FOLLOWING IF YOU MOUNT YOUR DATA AND CHECKPOINTS DIFFERENTLY IN THE CONTAINER.
+TP="8"
+INFERENCE_TP=${TP}
+DECODER_TYPE="llama"
+CHECKPOINT_LOAD_DIR="${NAME}"
+
+if [ "$QUANT_CFG" = "int4_awq" ]; then
+    INFERENCE_TP="1"
+fi
+
+additional_options=" \
+    --export-quant-cfg ${QUANT_CFG} \
+    --export-legacy-megatron \
+    --export-te-mcore-model \
+    --calib-batch-size 8 \
+    --decoder ${DECODER_TYPE} \
+    --export-dir /tmp/trtllm_ckpt \
+    --inference-tensor-parallel ${INFERENCE_TP} "
+
+# DO NOT CHANGE THE SETTING BELOW UNLESS YOU KNOW WHAT YOU ARE DOING!!!
+export CUDA_DEVICE_MAX_CONNECTIONS=1
+
+options=" \
+    --untie-embeddings-and-output-weights \
+    --no-masked-softmax-fusion \
+    --no-position-embedding \
+    --use-mcore-models \
+    --disable-bias-linear \
+    --rotary-percent 1.0 \
+    --attention-dropout 0.0 \
+    --hidden-dropout 0.0 \
+    --tensor-model-parallel-size ${TP} \
+    --pipeline-model-parallel-size 1 \
+    --num-layers 32 \
+    --hidden-size 4096 \
+    --ffn-hidden-size 14336 \
+    --num-attention-heads 32 \
+    --seq-length 4096 \
+    --kv-channels 128 \
+    --normalization RMSNorm \
+    --swiglu \
+    --num-query-groups 8 \
+    --num-experts 8 \
+    --moe-router-topk 2 \
+    --moe-aux-loss-coeff 1e-2 \
+    --moe-router-load-balancing-type aux_loss \
+    --group-query-attention \
+    --position-embedding-type rope \
+    --no-rope-fusion \
+    --max-position-embeddings 32768 \
+    --micro-batch-size 1 \
+    --tokenizer-type HuggingFaceTokenizer \
+    --tiktoken-pattern v2 \
+    --tokenizer-model mistralai/Mixtral-8x7B-Instruct-v0.1 \
+    --save-interval 1000000 \
+    --load ${CHECKPOINT_LOAD_DIR} \
+    --bf16 \
+    --rotary-base 1000000 \
+    --use-dist-ckpt"
+
+# Precompile CUDA extentions
+python -c "import modelopt.torch.quantization.extensions as ext; print(ext.cuda_ext); print(ext.cuda_ext_fp8)"
+
+# Acquire launch configuration where variable launch_config will be set
+launch_config="--nproc_per_node=${TP}"
+
+# Launch multi-process with torchrun
+torchrun ${launch_config} examples/export/ptq_and_trtllm_export/text_generation_ptq.py ${options} ${additional_options}
+
+
diff --git a/examples/export/ptq_and_trtllm_export/text_generation_ptq.py b/examples/export/ptq_and_trtllm_export/text_generation_ptq.py
@@ -6,7 +6,7 @@
 import sys
 from pathlib import Path
 
-sys.path.append(os.path.abspath(os.path.join(os.path.dirname(__file__), "../../../../")))
+sys.path.append(os.path.abspath(os.path.join(os.path.dirname(__file__), "../../../")))
 
 import modelopt.torch.quantization as mtq
 import torch
@@ -120,6 +120,9 @@ def get_calib_dataloader(
 
     print_rank_0("WARNING: Forcing exit_on_missing_checkpoint to True for text generation.")
     args.exit_on_missing_checkpoint = True
+    if hasattr(args, 'moe_grouped_gemm') and args.moe_grouped_gemm == True:
+        print_rank_0("WARNING: Forcing moe_grouped_gemm to False for PTQ and export.")
+        args.moe_grouped_gemm = False
 
     # Set up model and load checkpoint
     # [ModelOpt]: make sure that output logits are allgathered.
@@ -168,7 +171,7 @@ def hf_dataset_forword_loop_func(model):
                     model,
                     prompts=prompts,
                     tokens_to_generate=0,
-                    return_output_log_probs=True,
+                    return_output_log_probs=False,
                     temperature=1.0,
                 )
             else:
@@ -216,3 +219,4 @@ def hf_dataset_forword_loop_func(model):
         )
 
         print_rank_0(f"TensorRT-LLM checkpoints saved to {args.export_dir}")
+        torch.distributed.barrier()
diff --git a/megatron/core/inference/modelopt_support/gpt/model_specs.py b/megatron/core/inference/modelopt_support/gpt/model_specs.py
@@ -2,31 +2,42 @@
 
 from megatron.core.extensions.transformer_engine import TEDotProductAttention, TENorm
 from megatron.core.fusions.fused_bias_dropout import get_bias_dropout_add
+from megatron.core.models.gpt.gpt_layer_specs import _get_mlp_module_spec
 from megatron.core.tensor_parallel.layers import ColumnParallelLinear, RowParallelLinear
 from megatron.core.transformer.attention import SelfAttention, SelfAttentionSubmodules
 from megatron.core.transformer.enums import AttnMaskType
 from megatron.core.transformer.identity_op import IdentityOp
-from megatron.core.transformer.mlp import MLP, MLPSubmodules
 from megatron.core.transformer.spec_utils import ModuleSpec
 from megatron.core.transformer.transformer_layer import TransformerLayer, TransformerLayerSubmodules
 
 
 # Use this spec for ModelOpt PTQ and TensorRT-LLM export
 def get_gpt_layer_modelopt_spec(
-    remap_te_layernorm: bool = False, qk_layernorm: bool = False
+    num_experts: int = None,
+    moe_grouped_gemm: bool = False,
+    remap_te_layernorm: bool = False,
+    qk_layernorm: bool = False,
 ) -> ModuleSpec:
     """Mix the native spec with TENorm.
 
     This is essentially the native local spec except for the layernorm implementation
     is using TENorm from Transformer-Engine. The issue is that FusedLayerNorm from apex
     has stopped supporting RMSNorm needed by llama.
     """
+    mlp = _get_mlp_module_spec(
+        use_te=False, num_experts=num_experts, moe_grouped_gemm=moe_grouped_gemm, fp8=False
+    )
     sharded_state_dict_keys_map = {}
     if remap_te_layernorm:
-        sharded_state_dict_keys_map = {
-            'input_layernorm.': 'self_attention.linear_qkv.layer_norm_',
-            'pre_mlp_layernorm.': 'mlp.linear_fc1.layer_norm_',
-        }
+        if num_experts:
+            sharded_state_dict_keys_map = {
+                'input_layernorm.': 'self_attention.linear_qkv.layer_norm_'
+            }
+        else:
+            sharded_state_dict_keys_map = {
+                'input_layernorm.': 'self_attention.linear_qkv.layer_norm_',
+                'pre_mlp_layernorm.': 'mlp.linear_fc1.layer_norm_',
+            }
     return ModuleSpec(
         module=TransformerLayer,
         submodules=TransformerLayerSubmodules(
@@ -44,12 +55,7 @@ def get_gpt_layer_modelopt_spec(
             ),
             self_attn_bda=get_bias_dropout_add,
             pre_mlp_layernorm=TENorm,
-            mlp=ModuleSpec(
-                module=MLP,
-                submodules=MLPSubmodules(
-                    linear_fc1=ColumnParallelLinear, linear_fc2=RowParallelLinear
-                ),
-            ),
+            mlp=mlp,
             mlp_bda=get_bias_dropout_add,
             # Map TE-layernorm-fusion keys back
             sharded_state_dict_keys_map=sharded_state_dict_keys_map,

diff --git a/megatron/inference/gpt/model_provider.py b/megatron/inference/gpt/model_provider.py
@@ -150,7 +150,10 @@ def model_provider(pre_process=True, post_process=True, parallel_output=True) ->
         transformer_layer_spec = import_module(args.spec)
     else:
         transformer_layer_spec = get_gpt_layer_modelopt_spec(
-            remap_te_layernorm=args.export_te_mcore_model, qk_layernorm=False
+            num_experts=args.num_experts,
+            moe_grouped_gemm=args.moe_grouped_gemm,
+            remap_te_layernorm=args.export_te_mcore_model,
+            qk_layernorm=False,
         )
 
     model_kwargs = {