Skip to content

Commit

Permalink
Merge branch 'yueshen/mixtral8x7b_support' into 'main'
Browse files Browse the repository at this point in the history
Mixtral8x7b modelopt support

See merge request ADLR/megatron-lm!2200
  • Loading branch information
jaredcasper committed Nov 6, 2024
2 parents ad5ce45 + 0197f6f commit f39c48d
Show file tree
Hide file tree
Showing 5 changed files with 154 additions and 15 deletions.
42 changes: 42 additions & 0 deletions examples/export/ptq_and_trtllm_export/README.md
Original file line number Diff line number Diff line change
Expand Up @@ -250,4 +250,46 @@ python examples/export/ptq_and_trtllm_export/trtllm_text_generation.py --tokeniz

python examples/export/ptq_and_trtllm_export/trtllm_text_generation.py --tokenizer meta-llama/Meta-Llama-3.1-8B
#For llama-3.1
```


### Mixtral-8x7B FP8 Quantization and TensorRT-LLM Deployment
First download the nemotron checkpoint from https://catalog.ngc.nvidia.com/orgs/nvidia/teams/nemo/models/mixtral-8x7b-v01, extract the
sharded checkpoint from the `.nemo` tarbal.

```sh
ngc registry model download-version "nvidia/nemo/mixtral-8x7b-v01:1.0"
cd mixtral-8x7b-v01_v1.0
tar -xvf mixtral.nemo
cd ..
```

Then log in to huggingface so that you can access to model

> **NOTE:** You need a token generated from huggingface.co/settings/tokens and access to mistralai/Mixtral-8x7B-v0.1 on huggingface
```sh
pip install -U "huggingface_hub[cli]"
huggingface-cli login
```

Now launch the PTQ + TensorRT-LLM checkpoint export script,

```sh
bash examples/export/ptq_and_trtllm_export/ptq_trtllm_mixtral_8x7b.sh ./mixtral-8x7b-v01_v1.0/
```

Then build TensorRT engine and run text generation example using the newly built TensorRT engine

```sh
export trtllm_options=" \
--checkpoint_dir /tmp/trtllm_ckpt \
--output_dir /tmp/trtllm_engine \
--max_input_len 2048 \
--max_seq_len 512 \
--max_batch_size 8 "

trtllm-build ${trtllm_options}

python examples/export/ptq_and_trtllm_export/trtllm_text_generation.py --tokenizer mistralai/Mixtral-8x7B-v0.1
```
84 changes: 84 additions & 0 deletions examples/export/ptq_and_trtllm_export/ptq_trtllm_mixtral_8x7b.sh
Original file line number Diff line number Diff line change
@@ -0,0 +1,84 @@
#!/bin/bash
set -e

DEFAULT_NAME="/checkpoints/Mistral-NeMo-12B-Base"
NAME="${1:-$DEFAULT_NAME}"

DEFAULT_QUANT_CFG="fp8"
QUANT_CFG="${2:-$DEFAULT_QUANT_CFG}"

# NOTE: UNFUSED ATTENTION MUST BE USED TO AVOID ADDITIONAL STATE_DICT KEY MISMATCH.
export NVTE_FLASH_ATTN=0
export NVTE_FUSED_ATTN=0
export NVTE_UNFUSED_ATTN=1

# CHANGE THE FOLLOWING IF YOU MOUNT YOUR DATA AND CHECKPOINTS DIFFERENTLY IN THE CONTAINER.
TP="8"
INFERENCE_TP=${TP}
DECODER_TYPE="llama"
CHECKPOINT_LOAD_DIR="${NAME}"

if [ "$QUANT_CFG" = "int4_awq" ]; then
INFERENCE_TP="1"
fi

additional_options=" \
--export-quant-cfg ${QUANT_CFG} \
--export-legacy-megatron \
--export-te-mcore-model \
--calib-batch-size 8 \
--decoder ${DECODER_TYPE} \
--export-dir /tmp/trtllm_ckpt \
--inference-tensor-parallel ${INFERENCE_TP} "

# DO NOT CHANGE THE SETTING BELOW UNLESS YOU KNOW WHAT YOU ARE DOING!!!
export CUDA_DEVICE_MAX_CONNECTIONS=1

options=" \
--untie-embeddings-and-output-weights \
--no-masked-softmax-fusion \
--no-position-embedding \
--use-mcore-models \
--disable-bias-linear \
--rotary-percent 1.0 \
--attention-dropout 0.0 \
--hidden-dropout 0.0 \
--tensor-model-parallel-size ${TP} \
--pipeline-model-parallel-size 1 \
--num-layers 32 \
--hidden-size 4096 \
--ffn-hidden-size 14336 \
--num-attention-heads 32 \
--seq-length 4096 \
--kv-channels 128 \
--normalization RMSNorm \
--swiglu \
--num-query-groups 8 \
--num-experts 8 \
--moe-router-topk 2 \
--moe-aux-loss-coeff 1e-2 \
--moe-router-load-balancing-type aux_loss \
--group-query-attention \
--position-embedding-type rope \
--no-rope-fusion \
--max-position-embeddings 32768 \
--micro-batch-size 1 \
--tokenizer-type HuggingFaceTokenizer \
--tiktoken-pattern v2 \
--tokenizer-model mistralai/Mixtral-8x7B-Instruct-v0.1 \
--save-interval 1000000 \
--load ${CHECKPOINT_LOAD_DIR} \
--bf16 \
--rotary-base 1000000 \
--use-dist-ckpt"

# Precompile CUDA extentions
python -c "import modelopt.torch.quantization.extensions as ext; print(ext.cuda_ext); print(ext.cuda_ext_fp8)"

# Acquire launch configuration where variable launch_config will be set
launch_config="--nproc_per_node=${TP}"

# Launch multi-process with torchrun
torchrun ${launch_config} examples/export/ptq_and_trtllm_export/text_generation_ptq.py ${options} ${additional_options}


8 changes: 6 additions & 2 deletions examples/export/ptq_and_trtllm_export/text_generation_ptq.py
Original file line number Diff line number Diff line change
Expand Up @@ -6,7 +6,7 @@
import sys
from pathlib import Path

sys.path.append(os.path.abspath(os.path.join(os.path.dirname(__file__), "../../../../")))
sys.path.append(os.path.abspath(os.path.join(os.path.dirname(__file__), "../../../")))

import modelopt.torch.quantization as mtq
import torch
Expand Down Expand Up @@ -120,6 +120,9 @@ def get_calib_dataloader(

print_rank_0("WARNING: Forcing exit_on_missing_checkpoint to True for text generation.")
args.exit_on_missing_checkpoint = True
if hasattr(args, 'moe_grouped_gemm') and args.moe_grouped_gemm == True:
print_rank_0("WARNING: Forcing moe_grouped_gemm to False for PTQ and export.")
args.moe_grouped_gemm = False

# Set up model and load checkpoint
# [ModelOpt]: make sure that output logits are allgathered.
Expand Down Expand Up @@ -168,7 +171,7 @@ def hf_dataset_forword_loop_func(model):
model,
prompts=prompts,
tokens_to_generate=0,
return_output_log_probs=True,
return_output_log_probs=False,
temperature=1.0,
)
else:
Expand Down Expand Up @@ -216,3 +219,4 @@ def hf_dataset_forword_loop_func(model):
)

print_rank_0(f"TensorRT-LLM checkpoints saved to {args.export_dir}")
torch.distributed.barrier()
30 changes: 18 additions & 12 deletions megatron/core/inference/modelopt_support/gpt/model_specs.py
Original file line number Diff line number Diff line change
Expand Up @@ -2,31 +2,42 @@

from megatron.core.extensions.transformer_engine import TEDotProductAttention, TENorm
from megatron.core.fusions.fused_bias_dropout import get_bias_dropout_add
from megatron.core.models.gpt.gpt_layer_specs import _get_mlp_module_spec
from megatron.core.tensor_parallel.layers import ColumnParallelLinear, RowParallelLinear
from megatron.core.transformer.attention import SelfAttention, SelfAttentionSubmodules
from megatron.core.transformer.enums import AttnMaskType
from megatron.core.transformer.identity_op import IdentityOp
from megatron.core.transformer.mlp import MLP, MLPSubmodules
from megatron.core.transformer.spec_utils import ModuleSpec
from megatron.core.transformer.transformer_layer import TransformerLayer, TransformerLayerSubmodules


# Use this spec for ModelOpt PTQ and TensorRT-LLM export
def get_gpt_layer_modelopt_spec(
remap_te_layernorm: bool = False, qk_layernorm: bool = False
num_experts: int = None,
moe_grouped_gemm: bool = False,
remap_te_layernorm: bool = False,
qk_layernorm: bool = False,
) -> ModuleSpec:
"""Mix the native spec with TENorm.
This is essentially the native local spec except for the layernorm implementation
is using TENorm from Transformer-Engine. The issue is that FusedLayerNorm from apex
has stopped supporting RMSNorm needed by llama.
"""
mlp = _get_mlp_module_spec(
use_te=False, num_experts=num_experts, moe_grouped_gemm=moe_grouped_gemm, fp8=False
)
sharded_state_dict_keys_map = {}
if remap_te_layernorm:
sharded_state_dict_keys_map = {
'input_layernorm.': 'self_attention.linear_qkv.layer_norm_',
'pre_mlp_layernorm.': 'mlp.linear_fc1.layer_norm_',
}
if num_experts:
sharded_state_dict_keys_map = {
'input_layernorm.': 'self_attention.linear_qkv.layer_norm_'
}
else:
sharded_state_dict_keys_map = {
'input_layernorm.': 'self_attention.linear_qkv.layer_norm_',
'pre_mlp_layernorm.': 'mlp.linear_fc1.layer_norm_',
}
return ModuleSpec(
module=TransformerLayer,
submodules=TransformerLayerSubmodules(
Expand All @@ -44,12 +55,7 @@ def get_gpt_layer_modelopt_spec(
),
self_attn_bda=get_bias_dropout_add,
pre_mlp_layernorm=TENorm,
mlp=ModuleSpec(
module=MLP,
submodules=MLPSubmodules(
linear_fc1=ColumnParallelLinear, linear_fc2=RowParallelLinear
),
),
mlp=mlp,
mlp_bda=get_bias_dropout_add,
# Map TE-layernorm-fusion keys back
sharded_state_dict_keys_map=sharded_state_dict_keys_map,
Expand Down
5 changes: 4 additions & 1 deletion megatron/inference/gpt/model_provider.py
Original file line number Diff line number Diff line change
Expand Up @@ -150,7 +150,10 @@ def model_provider(pre_process=True, post_process=True, parallel_output=True) ->
transformer_layer_spec = import_module(args.spec)
else:
transformer_layer_spec = get_gpt_layer_modelopt_spec(
remap_te_layernorm=args.export_te_mcore_model, qk_layernorm=False
num_experts=args.num_experts,
moe_grouped_gemm=args.moe_grouped_gemm,
remap_te_layernorm=args.export_te_mcore_model,
qk_layernorm=False,
)

model_kwargs = {
Expand Down

0 comments on commit f39c48d

Please sign in to comment.