From 101ec840759e17d9208539f31f9a9b5c107c3d07 Mon Sep 17 00:00:00 2001 From: Tim Moon Date: Thu, 5 Dec 2024 00:00:45 +0000 Subject: [PATCH] Debug Mcore integration test Avoid FP8 on Ampere and older. Generate synthetic data instead of depending on external data. Signed-off-by: Tim Moon --- qa/L1_pytorch_mcore_integration/.gitignore | 2 ++ qa/L1_pytorch_mcore_integration/merges.txt | 1 + qa/L1_pytorch_mcore_integration/test.sh | 22 ++++++++++++++++++---- 3 files changed, 21 insertions(+), 4 deletions(-) create mode 100644 qa/L1_pytorch_mcore_integration/.gitignore create mode 100644 qa/L1_pytorch_mcore_integration/merges.txt diff --git a/qa/L1_pytorch_mcore_integration/.gitignore b/qa/L1_pytorch_mcore_integration/.gitignore new file mode 100644 index 0000000000..46426003ca --- /dev/null +++ b/qa/L1_pytorch_mcore_integration/.gitignore @@ -0,0 +1,2 @@ +Megatron-LM +vocab.json \ No newline at end of file diff --git a/qa/L1_pytorch_mcore_integration/merges.txt b/qa/L1_pytorch_mcore_integration/merges.txt new file mode 100644 index 0000000000..5e7f1fd949 --- /dev/null +++ b/qa/L1_pytorch_mcore_integration/merges.txt @@ -0,0 +1 @@ +#version: 0.2 diff --git a/qa/L1_pytorch_mcore_integration/test.sh b/qa/L1_pytorch_mcore_integration/test.sh index 01c9e14eb1..b0aba17ef5 100644 --- a/qa/L1_pytorch_mcore_integration/test.sh +++ b/qa/L1_pytorch_mcore_integration/test.sh @@ -8,6 +8,12 @@ set -e : ${TE_PATH:=/opt/transformerengine} : ${MCORE_PATH:=${TE_PATH}/qa/L1_pytorch_mcore_integration/Megatron-LM} +# Check whether FP8 is supported +DEVICE_ARCH=$(nvidia-smi --query-gpu=compute_cap --format=csv,noheader | head -n 1 | sed 's/[^0-9]//g') +if [[ ${DEVICE_ARCH} -ge 89 ]]; then + WITH_FP8=1 +fi + # Download Megatron-LM if needed if [ ! -d "${MCORE_PATH}" ]; then pushd $(dirname ${MCORE_PATH}) @@ -15,6 +21,14 @@ if [ ! -d "${MCORE_PATH}" ]; then popd fi +# Create mock vocab +VOCAB_FILE=${TE_PATH}/qa/L1_pytorch_mcore_integration/vocab.json +printf "" > ${VOCAB_FILE} +printf "{" >> ${VOCAB_FILE} +printf "\"<|endoftext|>\": 0" >> ${VOCAB_FILE} +seq 1 4095 | awk '{ printf(", \"%d\": %d", $1, $1) }' >> ${VOCAB_FILE} +printf "}" >> ${VOCAB_FILE} + # Megatron-LM invocation COMMAND=" NVTE_TORCH_COMPILE=0 @@ -40,17 +54,17 @@ ${MCORE_PATH}/pretrain_gpt.py --hidden-size 128 --num-attention-heads 8 --seq-length 128 ---max-position-embeddings 2048 +--max-position-embeddings 128 --micro-batch-size 1 --global-batch-size 8 --train-iters 10 --eval-iters 10 --lr 1e-4 --mock-data ---vocab-file /data/gpt3/pile-cc1-cc2-shuf/bpe/gpt2-vocab.json ---merge-file /data/gpt3/pile-cc1-cc2-shuf/bpe/gpt2-merges.txt +--vocab-file ${VOCAB_FILE} +--merge-file ${TE_PATH}/qa/L1_pytorch_mcore_integration/merges.txt --transformer-impl transformer_engine ---fp8-format hybrid +${WITH_FP8:+--fp8-format hybrid} " COMMAND=$(echo "${COMMAND}" | tr '\n' ' ')