From 101ec840759e17d9208539f31f9a9b5c107c3d07 Mon Sep 17 00:00:00 2001
From: Tim Moon <tmoon@nvidia.com>
Date: Thu, 5 Dec 2024 00:00:45 +0000
Subject: [PATCH] Debug Mcore integration test

Avoid FP8 on Ampere and older. Generate synthetic data instead of depending on external data.

Signed-off-by: Tim Moon <tmoon@nvidia.com>
---
 qa/L1_pytorch_mcore_integration/.gitignore |  2 ++
 qa/L1_pytorch_mcore_integration/merges.txt |  1 +
 qa/L1_pytorch_mcore_integration/test.sh    | 22 ++++++++++++++++++----
 3 files changed, 21 insertions(+), 4 deletions(-)
 create mode 100644 qa/L1_pytorch_mcore_integration/.gitignore
 create mode 100644 qa/L1_pytorch_mcore_integration/merges.txt

diff --git a/qa/L1_pytorch_mcore_integration/.gitignore b/qa/L1_pytorch_mcore_integration/.gitignore
new file mode 100644
index 0000000000..46426003ca
--- /dev/null
+++ b/qa/L1_pytorch_mcore_integration/.gitignore
@@ -0,0 +1,2 @@
+Megatron-LM
+vocab.json
\ No newline at end of file
diff --git a/qa/L1_pytorch_mcore_integration/merges.txt b/qa/L1_pytorch_mcore_integration/merges.txt
new file mode 100644
index 0000000000..5e7f1fd949
--- /dev/null
+++ b/qa/L1_pytorch_mcore_integration/merges.txt
@@ -0,0 +1 @@
+#version: 0.2
diff --git a/qa/L1_pytorch_mcore_integration/test.sh b/qa/L1_pytorch_mcore_integration/test.sh
index 01c9e14eb1..b0aba17ef5 100644
--- a/qa/L1_pytorch_mcore_integration/test.sh
+++ b/qa/L1_pytorch_mcore_integration/test.sh
@@ -8,6 +8,12 @@ set -e
 : ${TE_PATH:=/opt/transformerengine}
 : ${MCORE_PATH:=${TE_PATH}/qa/L1_pytorch_mcore_integration/Megatron-LM}
 
+# Check whether FP8 is supported
+DEVICE_ARCH=$(nvidia-smi --query-gpu=compute_cap --format=csv,noheader | head -n 1 | sed 's/[^0-9]//g')
+if [[ ${DEVICE_ARCH} -ge 89 ]]; then
+    WITH_FP8=1
+fi
+
 # Download Megatron-LM if needed
 if [ ! -d "${MCORE_PATH}" ]; then
     pushd $(dirname ${MCORE_PATH})
@@ -15,6 +21,14 @@ if [ ! -d "${MCORE_PATH}" ]; then
     popd
 fi
 
+# Create mock vocab
+VOCAB_FILE=${TE_PATH}/qa/L1_pytorch_mcore_integration/vocab.json
+printf "" > ${VOCAB_FILE}
+printf "{" >> ${VOCAB_FILE}
+printf "\"<|endoftext|>\": 0" >> ${VOCAB_FILE}
+seq 1 4095 | awk '{ printf(", \"%d\": %d", $1, $1) }' >> ${VOCAB_FILE}
+printf "}" >> ${VOCAB_FILE}
+
 # Megatron-LM invocation
 COMMAND="
 NVTE_TORCH_COMPILE=0
@@ -40,17 +54,17 @@ ${MCORE_PATH}/pretrain_gpt.py
 --hidden-size 128
 --num-attention-heads 8
 --seq-length 128
---max-position-embeddings 2048
+--max-position-embeddings 128
 --micro-batch-size 1
 --global-batch-size 8
 --train-iters 10
 --eval-iters 10
 --lr 1e-4
 --mock-data
---vocab-file /data/gpt3/pile-cc1-cc2-shuf/bpe/gpt2-vocab.json
---merge-file /data/gpt3/pile-cc1-cc2-shuf/bpe/gpt2-merges.txt
+--vocab-file ${VOCAB_FILE}
+--merge-file ${TE_PATH}/qa/L1_pytorch_mcore_integration/merges.txt
 --transformer-impl transformer_engine
---fp8-format hybrid
+${WITH_FP8:+--fp8-format hybrid}
 "
 COMMAND=$(echo "${COMMAND}" | tr '\n' ' ')