Merge branch 'main' into fsdp2

NVIDIA · Dec 16, 2024 · 5c1f189 · 5c1f189
2 parents e4cf960 + 1975ace
commit 5c1f189
Show file tree

Hide file tree

Showing 54 changed files with 3,196 additions and 3,814 deletions.
diff --git a/.github/workflows/deploy_nightly_docs.yml b/.github/workflows/deploy_nightly_docs.yml
@@ -16,13 +16,14 @@ jobs:
     runs-on: ubuntu-latest
     steps:
       - name: Download artifact
-        uses: actions/download-artifact@v4.1.7
+        uses: actions/download-artifact@v4
         with:
             name: "te_docs"
             path: "html"
       - name: Prepare for pages
         uses: actions/[email protected]
         with:
+          name: github-pages
           path: "html"
   deploy:
     needs: prepare

diff --git a/.github/workflows/docs.yml b/.github/workflows/docs.yml
@@ -27,7 +27,7 @@ jobs:
           cd docs
           make html
       - name: 'Upload docs'
-        uses: actions/upload-artifact@v3
+        uses: actions/upload-artifact@v4
         with:
           name: te_docs
           path: docs/_build/html

diff --git a/.github/workflows/trigger-ci.yml b/.github/workflows/trigger-ci.yml
@@ -42,6 +42,7 @@ jobs:
            || github.actor == 'kocchop'
            || github.actor == 'youngeunkwon0405'
            || github.actor == 'KshitijLakhani'
+           || github.actor == 'jberchtold-nvidia'
          )
     steps:
       - name: Check if comment is issued by authorized person

diff --git a/examples/pytorch/comm_gemm_overlap/README.md b/examples/pytorch/comm_gemm_overlap/README.md
@@ -16,7 +16,7 @@
 Forward and backward passes with layer weights distributed over all GPUs in a single node.
 
 ```bash
-$ torchrun --nnodes=1 --nproc-per-node=$(nvidia-smi -L | wc -l) ln_mlp_with_overlap.py
+$ torchrun --nnodes=1 --nproc-per-node=$(nvidia-smi -L | wc -l) te_layer_with_overlap.py
 
 # Sample output on 8x H100s:
 #   [rank0:node0] |-- Created tensor-parallel group: [0, 1, 2, 3, 4, 5, 6, 7]
@@ -70,7 +70,7 @@ Uses `torch.nn.parallel.DistributedDataParallel` for replicatin the model across
 groups in a single node.
 
 ```bash
-$ torchrun --nnodes=1 --nproc-per-node=$(nvidia-smi -L | wc -l) ln_mlp_overlap.py --num-replicas 2
+$ torchrun --nnodes=1 --nproc-per-node=$(nvidia-smi -L | wc -l) te_layer_with_overlap.py --num-replicas 2
 
 # Sample output on 8x H100s:
 #   [rank0:node0] |-- Created tensor-parallel group: [0, 1, 2, 3]

diff --git a/qa/L1_pytorch_mcore_integration/.gitignore b/qa/L1_pytorch_mcore_integration/.gitignore
@@ -0,0 +1,2 @@
+Megatron-LM
+vocab.json
diff --git a/qa/L1_pytorch_mcore_integration/merges.txt b/qa/L1_pytorch_mcore_integration/merges.txt
@@ -0,0 +1 @@
+#version: 0.2
diff --git a/qa/L1_pytorch_mcore_integration/test.sh b/qa/L1_pytorch_mcore_integration/test.sh
@@ -8,13 +8,27 @@ set -e
 : ${TE_PATH:=/opt/transformerengine}
 : ${MCORE_PATH:=${TE_PATH}/qa/L1_pytorch_mcore_integration/Megatron-LM}
 
+# Check whether FP8 is supported
+DEVICE_ARCH=$(nvidia-smi --query-gpu=compute_cap --format=csv,noheader | head -n 1 | sed 's/[^0-9]//g')
+if [[ ${DEVICE_ARCH} -ge 89 ]]; then
+    WITH_FP8=1
+fi
+
 # Download Megatron-LM if needed
 if [ ! -d "${MCORE_PATH}" ]; then
     pushd $(dirname ${MCORE_PATH})
     git clone -b core_r0.9.0 https://github.com/NVIDIA/Megatron-LM.git Megatron-LM
     popd
 fi
 
+# Create mock vocab
+VOCAB_FILE=${TE_PATH}/qa/L1_pytorch_mcore_integration/vocab.json
+printf "" > ${VOCAB_FILE}
+printf "{" >> ${VOCAB_FILE}
+printf "\"<|endoftext|>\": 0" >> ${VOCAB_FILE}
+seq 1 4095 | awk '{ printf(", \"%d\": %d", $1, $1) }' >> ${VOCAB_FILE}
+printf "}" >> ${VOCAB_FILE}
+
 # Megatron-LM invocation
 COMMAND="
 NVTE_TORCH_COMPILE=0
@@ -40,17 +54,17 @@ ${MCORE_PATH}/pretrain_gpt.py
 --hidden-size 128
 --num-attention-heads 8
 --seq-length 128
---max-position-embeddings 2048
+--max-position-embeddings 128
 --micro-batch-size 1
 --global-batch-size 8
 --train-iters 10
 --eval-iters 10
 --lr 1e-4
 --mock-data
---vocab-file /data/gpt3/pile-cc1-cc2-shuf/bpe/gpt2-vocab.json
---merge-file /data/gpt3/pile-cc1-cc2-shuf/bpe/gpt2-merges.txt
+--vocab-file ${VOCAB_FILE}
+--merge-file ${TE_PATH}/qa/L1_pytorch_mcore_integration/merges.txt
 --transformer-impl transformer_engine
---fp8-format hybrid
+${WITH_FP8:+--fp8-format hybrid}
 "
 COMMAND=$(echo "${COMMAND}" | tr '\n' ' ')
 

diff --git a/tests/cpp/operator/CMakeLists.txt b/tests/cpp/operator/CMakeLists.txt
@@ -10,8 +10,7 @@ add_executable(test_operator
                test_cast_transpose_dbias_dgelu.cu
                test_cast_transpose_dgeglu.cu
                test_act.cu
-               test_layernorm.cu
-               test_rmsnorm.cu
+               test_normalization.cu
                test_multi_cast_transpose.cu
                test_multi_padding.cu
                test_causal_softmax.cu