[PyTorch] Reorganize L1 tests (#1255)

* Reorganize PyTorch L1 tests Signed-off-by: Tim Moon <[email protected]> * Move ONNX tests to L1 Signed-off-by: Tim Moon <[email protected]> * Move FA version test to L3 Signed-off-by: Tim Moon <[email protected]> * Limit parallel build jobs in FA version test Signed-off-by: Tim Moon <[email protected]> --------- Signed-off-by: Tim Moon <[email protected]>
NVIDIA · Oct 18, 2024 · 41fe1e5 · 41fe1e5
1 parent a488b8b
commit 41fe1e5
Show file tree

Hide file tree

Showing 7 changed files with 45 additions and 24 deletions.
diff --git a/qa/L0_pytorch_unittest/test.sh b/qa/L0_pytorch_unittest/test.sh
@@ -21,11 +21,4 @@ pytest -v -s $TE_PATH/tests/pytorch/test_gqa.py
 pytest -v -s $TE_PATH/tests/pytorch/test_fused_optimizer.py
 pytest -v -s $TE_PATH/tests/pytorch/test_multi_tensor.py
 pytest -v -s $TE_PATH/tests/pytorch/test_fusible_ops.py
-pytest -v -s $TE_PATH/tests/pytorch/test_fusible_ops_distributed.py
 pytest -v -s $TE_PATH/tests/pytorch/test_permutation.py
-
-# Build custom ONNX extensions for ONNX export test
-pip install onnxruntime==1.19.2
-export CUSTOM_ORT_OPS_PATH=$TE_PATH/tests/pytorch/custom_ort_ops
-bash $CUSTOM_ORT_OPS_PATH/build.sh
-NVTE_TORCH_COMPILE=0 pytest -v -s $TE_PATH/tests/pytorch/test_onnx_export.py
diff --git a/qa/L1_pytorch_context_parallel_test/test.sh b/qa/L1_pytorch_context_parallel_test/test.sh
diff --git a/qa/L1_pytorch_distributed_unittest/test.sh b/qa/L1_pytorch_distributed_unittest/test.sh
@@ -5,11 +5,9 @@
 set -e
 
 : ${TE_PATH:=/opt/transformerengine}
-pytest -v -s $TE_PATH/tests/pytorch/distributed/test_comm_gemm_overlap.py
 
-pip install prettytable
-git clone https://github.com/NVIDIA/Megatron-LM.git
-cd Megatron-LM
-git checkout b3375a0e38c10e2300ef4be031f7dcabab52b448
-pytest -v -s $TE_PATH/tests/pytorch/distributed/test_convergence.py
-python $TE_PATH/tests/pytorch/distributed/print_logs.py
+pip install pytest==8.2.1
+pytest -v -s $TE_PATH/tests/pytorch/distributed/test_numerics.py
+pytest -v -s $TE_PATH/tests/pytorch/distributed/test_comm_gemm_overlap.py
+pytest -v -s $TE_PATH/tests/pytorch/distributed/test_fusible_ops.py
+pytest -v -s $TE_PATH/tests/pytorch/fused_attn/test_fused_attn_with_cp.py
diff --git a/qa/L1_pytorch_onnx_test/test.sh b/qa/L1_pytorch_onnx_test/test.sh
@@ -0,0 +1,16 @@
+# Copyright (c) 2022-2024, NVIDIA CORPORATION & AFFILIATES. All rights reserved.
+#
+# See LICENSE for license information.
+
+set -e
+
+: ${TE_PATH:=/opt/transformerengine}
+
+pip install pytest==8.2.1 onnxruntime==1.19.2
+
+# Build custom ONNX Runtime operators
+export CUSTOM_ORT_OPS_PATH=$TE_PATH/tests/pytorch/custom_ort_ops
+bash $CUSTOM_ORT_OPS_PATH/build.sh
+
+# Run tests
+NVTE_TORCH_COMPILE=0 pytest -v -s $TE_PATH/tests/pytorch/test_onnx_export.py
diff --git a/qa/L1_pytorch_FA_versions_test/test.sh → qa/L3_pytorch_FA_versions_test/test.sh b/qa/L1_pytorch_FA_versions_test/test.sh → qa/L3_pytorch_FA_versions_test/test.sh
@@ -7,9 +7,16 @@ set -e
 : ${TE_PATH:=/opt/transformerengine}
 
 pip install pytest==8.2.1
+
+# Limit parallel build jobs to avoid overwhelming system resources
+export MAX_JOBS=4
+
+# Iterate over Flash Attention versions
 FA_versions=(2.1.1 2.3.0 2.4.0.post1 2.4.1 2.5.7 2.6.3 3.0.0b1)
 for fa_version in "${FA_versions[@]}"
 do
+
+  # Build Flash Attention
   if [ "${fa_version}" \< "3.0.0" ]
   then
     pip install flash-attn==${fa_version}
@@ -19,5 +26,8 @@ do
     mkdir -p $python_path/flashattn_hopper
     wget -P $python_path/flashattn_hopper https://raw.githubusercontent.com/Dao-AILab/flash-attention/main/hopper/flash_attn_interface.py
   fi
+
+  # Run tests
   NVTE_TORCH_COMPILE=0 pytest -v -s $TE_PATH/tests/pytorch/fused_attn/test_fused_attn.py
+
 done
diff --git a/qa/L3_pytorch_convergence_test/test.sh b/qa/L3_pytorch_convergence_test/test.sh
@@ -0,0 +1,14 @@
+# Copyright (c) 2022-2024, NVIDIA CORPORATION & AFFILIATES. All rights reserved.
+#
+# See LICENSE for license information.
+
+set -e
+
+: ${TE_PATH:=/opt/transformerengine}
+
+pip install prettytable
+git clone https://github.com/NVIDIA/Megatron-LM.git
+cd Megatron-LM
+git checkout b3375a0e38c10e2300ef4be031f7dcabab52b448
+pytest -v -s $TE_PATH/tests/pytorch/distributed/test_convergence.py
+python $TE_PATH/tests/pytorch/distributed/print_logs.py
diff --git a/...s/pytorch/test_fusible_ops_distributed.py → ...s/pytorch/distributed/test_fusible_ops.py b/...s/pytorch/test_fusible_ops_distributed.py → ...s/pytorch/distributed/test_fusible_ops.py