From 41fe1e505bd5de32c61980182f2916c3e2a3de00 Mon Sep 17 00:00:00 2001
From: Tim Moon <4406448+timmoon10@users.noreply.github.com>
Date: Thu, 17 Oct 2024 18:57:13 -0700
Subject: [PATCH] [PyTorch] Reorganize L1 tests (#1255)

* Reorganize PyTorch L1 tests

Signed-off-by: Tim Moon <tmoon@nvidia.com>

* Move ONNX tests to L1

Signed-off-by: Tim Moon <tmoon@nvidia.com>

* Move FA version test to L3

Signed-off-by: Tim Moon <tmoon@nvidia.com>

* Limit parallel build jobs in FA version test

Signed-off-by: Tim Moon <tmoon@nvidia.com>

---------

Signed-off-by: Tim Moon <tmoon@nvidia.com>
---
 qa/L0_pytorch_unittest/test.sh                   |  7 -------
 qa/L1_pytorch_context_parallel_test/test.sh      | 10 ----------
 qa/L1_pytorch_distributed_unittest/test.sh       | 12 +++++-------
 qa/L1_pytorch_onnx_test/test.sh                  | 16 ++++++++++++++++
 .../test.sh                                      | 10 ++++++++++
 qa/L3_pytorch_convergence_test/test.sh           | 14 ++++++++++++++
 .../test_fusible_ops.py}                         |  0
 7 files changed, 45 insertions(+), 24 deletions(-)
 delete mode 100644 qa/L1_pytorch_context_parallel_test/test.sh
 create mode 100644 qa/L1_pytorch_onnx_test/test.sh
 rename qa/{L1_pytorch_FA_versions_test => L3_pytorch_FA_versions_test}/test.sh (83%)
 create mode 100644 qa/L3_pytorch_convergence_test/test.sh
 rename tests/pytorch/{test_fusible_ops_distributed.py => distributed/test_fusible_ops.py} (100%)

diff --git a/qa/L0_pytorch_unittest/test.sh b/qa/L0_pytorch_unittest/test.sh
index bf2581217d..17307574a9 100644
--- a/qa/L0_pytorch_unittest/test.sh
+++ b/qa/L0_pytorch_unittest/test.sh
@@ -21,11 +21,4 @@ pytest -v -s $TE_PATH/tests/pytorch/test_gqa.py
 pytest -v -s $TE_PATH/tests/pytorch/test_fused_optimizer.py
 pytest -v -s $TE_PATH/tests/pytorch/test_multi_tensor.py
 pytest -v -s $TE_PATH/tests/pytorch/test_fusible_ops.py
-pytest -v -s $TE_PATH/tests/pytorch/test_fusible_ops_distributed.py
 pytest -v -s $TE_PATH/tests/pytorch/test_permutation.py
-
-# Build custom ONNX extensions for ONNX export test
-pip install onnxruntime==1.19.2
-export CUSTOM_ORT_OPS_PATH=$TE_PATH/tests/pytorch/custom_ort_ops
-bash $CUSTOM_ORT_OPS_PATH/build.sh
-NVTE_TORCH_COMPILE=0 pytest -v -s $TE_PATH/tests/pytorch/test_onnx_export.py
diff --git a/qa/L1_pytorch_context_parallel_test/test.sh b/qa/L1_pytorch_context_parallel_test/test.sh
deleted file mode 100644
index 81ab8ee20b..0000000000
--- a/qa/L1_pytorch_context_parallel_test/test.sh
+++ /dev/null
@@ -1,10 +0,0 @@
-# Copyright (c) 2022-2024, NVIDIA CORPORATION & AFFILIATES. All rights reserved.
-#
-# See LICENSE for license information.
-
-set -e
-
-: ${TE_PATH:=/opt/transformerengine}
-
-pip install pytest==7.2.0
-pytest -v -s $TE_PATH/tests/pytorch/fused_attn/test_fused_attn_with_cp.py
diff --git a/qa/L1_pytorch_distributed_unittest/test.sh b/qa/L1_pytorch_distributed_unittest/test.sh
index a18d06a131..c22ba221be 100644
--- a/qa/L1_pytorch_distributed_unittest/test.sh
+++ b/qa/L1_pytorch_distributed_unittest/test.sh
@@ -5,11 +5,9 @@
 set -e
 
 : ${TE_PATH:=/opt/transformerengine}
-pytest -v -s $TE_PATH/tests/pytorch/distributed/test_comm_gemm_overlap.py
 
-pip install prettytable
-git clone https://github.com/NVIDIA/Megatron-LM.git
-cd Megatron-LM
-git checkout b3375a0e38c10e2300ef4be031f7dcabab52b448
-pytest -v -s $TE_PATH/tests/pytorch/distributed/test_convergence.py
-python $TE_PATH/tests/pytorch/distributed/print_logs.py
+pip install pytest==8.2.1
+pytest -v -s $TE_PATH/tests/pytorch/distributed/test_numerics.py
+pytest -v -s $TE_PATH/tests/pytorch/distributed/test_comm_gemm_overlap.py
+pytest -v -s $TE_PATH/tests/pytorch/distributed/test_fusible_ops.py
+pytest -v -s $TE_PATH/tests/pytorch/fused_attn/test_fused_attn_with_cp.py
diff --git a/qa/L1_pytorch_onnx_test/test.sh b/qa/L1_pytorch_onnx_test/test.sh
new file mode 100644
index 0000000000..5a01468064
--- /dev/null
+++ b/qa/L1_pytorch_onnx_test/test.sh
@@ -0,0 +1,16 @@
+# Copyright (c) 2022-2024, NVIDIA CORPORATION & AFFILIATES. All rights reserved.
+#
+# See LICENSE for license information.
+
+set -e
+
+: ${TE_PATH:=/opt/transformerengine}
+
+pip install pytest==8.2.1 onnxruntime==1.19.2
+
+# Build custom ONNX Runtime operators
+export CUSTOM_ORT_OPS_PATH=$TE_PATH/tests/pytorch/custom_ort_ops
+bash $CUSTOM_ORT_OPS_PATH/build.sh
+
+# Run tests
+NVTE_TORCH_COMPILE=0 pytest -v -s $TE_PATH/tests/pytorch/test_onnx_export.py
diff --git a/qa/L1_pytorch_FA_versions_test/test.sh b/qa/L3_pytorch_FA_versions_test/test.sh
similarity index 83%
rename from qa/L1_pytorch_FA_versions_test/test.sh
rename to qa/L3_pytorch_FA_versions_test/test.sh
index 3616dd01d0..162ed85823 100644
--- a/qa/L1_pytorch_FA_versions_test/test.sh
+++ b/qa/L3_pytorch_FA_versions_test/test.sh
@@ -7,9 +7,16 @@ set -e
 : ${TE_PATH:=/opt/transformerengine}
 
 pip install pytest==8.2.1
+
+# Limit parallel build jobs to avoid overwhelming system resources
+export MAX_JOBS=4
+
+# Iterate over Flash Attention versions
 FA_versions=(2.1.1 2.3.0 2.4.0.post1 2.4.1 2.5.7 2.6.3 3.0.0b1)
 for fa_version in "${FA_versions[@]}"
 do
+
+  # Build Flash Attention
   if [ "${fa_version}" \< "3.0.0" ]
   then
     pip install flash-attn==${fa_version}
@@ -19,5 +26,8 @@ do
     mkdir -p $python_path/flashattn_hopper
     wget -P $python_path/flashattn_hopper https://raw.githubusercontent.com/Dao-AILab/flash-attention/main/hopper/flash_attn_interface.py
   fi
+
+  # Run tests
   NVTE_TORCH_COMPILE=0 pytest -v -s $TE_PATH/tests/pytorch/fused_attn/test_fused_attn.py
+
 done
diff --git a/qa/L3_pytorch_convergence_test/test.sh b/qa/L3_pytorch_convergence_test/test.sh
new file mode 100644
index 0000000000..fca621f279
--- /dev/null
+++ b/qa/L3_pytorch_convergence_test/test.sh
@@ -0,0 +1,14 @@
+# Copyright (c) 2022-2024, NVIDIA CORPORATION & AFFILIATES. All rights reserved.
+#
+# See LICENSE for license information.
+
+set -e
+
+: ${TE_PATH:=/opt/transformerengine}
+
+pip install prettytable
+git clone https://github.com/NVIDIA/Megatron-LM.git
+cd Megatron-LM
+git checkout b3375a0e38c10e2300ef4be031f7dcabab52b448
+pytest -v -s $TE_PATH/tests/pytorch/distributed/test_convergence.py
+python $TE_PATH/tests/pytorch/distributed/print_logs.py
diff --git a/tests/pytorch/test_fusible_ops_distributed.py b/tests/pytorch/distributed/test_fusible_ops.py
similarity index 100%
rename from tests/pytorch/test_fusible_ops_distributed.py
rename to tests/pytorch/distributed/test_fusible_ops.py