From 41fe1e505bd5de32c61980182f2916c3e2a3de00 Mon Sep 17 00:00:00 2001 From: Tim Moon <4406448+timmoon10@users.noreply.github.com> Date: Thu, 17 Oct 2024 18:57:13 -0700 Subject: [PATCH] [PyTorch] Reorganize L1 tests (#1255) * Reorganize PyTorch L1 tests Signed-off-by: Tim Moon * Move ONNX tests to L1 Signed-off-by: Tim Moon * Move FA version test to L3 Signed-off-by: Tim Moon * Limit parallel build jobs in FA version test Signed-off-by: Tim Moon --------- Signed-off-by: Tim Moon --- qa/L0_pytorch_unittest/test.sh | 7 ------- qa/L1_pytorch_context_parallel_test/test.sh | 10 ---------- qa/L1_pytorch_distributed_unittest/test.sh | 12 +++++------- qa/L1_pytorch_onnx_test/test.sh | 16 ++++++++++++++++ .../test.sh | 10 ++++++++++ qa/L3_pytorch_convergence_test/test.sh | 14 ++++++++++++++ .../test_fusible_ops.py} | 0 7 files changed, 45 insertions(+), 24 deletions(-) delete mode 100644 qa/L1_pytorch_context_parallel_test/test.sh create mode 100644 qa/L1_pytorch_onnx_test/test.sh rename qa/{L1_pytorch_FA_versions_test => L3_pytorch_FA_versions_test}/test.sh (83%) create mode 100644 qa/L3_pytorch_convergence_test/test.sh rename tests/pytorch/{test_fusible_ops_distributed.py => distributed/test_fusible_ops.py} (100%) diff --git a/qa/L0_pytorch_unittest/test.sh b/qa/L0_pytorch_unittest/test.sh index bf2581217d..17307574a9 100644 --- a/qa/L0_pytorch_unittest/test.sh +++ b/qa/L0_pytorch_unittest/test.sh @@ -21,11 +21,4 @@ pytest -v -s $TE_PATH/tests/pytorch/test_gqa.py pytest -v -s $TE_PATH/tests/pytorch/test_fused_optimizer.py pytest -v -s $TE_PATH/tests/pytorch/test_multi_tensor.py pytest -v -s $TE_PATH/tests/pytorch/test_fusible_ops.py -pytest -v -s $TE_PATH/tests/pytorch/test_fusible_ops_distributed.py pytest -v -s $TE_PATH/tests/pytorch/test_permutation.py - -# Build custom ONNX extensions for ONNX export test -pip install onnxruntime==1.19.2 -export CUSTOM_ORT_OPS_PATH=$TE_PATH/tests/pytorch/custom_ort_ops -bash $CUSTOM_ORT_OPS_PATH/build.sh -NVTE_TORCH_COMPILE=0 pytest -v -s $TE_PATH/tests/pytorch/test_onnx_export.py diff --git a/qa/L1_pytorch_context_parallel_test/test.sh b/qa/L1_pytorch_context_parallel_test/test.sh deleted file mode 100644 index 81ab8ee20b..0000000000 --- a/qa/L1_pytorch_context_parallel_test/test.sh +++ /dev/null @@ -1,10 +0,0 @@ -# Copyright (c) 2022-2024, NVIDIA CORPORATION & AFFILIATES. All rights reserved. -# -# See LICENSE for license information. - -set -e - -: ${TE_PATH:=/opt/transformerengine} - -pip install pytest==7.2.0 -pytest -v -s $TE_PATH/tests/pytorch/fused_attn/test_fused_attn_with_cp.py diff --git a/qa/L1_pytorch_distributed_unittest/test.sh b/qa/L1_pytorch_distributed_unittest/test.sh index a18d06a131..c22ba221be 100644 --- a/qa/L1_pytorch_distributed_unittest/test.sh +++ b/qa/L1_pytorch_distributed_unittest/test.sh @@ -5,11 +5,9 @@ set -e : ${TE_PATH:=/opt/transformerengine} -pytest -v -s $TE_PATH/tests/pytorch/distributed/test_comm_gemm_overlap.py -pip install prettytable -git clone https://github.com/NVIDIA/Megatron-LM.git -cd Megatron-LM -git checkout b3375a0e38c10e2300ef4be031f7dcabab52b448 -pytest -v -s $TE_PATH/tests/pytorch/distributed/test_convergence.py -python $TE_PATH/tests/pytorch/distributed/print_logs.py +pip install pytest==8.2.1 +pytest -v -s $TE_PATH/tests/pytorch/distributed/test_numerics.py +pytest -v -s $TE_PATH/tests/pytorch/distributed/test_comm_gemm_overlap.py +pytest -v -s $TE_PATH/tests/pytorch/distributed/test_fusible_ops.py +pytest -v -s $TE_PATH/tests/pytorch/fused_attn/test_fused_attn_with_cp.py diff --git a/qa/L1_pytorch_onnx_test/test.sh b/qa/L1_pytorch_onnx_test/test.sh new file mode 100644 index 0000000000..5a01468064 --- /dev/null +++ b/qa/L1_pytorch_onnx_test/test.sh @@ -0,0 +1,16 @@ +# Copyright (c) 2022-2024, NVIDIA CORPORATION & AFFILIATES. All rights reserved. +# +# See LICENSE for license information. + +set -e + +: ${TE_PATH:=/opt/transformerengine} + +pip install pytest==8.2.1 onnxruntime==1.19.2 + +# Build custom ONNX Runtime operators +export CUSTOM_ORT_OPS_PATH=$TE_PATH/tests/pytorch/custom_ort_ops +bash $CUSTOM_ORT_OPS_PATH/build.sh + +# Run tests +NVTE_TORCH_COMPILE=0 pytest -v -s $TE_PATH/tests/pytorch/test_onnx_export.py diff --git a/qa/L1_pytorch_FA_versions_test/test.sh b/qa/L3_pytorch_FA_versions_test/test.sh similarity index 83% rename from qa/L1_pytorch_FA_versions_test/test.sh rename to qa/L3_pytorch_FA_versions_test/test.sh index 3616dd01d0..162ed85823 100644 --- a/qa/L1_pytorch_FA_versions_test/test.sh +++ b/qa/L3_pytorch_FA_versions_test/test.sh @@ -7,9 +7,16 @@ set -e : ${TE_PATH:=/opt/transformerengine} pip install pytest==8.2.1 + +# Limit parallel build jobs to avoid overwhelming system resources +export MAX_JOBS=4 + +# Iterate over Flash Attention versions FA_versions=(2.1.1 2.3.0 2.4.0.post1 2.4.1 2.5.7 2.6.3 3.0.0b1) for fa_version in "${FA_versions[@]}" do + + # Build Flash Attention if [ "${fa_version}" \< "3.0.0" ] then pip install flash-attn==${fa_version} @@ -19,5 +26,8 @@ do mkdir -p $python_path/flashattn_hopper wget -P $python_path/flashattn_hopper https://raw.githubusercontent.com/Dao-AILab/flash-attention/main/hopper/flash_attn_interface.py fi + + # Run tests NVTE_TORCH_COMPILE=0 pytest -v -s $TE_PATH/tests/pytorch/fused_attn/test_fused_attn.py + done diff --git a/qa/L3_pytorch_convergence_test/test.sh b/qa/L3_pytorch_convergence_test/test.sh new file mode 100644 index 0000000000..fca621f279 --- /dev/null +++ b/qa/L3_pytorch_convergence_test/test.sh @@ -0,0 +1,14 @@ +# Copyright (c) 2022-2024, NVIDIA CORPORATION & AFFILIATES. All rights reserved. +# +# See LICENSE for license information. + +set -e + +: ${TE_PATH:=/opt/transformerengine} + +pip install prettytable +git clone https://github.com/NVIDIA/Megatron-LM.git +cd Megatron-LM +git checkout b3375a0e38c10e2300ef4be031f7dcabab52b448 +pytest -v -s $TE_PATH/tests/pytorch/distributed/test_convergence.py +python $TE_PATH/tests/pytorch/distributed/print_logs.py diff --git a/tests/pytorch/test_fusible_ops_distributed.py b/tests/pytorch/distributed/test_fusible_ops.py similarity index 100% rename from tests/pytorch/test_fusible_ops_distributed.py rename to tests/pytorch/distributed/test_fusible_ops.py