From 358fbcfbdca52c62f25364f80f1d15ad06048ff1 Mon Sep 17 00:00:00 2001 From: Oliver Koenig Date: Wed, 6 Nov 2024 07:29:23 -0800 Subject: [PATCH] ADLR/megatron-lm!2310 - tests: Re-enable CP tests --- tests/functional_tests/jet_recipes/gpt.yaml | 8 ++++---- .../model_config.yaml | 2 ++ .../model_config.yaml | 2 ++ .../model_config.yaml | 2 ++ .../model_config.yaml | 2 ++ 5 files changed, 12 insertions(+), 4 deletions(-) diff --git a/tests/functional_tests/jet_recipes/gpt.yaml b/tests/functional_tests/jet_recipes/gpt.yaml index 957db69326..c00f827428 100644 --- a/tests/functional_tests/jet_recipes/gpt.yaml +++ b/tests/functional_tests/jet_recipes/gpt.yaml @@ -101,11 +101,11 @@ products: - gpt3_mr_tp2_pp2_dgx_a100_1N8G - gpt3_mr_mcore_te_tp2_pp2_defer_embedding_wgrad_compute_dgx_a100_1N8G - gpt3_mr_mcore_te_tp1_pp4_vp1_dist_optimizer_overlap_grad_reduce_param_gather_dgx_a100_1N8G - # - gpt3_mr_mcore_te_tp2_pp1_cp2_nondeterministic_dgx_a100_1N8G # cp and attention - # - gpt3_mr_mcore_te_tp2_pp2_cp2_nondeterministic_dgx_a100_1N8G # cp and attention - # - gpt3_mr_mcore_te_tp2_pp1_resume_torch_dist_cp2_nondeterministic_dgx_a100_1N8G # cp and attention - # - gpt3_mr_mcore_te_tp2_pp2_resume_torch_dist_cp2_nondeterministic_dgx_a100_1N8G # cp and attention - gpt3_mr_mcore_te_tp1_pp4_vp1_dist_optimizer_overlap_grad_reduce_dgx_a100_1N8G + - gpt3_mr_mcore_te_tp2_pp1_cp2_nondeterministic_dgx_a100_1N8G # cp and attention + - gpt3_mr_mcore_te_tp2_pp2_cp2_nondeterministic_dgx_a100_1N8G # cp and attention + - gpt3_mr_mcore_te_tp2_pp1_resume_torch_dist_cp2_nondeterministic_dgx_a100_1N8G # cp and attention + - gpt3_mr_mcore_te_tp2_pp2_resume_torch_dist_cp2_nondeterministic_dgx_a100_1N8G # cp and attention - environment: [lts, dev] scope: [nightly] platforms: [dgx_a100] diff --git a/tests/functional_tests/test_cases/gpt/gpt3_mr_mcore_te_tp2_pp1_cp2_nondeterministic_dgx_a100_1N8G/model_config.yaml b/tests/functional_tests/test_cases/gpt/gpt3_mr_mcore_te_tp2_pp1_cp2_nondeterministic_dgx_a100_1N8G/model_config.yaml index 01c7ffc2f1..d07e244b7a 100644 --- a/tests/functional_tests/test_cases/gpt/gpt3_mr_mcore_te_tp2_pp1_cp2_nondeterministic_dgx_a100_1N8G/model_config.yaml +++ b/tests/functional_tests/test_cases/gpt/gpt3_mr_mcore_te_tp2_pp1_cp2_nondeterministic_dgx_a100_1N8G/model_config.yaml @@ -1,6 +1,8 @@ ENV_VARS: CUDA_DEVICE_MAX_CONNECTIONS: 1 NVTE_ALLOW_NONDETERMINISTIC_ALGO: 1 + NVTE_FUSED_ATTN: 0 + NVTE_FLASH_ATTN: 1 MODEL_ARGS: --num-layers: 12 --hidden-size: 512 diff --git a/tests/functional_tests/test_cases/gpt/gpt3_mr_mcore_te_tp2_pp1_resume_torch_dist_cp2_nondeterministic_dgx_a100_1N8G/model_config.yaml b/tests/functional_tests/test_cases/gpt/gpt3_mr_mcore_te_tp2_pp1_resume_torch_dist_cp2_nondeterministic_dgx_a100_1N8G/model_config.yaml index 2cc6bd5c6f..4d2dea4597 100644 --- a/tests/functional_tests/test_cases/gpt/gpt3_mr_mcore_te_tp2_pp1_resume_torch_dist_cp2_nondeterministic_dgx_a100_1N8G/model_config.yaml +++ b/tests/functional_tests/test_cases/gpt/gpt3_mr_mcore_te_tp2_pp1_resume_torch_dist_cp2_nondeterministic_dgx_a100_1N8G/model_config.yaml @@ -1,6 +1,8 @@ ENV_VARS: CUDA_DEVICE_MAX_CONNECTIONS: 1 NVTE_ALLOW_NONDETERMINISTIC_ALGO: 1 + NVTE_FUSED_ATTN: 0 + NVTE_FLASH_ATTN: 1 MODEL_ARGS: --num-layers: 12 --hidden-size: 512 diff --git a/tests/functional_tests/test_cases/gpt/gpt3_mr_mcore_te_tp2_pp2_cp2_nondeterministic_dgx_a100_1N8G/model_config.yaml b/tests/functional_tests/test_cases/gpt/gpt3_mr_mcore_te_tp2_pp2_cp2_nondeterministic_dgx_a100_1N8G/model_config.yaml index 5630ddd719..a6cf383dbe 100644 --- a/tests/functional_tests/test_cases/gpt/gpt3_mr_mcore_te_tp2_pp2_cp2_nondeterministic_dgx_a100_1N8G/model_config.yaml +++ b/tests/functional_tests/test_cases/gpt/gpt3_mr_mcore_te_tp2_pp2_cp2_nondeterministic_dgx_a100_1N8G/model_config.yaml @@ -1,6 +1,8 @@ ENV_VARS: CUDA_DEVICE_MAX_CONNECTIONS: 1 NVTE_ALLOW_NONDETERMINISTIC_ALGO: 1 + NVTE_FUSED_ATTN: 0 + NVTE_FLASH_ATTN: 1 MODEL_ARGS: --num-layers: 12 --hidden-size: 512 diff --git a/tests/functional_tests/test_cases/gpt/gpt3_mr_mcore_te_tp2_pp2_resume_torch_dist_cp2_nondeterministic_dgx_a100_1N8G/model_config.yaml b/tests/functional_tests/test_cases/gpt/gpt3_mr_mcore_te_tp2_pp2_resume_torch_dist_cp2_nondeterministic_dgx_a100_1N8G/model_config.yaml index 4a8a6abdd0..d150435364 100644 --- a/tests/functional_tests/test_cases/gpt/gpt3_mr_mcore_te_tp2_pp2_resume_torch_dist_cp2_nondeterministic_dgx_a100_1N8G/model_config.yaml +++ b/tests/functional_tests/test_cases/gpt/gpt3_mr_mcore_te_tp2_pp2_resume_torch_dist_cp2_nondeterministic_dgx_a100_1N8G/model_config.yaml @@ -1,6 +1,8 @@ ENV_VARS: CUDA_DEVICE_MAX_CONNECTIONS: 1 NVTE_ALLOW_NONDETERMINISTIC_ALGO: 1 + NVTE_FUSED_ATTN: 0 + NVTE_FLASH_ATTN: 1 MODEL_ARGS: --num-layers: 12 --hidden-size: 512