From 47b8470fd855ea75dcbb4db70914dd6b3e08e3ee Mon Sep 17 00:00:00 2001 From: Oliver Koenig Date: Sat, 4 Jan 2025 17:03:06 -0800 Subject: [PATCH] ADLR/megatron-lm!2496 - ci: Move most of LTS tests to nightly --- tests/test_utils/python_scripts/common.py | 7 +- .../generate_jet_trigger_job.py | 1 + .../python_scripts/launch_jet_workload.py | 5 + tests/test_utils/recipes/bert.yaml | 93 ++- tests/test_utils/recipes/gpt-modelopt.yaml | 13 +- tests/test_utils/recipes/gpt-nemo.yaml | 22 +- tests/test_utils/recipes/gpt.yaml | 739 ++++++++++++++---- .../test_utils/recipes/multimodal-llava.yaml | 70 +- tests/test_utils/recipes/t5.yaml | 99 ++- tests/test_utils/recipes/unit-tests.yaml | 54 +- 10 files changed, 862 insertions(+), 241 deletions(-) diff --git a/tests/test_utils/python_scripts/common.py b/tests/test_utils/python_scripts/common.py index dd2e2e4706..0167a32b9b 100644 --- a/tests/test_utils/python_scripts/common.py +++ b/tests/test_utils/python_scripts/common.py @@ -25,9 +25,12 @@ def flatten_products( workload_manifest: jetclient.JETWorkloadManifest, ) -> jetclient.JETWorkloadManifest: """Flattens a nested dict of products""" + workload_manifest.products = [ - dict(zip(inp.keys(), values)) - for inp in workload_manifest.products + dict(**dict(zip(inp.keys(), values)), **{"test_case": product['test_case'][0]}) + for product in workload_manifest.products + if "products" in product + for inp in product['products'] for values in itertools.product(*inp.values()) ] diff --git a/tests/test_utils/python_scripts/generate_jet_trigger_job.py b/tests/test_utils/python_scripts/generate_jet_trigger_job.py index 72027e3613..1aa36c8a4b 100644 --- a/tests/test_utils/python_scripts/generate_jet_trigger_job.py +++ b/tests/test_utils/python_scripts/generate_jet_trigger_job.py @@ -131,6 +131,7 @@ def main( f"--environment {test_case.spec.environment}", f"--n-repeat {n_repeat}", f"--time-limit {time_limit}", + f"--scope {scope}", f"--test-case '{test_case.spec.test_case}'", f"--container-tag {container_tag}", f"--cluster {cluster}", diff --git a/tests/test_utils/python_scripts/launch_jet_workload.py b/tests/test_utils/python_scripts/launch_jet_workload.py index 6cdac6a2c4..45e974855b 100644 --- a/tests/test_utils/python_scripts/launch_jet_workload.py +++ b/tests/test_utils/python_scripts/launch_jet_workload.py @@ -40,6 +40,7 @@ def launch_and_wait_for_completion( environment: str, n_repeat: int, time_limit: int, + scope: str, container_image: Optional[str], container_tag: str, cluster: str, @@ -63,6 +64,7 @@ def launch_and_wait_for_completion( n_repeat=n_repeat, time_limit=time_limit, tag=tag, + scope=scope, container_image=container_image, container_tag=container_tag, environment=environment, @@ -160,6 +162,7 @@ def parse_finished_training(logs: List[str]) -> Optional[bool]: ) @click.option("--n-repeat", required=False, default=1, type=int) @click.option("--time-limit", required=False, default=1800, type=int) +@click.option("--scope", required=False, default="mr", type=str) @click.option( "--account", required=False, @@ -187,6 +190,7 @@ def main( environment: str, n_repeat: int, time_limit: int, + scope: str, account: str, partition: Optional[str], cluster: str, @@ -236,6 +240,7 @@ def main( environment=environment, n_repeat=n_repeat, time_limit=time_limit, + scope=scope, container_image=container_image, container_tag=container_tag, cluster=cluster, diff --git a/tests/test_utils/recipes/bert.yaml b/tests/test_utils/recipes/bert.yaml index 2fc294d1dc..53c6a79467 100644 --- a/tests/test_utils/recipes/bert.yaml +++ b/tests/test_utils/recipes/bert.yaml @@ -9,6 +9,8 @@ spec: build: mcore-pyt-{environment} gpus: 8 platforms: dgx_a100 + time_limit: + n_repeat: artifacts: /workspace/data/bert_data: text/the_pile/bert_shard00 script: |- @@ -30,26 +32,71 @@ spec: bash ./tests/functional_tests/shell_test_utils/run_ci_test.sh ${{ARGUMENTS[@]}} products: - - environment: [lts, dev] - scope: [mr] - time_limit: [1800] - n_repeat: [5] - test_case: - - bert_mr_mcore_tp2_pp2_dgx_a100_1N8G - - bert_mr_mcore_tp2_pp2_local_spec_dgx_a100_1N8G - - bert_mr_mcore_tp2_pp2_resume_torch_dist_dgx_a100_1N8G - - bert_mr_mcore_tp2_pp2_resume_torch_dist_local_spec_dgx_a100_1N8G - - bert_mr_tp1_pp4_vp2_dgx_a100_1N8G - - bert_mr_tp1_pp4_vp2_resume_torch_dgx_a100_1N8G - - bert_mr_tp2_pp2_dgx_a100_1N8G - - bert_mr_tp2_pp2_resume_torch_dgx_a100_1N8G - - environment: [lts, dev] - scope: [nightly] - n_repeat: [5] - time_limit: [3600] - test_case: - - bert_nightly_dgx_a100_1N8G_mcore_tp1_pp2 - - bert_nightly_dgx_a100_1N8G_mcore_tp1_pp4_vp2 - - bert_nightly_dgx_a100_1N8G_mcore_tp4_pp1 - - bert_nightly_dgx_a100_1N8G_tp1_pp2 - - bert_nightly_dgx_a100_1N8G_tp4_pp1 + - test_case: [bert_mr_mcore_tp2_pp2_dgx_a100_1N8G] + products: + - environment: [dev] + scope: [mr] + - environment: [lts] + scope: [nightly] + - test_case: [bert_mr_mcore_tp2_pp2_local_spec_dgx_a100_1N8G] + products: + - environment: [dev] + scope: [mr] + - environment: [lts] + scope: [nightly] + - test_case: [bert_mr_mcore_tp2_pp2_resume_torch_dist_dgx_a100_1N8G] + products: + - environment: [dev] + scope: [mr] + - environment: [lts] + scope: [nightly] + - test_case: [bert_mr_mcore_tp2_pp2_resume_torch_dist_local_spec_dgx_a100_1N8G] + products: + - environment: [dev] + scope: [mr] + - environment: [lts] + scope: [nightly] + - test_case: [bert_mr_tp1_pp4_vp2_dgx_a100_1N8G] + products: + - environment: [dev] + scope: [mr] + - environment: [lts] + scope: [nightly] + - test_case: [bert_mr_tp1_pp4_vp2_resume_torch_dgx_a100_1N8G] + products: + - environment: [dev] + scope: [mr] + - environment: [lts] + scope: [nightly] + - test_case: [bert_mr_tp2_pp2_dgx_a100_1N8G] + products: + - environment: [dev] + scope: [mr] + - environment: [lts] + scope: [nightly] + - test_case: [bert_mr_tp2_pp2_resume_torch_dgx_a100_1N8G] + products: + - environment: [dev] + scope: [mr] + - environment: [lts] + scope: [nightly] + - test_case: [bert_nightly_dgx_a100_1N8G_mcore_tp1_pp2] + products: + - environment: [dev, lts] + scope: [nightly] + - test_case: [bert_nightly_dgx_a100_1N8G_mcore_tp1_pp4_vp2] + products: + - environment: [dev, lts] + scope: [nightly] + - test_case: [bert_nightly_dgx_a100_1N8G_mcore_tp4_pp1] + products: + - environment: [dev, lts] + scope: [nightly] + - test_case: [bert_nightly_dgx_a100_1N8G_tp1_pp2] + products: + - environment: [dev, lts] + scope: [nightly] + - test_case: [bert_nightly_dgx_a100_1N8G_tp4_pp1] + products: + - environment: [dev, lts] + scope: [nightly] diff --git a/tests/test_utils/recipes/gpt-modelopt.yaml b/tests/test_utils/recipes/gpt-modelopt.yaml index 48428a65a6..c2ff0b91c6 100644 --- a/tests/test_utils/recipes/gpt-modelopt.yaml +++ b/tests/test_utils/recipes/gpt-modelopt.yaml @@ -8,6 +8,9 @@ spec: build: mcore-pyt-{environment} nodes: 1 gpus: 2 + platforms: dgx_a100 + time_limit: + n_repeat: artifacts: /workspace/data/gpt3_data: text/the_pile/shard00 /workspace/checkpoints/teacher: model/gpt_dummy_pyt/ckpt/24.10.0_bf16_teacher @@ -29,9 +32,7 @@ spec: bash ./tests/functional_tests/shell_test_utils/run_ci_test.sh ${{ARGUMENTS[@]}} products: - - scope: [nightly] - platforms: [dgx_a100] - time_limit: [1200] - environment: [lts, dev] # Disable dev for now - test_case: - - gpt3_nightly_mcore_te_tp2_pp1_modelopt_distill_resume + - test_case: [gpt3_nightly_mcore_te_tp2_pp1_modelopt_distill_resume] + products: + - environment: [dev, lts] + scope: [nightly] diff --git a/tests/test_utils/recipes/gpt-nemo.yaml b/tests/test_utils/recipes/gpt-nemo.yaml index fa71a5e7b9..3d937b2dea 100644 --- a/tests/test_utils/recipes/gpt-nemo.yaml +++ b/tests/test_utils/recipes/gpt-nemo.yaml @@ -10,7 +10,7 @@ spec: gpus: 8 platforms: dgx_a100 time_limit: 1800 - scope: null + scope: script: |- ls cd /opt/NeMo @@ -30,11 +30,15 @@ spec: bash /opt/megatron-lm/tests/functional_tests/shell_test_utils/run_ci_test.sh ${{ARGUMENTS[@]}} products: - - environment: [dev] - scope: [mr] - n_repeat: [5] - test_case: - - gpt3-nemo_126m_mr_mbs1_gbs8_mcore_te_tp2_pp4_vp3_seq_par_overlap_p2p_dgx_a100_1N8G - - gpt3-nemo_126m_mr_mbs4_gbs64_mcore_te_tp1_pp1_dgx_a100_1N8G - - gpt3-nemo_126m_mr_mbs1_gbs8_mcore_te_8experts_tp2_ep2_pp1_dgx_a100_1N8G - + - test_case: [gpt3-nemo_126m_mr_mbs1_gbs8_mcore_te_tp2_pp4_vp3_seq_par_overlap_p2p_dgx_a100_1N8G] + products: + - environment: [dev] + scope: [mr] + - test_case: [gpt3-nemo_126m_mr_mbs4_gbs64_mcore_te_tp1_pp1_dgx_a100_1N8G] + products: + - environment: [dev] + scope: [mr] + - test_case: [gpt3-nemo_126m_mr_mbs1_gbs8_mcore_te_8experts_tp2_ep2_pp1_dgx_a100_1N8G] + products: + - environment: [dev] + scope: [mr] diff --git a/tests/test_utils/recipes/gpt.yaml b/tests/test_utils/recipes/gpt.yaml index db6158a750..42d70839bb 100644 --- a/tests/test_utils/recipes/gpt.yaml +++ b/tests/test_utils/recipes/gpt.yaml @@ -8,7 +8,8 @@ spec: build: mcore-pyt-{environment} nodes: 1 gpus: 8 - n_repeat: null + n_repeat: 5 + platforms: dgx_a100 artifacts: /workspace/data/gpt3_data: text/the_pile/shard00 script: |- @@ -30,144 +31,598 @@ spec: bash ./tests/functional_tests/shell_test_utils/run_ci_test.sh ${{ARGUMENTS[@]}} products: - - environment: [lts, dev] - scope: [mr] - platforms: [dgx_a100] - time_limit: [1800] - n_repeat: [5] - test_case: - - gpt3_mr_mcore_te_tp1_pp1_dist_optimizer_no_mmap_bin_files_dgx_a100_1N8G - - gpt3_mr_mcore_te_tp1_pp1_resume_torch_dist_dist_optimizer_dgx_a100_1N8G - - gpt3_mr_mcore_te_tp1_pp1_resume_torch_dist_dist_optimizer_no_mmap_bin_files_dgx_a100_1N8G - - gpt3_mr_mcore_te_tp1_pp1_resume_torch_dist_uniform_full_recompute_dgx_a100_1N8G - - gpt3_mr_mcore_te_tp1_pp1_uniform_full_recompute_dgx_a100_1N8G - - gpt3_mr_mcore_te_tp1_pp2_resume_torch_dist_rope_embeddings_dgx_a100_1N8G - - gpt3_mr_mcore_te_tp1_pp2_resume_torch_dist_rope_embeddings_interleaved_no_fusion_dgx_a100_1N8G - - gpt3_mr_mcore_te_tp1_pp2_rope_embeddings_dgx_a100_1N8G - - gpt3_mr_mcore_te_tp1_pp2_rope_embeddings_interleaved_no_fusion_dgx_a100_1N8G - - gpt3_mr_mcore_te_tp1_pp4_disable_bias_linear_dgx_a100_1N8G - - gpt3_mr_mcore_te_tp1_pp4_resume_torch_dist_disable_bias_linear_dgx_a100_1N8G - - gpt3_mr_mcore_te_tp1_pp4_resume_torch_dist_sequence_parallel_dgx_a100_1N8G - - gpt3_mr_mcore_te_tp1_pp4_resume_torch_dist_swiglu_dgx_a100_1N8G - - gpt3_mr_mcore_te_tp1_pp4_resume_torch_dist_untie_embeddings_and_outputs_dgx_a100_1N8G - - gpt3_mr_mcore_te_tp1_pp4_sequence_parallel_dgx_a100_1N8G - - gpt3_mr_mcore_te_tp1_pp4_swiglu_dgx_a100_1N8G - - gpt3_mr_mcore_te_tp1_pp4_untie_embeddings_and_outputs_dgx_a100_1N8G - - gpt3_mr_mcore_te_tp1_pp4_vp1_calculate_per_token_loss_dgx_a100_1N8G - - gpt3_mr_mcore_te_tp1_pp4_vp1_decoupled_lr_dgx_a100_1N8G - - gpt3_mr_mcore_te_tp1_pp4_vp1_uneven_pipeline_dgx_a100_1N8G - - gpt3_mr_mcore_te_tp1_pp4_vp1_dgx_a100_1N8G - - gpt3_mr_mcore_te_tp1_pp4_vp1_tunable_overlap_dgx_a100_1N8G - - gpt3_mr_mcore_te_tp1_pp4_vp1_dist_optimizer_overlap_grad_reduce_param_gather_overlap_optimizer_dgx_a100_1N8G - - gpt3_mr_mcore_te_tp1_pp4_vp1_dist_optimizer_overlap_grad_reduce_untied_dgx_a100_1N8G - - gpt3_mr_mcore_te_tp1_pp4_vp1_resume_torch_decoupled_lr_dgx_a100_1N8G - - gpt3_mr_mcore_te_tp1_pp4_vp1_resume_torch_dist_calculate_per_token_loss_dgx_a100_1N8G - - gpt3_mr_mcore_te_tp1_pp4_vp1_resume_torch_dist_dgx_a100_1N8G - - gpt3_mr_mcore_te_tp1_pp4_vp1_resume_torch_dist_dist_optimizer_overlap_grad_reduce_dgx_a100_1N8G - - gpt3_mr_mcore_te_tp1_pp4_vp1_resume_torch_dist_dist_optimizer_overlap_grad_reduce_param_gather_dgx_a100_1N8G - - gpt3_mr_mcore_te_tp1_pp4_vp1_resume_torch_dist_dist_optimizer_overlap_grad_reduce_untied_dgx_a100_1N8G - # - gpt3_mr_mcore_te_tp2_pp1_fsdp2_resume_torch_dist_dgx_a100_1N8G # torch >= 2.4.0 - - gpt3_mr_mcore_te_tp1_pp4_vp1_resume_torch_dist_tunable_overlap_dgx_a100_1N8G - - gpt3_mr_mcore_te_tp2_pp1_resume_torch_dist_te_8experts2parallel_dist_optimizer_dgx_a100_1N8G - - gpt3_mr_mcore_te_tp2_pp1_resume_torch_dist_te_8experts2parallel_groupedGEMM_dgx_a100_1N8G - - gpt3_mr_mcore_te_tp2_pp1_resume_torch_dist_te_8experts2parallel_top2router_dgx_a100_1N8G - - gpt3_mr_mcore_te_tp2_pp1_te_8experts_etp1_ep4_dgx_a100_1N8G - - gpt3_mr_mcore_te_tp2_pp1_te_8experts2parallel_dist_optimizer_dgx_a100_1N8G - - gpt3_mr_mcore_te_tp2_pp1_te_8experts2parallel_overlap_grad_reduce_param_gather_groupedGEMM_dgx_a100_1N8G - - gpt3_mr_mcore_te_tp2_pp1_te_8experts2parallel_top2router_dgx_a100_1N8G - - gpt3_mr_mcore_te_tp2_pp2_cross_entropy_loss_fusion_dgx_a100_1N8G - - gpt3_mr_mcore_te_tp2_pp2_ddp_average_in_collective_dgx_a100_1N8G - - gpt3_mr_mcore_te_tp2_pp2_dgx_a100_1N8G - - gpt3_mr_mcore_te_tp2_pp2_no_create_attention_mask_in_dataloader_dgx_a100_1N8G - - gpt3_mr_mcore_te_tp2_pp2_no_mmap_bin_files_dgx_a100_1N8G - - gpt3_mr_mcore_te_tp2_pp2_resume_torch_dist_cross_entropy_loss_fusion_dgx_a100_1N8G - - gpt3_mr_mcore_te_tp2_pp2_resume_torch_dist_ddp_average_in_collective_dgx_a100_1N8G - - gpt3_mr_mcore_te_tp2_pp2_resume_torch_dist_defer_embedding_wgrad_compute_dgx_a100_1N8G - - gpt3_mr_mcore_te_tp2_pp2_resume_torch_dist_dgx_a100_1N8G - - gpt3_mr_mcore_te_tp2_pp2_resume_torch_dist_no_create_attention_mask_in_dataloader_dgx_a100_1N8G - - gpt3_mr_mcore_te_tp2_pp2_resume_torch_dist_no_mmap_bin_files_dgx_a100_1N8G - - gpt3_mr_mcore_te_tp2_pp2_resume_torch_dist_reshard_1x4xNone_dgx_a100_1N8G - - gpt3_mr_mcore_te_tp4_pp1_dist_optimizer_overlap_grad_reduce_dgx_a100_1N8G - - gpt3_mr_mcore_te_tp4_pp1_dist_optimizer_overlap_grad_reduce_param_gather_dgx_a100_1N8G - - gpt3_mr_mcore_te_tp4_pp1_qk_layernorm_test_mode_dgx_a100_1N8G - - gpt3_mr_mcore_te_tp4_pp1_resume_torch_dist_dist_optimizer_overlap_grad_reduce_dgx_a100_1N8G - - gpt3_mr_mcore_te_tp4_pp1_resume_torch_dist_dist_optimizer_overlap_grad_reduce_param_gather_dgx_a100_1N8G - - gpt3_mr_mcore_te_tp4_pp1_resume_torch_dist_qk_layernorm_test_mode_dgx_a100_1N8G - - gpt3_mr_mcore_te_tp4_pp2_resume_torch_dist_reshard_8x1xNone_dgx_a100_1N8G - # - gpt3_mr_mcore_tp2_pp2_resume_torch_dist_uninstall_te_dgx_a100_1N8G # non-deterministic on gradients - - gpt3_mr_mcore_tp2_pp2_uninstall_te_dgx_a100_1N8G - - gpt3_mr_te_tp2_pp2_dgx_a100_1N8G - - gpt3_mr_te_tp2_pp2_resume_torch_dgx_a100_1N8G - - gpt3_mr_tp1_pp4_vp1_dgx_a100_1N8G - - gpt3_mr_tp1_pp4_vp1_resume_torch_dgx_a100_1N8G - - gpt3_mr_tp2_pp2_dgx_a100_1N8G - - gpt3_mr_mcore_te_tp2_pp2_defer_embedding_wgrad_compute_dgx_a100_1N8G - - gpt3_mr_mcore_te_tp1_pp4_vp1_dist_optimizer_overlap_grad_reduce_param_gather_dgx_a100_1N8G - - gpt3_mr_mcore_te_tp1_pp4_vp1_dist_optimizer_overlap_grad_reduce_dgx_a100_1N8G - - gpt3_mr_mcore_te_tp2_pp1_multi_dist_optimizer_instances_dgx_a100_1N8G - - gpt3_mr_mcore_te_tp2_pp1_resume_torch_dist_multi_dist_optimizer_instances_dgx_a100_1N8G - - gpt3_mr_mcore_te_tp2_pp1_cp2_nondeterministic_dgx_a100_1N8G # cp and attention - - gpt3_mr_mcore_te_tp2_pp2_cp2_nondeterministic_dgx_a100_1N8G # cp and attention - - gpt3_mr_mcore_te_tp2_pp1_resume_torch_dist_cp2_nondeterministic_dgx_a100_1N8G # cp and attention - - gpt3_mr_mcore_te_tp2_pp2_resume_torch_dist_cp2_nondeterministic_dgx_a100_1N8G # cp and attention - - gpt3_mr_mcore_te_tp1_pp2_cp4_a2a_p2p_nondeterministic_dgx_a100_1N8G # cp and attention with a2a+p2p comm type - - gpt3_mr_mcore_te_tp1_pp2_resume_torch_dist_cp4_a2a_p2p_nondeterministic_dgx_a100_1N8G # cp and attention with a2a+p2p comm type - - environment: [dev] - scope: [mr] - platforms: [dgx_a100] - time_limit: [1800] - n_repeat: [5] - test_case: - - gpt3_mr_mcore_te_tp1_pp2_resume_torch_dist_reshard_2x1x4_te_8experts2parallel_dist_optimizer_dgx_a100_1N8G - - environment: [lts, dev] - scope: [nightly] - platforms: [dgx_a100] - time_limit: [3600] - n_repeat: [5] - test_case: - - gpt3_345m_nightly_dgx_a100_1N8G_mcore_tp1_pp1_dist_optimizer_overlap_grad_reduce_param_gather - # - gpt3_345m_nightly_dgx_a100_1N8G_mcore_tp1_pp1_fsdp2_resume_torch_dist_te # torch >= 2.4.0 - - gpt3_345m_nightly_dgx_a100_1N8G_mcore_tp1_pp1_resume_torch_dist_dist_optimizer_overlap_grad_reduce_param_gather - - gpt3_345m_nightly_dgx_a100_1N8G_mcore_tp1_pp2 - - gpt3_345m_nightly_dgx_a100_1N8G_mcore_tp1_pp2_resume_torch_dist - - gpt3_345m_nightly_dgx_a100_1N8G_mcore_tp1_pp4 - - gpt3_345m_nightly_dgx_a100_1N8G_mcore_tp1_pp4_resume_torch_dist - - gpt3_345m_nightly_dgx_a100_1N8G_mcore_tp2_pp2_ep2_te_4experts2parallel - # - gpt3_345m_nightly_dgx_a100_1N8G_mcore_tp2_pp2_resume_torch_dist_te_2experts # non-determinism - - gpt3_345m_nightly_dgx_a100_1N8G_mcore_tp4_pp1 - - gpt3_345m_nightly_dgx_a100_1N8G_mcore_tp4_pp1_resume_torch - - gpt3_345m_nightly_dgx_a100_1N8G_mcore_tp4_pp1_resume_torch_dist - - gpt3_345m_nightly_dgx_a100_1N8G_tp1_pp1_dist_optimizer_overlap_grad_reduce - - gpt3_345m_nightly_dgx_a100_1N8G_tp1_pp1_overlap_grad_reduce - - gpt3_345m_nightly_dgx_a100_1N8G_tp1_pp2 - - gpt3_345m_nightly_dgx_a100_1N8G_tp1_pp2_resume_torch - - gpt3_345m_nightly_dgx_a100_1N8G_tp1_pp4 - - gpt3_345m_nightly_dgx_a100_1N8G_tp1_pp4_overlap_grad_reduce - - gpt3_345m_nightly_dgx_a100_1N8G_tp1_pp4_resume_torch - - gpt3_345m_nightly_dgx_a100_1N8G_tp1_pp4_vp1_overlap_grad_reduce - - gpt3_345m_nightly_dgx_a100_1N8G_tp2_pp2_overlap_grad_reduce - - gpt3_345m_nightly_dgx_a100_1N8G_tp2_pp2_resume_torch_4experts - - gpt3_345m_nightly_dgx_a100_1N8G_tp2_pp2_resume_torch_overlap_grad_reduce - - gpt3_345m_nightly_dgx_a100_1N8G_tp4_pp1 - - gpt3_345m_nightly_dgx_a100_1N8G_tp4_pp1_overlap_grad_reduce - - gpt3_345m_nightly_dgx_a100_1N8G_tp4_pp1_resume_torch - - environment: [lts] - scope: [nightly] - platforms: [dgx_a100] - time_limit: [3600] - n_repeat: [5] - test_case: - - gpt3_345m_nightly_dgx_a100_1N8G_mcore_tp2_pp2_ep2_resume_torch_dist_te_4experts2parallel # non-determinism in dev - - environment: [lts, dev] - scope: [weekly] - platforms: [dgx_h100] - time_limit: [9000] - test_case: - - gpt3_345m_weekly_dgx_h100_1N8G_mcore_nondet_tp1_pp1_fp8_no_model_parallel - - gpt3_345m_weekly_dgx_h100_1N8G_mcore_tp1_pp1_bf16_baseline - - gpt3_345m_weekly_dgx_h100_1N8G_mcore_tp1_pp1_fp8_no_model_parallel - - gpt3_345m_weekly_dgx_h100_1N8G_mcore_tp1_pp2_fp8_pp - - gpt3_345m_weekly_dgx_h100_1N8G_mcore_tp2_pp2_fp8_tp_pp - - gpt3_345m_weekly_dgx_h100_1N8G_mcore_tp2_pp2_fp8_tp_pp_sp - - gpt3_345m_weekly_dgx_h100_1N8G_mcore_tp2_pp2_native_fp8_tp_pp_sp - - gpt3_345m_weekly_dgx_h100_1N8G_mcore_tp4_pp2_fp8_tp_pp + ####################################################################### + # Nightly tests: Run both DEV and LTS unless something is flaky # + ####################################################################### + - test_case: [gpt3_345m_nightly_dgx_a100_1N8G_mcore_tp1_pp1_dist_optimizer_overlap_grad_reduce_param_gather] + products: + - environment: [dev, lts] + scope: [nightly] + - test_case: [gpt3_345m_nightly_dgx_a100_1N8G_mcore_tp1_pp1_resume_torch_dist_dist_optimizer_overlap_grad_reduce_param_gather] + products: + - environment: [dev, lts] + scope: [nightly] + - test_case: [gpt3_345m_nightly_dgx_a100_1N8G_mcore_tp1_pp2] + products: + - environment: [dev, lts] + scope: [nightly] + - test_case: [gpt3_345m_nightly_dgx_a100_1N8G_mcore_tp1_pp2_resume_torch_dist] + products: + - environment: [dev, lts] + scope: [nightly] + - test_case: [gpt3_345m_nightly_dgx_a100_1N8G_mcore_tp1_pp4] + products: + - environment: [dev, lts] + scope: [nightly] + - test_case: [gpt3_345m_nightly_dgx_a100_1N8G_mcore_tp1_pp4_resume_torch_dist] + products: + - environment: [dev, lts] + scope: [nightly] + - test_case: [gpt3_345m_nightly_dgx_a100_1N8G_mcore_tp2_pp2_ep2_resume_torch_dist_te_4experts2parallel] + products: + - environment: [lts] + scope: [nightly] + - test_case: [gpt3_345m_nightly_dgx_a100_1N8G_mcore_tp2_pp2_ep2_te_4experts2parallel] + products: + - environment: [dev, lts] + scope: [nightly] + - test_case: [gpt3_345m_nightly_dgx_a100_1N8G_mcore_tp4_pp1] + products: + - environment: [dev, lts] + scope: [nightly] + - test_case: [gpt3_345m_nightly_dgx_a100_1N8G_mcore_tp4_pp1_resume_torch] + products: + - environment: [dev, lts] + scope: [nightly] + - test_case: [gpt3_345m_nightly_dgx_a100_1N8G_mcore_tp4_pp1_resume_torch_dist] + products: + - environment: [dev, lts] + scope: [nightly] + - test_case: [gpt3_345m_nightly_dgx_a100_1N8G_tp1_pp1_dist_optimizer_overlap_grad_reduce] + products: + - environment: [dev, lts] + scope: [nightly] + - test_case: [gpt3_345m_nightly_dgx_a100_1N8G_tp1_pp1_overlap_grad_reduce] + products: + - environment: [dev, lts] + scope: [nightly] + - test_case: [gpt3_345m_nightly_dgx_a100_1N8G_tp1_pp2] + products: + - environment: [dev, lts] + scope: [nightly] + - test_case: [gpt3_345m_nightly_dgx_a100_1N8G_tp1_pp2_resume_torch] + products: + - environment: [dev, lts] + scope: [nightly] + - test_case: [gpt3_345m_nightly_dgx_a100_1N8G_tp1_pp4] + products: + - environment: [dev, lts] + scope: [nightly] + - test_case: [gpt3_345m_nightly_dgx_a100_1N8G_tp1_pp4_overlap_grad_reduce] + products: + - environment: [dev, lts] + scope: [nightly] + - test_case: [gpt3_345m_nightly_dgx_a100_1N8G_tp1_pp4_resume_torch] + products: + - environment: [dev, lts] + scope: [nightly] + - test_case: [gpt3_345m_nightly_dgx_a100_1N8G_tp1_pp4_vp1_overlap_grad_reduce] + products: + - environment: [dev, lts] + scope: [nightly] + - test_case: [gpt3_345m_nightly_dgx_a100_1N8G_tp2_pp2_overlap_grad_reduce] + products: + - environment: [dev, lts] + scope: [nightly] + - test_case: [gpt3_345m_nightly_dgx_a100_1N8G_tp2_pp2_resume_torch_4experts] + products: + - environment: [dev, lts] + scope: [nightly] + - test_case: [gpt3_345m_nightly_dgx_a100_1N8G_tp2_pp2_resume_torch_overlap_grad_reduce] + products: + - environment: [dev, lts] + scope: [nightly] + - test_case: [gpt3_345m_nightly_dgx_a100_1N8G_tp4_pp1] + products: + - environment: [dev, lts] + scope: [nightly] + - test_case: [gpt3_345m_nightly_dgx_a100_1N8G_tp4_pp1_overlap_grad_reduce] + products: + - environment: [dev, lts] + scope: [nightly] + - test_case: [gpt3_345m_nightly_dgx_a100_1N8G_tp4_pp1_resume_torch] + products: + - environment: [dev, lts] + scope: [nightly] + # - test_case: [gpt3_345m_nightly_dgx_a100_1N8G_mcore_tp2_pp2_resume_torch_dist_te_2experts] + # products: + # - environment: [dev, lts] + # scope: [nightly] + # - test_case: [gpt3_345m_nightly_dgx_a100_1N8G_mcore_tp1_pp1_fsdp2_resume_torch_dist_te] + # products: + # - environment: [dev, lts] + # scope: [nightly] + ####################################################################### + # Weekly tests: Run both DEV and LTS unless something is flaky # + ####################################################################### + - test_case: [gpt3_345m_weekly_dgx_h100_1N8G_mcore_nondet_tp1_pp1_fp8_no_model_parallel] + products: + - environment: [dev, lts] + scope: [weekly] + - test_case: [gpt3_345m_weekly_dgx_h100_1N8G_mcore_tp1_pp1_bf16_baseline] + products: + - environment: [dev, lts] + scope: [weekly] + - test_case: [gpt3_345m_weekly_dgx_h100_1N8G_mcore_tp1_pp1_fp8_no_model_parallel] + products: + - environment: [dev, lts] + scope: [weekly] + - test_case: [gpt3_345m_weekly_dgx_h100_1N8G_mcore_tp1_pp2_fp8_pp] + products: + - environment: [dev, lts] + scope: [weekly] + - test_case: [gpt3_345m_weekly_dgx_h100_1N8G_mcore_tp2_pp2_fp8_tp_pp] + products: + - environment: [dev, lts] + scope: [weekly] + - test_case: [gpt3_345m_weekly_dgx_h100_1N8G_mcore_tp2_pp2_fp8_tp_pp_sp] + products: + - environment: [dev, lts] + scope: [weekly] + - test_case: [gpt3_345m_weekly_dgx_h100_1N8G_mcore_tp2_pp2_native_fp8_tp_pp_sp] + products: + - environment: [dev, lts] + scope: [weekly] + - test_case: [gpt3_345m_weekly_dgx_h100_1N8G_mcore_tp4_pp2_fp8_tp_pp] + products: + - environment: [dev, lts] + scope: [weekly] + ####################################################################### + # MR tests: Mostly DEV on MR, and LTS on nightly cadence, except for # + # some very important tests. # + ####################################################################### + - test_case: [gpt3_mr_mcore_te_tp1_pp1_dist_optimizer_no_mmap_bin_files_dgx_a100_1N8G] + products: + - environment: [dev] + scope: [mr] + - environment: [lts] + scope: [nightly] + - test_case: [gpt3_mr_mcore_te_tp1_pp1_resume_torch_dist_dist_optimizer_dgx_a100_1N8G] + products: + - environment: [dev] + scope: [mr] + - environment: [lts] + scope: [nightly] + - test_case: [gpt3_mr_mcore_te_tp1_pp1_resume_torch_dist_dist_optimizer_no_mmap_bin_files_dgx_a100_1N8G] + products: + - environment: [dev] + scope: [mr] + - environment: [lts] + scope: [nightly] + - test_case: [gpt3_mr_mcore_te_tp1_pp1_resume_torch_dist_uniform_full_recompute_dgx_a100_1N8G] + products: + - environment: [dev] + scope: [mr] + - environment: [lts] + scope: [nightly] + - test_case: [gpt3_mr_mcore_te_tp1_pp1_uniform_full_recompute_dgx_a100_1N8G] + products: + - environment: [dev] + scope: [mr] + - environment: [lts] + scope: [nightly] + - test_case: [gpt3_mr_mcore_te_tp1_pp2_cp4_a2a_p2p_nondeterministic_dgx_a100_1N8G] + products: + - environment: [dev] + scope: [mr] + - environment: [lts] + scope: [nightly] + - test_case: [gpt3_mr_mcore_te_tp1_pp2_resume_torch_dist_cp4_a2a_p2p_nondeterministic_dgx_a100_1N8G] + products: + - environment: [dev] + scope: [mr] + - environment: [lts] + scope: [nightly] + - test_case: [gpt3_mr_mcore_te_tp1_pp2_resume_torch_dist_reshard_2x1x4_te_8experts2parallel_dist_optimizer_dgx_a100_1N8G] + products: + - environment: [dev] + scope: [mr] + - test_case: [gpt3_mr_mcore_te_tp1_pp2_resume_torch_dist_rope_embeddings_dgx_a100_1N8G] + products: + - environment: [dev] + scope: [mr] + - environment: [lts] + scope: [nightly] + - test_case: [gpt3_mr_mcore_te_tp1_pp2_resume_torch_dist_rope_embeddings_interleaved_no_fusion_dgx_a100_1N8G] + products: + - environment: [dev] + scope: [mr] + - environment: [lts] + scope: [nightly] + - test_case: [gpt3_mr_mcore_te_tp1_pp2_rope_embeddings_dgx_a100_1N8G] + products: + - environment: [dev] + scope: [mr] + - environment: [lts] + scope: [nightly] + - test_case: [gpt3_mr_mcore_te_tp1_pp2_rope_embeddings_interleaved_no_fusion_dgx_a100_1N8G] + products: + - environment: [dev] + scope: [mr] + - environment: [lts] + scope: [nightly] + - test_case: [gpt3_mr_mcore_te_tp1_pp4_disable_bias_linear_dgx_a100_1N8G] + products: + - environment: [dev] + scope: [mr] + - environment: [lts] + scope: [nightly] + - test_case: [gpt3_mr_mcore_te_tp1_pp4_resume_torch_dist_disable_bias_linear_dgx_a100_1N8G] + products: + - environment: [dev] + scope: [mr] + - environment: [lts] + scope: [nightly] + - test_case: [gpt3_mr_mcore_te_tp1_pp4_resume_torch_dist_sequence_parallel_dgx_a100_1N8G] + products: + - environment: [dev] + scope: [mr] + - environment: [lts] + scope: [nightly] + - test_case: [gpt3_mr_mcore_te_tp1_pp4_resume_torch_dist_swiglu_dgx_a100_1N8G] + products: + - environment: [dev] + scope: [mr] + - environment: [lts] + scope: [nightly] + - test_case: [gpt3_mr_mcore_te_tp1_pp4_resume_torch_dist_untie_embeddings_and_outputs_dgx_a100_1N8G] + products: + - environment: [dev] + scope: [mr] + - environment: [lts] + scope: [nightly] + - test_case: [gpt3_mr_mcore_te_tp1_pp4_sequence_parallel_dgx_a100_1N8G] + products: + - environment: [dev] + scope: [mr] + - environment: [lts] + scope: [nightly] + - test_case: [gpt3_mr_mcore_te_tp1_pp4_swiglu_dgx_a100_1N8G] + products: + - environment: [dev] + scope: [mr] + - environment: [lts] + scope: [nightly] + - test_case: [gpt3_mr_mcore_te_tp1_pp4_untie_embeddings_and_outputs_dgx_a100_1N8G] + products: + - environment: [dev] + scope: [mr] + - environment: [lts] + scope: [nightly] + - test_case: [gpt3_mr_mcore_te_tp1_pp4_vp1_calculate_per_token_loss_dgx_a100_1N8G] + products: + - environment: [dev] + scope: [mr] + - environment: [lts] + scope: [nightly] + - test_case: [gpt3_mr_mcore_te_tp1_pp4_vp1_decoupled_lr_dgx_a100_1N8G] + products: + - environment: [dev] + scope: [mr] + - environment: [lts] + scope: [nightly] + - test_case: [gpt3_mr_mcore_te_tp1_pp4_vp1_dgx_a100_1N8G] + products: + - environment: [dev] + scope: [mr] + - environment: [lts] + scope: [nightly] + - test_case: [gpt3_mr_mcore_te_tp1_pp4_vp1_dist_optimizer_overlap_grad_reduce_dgx_a100_1N8G] + products: + - environment: [dev] + scope: [mr] + - environment: [lts] + scope: [nightly] + - test_case: [gpt3_mr_mcore_te_tp1_pp4_vp1_dist_optimizer_overlap_grad_reduce_param_gather_dgx_a100_1N8G] + products: + - environment: [dev] + scope: [mr] + - environment: [lts] + scope: [nightly] + + - test_case: [gpt3_mr_mcore_te_tp1_pp4_vp1_dist_optimizer_overlap_grad_reduce_untied_dgx_a100_1N8G] + products: + - environment: [dev] + scope: [mr] + - environment: [lts] + scope: [nightly] + - test_case: [gpt3_mr_mcore_te_tp1_pp4_vp1_resume_torch_decoupled_lr_dgx_a100_1N8G] + products: + - environment: [dev] + scope: [mr] + - environment: [lts] + scope: [nightly] + - test_case: [gpt3_mr_mcore_te_tp1_pp4_vp1_resume_torch_dist_calculate_per_token_loss_dgx_a100_1N8G] + products: + - environment: [dev] + scope: [mr] + - environment: [lts] + scope: [nightly] + - test_case: [gpt3_mr_mcore_te_tp1_pp4_vp1_resume_torch_dist_dgx_a100_1N8G] + products: + - environment: [dev] + scope: [mr] + - environment: [lts] + scope: [nightly] + - test_case: [gpt3_mr_mcore_te_tp1_pp4_vp1_resume_torch_dist_dist_optimizer_overlap_grad_reduce_dgx_a100_1N8G] + products: + - environment: [dev] + scope: [mr] + - environment: [lts] + scope: [nightly] + - test_case: [gpt3_mr_mcore_te_tp1_pp4_vp1_resume_torch_dist_dist_optimizer_overlap_grad_reduce_param_gather_dgx_a100_1N8G] + products: + - environment: [dev] + scope: [mr] + - environment: [lts] + scope: [nightly] + - test_case: [gpt3_mr_mcore_te_tp1_pp4_vp1_resume_torch_dist_dist_optimizer_overlap_grad_reduce_untied_dgx_a100_1N8G] + products: + - environment: [dev] + scope: [mr] + - environment: [lts] + scope: [nightly] + - test_case: [gpt3_mr_mcore_te_tp1_pp4_vp1_resume_torch_dist_tunable_overlap_dgx_a100_1N8G] + products: + - environment: [dev] + scope: [mr] + - environment: [lts] + scope: [nightly] + - test_case: [gpt3_mr_mcore_te_tp1_pp4_vp1_tunable_overlap_dgx_a100_1N8G] + products: + - environment: [dev] + scope: [mr] + - environment: [lts] + scope: [nightly] + - test_case: [gpt3_mr_mcore_te_tp1_pp4_vp1_uneven_pipeline_dgx_a100_1N8G] + products: + - environment: [dev] + scope: [mr] + - environment: [lts] + scope: [nightly] + - test_case: [gpt3_mr_mcore_te_tp2_pp1_multi_dist_optimizer_instances_dgx_a100_1N8G] + products: + - environment: [dev] + scope: [mr] + - environment: [lts] + scope: [nightly] + - test_case: [gpt3_mr_mcore_te_tp2_pp1_resume_torch_dist_cp2_nondeterministic_dgx_a100_1N8G] + products: + - environment: [dev] + scope: [mr] + - environment: [lts] + scope: [nightly] + - test_case: [gpt3_mr_mcore_te_tp2_pp1_resume_torch_dist_multi_dist_optimizer_instances_dgx_a100_1N8G] + products: + - environment: [dev] + scope: [mr] + - environment: [lts] + scope: [nightly] + - test_case: [gpt3_mr_mcore_te_tp2_pp1_resume_torch_dist_te_8experts2parallel_dist_optimizer_dgx_a100_1N8G] + products: + - environment: [dev] + scope: [mr] + - environment: [lts] + scope: [nightly] + - test_case: [gpt3_mr_mcore_te_tp2_pp1_resume_torch_dist_te_8experts2parallel_groupedGEMM_dgx_a100_1N8G] + products: + - environment: [dev] + scope: [mr] + - environment: [lts] + scope: [nightly] + - test_case: [gpt3_mr_mcore_te_tp2_pp1_resume_torch_dist_te_8experts2parallel_top2router_dgx_a100_1N8G] + products: + - environment: [dev] + scope: [mr] + - environment: [lts] + scope: [nightly] + - test_case: [gpt3_mr_mcore_te_tp2_pp1_te_8experts2parallel_dist_optimizer_dgx_a100_1N8G] + products: + - environment: [dev] + scope: [mr] + - environment: [lts] + scope: [nightly] + - test_case: [gpt3_mr_mcore_te_tp2_pp1_te_8experts2parallel_overlap_grad_reduce_param_gather_groupedGEMM_dgx_a100_1N8G] + products: + - environment: [dev] + scope: [mr] + - environment: [lts] + scope: [nightly] + - test_case: [gpt3_mr_mcore_te_tp2_pp1_te_8experts2parallel_top2router_dgx_a100_1N8G] + products: + - environment: [dev] + scope: [mr] + - environment: [lts] + scope: [nightly] + - test_case: [gpt3_mr_mcore_te_tp2_pp1_te_8experts_etp1_ep4_dgx_a100_1N8G] + products: + - environment: [dev] + scope: [mr] + - environment: [lts] + scope: [nightly] + - test_case: [gpt3_mr_mcore_te_tp2_pp2_cp2_nondeterministic_dgx_a100_1N8G] + products: + - environment: [dev] + scope: [mr] + - environment: [lts] + scope: [nightly] + - test_case: [gpt3_mr_mcore_te_tp2_pp2_cross_entropy_loss_fusion_dgx_a100_1N8G] + products: + - environment: [dev] + scope: [mr] + - environment: [lts] + scope: [nightly] + - test_case: [gpt3_mr_mcore_te_tp2_pp2_ddp_average_in_collective_dgx_a100_1N8G] + products: + - environment: [dev] + scope: [mr] + - environment: [lts] + scope: [nightly] + - test_case: [gpt3_mr_mcore_te_tp2_pp2_defer_embedding_wgrad_compute_dgx_a100_1N8G] + products: + - environment: [dev] + scope: [mr] + - environment: [lts] + scope: [nightly] + + - test_case: [gpt3_mr_mcore_te_tp2_pp2_no_create_attention_mask_in_dataloader_dgx_a100_1N8G] + products: + - environment: [dev] + scope: [mr] + - environment: [lts] + scope: [nightly] + - test_case: [gpt3_mr_mcore_te_tp2_pp2_no_mmap_bin_files_dgx_a100_1N8G] + products: + - environment: [dev] + scope: [mr] + - environment: [lts] + scope: [nightly] + - test_case: [gpt3_mr_mcore_te_tp2_pp2_resume_torch_dist_cp2_nondeterministic_dgx_a100_1N8G] + products: + - environment: [dev] + scope: [mr] + - environment: [lts] + scope: [nightly] + - test_case: [gpt3_mr_mcore_te_tp2_pp2_resume_torch_dist_cross_entropy_loss_fusion_dgx_a100_1N8G] + products: + - environment: [dev] + scope: [mr] + - environment: [lts] + scope: [nightly] + - test_case: [gpt3_mr_mcore_te_tp2_pp2_resume_torch_dist_ddp_average_in_collective_dgx_a100_1N8G] + products: + - environment: [dev] + scope: [mr] + - environment: [lts] + scope: [nightly] + - test_case: [gpt3_mr_mcore_te_tp2_pp2_resume_torch_dist_defer_embedding_wgrad_compute_dgx_a100_1N8G] + products: + - environment: [dev] + scope: [mr] + - environment: [lts] + scope: [nightly] + - test_case: [gpt3_mr_mcore_te_tp2_pp2_resume_torch_dist_dgx_a100_1N8G] + products: + - environment: [dev] + scope: [mr] + - environment: [lts] + scope: [nightly] + - test_case: [gpt3_mr_mcore_te_tp2_pp2_resume_torch_dist_no_create_attention_mask_in_dataloader_dgx_a100_1N8G] + products: + - environment: [dev] + scope: [mr] + - environment: [lts] + scope: [nightly] + - test_case: [gpt3_mr_mcore_te_tp2_pp2_resume_torch_dist_no_mmap_bin_files_dgx_a100_1N8G] + products: + - environment: [dev] + scope: [mr] + - environment: [lts] + scope: [nightly] + - test_case: [gpt3_mr_mcore_te_tp2_pp2_resume_torch_dist_reshard_1x4xNone_dgx_a100_1N8G] + products: + - environment: [dev] + scope: [mr] + - environment: [lts] + scope: [nightly] + - test_case: [gpt3_mr_mcore_te_tp4_pp1_dist_optimizer_overlap_grad_reduce_dgx_a100_1N8G] + products: + - environment: [dev] + scope: [mr] + - environment: [lts] + scope: [nightly] + - test_case: [gpt3_mr_mcore_te_tp4_pp1_qk_layernorm_test_mode_dgx_a100_1N8G] + products: + - environment: [dev] + scope: [mr] + - environment: [lts] + scope: [nightly] + - test_case: [gpt3_mr_mcore_te_tp4_pp1_resume_torch_dist_dist_optimizer_overlap_grad_reduce_dgx_a100_1N8G] + products: + - environment: [dev] + scope: [mr] + - environment: [lts] + scope: [nightly] + - test_case: [gpt3_mr_mcore_te_tp4_pp1_resume_torch_dist_qk_layernorm_test_mode_dgx_a100_1N8G] + products: + - environment: [dev] + scope: [mr] + - environment: [lts] + scope: [nightly] + - test_case: [gpt3_mr_mcore_tp2_pp2_uninstall_te_dgx_a100_1N8G] + products: + - environment: [dev] + scope: [mr] + - environment: [lts] + scope: [nightly] + - test_case: [gpt3_mr_te_tp2_pp2_resume_torch_dgx_a100_1N8G] + products: + - environment: [dev] + scope: [mr] + - environment: [lts] + scope: [nightly] + - test_case: [gpt3_mr_tp1_pp4_vp1_resume_torch_dgx_a100_1N8G] + products: + - environment: [dev] + scope: [mr] + - environment: [lts] + scope: [nightly] + # - test_case: [gpt3_mr_mcore_te_tp2_pp1_fsdp2_resume_torch_dist_dgx_a100_1N8G] + # products: + # - environment: [dev] + # scope: [mr] + # - environment: [lts] + # scope: [nightly] + ####################################################################### + # Super important MR tests that run for both DEV and LTS per MR # + ####################################################################### + - test_case: [gpt3_mr_mcore_te_tp2_pp1_cp2_nondeterministic_dgx_a100_1N8G] + products: + - environment: [dev, lts] + scope: [mr] + - test_case: [gpt3_mr_mcore_te_tp1_pp4_vp1_dist_optimizer_overlap_grad_reduce_param_gather_overlap_optimizer_dgx_a100_1N8G] + products: + - environment: [dev, lts] + scope: [mr] + - test_case: [gpt3_mr_mcore_te_tp2_pp2_dgx_a100_1N8G] + products: + - environment: [dev, lts] + scope: [mr] + - test_case: [gpt3_mr_mcore_te_tp4_pp1_dist_optimizer_overlap_grad_reduce_param_gather_dgx_a100_1N8G] + products: + - environment: [dev, lts] + scope: [mr] + - test_case: [gpt3_mr_mcore_te_tp4_pp1_resume_torch_dist_dist_optimizer_overlap_grad_reduce_param_gather_dgx_a100_1N8G] + products: + - environment: [dev, lts] + scope: [mr] + - test_case: [gpt3_mr_te_tp2_pp2_dgx_a100_1N8G] + products: + - environment: [dev, lts] + scope: [mr] + - test_case: [gpt3_mr_mcore_te_tp4_pp2_resume_torch_dist_reshard_8x1xNone_dgx_a100_1N8G] + products: + - environment: [dev, lts] + scope: [mr] + - test_case: [gpt3_mr_tp1_pp4_vp1_dgx_a100_1N8G] + products: + - environment: [dev, lts] + scope: [mr] + - test_case: [gpt3_mr_tp2_pp2_dgx_a100_1N8G] + products: + - environment: [dev, lts] + scope: [mr] + # - test_case: [gpt3_mr_mcore_tp2_pp2_resume_torch_dist_uninstall_te_dgx_a100_1N8G] + # products: + # - environment: [dev, lts] + # scope: [mr] diff --git a/tests/test_utils/recipes/multimodal-llava.yaml b/tests/test_utils/recipes/multimodal-llava.yaml index 064b0ed6e6..0d41a1b281 100644 --- a/tests/test_utils/recipes/multimodal-llava.yaml +++ b/tests/test_utils/recipes/multimodal-llava.yaml @@ -12,8 +12,10 @@ spec: nodes: 1 gpus: 8 platforms: dgx_a100 - time_limit: 1800 - scope: null + time_limit: + n_repeat: + test_case: + scope: script: |- ls cd /opt/megatron-lm @@ -33,19 +35,51 @@ spec: bash ./tests/functional_tests/shell_test_utils/run_ci_test.sh ${{ARGUMENTS[@]}} products: - - environment: [lts, dev] - scope: [mr] - n_repeat: [5] - gpus: [8] - test_case: - - multimodal_llava_mr_mcore_te_tp1_pp1_dgx_a100_1N8G - - multimodal_llava_mr_mcore_te_tp2_pp3_dgx_a100_1N8G - - multimodal_llava_mr_mcore_te_tp4_pp1_freeze_vit_freeze_lm_dgx_a100_1N8G - - multimodal_llava_mr_mcore_te_tp4_pp1_freeze_vit_freeze_lm_dist_opt_dgx_a100_1N8G - - environment: [lts, dev] - scope: [mr] - n_repeat: [5] - gpus: [7] - test_case: - - multimodal_llava_mr_mcore_te_tp4_pp1_etp3_dgx_a100_1N7G - - multimodal_llava_mr_mcore_te_tp4_pp1_resume_torch_etp3_dgx_a100_1N7G + - test_case: [multimodal_llava_mr_mcore_te_tp1_pp1_dgx_a100_1N8G] + products: + - environment: [dev] + scope: [mr] + gpus: [8] + - environment: [lts] + scope: [nightly] + gpus: [8] + - test_case: [multimodal_llava_mr_mcore_te_tp2_pp3_dgx_a100_1N8G] + products: + - environment: [dev] + scope: [mr] + gpus: [8] + - environment: [lts] + scope: [nightly] + gpus: [8] + - test_case: [multimodal_llava_mr_mcore_te_tp4_pp1_etp3_dgx_a100_1N7G] + products: + - environment: [dev] + scope: [mr] + gpus: [7] + - environment: [lts] + scope: [nightly] + gpus: [7] + - test_case: [multimodal_llava_mr_mcore_te_tp4_pp1_freeze_vit_freeze_lm_dgx_a100_1N8G] + products: + - environment: [dev] + scope: [mr] + gpus: [8] + - environment: [lts] + scope: [nightly] + gpus: [8] + - test_case: [multimodal_llava_mr_mcore_te_tp4_pp1_freeze_vit_freeze_lm_dist_opt_dgx_a100_1N8G] + products: + - environment: [dev] + scope: [mr] + gpus: [8] + - environment: [lts] + scope: [nightly] + gpus: [8] + - test_case: [multimodal_llava_mr_mcore_te_tp4_pp1_resume_torch_etp3_dgx_a100_1N7G] + products: + - environment: [dev] + scope: [mr] + gpus: [7] + - environment: [lts] + scope: [nightly] + gpus: [7] diff --git a/tests/test_utils/recipes/t5.yaml b/tests/test_utils/recipes/t5.yaml index fe59920633..80f9c586b7 100644 --- a/tests/test_utils/recipes/t5.yaml +++ b/tests/test_utils/recipes/t5.yaml @@ -30,32 +30,73 @@ spec: bash ./tests/functional_tests/shell_test_utils/run_ci_test.sh ${{ARGUMENTS[@]}} products: - - environment: [lts, dev] - scope: [mr] - time_limit: [1800] - n_repeat: [5] - test_case: - - t5_220m_mr_mcore_te_tp4_pp1_dgx_a100_1N8G - - t5_220m_mr_mcore_te_tp4_pp1_resume_torch_dist_dgx_a100_1N8G - - t5_220m_mr_mcore_te_tp2_pp2_resume_torch_dgx_a100_1N8G - - t5_220m_mr_mcore_tp4_pp1_resume_torch_dist_dgx_a100_1N8G - - t5_220m_mr_mcore_tp2_pp2_resume_torch_dgx_a100_1N8G - - t5_220m_mr_mcore_tp4_pp1_dgx_a100_1N8G - - t5_220m_mr_mcore_te_tp2_pp2_dgx_a100_1N8G - - environment: [lts] - scope: [mr] - time_limit: [1800] - n_repeat: [5] - test_case: - - t5_220m_mr_mcore_tp2_pp2_dgx_a100_1N8G - - environment: [lts, dev] - scope: [nightly] - time_limit: [9000] - n_repeat: [1] - test_case: - - t5_220m_nightly_dgx_a100_1N8G_mcore_te_tp1_pp1_vp1_resume_torch - - t5_220m_nightly_dgx_a100_1N8G_mcore_te_tp2_pp1_vp1 - - t5_220m_nightly_dgx_a100_1N8G_mcore_te_tp2_pp1_vp1_sequence_parallel - - t5_220m_nightly_dgx_a100_1N8G_mcore_tp1_pp1_vp1 - - t5_220m_nightly_dgx_a100_1N8G_mcore_tp1_pp1_vp1_resume_torch - - t5_220m_nightly_dgx_a100_1N8G_mcore_tp2_pp1_vp1 + - test_case: [t5_220m_mr_mcore_te_tp2_pp2_dgx_a100_1N8G] + products: + - environment: [dev] + scope: [mr] + - environment: [lts] + scope: [nightly] + - test_case: [t5_220m_mr_mcore_te_tp2_pp2_resume_torch_dgx_a100_1N8G] + products: + - environment: [dev] + scope: [mr] + - environment: [lts] + scope: [nightly] + - test_case: [t5_220m_mr_mcore_te_tp4_pp1_dgx_a100_1N8G] + products: + - environment: [dev] + scope: [mr] + - environment: [lts] + scope: [nightly] + - test_case: [t5_220m_mr_mcore_te_tp4_pp1_resume_torch_dist_dgx_a100_1N8G] + products: + - environment: [dev] + scope: [mr] + - environment: [lts] + scope: [nightly] + - test_case: [t5_220m_mr_mcore_tp2_pp2_dgx_a100_1N8G] + products: + - environment: [lts] + scope: [nightly] + - test_case: [t5_220m_mr_mcore_tp2_pp2_resume_torch_dgx_a100_1N8G] + products: + - environment: [dev] + scope: [mr] + - environment: [lts] + scope: [nightly] + - test_case: [t5_220m_mr_mcore_tp4_pp1_dgx_a100_1N8G] + products: + - environment: [dev] + scope: [mr] + - environment: [lts] + scope: [nightly] + - test_case: [t5_220m_mr_mcore_tp4_pp1_resume_torch_dist_dgx_a100_1N8G] + products: + - environment: [dev] + scope: [mr] + - environment: [lts] + scope: [nightly] + - test_case: [t5_220m_nightly_dgx_a100_1N8G_mcore_te_tp1_pp1_vp1_resume_torch] + products: + - environment: [dev, lts] + scope: [nightly] + - test_case: [t5_220m_nightly_dgx_a100_1N8G_mcore_te_tp2_pp1_vp1] + products: + - environment: [dev, lts] + scope: [nightly] + - test_case: [t5_220m_nightly_dgx_a100_1N8G_mcore_te_tp2_pp1_vp1_sequence_parallel] + products: + - environment: [dev, lts] + scope: [nightly] + - test_case: [t5_220m_nightly_dgx_a100_1N8G_mcore_tp1_pp1_vp1] + products: + - environment: [dev, lts] + scope: [nightly] + - test_case: [t5_220m_nightly_dgx_a100_1N8G_mcore_tp1_pp1_vp1_resume_torch] + products: + - environment: [dev, lts] + scope: [nightly] + - test_case: [t5_220m_nightly_dgx_a100_1N8G_mcore_tp2_pp1_vp1] + products: + - environment: [dev, lts] + scope: [nightly] diff --git a/tests/test_utils/recipes/unit-tests.yaml b/tests/test_utils/recipes/unit-tests.yaml index 921670ab13..968d7f88c5 100644 --- a/tests/test_utils/recipes/unit-tests.yaml +++ b/tests/test_utils/recipes/unit-tests.yaml @@ -66,15 +66,45 @@ spec: done products: - - environment: [lts, dev] - tag: [latest, legacy] - scope: [unit-tests] - n_repeat: [1] - time_limit: [1800] - test_case: - - tests/unit_tests/data/ - - tests/unit_tests/dist_checkpointing/*.py - - tests/unit_tests/dist_checkpointing/models/ - - tests/unit_tests/transformer/*.py - - tests/unit_tests/transformer/moe - - tests/unit_tests + - test_case: [tests/unit_tests/data/] + products: + - environment: [lts, dev] + tag: [latest, legacy] + scope: [unit-tests] + n_repeat: [1] + time_limit: [1800] + - test_case: [tests/unit_tests/dist_checkpointing/*.py] + products: + - environment: [lts, dev] + tag: [latest, legacy] + scope: [unit-tests] + n_repeat: [1] + time_limit: [1800] + - test_case: [tests/unit_tests/dist_checkpointing/models/] + products: + - environment: [lts, dev] + tag: [latest, legacy] + scope: [unit-tests] + n_repeat: [1] + time_limit: [1800] + - test_case: [tests/unit_tests/transformer/*.py] + products: + - environment: [lts, dev] + tag: [latest, legacy] + scope: [unit-tests] + n_repeat: [1] + time_limit: [1800] + - test_case: [tests/unit_tests/transformer/moe] + products: + - environment: [lts, dev] + tag: [latest, legacy] + scope: [unit-tests] + n_repeat: [1] + time_limit: [1800] + - test_case: [tests/unit_tests] + products: + - environment: [lts, dev] + tag: [latest, legacy] + scope: [unit-tests] + n_repeat: [1] + time_limit: [1800]