diff --git a/tests/functional_tests/jet_recipes/bert.yaml b/tests/functional_tests/jet_recipes/bert.yaml index 99bcb4e2e1..75aac2faab 100644 --- a/tests/functional_tests/jet_recipes/bert.yaml +++ b/tests/functional_tests/jet_recipes/bert.yaml @@ -29,7 +29,7 @@ spec: products: - scope: [mr] - time_limit: [1200] + time_limit: [12000] test_case: - bert_mr_mcore_tp2_pp2_dgx_a100_1N8G - bert_mr_mcore_tp2_pp2_local_spec_dgx_a100_1N8G diff --git a/tests/functional_tests/jet_recipes/gpt-nemo.yaml b/tests/functional_tests/jet_recipes/gpt-nemo.yaml index 9f5650842e..87a6fb2c23 100644 --- a/tests/functional_tests/jet_recipes/gpt-nemo.yaml +++ b/tests/functional_tests/jet_recipes/gpt-nemo.yaml @@ -9,7 +9,7 @@ spec: nodes: 1 gpus: 8 platforms: dgx_a100 - time_limit: 1200 + time_limit: 12000 scope: null script: |- ls diff --git a/tests/functional_tests/jet_recipes/gpt.yaml b/tests/functional_tests/jet_recipes/gpt.yaml index e7098277a1..abaef86b81 100644 --- a/tests/functional_tests/jet_recipes/gpt.yaml +++ b/tests/functional_tests/jet_recipes/gpt.yaml @@ -29,7 +29,7 @@ spec: products: - scope: [mr] platforms: [dgx_a100] - time_limit: [1200] + time_limit: [12000] test_case: - gpt3_mr_mcore_te_tp1_pp1_dist_optimizer_no_mmap_bin_files_dgx_a100_1N8G - gpt3_mr_mcore_te_tp1_pp1_resume_torch_dist_dist_optimizer_dgx_a100_1N8G diff --git a/tests/functional_tests/jet_recipes/multimodal-llava.yaml b/tests/functional_tests/jet_recipes/multimodal-llava.yaml index 6b8302b03a..7a20b1145a 100644 --- a/tests/functional_tests/jet_recipes/multimodal-llava.yaml +++ b/tests/functional_tests/jet_recipes/multimodal-llava.yaml @@ -9,7 +9,7 @@ spec: nodes: 1 gpus: 8 platforms: dgx_a100 - time_limit: 1200 + time_limit: 12000 scope: null script: |- ls diff --git a/tests/functional_tests/jet_recipes/t5.yaml b/tests/functional_tests/jet_recipes/t5.yaml index 87d2a476ac..947023b0eb 100644 --- a/tests/functional_tests/jet_recipes/t5.yaml +++ b/tests/functional_tests/jet_recipes/t5.yaml @@ -29,7 +29,7 @@ spec: products: - scope: [mr] - time_limit: [1200] + time_limit: [12000] test_case: - t5_220m_mr_mcore_tp2_pp2_dgx_a100_1N8G - t5_220m_mr_mcore_tp2_pp2_resume_torch_dgx_a100_1N8G diff --git a/tests/functional_tests/test_cases/bert/bert_345m_nightly_dgx_a100_1N8G_mcore_tp1_pp2/model_config.yaml b/tests/functional_tests/test_cases/bert/bert_345m_nightly_dgx_a100_1N8G_mcore_tp1_pp2/model_config.yaml index 073585dee6..26d377fd02 100644 --- a/tests/functional_tests/test_cases/bert/bert_345m_nightly_dgx_a100_1N8G_mcore_tp1_pp2/model_config.yaml +++ b/tests/functional_tests/test_cases/bert/bert_345m_nightly_dgx_a100_1N8G_mcore_tp1_pp2/model_config.yaml @@ -3,7 +3,7 @@ ENV_VARS: NVTE_ALLOW_NONDETERMINISTIC_ALGO: 0 NCCL_ALGO: Tree CUBLAS_WORKSPACE_CONFIG: :4096:8 - N_REPEATS: 10 + N_REPEATS: 5 MODEL_ARGS: --num-layers: 24 --hidden-size: 1024 diff --git a/tests/functional_tests/test_cases/bert/bert_345m_nightly_dgx_a100_1N8G_mcore_tp1_pp4_vp2/model_config.yaml b/tests/functional_tests/test_cases/bert/bert_345m_nightly_dgx_a100_1N8G_mcore_tp1_pp4_vp2/model_config.yaml index eb64af65e3..3a6d46e00d 100644 --- a/tests/functional_tests/test_cases/bert/bert_345m_nightly_dgx_a100_1N8G_mcore_tp1_pp4_vp2/model_config.yaml +++ b/tests/functional_tests/test_cases/bert/bert_345m_nightly_dgx_a100_1N8G_mcore_tp1_pp4_vp2/model_config.yaml @@ -3,7 +3,7 @@ ENV_VARS: NVTE_ALLOW_NONDETERMINISTIC_ALGO: 0 NCCL_ALGO: Tree CUBLAS_WORKSPACE_CONFIG: :4096:8 - N_REPEATS: 10 + N_REPEATS: 5 MODEL_ARGS: --num-layers: 24 --hidden-size: 1024 diff --git a/tests/functional_tests/test_cases/bert/bert_345m_nightly_dgx_a100_1N8G_mcore_tp4_pp1/model_config.yaml b/tests/functional_tests/test_cases/bert/bert_345m_nightly_dgx_a100_1N8G_mcore_tp4_pp1/model_config.yaml index 598aa59793..24b9147500 100644 --- a/tests/functional_tests/test_cases/bert/bert_345m_nightly_dgx_a100_1N8G_mcore_tp4_pp1/model_config.yaml +++ b/tests/functional_tests/test_cases/bert/bert_345m_nightly_dgx_a100_1N8G_mcore_tp4_pp1/model_config.yaml @@ -3,7 +3,7 @@ ENV_VARS: NVTE_ALLOW_NONDETERMINISTIC_ALGO: 0 NCCL_ALGO: Tree CUBLAS_WORKSPACE_CONFIG: :4096:8 - N_REPEATS: 10 + N_REPEATS: 5 MODEL_ARGS: --num-layers: 24 --hidden-size: 1024 diff --git a/tests/functional_tests/test_cases/bert/bert_345m_nightly_dgx_a100_1N8G_tp1_pp2/model_config.yaml b/tests/functional_tests/test_cases/bert/bert_345m_nightly_dgx_a100_1N8G_tp1_pp2/model_config.yaml index 4cdfc1c44b..f372ca18ce 100644 --- a/tests/functional_tests/test_cases/bert/bert_345m_nightly_dgx_a100_1N8G_tp1_pp2/model_config.yaml +++ b/tests/functional_tests/test_cases/bert/bert_345m_nightly_dgx_a100_1N8G_tp1_pp2/model_config.yaml @@ -4,7 +4,7 @@ ENV_VARS: NCCL_ALGO: Tree CUBLAS_WORKSPACE_CONFIG: :4096:8 NVTE_APPLY_QK_LAYER_SCALING: 1 - N_REPEATS: 10 + N_REPEATS: 5 MODEL_ARGS: --num-layers: 24 --hidden-size: 1024 diff --git a/tests/functional_tests/test_cases/bert/bert_345m_nightly_dgx_a100_1N8G_tp4_pp1/model_config.yaml b/tests/functional_tests/test_cases/bert/bert_345m_nightly_dgx_a100_1N8G_tp4_pp1/model_config.yaml index 70846159d3..476366af7d 100644 --- a/tests/functional_tests/test_cases/bert/bert_345m_nightly_dgx_a100_1N8G_tp4_pp1/model_config.yaml +++ b/tests/functional_tests/test_cases/bert/bert_345m_nightly_dgx_a100_1N8G_tp4_pp1/model_config.yaml @@ -4,7 +4,7 @@ ENV_VARS: NCCL_ALGO: Tree CUBLAS_WORKSPACE_CONFIG: :4096:8 NVTE_APPLY_QK_LAYER_SCALING: 1 - N_REPEATS: 10 + N_REPEATS: 5 MODEL_ARGS: --num-layers: 24 --hidden-size: 1024 diff --git a/tests/functional_tests/test_cases/bert/bert_mr_mcore_tp2_pp2_dgx_a100_1N8G/model_config.yaml b/tests/functional_tests/test_cases/bert/bert_mr_mcore_tp2_pp2_dgx_a100_1N8G/model_config.yaml index da970b1b3e..b6b7359e5a 100644 --- a/tests/functional_tests/test_cases/bert/bert_mr_mcore_tp2_pp2_dgx_a100_1N8G/model_config.yaml +++ b/tests/functional_tests/test_cases/bert/bert_mr_mcore_tp2_pp2_dgx_a100_1N8G/model_config.yaml @@ -3,6 +3,7 @@ ENV_VARS: NVTE_ALLOW_NONDETERMINISTIC_ALGO: 0 NCCL_ALGO: Tree CUBLAS_WORKSPACE_CONFIG: :4096:8 + N_REPEATS: 5 MODEL_ARGS: --num-layers: 24 --hidden-size: 1024 @@ -40,4 +41,4 @@ MODEL_ARGS: --data-cache-path: ${DATA_CACHE_PATH} --bf16: true --ckpt-format: torch -TEST_TYPE: regular \ No newline at end of file +TEST_TYPE: regular diff --git a/tests/functional_tests/test_cases/bert/bert_mr_mcore_tp2_pp2_local_spec_dgx_a100_1N8G/model_config.yaml b/tests/functional_tests/test_cases/bert/bert_mr_mcore_tp2_pp2_local_spec_dgx_a100_1N8G/model_config.yaml index f30342bb1c..9f5de1eb86 100644 --- a/tests/functional_tests/test_cases/bert/bert_mr_mcore_tp2_pp2_local_spec_dgx_a100_1N8G/model_config.yaml +++ b/tests/functional_tests/test_cases/bert/bert_mr_mcore_tp2_pp2_local_spec_dgx_a100_1N8G/model_config.yaml @@ -3,6 +3,7 @@ ENV_VARS: NVTE_ALLOW_NONDETERMINISTIC_ALGO: 0 NCCL_ALGO: Tree CUBLAS_WORKSPACE_CONFIG: :4096:8 + N_REPEATS: 5 MODEL_ARGS: --num-layers: 24 --hidden-size: 1024 @@ -35,10 +36,10 @@ MODEL_ARGS: --eval-iters: 10 --tensor-model-parallel-size: 2 --pipeline-model-parallel-size: 2 - --spec: local - --deterministic-mode: true + --spec: local + --deterministic-mode: true --no-gradient-accumulation-fusion: true --data-cache-path: ${DATA_CACHE_PATH} --bf16: true --ckpt-format: torch -TEST_TYPE: regular \ No newline at end of file +TEST_TYPE: regular diff --git a/tests/functional_tests/test_cases/bert/bert_mr_mcore_tp2_pp2_resume_torch_dist_dgx_a100_1N8G/model_config.yaml b/tests/functional_tests/test_cases/bert/bert_mr_mcore_tp2_pp2_resume_torch_dist_dgx_a100_1N8G/model_config.yaml index d71d2d5b87..1f3c1cf607 100644 --- a/tests/functional_tests/test_cases/bert/bert_mr_mcore_tp2_pp2_resume_torch_dist_dgx_a100_1N8G/model_config.yaml +++ b/tests/functional_tests/test_cases/bert/bert_mr_mcore_tp2_pp2_resume_torch_dist_dgx_a100_1N8G/model_config.yaml @@ -3,6 +3,7 @@ ENV_VARS: NVTE_ALLOW_NONDETERMINISTIC_ALGO: 0 NCCL_ALGO: Tree CUBLAS_WORKSPACE_CONFIG: :4096:8 + N_REPEATS: 5 MODEL_ARGS: --num-layers: 24 --hidden-size: 1024 @@ -35,11 +36,11 @@ MODEL_ARGS: --eval-iters: 10 --tensor-model-parallel-size: 2 --pipeline-model-parallel-size: 2 - --deterministic-mode: true - --use-checkpoint-args: true + --deterministic-mode: true + --use-checkpoint-args: true --use-checkpoint-opt_param-scheduler: true --no-gradient-accumulation-fusion: true --data-cache-path: ${DATA_CACHE_PATH} --bf16: true --ckpt-format: torch -TEST_TYPE: ckpt-resume \ No newline at end of file +TEST_TYPE: ckpt-resume diff --git a/tests/functional_tests/test_cases/bert/bert_mr_mcore_tp2_pp2_resume_torch_dist_local_spec_dgx_a100_1N8G/model_config.yaml b/tests/functional_tests/test_cases/bert/bert_mr_mcore_tp2_pp2_resume_torch_dist_local_spec_dgx_a100_1N8G/model_config.yaml index 9ffd3f164f..ade42ea6f4 100644 --- a/tests/functional_tests/test_cases/bert/bert_mr_mcore_tp2_pp2_resume_torch_dist_local_spec_dgx_a100_1N8G/model_config.yaml +++ b/tests/functional_tests/test_cases/bert/bert_mr_mcore_tp2_pp2_resume_torch_dist_local_spec_dgx_a100_1N8G/model_config.yaml @@ -3,6 +3,7 @@ ENV_VARS: NVTE_ALLOW_NONDETERMINISTIC_ALGO: 0 NCCL_ALGO: Tree CUBLAS_WORKSPACE_CONFIG: :4096:8 + N_REPEATS: 5 MODEL_ARGS: --num-layers: 24 --hidden-size: 1024 @@ -35,12 +36,12 @@ MODEL_ARGS: --eval-iters: 10 --tensor-model-parallel-size: 2 --pipeline-model-parallel-size: 2 - --spec: local - --deterministic-mode: true - --use-checkpoint-args: true + --spec: local + --deterministic-mode: true + --use-checkpoint-args: true --use-checkpoint-opt_param-scheduler: true --no-gradient-accumulation-fusion: true --data-cache-path: ${DATA_CACHE_PATH} --bf16: true --ckpt-format: torch -TEST_TYPE: ckpt-resume \ No newline at end of file +TEST_TYPE: ckpt-resume diff --git a/tests/functional_tests/test_cases/bert/bert_mr_tp1_pp4_vp2_dgx_a100_1N8G/model_config.yaml b/tests/functional_tests/test_cases/bert/bert_mr_tp1_pp4_vp2_dgx_a100_1N8G/model_config.yaml index cd18e14d0e..38fd703ccf 100644 --- a/tests/functional_tests/test_cases/bert/bert_mr_tp1_pp4_vp2_dgx_a100_1N8G/model_config.yaml +++ b/tests/functional_tests/test_cases/bert/bert_mr_tp1_pp4_vp2_dgx_a100_1N8G/model_config.yaml @@ -3,6 +3,7 @@ ENV_VARS: NVTE_ALLOW_NONDETERMINISTIC_ALGO: 0 NCCL_ALGO: Tree CUBLAS_WORKSPACE_CONFIG: :4096:8 + N_REPEATS: 5 MODEL_ARGS: --num-layers: 24 --hidden-size: 1024 @@ -44,4 +45,4 @@ MODEL_ARGS: --fp16: true --apply-query-key-layer-scaling: true --ckpt-format: torch -TEST_TYPE: regular \ No newline at end of file +TEST_TYPE: regular diff --git a/tests/functional_tests/test_cases/bert/bert_mr_tp1_pp4_vp2_resume_torch_dgx_a100_1N8G/model_config.yaml b/tests/functional_tests/test_cases/bert/bert_mr_tp1_pp4_vp2_resume_torch_dgx_a100_1N8G/model_config.yaml index b7377a2397..041d95f9ba 100644 --- a/tests/functional_tests/test_cases/bert/bert_mr_tp1_pp4_vp2_resume_torch_dgx_a100_1N8G/model_config.yaml +++ b/tests/functional_tests/test_cases/bert/bert_mr_tp1_pp4_vp2_resume_torch_dgx_a100_1N8G/model_config.yaml @@ -3,6 +3,7 @@ ENV_VARS: NVTE_ALLOW_NONDETERMINISTIC_ALGO: 0 NCCL_ALGO: Tree CUBLAS_WORKSPACE_CONFIG: :4096:8 + N_REPEATS: 5 MODEL_ARGS: --num-layers: 24 --hidden-size: 1024 @@ -37,13 +38,13 @@ MODEL_ARGS: --pipeline-model-parallel-size: 4 --num-layers-per-virtual-pipeline-stage: 2 --use-legacy-models: true - --transformer-impl: local - --deterministic-mode: true - --use-checkpoint-args: true + --transformer-impl: local + --deterministic-mode: true + --use-checkpoint-args: true --use-checkpoint-opt_param-scheduler: true --no-gradient-accumulation-fusion: true --data-cache-path: ${DATA_CACHE_PATH} - --fp16: true + --fp16: true --apply-query-key-layer-scaling: true --ckpt-format: torch -TEST_TYPE: ckpt-resume \ No newline at end of file +TEST_TYPE: ckpt-resume diff --git a/tests/functional_tests/test_cases/bert/bert_mr_tp2_pp2_dgx_a100_1N8G/model_config.yaml b/tests/functional_tests/test_cases/bert/bert_mr_tp2_pp2_dgx_a100_1N8G/model_config.yaml index 4d85d383ed..a2a39e49a3 100644 --- a/tests/functional_tests/test_cases/bert/bert_mr_tp2_pp2_dgx_a100_1N8G/model_config.yaml +++ b/tests/functional_tests/test_cases/bert/bert_mr_tp2_pp2_dgx_a100_1N8G/model_config.yaml @@ -3,6 +3,7 @@ ENV_VARS: NVTE_ALLOW_NONDETERMINISTIC_ALGO: 0 NCCL_ALGO: Tree CUBLAS_WORKSPACE_CONFIG: :4096:8 + N_REPEATS: 5 MODEL_ARGS: --num-layers: 24 --hidden-size: 1024 @@ -36,11 +37,11 @@ MODEL_ARGS: --tensor-model-parallel-size: 2 --pipeline-model-parallel-size: 2 --use-legacy-models: true - --transformer-impl: local + --transformer-impl: local --deterministic-mode: true --no-gradient-accumulation-fusion: true --data-cache-path: ${DATA_CACHE_PATH} - --fp16: true + --fp16: true --apply-query-key-layer-scaling: true --ckpt-format: torch -TEST_TYPE: regular \ No newline at end of file +TEST_TYPE: regular diff --git a/tests/functional_tests/test_cases/bert/bert_mr_tp2_pp2_resume_torch_dgx_a100_1N8G/model_config.yaml b/tests/functional_tests/test_cases/bert/bert_mr_tp2_pp2_resume_torch_dgx_a100_1N8G/model_config.yaml index aa37109915..e65fc9cc0d 100644 --- a/tests/functional_tests/test_cases/bert/bert_mr_tp2_pp2_resume_torch_dgx_a100_1N8G/model_config.yaml +++ b/tests/functional_tests/test_cases/bert/bert_mr_tp2_pp2_resume_torch_dgx_a100_1N8G/model_config.yaml @@ -3,6 +3,7 @@ ENV_VARS: NVTE_ALLOW_NONDETERMINISTIC_ALGO: 0 NCCL_ALGO: Tree CUBLAS_WORKSPACE_CONFIG: :4096:8 + N_REPEATS: 5 MODEL_ARGS: --num-layers: 24 --hidden-size: 1024 @@ -36,13 +37,13 @@ MODEL_ARGS: --tensor-model-parallel-size: 2 --pipeline-model-parallel-size: 2 --use-legacy-models: true - --transformer-impl: local - --deterministic-mode: true - --use-checkpoint-args: true + --transformer-impl: local + --deterministic-mode: true + --use-checkpoint-args: true --use-checkpoint-opt_param-scheduler: true --no-gradient-accumulation-fusion: true --data-cache-path: ${DATA_CACHE_PATH} --fp16: true --apply-query-key-layer-scaling: true --ckpt-format: torch -TEST_TYPE: ckpt-resume \ No newline at end of file +TEST_TYPE: ckpt-resume diff --git a/tests/functional_tests/test_cases/gpt-nemo/gpt3-nemo_126m_mr_mbs1_gbs8_mcore_te_tp2_pp4_vp3_seq_par_overlap_p2p_dgx_a100_1N8G/model_config.yaml b/tests/functional_tests/test_cases/gpt-nemo/gpt3-nemo_126m_mr_mbs1_gbs8_mcore_te_tp2_pp4_vp3_seq_par_overlap_p2p_dgx_a100_1N8G/model_config.yaml index 9dfedbcd0a..89c71f6291 100644 --- a/tests/functional_tests/test_cases/gpt-nemo/gpt3-nemo_126m_mr_mbs1_gbs8_mcore_te_tp2_pp4_vp3_seq_par_overlap_p2p_dgx_a100_1N8G/model_config.yaml +++ b/tests/functional_tests/test_cases/gpt-nemo/gpt3-nemo_126m_mr_mbs1_gbs8_mcore_te_tp2_pp4_vp3_seq_par_overlap_p2p_dgx_a100_1N8G/model_config.yaml @@ -1,6 +1,7 @@ ENV_VARS: CUDA_DEVICE_MAX_CONNECTIONS: 1 SKIP_PYTEST: 1 + N_REPEATS: 1 MODEL_ARGS: trainer.num_nodes: 1 trainer.devices: 8 @@ -32,4 +33,4 @@ MODEL_ARGS: model.sequence_parallel: 'True' model.overlap_p2p_comm: 'True' model.batch_p2p_comm: 'False' -TEST_TYPE: regular \ No newline at end of file +TEST_TYPE: regular diff --git a/tests/functional_tests/test_cases/gpt-nemo/gpt3-nemo_126m_mr_mbs4_gbs64_mcore_te_tp1_pp1_dgx_a100_1N8G/model_config.yaml b/tests/functional_tests/test_cases/gpt-nemo/gpt3-nemo_126m_mr_mbs4_gbs64_mcore_te_tp1_pp1_dgx_a100_1N8G/model_config.yaml index dd9d35ef86..d7e926e96e 100644 --- a/tests/functional_tests/test_cases/gpt-nemo/gpt3-nemo_126m_mr_mbs4_gbs64_mcore_te_tp1_pp1_dgx_a100_1N8G/model_config.yaml +++ b/tests/functional_tests/test_cases/gpt-nemo/gpt3-nemo_126m_mr_mbs4_gbs64_mcore_te_tp1_pp1_dgx_a100_1N8G/model_config.yaml @@ -1,6 +1,7 @@ ENV_VARS: CUDA_DEVICE_MAX_CONNECTIONS: 1 SKIP_PYTEST: 1 + N_REPEATS: 1 MODEL_ARGS: trainer.num_nodes: 1 trainer.devices: 8 @@ -29,4 +30,4 @@ MODEL_ARGS: model.optim.name: distributed_fused_adam model.optim.weight_decay: 0.1 exp_manager.create_checkpoint_callback: 'False' -TEST_TYPE: regular \ No newline at end of file +TEST_TYPE: regular diff --git a/tests/functional_tests/test_cases/gpt/gpt3_345m_nightly_dgx_a100_1N8G_mcore_tp1_pp1_dist_optimizer_overlap_grad_reduce_param_gather/model_config.yaml b/tests/functional_tests/test_cases/gpt/gpt3_345m_nightly_dgx_a100_1N8G_mcore_tp1_pp1_dist_optimizer_overlap_grad_reduce_param_gather/model_config.yaml index 62bc1cba5d..459270a1b2 100644 --- a/tests/functional_tests/test_cases/gpt/gpt3_345m_nightly_dgx_a100_1N8G_mcore_tp1_pp1_dist_optimizer_overlap_grad_reduce_param_gather/model_config.yaml +++ b/tests/functional_tests/test_cases/gpt/gpt3_345m_nightly_dgx_a100_1N8G_mcore_tp1_pp1_dist_optimizer_overlap_grad_reduce_param_gather/model_config.yaml @@ -3,7 +3,7 @@ ENV_VARS: NVTE_ALLOW_NONDETERMINISTIC_ALGO: 0 NCCL_ALGO: Tree CUBLAS_WORKSPACE_CONFIG: :4096:8 - N_REPEATS: 10 + N_REPEATS: 5 MODEL_ARGS: --num-layers: 12 --hidden-size: 512 diff --git a/tests/functional_tests/test_cases/gpt/gpt3_345m_nightly_dgx_a100_1N8G_mcore_tp1_pp1_resume_torch_dist_dist_optimizer_overlap_grad_reduce_param_gather/model_config.yaml b/tests/functional_tests/test_cases/gpt/gpt3_345m_nightly_dgx_a100_1N8G_mcore_tp1_pp1_resume_torch_dist_dist_optimizer_overlap_grad_reduce_param_gather/model_config.yaml index e780aed0e1..dcb80dc007 100644 --- a/tests/functional_tests/test_cases/gpt/gpt3_345m_nightly_dgx_a100_1N8G_mcore_tp1_pp1_resume_torch_dist_dist_optimizer_overlap_grad_reduce_param_gather/model_config.yaml +++ b/tests/functional_tests/test_cases/gpt/gpt3_345m_nightly_dgx_a100_1N8G_mcore_tp1_pp1_resume_torch_dist_dist_optimizer_overlap_grad_reduce_param_gather/model_config.yaml @@ -3,7 +3,7 @@ ENV_VARS: NVTE_ALLOW_NONDETERMINISTIC_ALGO: 0 NCCL_ALGO: Tree CUBLAS_WORKSPACE_CONFIG: :4096:8 - N_REPEATS: 10 + N_REPEATS: 5 MODEL_ARGS: --num-layers: 12 --hidden-size: 512 diff --git a/tests/functional_tests/test_cases/gpt/gpt3_345m_nightly_dgx_a100_1N8G_mcore_tp1_pp2/model_config.yaml b/tests/functional_tests/test_cases/gpt/gpt3_345m_nightly_dgx_a100_1N8G_mcore_tp1_pp2/model_config.yaml index b2658b6a07..d94f5277d4 100644 --- a/tests/functional_tests/test_cases/gpt/gpt3_345m_nightly_dgx_a100_1N8G_mcore_tp1_pp2/model_config.yaml +++ b/tests/functional_tests/test_cases/gpt/gpt3_345m_nightly_dgx_a100_1N8G_mcore_tp1_pp2/model_config.yaml @@ -3,7 +3,7 @@ ENV_VARS: NVTE_ALLOW_NONDETERMINISTIC_ALGO: 0 NCCL_ALGO: Tree CUBLAS_WORKSPACE_CONFIG: :4096:8 - N_REPEATS: 10 + N_REPEATS: 5 MODEL_ARGS: --num-layers: 12 --hidden-size: 512 diff --git a/tests/functional_tests/test_cases/gpt/gpt3_345m_nightly_dgx_a100_1N8G_mcore_tp1_pp2_fp16/model_config.yaml b/tests/functional_tests/test_cases/gpt/gpt3_345m_nightly_dgx_a100_1N8G_mcore_tp1_pp2_fp16/model_config.yaml index 69e9eeed24..9f210d838f 100644 --- a/tests/functional_tests/test_cases/gpt/gpt3_345m_nightly_dgx_a100_1N8G_mcore_tp1_pp2_fp16/model_config.yaml +++ b/tests/functional_tests/test_cases/gpt/gpt3_345m_nightly_dgx_a100_1N8G_mcore_tp1_pp2_fp16/model_config.yaml @@ -3,7 +3,7 @@ ENV_VARS: NVTE_ALLOW_NONDETERMINISTIC_ALGO: 0 NCCL_ALGO: Tree CUBLAS_WORKSPACE_CONFIG: :4096:8 - N_REPEATS: 10 + N_REPEATS: 5 MODEL_ARGS: --num-layers: 12 --hidden-size: 512 diff --git a/tests/functional_tests/test_cases/gpt/gpt3_345m_nightly_dgx_a100_1N8G_mcore_tp1_pp2_resume_torch_dist/model_config.yaml b/tests/functional_tests/test_cases/gpt/gpt3_345m_nightly_dgx_a100_1N8G_mcore_tp1_pp2_resume_torch_dist/model_config.yaml index e2d3762795..b943bfec0f 100644 --- a/tests/functional_tests/test_cases/gpt/gpt3_345m_nightly_dgx_a100_1N8G_mcore_tp1_pp2_resume_torch_dist/model_config.yaml +++ b/tests/functional_tests/test_cases/gpt/gpt3_345m_nightly_dgx_a100_1N8G_mcore_tp1_pp2_resume_torch_dist/model_config.yaml @@ -3,7 +3,7 @@ ENV_VARS: NVTE_ALLOW_NONDETERMINISTIC_ALGO: 0 NCCL_ALGO: Tree CUBLAS_WORKSPACE_CONFIG: :4096:8 - N_REPEATS: 10 + N_REPEATS: 5 MODEL_ARGS: --num-layers: 12 --hidden-size: 512 diff --git a/tests/functional_tests/test_cases/gpt/gpt3_345m_nightly_dgx_a100_1N8G_mcore_tp1_pp4/model_config.yaml b/tests/functional_tests/test_cases/gpt/gpt3_345m_nightly_dgx_a100_1N8G_mcore_tp1_pp4/model_config.yaml index 7b98858b84..108cb6b1a4 100644 --- a/tests/functional_tests/test_cases/gpt/gpt3_345m_nightly_dgx_a100_1N8G_mcore_tp1_pp4/model_config.yaml +++ b/tests/functional_tests/test_cases/gpt/gpt3_345m_nightly_dgx_a100_1N8G_mcore_tp1_pp4/model_config.yaml @@ -3,7 +3,7 @@ ENV_VARS: NVTE_ALLOW_NONDETERMINISTIC_ALGO: 0 NCCL_ALGO: Tree CUBLAS_WORKSPACE_CONFIG: :4096:8 - N_REPEATS: 10 + N_REPEATS: 5 MODEL_ARGS: --num-layers: 12 --hidden-size: 512 diff --git a/tests/functional_tests/test_cases/gpt/gpt3_345m_nightly_dgx_a100_1N8G_mcore_tp1_pp4_resume_torch_dist/model_config.yaml b/tests/functional_tests/test_cases/gpt/gpt3_345m_nightly_dgx_a100_1N8G_mcore_tp1_pp4_resume_torch_dist/model_config.yaml index d5a6a9a130..1c2a42eaaa 100644 --- a/tests/functional_tests/test_cases/gpt/gpt3_345m_nightly_dgx_a100_1N8G_mcore_tp1_pp4_resume_torch_dist/model_config.yaml +++ b/tests/functional_tests/test_cases/gpt/gpt3_345m_nightly_dgx_a100_1N8G_mcore_tp1_pp4_resume_torch_dist/model_config.yaml @@ -3,7 +3,7 @@ ENV_VARS: NVTE_ALLOW_NONDETERMINISTIC_ALGO: 0 NCCL_ALGO: Tree CUBLAS_WORKSPACE_CONFIG: :4096:8 - N_REPEATS: 10 + N_REPEATS: 5 MODEL_ARGS: --num-layers: 12 --hidden-size: 512 diff --git a/tests/functional_tests/test_cases/gpt/gpt3_345m_nightly_dgx_a100_1N8G_mcore_tp2_pp2_ep2_resume_torch_dist_te_4experts2parallel/model_config.yaml b/tests/functional_tests/test_cases/gpt/gpt3_345m_nightly_dgx_a100_1N8G_mcore_tp2_pp2_ep2_resume_torch_dist_te_4experts2parallel/model_config.yaml index fc589f94fa..cb0214f264 100644 --- a/tests/functional_tests/test_cases/gpt/gpt3_345m_nightly_dgx_a100_1N8G_mcore_tp2_pp2_ep2_resume_torch_dist_te_4experts2parallel/model_config.yaml +++ b/tests/functional_tests/test_cases/gpt/gpt3_345m_nightly_dgx_a100_1N8G_mcore_tp2_pp2_ep2_resume_torch_dist_te_4experts2parallel/model_config.yaml @@ -3,7 +3,7 @@ ENV_VARS: NVTE_ALLOW_NONDETERMINISTIC_ALGO: 0 NCCL_ALGO: Tree CUBLAS_WORKSPACE_CONFIG: :4096:8 - N_REPEATS: 10 + N_REPEATS: 5 MODEL_ARGS: --num-layers: 12 --hidden-size: 512 diff --git a/tests/functional_tests/test_cases/gpt/gpt3_345m_nightly_dgx_a100_1N8G_mcore_tp2_pp2_ep2_te_4experts2parallel/model_config.yaml b/tests/functional_tests/test_cases/gpt/gpt3_345m_nightly_dgx_a100_1N8G_mcore_tp2_pp2_ep2_te_4experts2parallel/model_config.yaml index 08f556c1e2..97d3d8c5f0 100644 --- a/tests/functional_tests/test_cases/gpt/gpt3_345m_nightly_dgx_a100_1N8G_mcore_tp2_pp2_ep2_te_4experts2parallel/model_config.yaml +++ b/tests/functional_tests/test_cases/gpt/gpt3_345m_nightly_dgx_a100_1N8G_mcore_tp2_pp2_ep2_te_4experts2parallel/model_config.yaml @@ -3,7 +3,7 @@ ENV_VARS: NVTE_ALLOW_NONDETERMINISTIC_ALGO: 0 NCCL_ALGO: Tree CUBLAS_WORKSPACE_CONFIG: :4096:8 - N_REPEATS: 10 + N_REPEATS: 5 MODEL_ARGS: --num-layers: 12 --hidden-size: 512 diff --git a/tests/functional_tests/test_cases/gpt/gpt3_345m_nightly_dgx_a100_1N8G_mcore_tp2_pp2_resume_torch_dist_te_2experts/model_config.yaml b/tests/functional_tests/test_cases/gpt/gpt3_345m_nightly_dgx_a100_1N8G_mcore_tp2_pp2_resume_torch_dist_te_2experts/model_config.yaml index 5dc534753c..1a15825731 100644 --- a/tests/functional_tests/test_cases/gpt/gpt3_345m_nightly_dgx_a100_1N8G_mcore_tp2_pp2_resume_torch_dist_te_2experts/model_config.yaml +++ b/tests/functional_tests/test_cases/gpt/gpt3_345m_nightly_dgx_a100_1N8G_mcore_tp2_pp2_resume_torch_dist_te_2experts/model_config.yaml @@ -3,7 +3,7 @@ ENV_VARS: NVTE_ALLOW_NONDETERMINISTIC_ALGO: 0 NCCL_ALGO: Tree CUBLAS_WORKSPACE_CONFIG: :4096:8 - N_REPEATS: 10 + N_REPEATS: 5 MODEL_ARGS: --num-layers: 12 --hidden-size: 512 diff --git a/tests/functional_tests/test_cases/gpt/gpt3_345m_nightly_dgx_a100_1N8G_mcore_tp2_pp2_te_2experts/model_config.yaml b/tests/functional_tests/test_cases/gpt/gpt3_345m_nightly_dgx_a100_1N8G_mcore_tp2_pp2_te_2experts/model_config.yaml index 34dd7657f0..c6728722e2 100644 --- a/tests/functional_tests/test_cases/gpt/gpt3_345m_nightly_dgx_a100_1N8G_mcore_tp2_pp2_te_2experts/model_config.yaml +++ b/tests/functional_tests/test_cases/gpt/gpt3_345m_nightly_dgx_a100_1N8G_mcore_tp2_pp2_te_2experts/model_config.yaml @@ -3,7 +3,7 @@ ENV_VARS: NVTE_ALLOW_NONDETERMINISTIC_ALGO: 0 NCCL_ALGO: Tree CUBLAS_WORKSPACE_CONFIG: :4096:8 - N_REPEATS: 10 + N_REPEATS: 5 MODEL_ARGS: --num-layers: 12 --hidden-size: 512 diff --git a/tests/functional_tests/test_cases/gpt/gpt3_345m_nightly_dgx_a100_1N8G_mcore_tp4_pp1/model_config.yaml b/tests/functional_tests/test_cases/gpt/gpt3_345m_nightly_dgx_a100_1N8G_mcore_tp4_pp1/model_config.yaml index 3039779e57..37cc4615a5 100644 --- a/tests/functional_tests/test_cases/gpt/gpt3_345m_nightly_dgx_a100_1N8G_mcore_tp4_pp1/model_config.yaml +++ b/tests/functional_tests/test_cases/gpt/gpt3_345m_nightly_dgx_a100_1N8G_mcore_tp4_pp1/model_config.yaml @@ -3,7 +3,7 @@ ENV_VARS: NVTE_ALLOW_NONDETERMINISTIC_ALGO: 0 NCCL_ALGO: Tree CUBLAS_WORKSPACE_CONFIG: :4096:8 - N_REPEATS: 10 + N_REPEATS: 5 MODEL_ARGS: --num-layers: 12 --hidden-size: 512 diff --git a/tests/functional_tests/test_cases/gpt/gpt3_345m_nightly_dgx_a100_1N8G_mcore_tp4_pp1_resume_torch/model_config.yaml b/tests/functional_tests/test_cases/gpt/gpt3_345m_nightly_dgx_a100_1N8G_mcore_tp4_pp1_resume_torch/model_config.yaml index 56dc883536..528b691a28 100644 --- a/tests/functional_tests/test_cases/gpt/gpt3_345m_nightly_dgx_a100_1N8G_mcore_tp4_pp1_resume_torch/model_config.yaml +++ b/tests/functional_tests/test_cases/gpt/gpt3_345m_nightly_dgx_a100_1N8G_mcore_tp4_pp1_resume_torch/model_config.yaml @@ -3,7 +3,7 @@ ENV_VARS: NVTE_ALLOW_NONDETERMINISTIC_ALGO: 0 NCCL_ALGO: Tree CUBLAS_WORKSPACE_CONFIG: :4096:8 - N_REPEATS: 10 + N_REPEATS: 5 MODEL_ARGS: --num-layers: 12 --hidden-size: 512 diff --git a/tests/functional_tests/test_cases/gpt/gpt3_345m_nightly_dgx_a100_1N8G_mcore_tp4_pp1_resume_torch_dist/model_config.yaml b/tests/functional_tests/test_cases/gpt/gpt3_345m_nightly_dgx_a100_1N8G_mcore_tp4_pp1_resume_torch_dist/model_config.yaml index 32ad67e2a4..4f5e8d93b7 100644 --- a/tests/functional_tests/test_cases/gpt/gpt3_345m_nightly_dgx_a100_1N8G_mcore_tp4_pp1_resume_torch_dist/model_config.yaml +++ b/tests/functional_tests/test_cases/gpt/gpt3_345m_nightly_dgx_a100_1N8G_mcore_tp4_pp1_resume_torch_dist/model_config.yaml @@ -3,7 +3,7 @@ ENV_VARS: NVTE_ALLOW_NONDETERMINISTIC_ALGO: 0 NCCL_ALGO: Tree CUBLAS_WORKSPACE_CONFIG: :4096:8 - N_REPEATS: 10 + N_REPEATS: 5 MODEL_ARGS: --num-layers: 12 --hidden-size: 512 diff --git a/tests/functional_tests/test_cases/gpt/gpt3_345m_nightly_dgx_a100_1N8G_tp1_pp1_dist_optimizer_overlap_grad_reduce/model_config.yaml b/tests/functional_tests/test_cases/gpt/gpt3_345m_nightly_dgx_a100_1N8G_tp1_pp1_dist_optimizer_overlap_grad_reduce/model_config.yaml index 93f704b7d8..64d504bf29 100644 --- a/tests/functional_tests/test_cases/gpt/gpt3_345m_nightly_dgx_a100_1N8G_tp1_pp1_dist_optimizer_overlap_grad_reduce/model_config.yaml +++ b/tests/functional_tests/test_cases/gpt/gpt3_345m_nightly_dgx_a100_1N8G_tp1_pp1_dist_optimizer_overlap_grad_reduce/model_config.yaml @@ -3,7 +3,7 @@ ENV_VARS: NVTE_ALLOW_NONDETERMINISTIC_ALGO: 0 NCCL_ALGO: Tree CUBLAS_WORKSPACE_CONFIG: :4096:8 - N_REPEATS: 10 + N_REPEATS: 5 MODEL_ARGS: --num-layers: 12 --hidden-size: 512 diff --git a/tests/functional_tests/test_cases/gpt/gpt3_345m_nightly_dgx_a100_1N8G_tp1_pp1_overlap_grad_reduce/model_config.yaml b/tests/functional_tests/test_cases/gpt/gpt3_345m_nightly_dgx_a100_1N8G_tp1_pp1_overlap_grad_reduce/model_config.yaml index f115e94c06..190e5777f2 100644 --- a/tests/functional_tests/test_cases/gpt/gpt3_345m_nightly_dgx_a100_1N8G_tp1_pp1_overlap_grad_reduce/model_config.yaml +++ b/tests/functional_tests/test_cases/gpt/gpt3_345m_nightly_dgx_a100_1N8G_tp1_pp1_overlap_grad_reduce/model_config.yaml @@ -3,7 +3,7 @@ ENV_VARS: NVTE_ALLOW_NONDETERMINISTIC_ALGO: 0 NCCL_ALGO: Tree CUBLAS_WORKSPACE_CONFIG: :4096:8 - N_REPEATS: 10 + N_REPEATS: 5 MODEL_ARGS: --num-layers: 12 --hidden-size: 512 diff --git a/tests/functional_tests/test_cases/gpt/gpt3_345m_nightly_dgx_a100_1N8G_tp1_pp2/model_config.yaml b/tests/functional_tests/test_cases/gpt/gpt3_345m_nightly_dgx_a100_1N8G_tp1_pp2/model_config.yaml index 488589f9f2..99d0ac8f6b 100644 --- a/tests/functional_tests/test_cases/gpt/gpt3_345m_nightly_dgx_a100_1N8G_tp1_pp2/model_config.yaml +++ b/tests/functional_tests/test_cases/gpt/gpt3_345m_nightly_dgx_a100_1N8G_tp1_pp2/model_config.yaml @@ -3,7 +3,7 @@ ENV_VARS: NVTE_ALLOW_NONDETERMINISTIC_ALGO: 0 NCCL_ALGO: Tree CUBLAS_WORKSPACE_CONFIG: :4096:8 - N_REPEATS: 10 + N_REPEATS: 5 MODEL_ARGS: --num-layers: 12 --hidden-size: 512 diff --git a/tests/functional_tests/test_cases/gpt/gpt3_345m_nightly_dgx_a100_1N8G_tp1_pp2_resume_torch/model_config.yaml b/tests/functional_tests/test_cases/gpt/gpt3_345m_nightly_dgx_a100_1N8G_tp1_pp2_resume_torch/model_config.yaml index 7afec20da2..6242b2ebbc 100644 --- a/tests/functional_tests/test_cases/gpt/gpt3_345m_nightly_dgx_a100_1N8G_tp1_pp2_resume_torch/model_config.yaml +++ b/tests/functional_tests/test_cases/gpt/gpt3_345m_nightly_dgx_a100_1N8G_tp1_pp2_resume_torch/model_config.yaml @@ -3,7 +3,7 @@ ENV_VARS: NVTE_ALLOW_NONDETERMINISTIC_ALGO: 0 NCCL_ALGO: Tree CUBLAS_WORKSPACE_CONFIG: :4096:8 - N_REPEATS: 10 + N_REPEATS: 5 MODEL_ARGS: --num-layers: 12 --hidden-size: 512 diff --git a/tests/functional_tests/test_cases/gpt/gpt3_345m_nightly_dgx_a100_1N8G_tp1_pp4/model_config.yaml b/tests/functional_tests/test_cases/gpt/gpt3_345m_nightly_dgx_a100_1N8G_tp1_pp4/model_config.yaml index 668241061c..81727e052d 100644 --- a/tests/functional_tests/test_cases/gpt/gpt3_345m_nightly_dgx_a100_1N8G_tp1_pp4/model_config.yaml +++ b/tests/functional_tests/test_cases/gpt/gpt3_345m_nightly_dgx_a100_1N8G_tp1_pp4/model_config.yaml @@ -3,7 +3,7 @@ ENV_VARS: NVTE_ALLOW_NONDETERMINISTIC_ALGO: 0 NCCL_ALGO: Tree CUBLAS_WORKSPACE_CONFIG: :4096:8 - N_REPEATS: 10 + N_REPEATS: 5 MODEL_ARGS: --num-layers: 12 --hidden-size: 512 diff --git a/tests/functional_tests/test_cases/gpt/gpt3_345m_nightly_dgx_a100_1N8G_tp1_pp4_overlap_grad_reduce/model_config.yaml b/tests/functional_tests/test_cases/gpt/gpt3_345m_nightly_dgx_a100_1N8G_tp1_pp4_overlap_grad_reduce/model_config.yaml index 75d0037f4f..525d0f2c90 100644 --- a/tests/functional_tests/test_cases/gpt/gpt3_345m_nightly_dgx_a100_1N8G_tp1_pp4_overlap_grad_reduce/model_config.yaml +++ b/tests/functional_tests/test_cases/gpt/gpt3_345m_nightly_dgx_a100_1N8G_tp1_pp4_overlap_grad_reduce/model_config.yaml @@ -3,7 +3,7 @@ ENV_VARS: NVTE_ALLOW_NONDETERMINISTIC_ALGO: 0 NCCL_ALGO: Tree CUBLAS_WORKSPACE_CONFIG: :4096:8 - N_REPEATS: 10 + N_REPEATS: 5 MODEL_ARGS: --num-layers: 12 --hidden-size: 512 diff --git a/tests/functional_tests/test_cases/gpt/gpt3_345m_nightly_dgx_a100_1N8G_tp1_pp4_resume_torch/model_config.yaml b/tests/functional_tests/test_cases/gpt/gpt3_345m_nightly_dgx_a100_1N8G_tp1_pp4_resume_torch/model_config.yaml index 176cd5d6de..516e1dd517 100644 --- a/tests/functional_tests/test_cases/gpt/gpt3_345m_nightly_dgx_a100_1N8G_tp1_pp4_resume_torch/model_config.yaml +++ b/tests/functional_tests/test_cases/gpt/gpt3_345m_nightly_dgx_a100_1N8G_tp1_pp4_resume_torch/model_config.yaml @@ -3,7 +3,7 @@ ENV_VARS: NVTE_ALLOW_NONDETERMINISTIC_ALGO: 0 NCCL_ALGO: Tree CUBLAS_WORKSPACE_CONFIG: :4096:8 - N_REPEATS: 10 + N_REPEATS: 5 MODEL_ARGS: --num-layers: 12 --hidden-size: 512 diff --git a/tests/functional_tests/test_cases/gpt/gpt3_345m_nightly_dgx_a100_1N8G_tp1_pp4_vp1_overlap_grad_reduce/model_config.yaml b/tests/functional_tests/test_cases/gpt/gpt3_345m_nightly_dgx_a100_1N8G_tp1_pp4_vp1_overlap_grad_reduce/model_config.yaml index a683015714..10fc8c2f23 100644 --- a/tests/functional_tests/test_cases/gpt/gpt3_345m_nightly_dgx_a100_1N8G_tp1_pp4_vp1_overlap_grad_reduce/model_config.yaml +++ b/tests/functional_tests/test_cases/gpt/gpt3_345m_nightly_dgx_a100_1N8G_tp1_pp4_vp1_overlap_grad_reduce/model_config.yaml @@ -3,7 +3,7 @@ ENV_VARS: NVTE_ALLOW_NONDETERMINISTIC_ALGO: 0 NCCL_ALGO: Tree CUBLAS_WORKSPACE_CONFIG: :4096:8 - N_REPEATS: 10 + N_REPEATS: 5 MODEL_ARGS: --num-layers: 12 --hidden-size: 512 diff --git a/tests/functional_tests/test_cases/gpt/gpt3_345m_nightly_dgx_a100_1N8G_tp2_pp2_4experts/model_config.yaml b/tests/functional_tests/test_cases/gpt/gpt3_345m_nightly_dgx_a100_1N8G_tp2_pp2_4experts/model_config.yaml index a995f9390f..ba219d4445 100644 --- a/tests/functional_tests/test_cases/gpt/gpt3_345m_nightly_dgx_a100_1N8G_tp2_pp2_4experts/model_config.yaml +++ b/tests/functional_tests/test_cases/gpt/gpt3_345m_nightly_dgx_a100_1N8G_tp2_pp2_4experts/model_config.yaml @@ -3,7 +3,7 @@ ENV_VARS: NVTE_ALLOW_NONDETERMINISTIC_ALGO: 0 NCCL_ALGO: Tree CUBLAS_WORKSPACE_CONFIG: :4096:8 - N_REPEATS: 10 + N_REPEATS: 5 MODEL_ARGS: --num-layers: 12 --hidden-size: 512 diff --git a/tests/functional_tests/test_cases/gpt/gpt3_345m_nightly_dgx_a100_1N8G_tp2_pp2_overlap_grad_reduce/model_config.yaml b/tests/functional_tests/test_cases/gpt/gpt3_345m_nightly_dgx_a100_1N8G_tp2_pp2_overlap_grad_reduce/model_config.yaml index 460746e283..c547f47970 100644 --- a/tests/functional_tests/test_cases/gpt/gpt3_345m_nightly_dgx_a100_1N8G_tp2_pp2_overlap_grad_reduce/model_config.yaml +++ b/tests/functional_tests/test_cases/gpt/gpt3_345m_nightly_dgx_a100_1N8G_tp2_pp2_overlap_grad_reduce/model_config.yaml @@ -3,7 +3,7 @@ ENV_VARS: NVTE_ALLOW_NONDETERMINISTIC_ALGO: 0 NCCL_ALGO: Tree CUBLAS_WORKSPACE_CONFIG: :4096:8 - N_REPEATS: 10 + N_REPEATS: 5 MODEL_ARGS: --num-layers: 12 --hidden-size: 512 diff --git a/tests/functional_tests/test_cases/gpt/gpt3_345m_nightly_dgx_a100_1N8G_tp2_pp2_resume_torch_4experts/model_config.yaml b/tests/functional_tests/test_cases/gpt/gpt3_345m_nightly_dgx_a100_1N8G_tp2_pp2_resume_torch_4experts/model_config.yaml index c80b1c225c..72c98e80be 100644 --- a/tests/functional_tests/test_cases/gpt/gpt3_345m_nightly_dgx_a100_1N8G_tp2_pp2_resume_torch_4experts/model_config.yaml +++ b/tests/functional_tests/test_cases/gpt/gpt3_345m_nightly_dgx_a100_1N8G_tp2_pp2_resume_torch_4experts/model_config.yaml @@ -3,7 +3,7 @@ ENV_VARS: NVTE_ALLOW_NONDETERMINISTIC_ALGO: 0 NCCL_ALGO: Tree CUBLAS_WORKSPACE_CONFIG: :4096:8 - N_REPEATS: 10 + N_REPEATS: 5 MODEL_ARGS: --num-layers: 12 --hidden-size: 512 diff --git a/tests/functional_tests/test_cases/gpt/gpt3_345m_nightly_dgx_a100_1N8G_tp2_pp2_resume_torch_overlap_grad_reduce/model_config.yaml b/tests/functional_tests/test_cases/gpt/gpt3_345m_nightly_dgx_a100_1N8G_tp2_pp2_resume_torch_overlap_grad_reduce/model_config.yaml index 99fac43c7f..03ddd8a7ca 100644 --- a/tests/functional_tests/test_cases/gpt/gpt3_345m_nightly_dgx_a100_1N8G_tp2_pp2_resume_torch_overlap_grad_reduce/model_config.yaml +++ b/tests/functional_tests/test_cases/gpt/gpt3_345m_nightly_dgx_a100_1N8G_tp2_pp2_resume_torch_overlap_grad_reduce/model_config.yaml @@ -3,7 +3,7 @@ ENV_VARS: NVTE_ALLOW_NONDETERMINISTIC_ALGO: 0 NCCL_ALGO: Tree CUBLAS_WORKSPACE_CONFIG: :4096:8 - N_REPEATS: 10 + N_REPEATS: 5 MODEL_ARGS: --num-layers: 12 --hidden-size: 512 diff --git a/tests/functional_tests/test_cases/gpt/gpt3_345m_nightly_dgx_a100_1N8G_tp4_pp1/model_config.yaml b/tests/functional_tests/test_cases/gpt/gpt3_345m_nightly_dgx_a100_1N8G_tp4_pp1/model_config.yaml index 3b61ee4ea1..84128fa780 100644 --- a/tests/functional_tests/test_cases/gpt/gpt3_345m_nightly_dgx_a100_1N8G_tp4_pp1/model_config.yaml +++ b/tests/functional_tests/test_cases/gpt/gpt3_345m_nightly_dgx_a100_1N8G_tp4_pp1/model_config.yaml @@ -3,7 +3,7 @@ ENV_VARS: NVTE_ALLOW_NONDETERMINISTIC_ALGO: 0 NCCL_ALGO: Tree CUBLAS_WORKSPACE_CONFIG: :4096:8 - N_REPEATS: 10 + N_REPEATS: 5 MODEL_ARGS: --num-layers: 12 --hidden-size: 512 diff --git a/tests/functional_tests/test_cases/gpt/gpt3_345m_nightly_dgx_a100_1N8G_tp4_pp1_overlap_grad_reduce/model_config.yaml b/tests/functional_tests/test_cases/gpt/gpt3_345m_nightly_dgx_a100_1N8G_tp4_pp1_overlap_grad_reduce/model_config.yaml index f25579efe1..b664115f27 100644 --- a/tests/functional_tests/test_cases/gpt/gpt3_345m_nightly_dgx_a100_1N8G_tp4_pp1_overlap_grad_reduce/model_config.yaml +++ b/tests/functional_tests/test_cases/gpt/gpt3_345m_nightly_dgx_a100_1N8G_tp4_pp1_overlap_grad_reduce/model_config.yaml @@ -3,7 +3,7 @@ ENV_VARS: NVTE_ALLOW_NONDETERMINISTIC_ALGO: 0 NCCL_ALGO: Tree CUBLAS_WORKSPACE_CONFIG: :4096:8 - N_REPEATS: 10 + N_REPEATS: 5 MODEL_ARGS: --num-layers: 12 --hidden-size: 512 diff --git a/tests/functional_tests/test_cases/gpt/gpt3_345m_nightly_dgx_a100_1N8G_tp4_pp1_resume_torch/model_config.yaml b/tests/functional_tests/test_cases/gpt/gpt3_345m_nightly_dgx_a100_1N8G_tp4_pp1_resume_torch/model_config.yaml index 8d61af2bb5..0ec5d88ad9 100644 --- a/tests/functional_tests/test_cases/gpt/gpt3_345m_nightly_dgx_a100_1N8G_tp4_pp1_resume_torch/model_config.yaml +++ b/tests/functional_tests/test_cases/gpt/gpt3_345m_nightly_dgx_a100_1N8G_tp4_pp1_resume_torch/model_config.yaml @@ -3,7 +3,7 @@ ENV_VARS: NVTE_ALLOW_NONDETERMINISTIC_ALGO: 0 NCCL_ALGO: Tree CUBLAS_WORKSPACE_CONFIG: :4096:8 - N_REPEATS: 10 + N_REPEATS: 5 MODEL_ARGS: --num-layers: 12 --hidden-size: 512 diff --git a/tests/functional_tests/test_cases/gpt/gpt3_mr_mcore_te_tp1_pp1_dist_optimizer_no_mmap_bin_files_dgx_a100_1N8G/model_config.yaml b/tests/functional_tests/test_cases/gpt/gpt3_mr_mcore_te_tp1_pp1_dist_optimizer_no_mmap_bin_files_dgx_a100_1N8G/model_config.yaml index 80f727609f..ee84d93de2 100644 --- a/tests/functional_tests/test_cases/gpt/gpt3_mr_mcore_te_tp1_pp1_dist_optimizer_no_mmap_bin_files_dgx_a100_1N8G/model_config.yaml +++ b/tests/functional_tests/test_cases/gpt/gpt3_mr_mcore_te_tp1_pp1_dist_optimizer_no_mmap_bin_files_dgx_a100_1N8G/model_config.yaml @@ -3,6 +3,7 @@ ENV_VARS: NVTE_ALLOW_NONDETERMINISTIC_ALGO: 0 NCCL_ALGO: Tree CUBLAS_WORKSPACE_CONFIG: :4096:8 + N_REPEATS: 5 MODEL_ARGS: --num-layers: 12 --hidden-size: 512 @@ -48,4 +49,4 @@ MODEL_ARGS: --ckpt-format: torch_dist --data-cache-path: ${DATA_CACHE_PATH} --bf16: true -TEST_TYPE: regular \ No newline at end of file +TEST_TYPE: regular diff --git a/tests/functional_tests/test_cases/gpt/gpt3_mr_mcore_te_tp1_pp1_resume_torch_dist_dist_optimizer_dgx_a100_1N8G/model_config.yaml b/tests/functional_tests/test_cases/gpt/gpt3_mr_mcore_te_tp1_pp1_resume_torch_dist_dist_optimizer_dgx_a100_1N8G/model_config.yaml index c4dd031c19..ffdaec80ad 100644 --- a/tests/functional_tests/test_cases/gpt/gpt3_mr_mcore_te_tp1_pp1_resume_torch_dist_dist_optimizer_dgx_a100_1N8G/model_config.yaml +++ b/tests/functional_tests/test_cases/gpt/gpt3_mr_mcore_te_tp1_pp1_resume_torch_dist_dist_optimizer_dgx_a100_1N8G/model_config.yaml @@ -3,6 +3,7 @@ ENV_VARS: NVTE_ALLOW_NONDETERMINISTIC_ALGO: 0 NCCL_ALGO: Tree CUBLAS_WORKSPACE_CONFIG: :4096:8 + N_REPEATS: 5 MODEL_ARGS: --num-layers: 12 --hidden-size: 512 @@ -50,4 +51,4 @@ MODEL_ARGS: --ckpt-format: torch_dist --data-cache-path: ${DATA_CACHE_PATH} --bf16: true -TEST_TYPE: ckpt-resume \ No newline at end of file +TEST_TYPE: ckpt-resume diff --git a/tests/functional_tests/test_cases/gpt/gpt3_mr_mcore_te_tp1_pp1_resume_torch_dist_dist_optimizer_no_mmap_bin_files_dgx_a100_1N8G/model_config.yaml b/tests/functional_tests/test_cases/gpt/gpt3_mr_mcore_te_tp1_pp1_resume_torch_dist_dist_optimizer_no_mmap_bin_files_dgx_a100_1N8G/model_config.yaml index 0af105d39d..9dd9e9ecd0 100644 --- a/tests/functional_tests/test_cases/gpt/gpt3_mr_mcore_te_tp1_pp1_resume_torch_dist_dist_optimizer_no_mmap_bin_files_dgx_a100_1N8G/model_config.yaml +++ b/tests/functional_tests/test_cases/gpt/gpt3_mr_mcore_te_tp1_pp1_resume_torch_dist_dist_optimizer_no_mmap_bin_files_dgx_a100_1N8G/model_config.yaml @@ -3,6 +3,7 @@ ENV_VARS: NVTE_ALLOW_NONDETERMINISTIC_ALGO: 0 NCCL_ALGO: Tree CUBLAS_WORKSPACE_CONFIG: :4096:8 + N_REPEATS: 5 MODEL_ARGS: --num-layers: 12 --hidden-size: 512 @@ -49,4 +50,4 @@ MODEL_ARGS: --ckpt-format: torch_dist --data-cache-path: ${DATA_CACHE_PATH} --bf16: true -TEST_TYPE: ckpt-resume \ No newline at end of file +TEST_TYPE: ckpt-resume diff --git a/tests/functional_tests/test_cases/gpt/gpt3_mr_mcore_te_tp1_pp1_resume_torch_dist_uniform_full_recompute_dgx_a100_1N8G/model_config.yaml b/tests/functional_tests/test_cases/gpt/gpt3_mr_mcore_te_tp1_pp1_resume_torch_dist_uniform_full_recompute_dgx_a100_1N8G/model_config.yaml index 6782b694cd..470ba6f926 100644 --- a/tests/functional_tests/test_cases/gpt/gpt3_mr_mcore_te_tp1_pp1_resume_torch_dist_uniform_full_recompute_dgx_a100_1N8G/model_config.yaml +++ b/tests/functional_tests/test_cases/gpt/gpt3_mr_mcore_te_tp1_pp1_resume_torch_dist_uniform_full_recompute_dgx_a100_1N8G/model_config.yaml @@ -3,6 +3,7 @@ ENV_VARS: NVTE_ALLOW_NONDETERMINISTIC_ALGO: 0 NCCL_ALGO: Tree CUBLAS_WORKSPACE_CONFIG: :4096:8 + N_REPEATS: 5 MODEL_ARGS: --num-layers: 12 --hidden-size: 512 @@ -50,4 +51,4 @@ MODEL_ARGS: --ckpt-format: torch_dist --data-cache-path: ${DATA_CACHE_PATH} --bf16: true -TEST_TYPE: ckpt-resume \ No newline at end of file +TEST_TYPE: ckpt-resume diff --git a/tests/functional_tests/test_cases/gpt/gpt3_mr_mcore_te_tp1_pp1_uniform_full_recompute_dgx_a100_1N8G/model_config.yaml b/tests/functional_tests/test_cases/gpt/gpt3_mr_mcore_te_tp1_pp1_uniform_full_recompute_dgx_a100_1N8G/model_config.yaml index fa5ce41aaa..fb07f9d30c 100644 --- a/tests/functional_tests/test_cases/gpt/gpt3_mr_mcore_te_tp1_pp1_uniform_full_recompute_dgx_a100_1N8G/model_config.yaml +++ b/tests/functional_tests/test_cases/gpt/gpt3_mr_mcore_te_tp1_pp1_uniform_full_recompute_dgx_a100_1N8G/model_config.yaml @@ -3,6 +3,7 @@ ENV_VARS: NVTE_ALLOW_NONDETERMINISTIC_ALGO: 0 NCCL_ALGO: Tree CUBLAS_WORKSPACE_CONFIG: :4096:8 + N_REPEATS: 5 MODEL_ARGS: --num-layers: 12 --hidden-size: 512 @@ -49,4 +50,4 @@ MODEL_ARGS: --ckpt-format: torch_dist --data-cache-path: ${DATA_CACHE_PATH} --bf16: true -TEST_TYPE: regular \ No newline at end of file +TEST_TYPE: regular diff --git a/tests/functional_tests/test_cases/gpt/gpt3_mr_mcore_te_tp1_pp2_resume_torch_dist_reshard_2x1x4_te_8experts2parallel_dist_optimizer_dgx_a100_1N8G/model_config.yaml b/tests/functional_tests/test_cases/gpt/gpt3_mr_mcore_te_tp1_pp2_resume_torch_dist_reshard_2x1x4_te_8experts2parallel_dist_optimizer_dgx_a100_1N8G/model_config.yaml index 85941e4c7b..7cdb56dd00 100644 --- a/tests/functional_tests/test_cases/gpt/gpt3_mr_mcore_te_tp1_pp2_resume_torch_dist_reshard_2x1x4_te_8experts2parallel_dist_optimizer_dgx_a100_1N8G/model_config.yaml +++ b/tests/functional_tests/test_cases/gpt/gpt3_mr_mcore_te_tp1_pp2_resume_torch_dist_reshard_2x1x4_te_8experts2parallel_dist_optimizer_dgx_a100_1N8G/model_config.yaml @@ -1,6 +1,7 @@ ENV_VARS: CUDA_DEVICE_MAX_CONNECTIONS: 1 NVTE_ALLOW_NONDETERMINISTIC_ALGO: 1 + N_REPEATS: 5 MODEL_ARGS: --num-layers: 12 --hidden-size: 512 @@ -52,4 +53,4 @@ MODEL_ARGS: --ckpt-format: torch_dist --data-cache-path: ${DATA_CACHE_PATH} --bf16: true -TEST_TYPE: ckpt-resume \ No newline at end of file +TEST_TYPE: ckpt-resume diff --git a/tests/functional_tests/test_cases/gpt/gpt3_mr_mcore_te_tp1_pp2_resume_torch_dist_rope_embeddings_dgx_a100_1N8G/model_config.yaml b/tests/functional_tests/test_cases/gpt/gpt3_mr_mcore_te_tp1_pp2_resume_torch_dist_rope_embeddings_dgx_a100_1N8G/model_config.yaml index dc520751f8..7bdd0c46e2 100644 --- a/tests/functional_tests/test_cases/gpt/gpt3_mr_mcore_te_tp1_pp2_resume_torch_dist_rope_embeddings_dgx_a100_1N8G/model_config.yaml +++ b/tests/functional_tests/test_cases/gpt/gpt3_mr_mcore_te_tp1_pp2_resume_torch_dist_rope_embeddings_dgx_a100_1N8G/model_config.yaml @@ -3,6 +3,7 @@ ENV_VARS: NVTE_ALLOW_NONDETERMINISTIC_ALGO: 0 NCCL_ALGO: Tree CUBLAS_WORKSPACE_CONFIG: :4096:8 + N_REPEATS: 5 MODEL_ARGS: --num-layers: 12 --hidden-size: 512 @@ -49,4 +50,4 @@ MODEL_ARGS: --ckpt-format: torch_dist --data-cache-path: ${DATA_CACHE_PATH} --bf16: true -TEST_TYPE: ckpt-resume \ No newline at end of file +TEST_TYPE: ckpt-resume diff --git a/tests/functional_tests/test_cases/gpt/gpt3_mr_mcore_te_tp1_pp2_resume_torch_dist_rope_embeddings_interleaved_no_fusion_dgx_a100_1N8G/model_config.yaml b/tests/functional_tests/test_cases/gpt/gpt3_mr_mcore_te_tp1_pp2_resume_torch_dist_rope_embeddings_interleaved_no_fusion_dgx_a100_1N8G/model_config.yaml index f0070af373..b014fdabc0 100644 --- a/tests/functional_tests/test_cases/gpt/gpt3_mr_mcore_te_tp1_pp2_resume_torch_dist_rope_embeddings_interleaved_no_fusion_dgx_a100_1N8G/model_config.yaml +++ b/tests/functional_tests/test_cases/gpt/gpt3_mr_mcore_te_tp1_pp2_resume_torch_dist_rope_embeddings_interleaved_no_fusion_dgx_a100_1N8G/model_config.yaml @@ -3,6 +3,7 @@ ENV_VARS: NVTE_ALLOW_NONDETERMINISTIC_ALGO: 0 NCCL_ALGO: Tree CUBLAS_WORKSPACE_CONFIG: :4096:8 + N_REPEATS: 5 MODEL_ARGS: --num-layers: 12 --hidden-size: 512 @@ -50,4 +51,4 @@ MODEL_ARGS: --ckpt-format: torch_dist --data-cache-path: ${DATA_CACHE_PATH} --bf16: true -TEST_TYPE: ckpt-resume \ No newline at end of file +TEST_TYPE: ckpt-resume diff --git a/tests/functional_tests/test_cases/gpt/gpt3_mr_mcore_te_tp1_pp2_rope_embeddings_dgx_a100_1N8G/model_config.yaml b/tests/functional_tests/test_cases/gpt/gpt3_mr_mcore_te_tp1_pp2_rope_embeddings_dgx_a100_1N8G/model_config.yaml index b86c2fcb0d..b2a1643ec8 100644 --- a/tests/functional_tests/test_cases/gpt/gpt3_mr_mcore_te_tp1_pp2_rope_embeddings_dgx_a100_1N8G/model_config.yaml +++ b/tests/functional_tests/test_cases/gpt/gpt3_mr_mcore_te_tp1_pp2_rope_embeddings_dgx_a100_1N8G/model_config.yaml @@ -3,6 +3,7 @@ ENV_VARS: NVTE_ALLOW_NONDETERMINISTIC_ALGO: 0 NCCL_ALGO: Tree CUBLAS_WORKSPACE_CONFIG: :4096:8 + N_REPEATS: 5 MODEL_ARGS: --num-layers: 12 --hidden-size: 512 @@ -48,4 +49,4 @@ MODEL_ARGS: --ckpt-format: torch_dist --data-cache-path: ${DATA_CACHE_PATH} --bf16: true -TEST_TYPE: regular \ No newline at end of file +TEST_TYPE: regular diff --git a/tests/functional_tests/test_cases/gpt/gpt3_mr_mcore_te_tp1_pp2_rope_embeddings_interleaved_no_fusion_dgx_a100_1N8G/model_config.yaml b/tests/functional_tests/test_cases/gpt/gpt3_mr_mcore_te_tp1_pp2_rope_embeddings_interleaved_no_fusion_dgx_a100_1N8G/model_config.yaml index b8c0b09668..6c2c9e51ab 100644 --- a/tests/functional_tests/test_cases/gpt/gpt3_mr_mcore_te_tp1_pp2_rope_embeddings_interleaved_no_fusion_dgx_a100_1N8G/model_config.yaml +++ b/tests/functional_tests/test_cases/gpt/gpt3_mr_mcore_te_tp1_pp2_rope_embeddings_interleaved_no_fusion_dgx_a100_1N8G/model_config.yaml @@ -3,6 +3,7 @@ ENV_VARS: NVTE_ALLOW_NONDETERMINISTIC_ALGO: 0 NCCL_ALGO: Tree CUBLAS_WORKSPACE_CONFIG: :4096:8 + N_REPEATS: 5 MODEL_ARGS: --num-layers: 12 --hidden-size: 512 @@ -49,4 +50,4 @@ MODEL_ARGS: --ckpt-format: torch_dist --data-cache-path: ${DATA_CACHE_PATH} --bf16: true -TEST_TYPE: regular \ No newline at end of file +TEST_TYPE: regular diff --git a/tests/functional_tests/test_cases/gpt/gpt3_mr_mcore_te_tp1_pp4_disable_bias_linear_dgx_a100_1N8G/model_config.yaml b/tests/functional_tests/test_cases/gpt/gpt3_mr_mcore_te_tp1_pp4_disable_bias_linear_dgx_a100_1N8G/model_config.yaml index 309398f123..2e0188551a 100644 --- a/tests/functional_tests/test_cases/gpt/gpt3_mr_mcore_te_tp1_pp4_disable_bias_linear_dgx_a100_1N8G/model_config.yaml +++ b/tests/functional_tests/test_cases/gpt/gpt3_mr_mcore_te_tp1_pp4_disable_bias_linear_dgx_a100_1N8G/model_config.yaml @@ -3,6 +3,7 @@ ENV_VARS: NVTE_ALLOW_NONDETERMINISTIC_ALGO: 0 NCCL_ALGO: Tree CUBLAS_WORKSPACE_CONFIG: :4096:8 + N_REPEATS: 5 MODEL_ARGS: --num-layers: 12 --hidden-size: 512 @@ -48,4 +49,4 @@ MODEL_ARGS: --ckpt-format: torch_dist --data-cache-path: ${DATA_CACHE_PATH} --bf16: true -TEST_TYPE: regular \ No newline at end of file +TEST_TYPE: regular diff --git a/tests/functional_tests/test_cases/gpt/gpt3_mr_mcore_te_tp1_pp4_resume_torch_dist_disable_bias_linear_dgx_a100_1N8G/model_config.yaml b/tests/functional_tests/test_cases/gpt/gpt3_mr_mcore_te_tp1_pp4_resume_torch_dist_disable_bias_linear_dgx_a100_1N8G/model_config.yaml index 995270875f..8fa10f4b9d 100644 --- a/tests/functional_tests/test_cases/gpt/gpt3_mr_mcore_te_tp1_pp4_resume_torch_dist_disable_bias_linear_dgx_a100_1N8G/model_config.yaml +++ b/tests/functional_tests/test_cases/gpt/gpt3_mr_mcore_te_tp1_pp4_resume_torch_dist_disable_bias_linear_dgx_a100_1N8G/model_config.yaml @@ -3,6 +3,7 @@ ENV_VARS: NVTE_ALLOW_NONDETERMINISTIC_ALGO: 0 NCCL_ALGO: Tree CUBLAS_WORKSPACE_CONFIG: :4096:8 + N_REPEATS: 5 MODEL_ARGS: --num-layers: 12 --hidden-size: 512 @@ -49,4 +50,4 @@ MODEL_ARGS: --ckpt-format: torch_dist --data-cache-path: ${DATA_CACHE_PATH} --bf16: true -TEST_TYPE: ckpt-resume \ No newline at end of file +TEST_TYPE: ckpt-resume diff --git a/tests/functional_tests/test_cases/gpt/gpt3_mr_mcore_te_tp1_pp4_resume_torch_dist_sequence_parallel_dgx_a100_1N8G/model_config.yaml b/tests/functional_tests/test_cases/gpt/gpt3_mr_mcore_te_tp1_pp4_resume_torch_dist_sequence_parallel_dgx_a100_1N8G/model_config.yaml index 539e4312f0..c64a4ef5e7 100644 --- a/tests/functional_tests/test_cases/gpt/gpt3_mr_mcore_te_tp1_pp4_resume_torch_dist_sequence_parallel_dgx_a100_1N8G/model_config.yaml +++ b/tests/functional_tests/test_cases/gpt/gpt3_mr_mcore_te_tp1_pp4_resume_torch_dist_sequence_parallel_dgx_a100_1N8G/model_config.yaml @@ -3,6 +3,7 @@ ENV_VARS: NVTE_ALLOW_NONDETERMINISTIC_ALGO: 0 NCCL_ALGO: Tree CUBLAS_WORKSPACE_CONFIG: :4096:8 + N_REPEATS: 5 MODEL_ARGS: --num-layers: 12 --hidden-size: 512 @@ -49,4 +50,4 @@ MODEL_ARGS: --ckpt-format: torch_dist --data-cache-path: ${DATA_CACHE_PATH} --bf16: true -TEST_TYPE: ckpt-resume \ No newline at end of file +TEST_TYPE: ckpt-resume diff --git a/tests/functional_tests/test_cases/gpt/gpt3_mr_mcore_te_tp1_pp4_resume_torch_dist_swiglu_dgx_a100_1N8G/model_config.yaml b/tests/functional_tests/test_cases/gpt/gpt3_mr_mcore_te_tp1_pp4_resume_torch_dist_swiglu_dgx_a100_1N8G/model_config.yaml index f0e0581593..dda1876e1a 100644 --- a/tests/functional_tests/test_cases/gpt/gpt3_mr_mcore_te_tp1_pp4_resume_torch_dist_swiglu_dgx_a100_1N8G/model_config.yaml +++ b/tests/functional_tests/test_cases/gpt/gpt3_mr_mcore_te_tp1_pp4_resume_torch_dist_swiglu_dgx_a100_1N8G/model_config.yaml @@ -3,6 +3,7 @@ ENV_VARS: NVTE_ALLOW_NONDETERMINISTIC_ALGO: 0 NCCL_ALGO: Tree CUBLAS_WORKSPACE_CONFIG: :4096:8 + N_REPEATS: 5 MODEL_ARGS: --num-layers: 12 --hidden-size: 512 @@ -50,4 +51,4 @@ MODEL_ARGS: --ckpt-format: torch_dist --data-cache-path: ${DATA_CACHE_PATH} --bf16: true -TEST_TYPE: ckpt-resume \ No newline at end of file +TEST_TYPE: ckpt-resume diff --git a/tests/functional_tests/test_cases/gpt/gpt3_mr_mcore_te_tp1_pp4_resume_torch_dist_untie_embeddings_and_outputs_dgx_a100_1N8G/model_config.yaml b/tests/functional_tests/test_cases/gpt/gpt3_mr_mcore_te_tp1_pp4_resume_torch_dist_untie_embeddings_and_outputs_dgx_a100_1N8G/model_config.yaml index 4cf91fb542..df7ba9fb3b 100644 --- a/tests/functional_tests/test_cases/gpt/gpt3_mr_mcore_te_tp1_pp4_resume_torch_dist_untie_embeddings_and_outputs_dgx_a100_1N8G/model_config.yaml +++ b/tests/functional_tests/test_cases/gpt/gpt3_mr_mcore_te_tp1_pp4_resume_torch_dist_untie_embeddings_and_outputs_dgx_a100_1N8G/model_config.yaml @@ -3,6 +3,7 @@ ENV_VARS: NVTE_ALLOW_NONDETERMINISTIC_ALGO: 0 NCCL_ALGO: Tree CUBLAS_WORKSPACE_CONFIG: :4096:8 + N_REPEATS: 5 MODEL_ARGS: --num-layers: 12 --hidden-size: 512 @@ -48,4 +49,4 @@ MODEL_ARGS: --ckpt-format: torch_dist --data-cache-path: ${DATA_CACHE_PATH} --bf16: true -TEST_TYPE: ckpt-resume \ No newline at end of file +TEST_TYPE: ckpt-resume diff --git a/tests/functional_tests/test_cases/gpt/gpt3_mr_mcore_te_tp1_pp4_sequence_parallel_dgx_a100_1N8G/model_config.yaml b/tests/functional_tests/test_cases/gpt/gpt3_mr_mcore_te_tp1_pp4_sequence_parallel_dgx_a100_1N8G/model_config.yaml index c7c33314c3..479916c654 100644 --- a/tests/functional_tests/test_cases/gpt/gpt3_mr_mcore_te_tp1_pp4_sequence_parallel_dgx_a100_1N8G/model_config.yaml +++ b/tests/functional_tests/test_cases/gpt/gpt3_mr_mcore_te_tp1_pp4_sequence_parallel_dgx_a100_1N8G/model_config.yaml @@ -3,6 +3,7 @@ ENV_VARS: NVTE_ALLOW_NONDETERMINISTIC_ALGO: 0 NCCL_ALGO: Tree CUBLAS_WORKSPACE_CONFIG: :4096:8 + N_REPEATS: 5 MODEL_ARGS: --num-layers: 12 --hidden-size: 512 @@ -48,4 +49,4 @@ MODEL_ARGS: --ckpt-format: torch_dist --data-cache-path: ${DATA_CACHE_PATH} --bf16: true -TEST_TYPE: regular \ No newline at end of file +TEST_TYPE: regular diff --git a/tests/functional_tests/test_cases/gpt/gpt3_mr_mcore_te_tp1_pp4_swiglu_dgx_a100_1N8G/model_config.yaml b/tests/functional_tests/test_cases/gpt/gpt3_mr_mcore_te_tp1_pp4_swiglu_dgx_a100_1N8G/model_config.yaml index ae50df1ce8..20c57f0c95 100644 --- a/tests/functional_tests/test_cases/gpt/gpt3_mr_mcore_te_tp1_pp4_swiglu_dgx_a100_1N8G/model_config.yaml +++ b/tests/functional_tests/test_cases/gpt/gpt3_mr_mcore_te_tp1_pp4_swiglu_dgx_a100_1N8G/model_config.yaml @@ -3,6 +3,7 @@ ENV_VARS: NVTE_ALLOW_NONDETERMINISTIC_ALGO: 0 NCCL_ALGO: Tree CUBLAS_WORKSPACE_CONFIG: :4096:8 + N_REPEATS: 5 MODEL_ARGS: --num-layers: 12 --hidden-size: 512 @@ -49,4 +50,4 @@ MODEL_ARGS: --ckpt-format: torch_dist --data-cache-path: ${DATA_CACHE_PATH} --bf16: true -TEST_TYPE: regular \ No newline at end of file +TEST_TYPE: regular diff --git a/tests/functional_tests/test_cases/gpt/gpt3_mr_mcore_te_tp1_pp4_untie_embeddings_and_outputs_dgx_a100_1N8G/model_config.yaml b/tests/functional_tests/test_cases/gpt/gpt3_mr_mcore_te_tp1_pp4_untie_embeddings_and_outputs_dgx_a100_1N8G/model_config.yaml index a95d943f21..f7c52c997f 100644 --- a/tests/functional_tests/test_cases/gpt/gpt3_mr_mcore_te_tp1_pp4_untie_embeddings_and_outputs_dgx_a100_1N8G/model_config.yaml +++ b/tests/functional_tests/test_cases/gpt/gpt3_mr_mcore_te_tp1_pp4_untie_embeddings_and_outputs_dgx_a100_1N8G/model_config.yaml @@ -3,6 +3,7 @@ ENV_VARS: NVTE_ALLOW_NONDETERMINISTIC_ALGO: 0 NCCL_ALGO: Tree CUBLAS_WORKSPACE_CONFIG: :4096:8 + N_REPEATS: 5 MODEL_ARGS: --num-layers: 12 --hidden-size: 512 @@ -47,4 +48,4 @@ MODEL_ARGS: --ckpt-format: torch_dist --data-cache-path: ${DATA_CACHE_PATH} --bf16: true -TEST_TYPE: regular \ No newline at end of file +TEST_TYPE: regular diff --git a/tests/functional_tests/test_cases/gpt/gpt3_mr_mcore_te_tp1_pp4_vp1_calculate_per_token_loss_dgx_a100_1N8G/model_config.yaml b/tests/functional_tests/test_cases/gpt/gpt3_mr_mcore_te_tp1_pp4_vp1_calculate_per_token_loss_dgx_a100_1N8G/model_config.yaml index 4c2ef387c8..210febf448 100644 --- a/tests/functional_tests/test_cases/gpt/gpt3_mr_mcore_te_tp1_pp4_vp1_calculate_per_token_loss_dgx_a100_1N8G/model_config.yaml +++ b/tests/functional_tests/test_cases/gpt/gpt3_mr_mcore_te_tp1_pp4_vp1_calculate_per_token_loss_dgx_a100_1N8G/model_config.yaml @@ -3,6 +3,7 @@ ENV_VARS: NVTE_ALLOW_NONDETERMINISTIC_ALGO: 0 NCCL_ALGO: Tree CUBLAS_WORKSPACE_CONFIG: :4096:8 + N_REPEATS: 5 MODEL_ARGS: --num-layers: 12 --hidden-size: 512 @@ -48,4 +49,4 @@ MODEL_ARGS: --ckpt-format: torch_dist --data-cache-path: ${DATA_CACHE_PATH} --bf16: true -TEST_TYPE: regular \ No newline at end of file +TEST_TYPE: regular diff --git a/tests/functional_tests/test_cases/gpt/gpt3_mr_mcore_te_tp1_pp4_vp1_decoupled_lr_dgx_a100_1N8G/model_config.yaml b/tests/functional_tests/test_cases/gpt/gpt3_mr_mcore_te_tp1_pp4_vp1_decoupled_lr_dgx_a100_1N8G/model_config.yaml index 7725cd9caa..fd67df60ca 100644 --- a/tests/functional_tests/test_cases/gpt/gpt3_mr_mcore_te_tp1_pp4_vp1_decoupled_lr_dgx_a100_1N8G/model_config.yaml +++ b/tests/functional_tests/test_cases/gpt/gpt3_mr_mcore_te_tp1_pp4_vp1_decoupled_lr_dgx_a100_1N8G/model_config.yaml @@ -3,6 +3,7 @@ ENV_VARS: NVTE_ALLOW_NONDETERMINISTIC_ALGO: 0 NCCL_ALGO: Tree CUBLAS_WORKSPACE_CONFIG: :4096:8 + N_REPEATS: 5 MODEL_ARGS: --num-layers: 12 --hidden-size: 512 @@ -47,4 +48,4 @@ MODEL_ARGS: --ckpt-format: torch --data-cache-path: ${DATA_CACHE_PATH} --bf16: true -TEST_TYPE: regular \ No newline at end of file +TEST_TYPE: regular diff --git a/tests/functional_tests/test_cases/gpt/gpt3_mr_mcore_te_tp1_pp4_vp1_dgx_a100_1N8G/model_config.yaml b/tests/functional_tests/test_cases/gpt/gpt3_mr_mcore_te_tp1_pp4_vp1_dgx_a100_1N8G/model_config.yaml index f743e0943f..0c0bc85f61 100644 --- a/tests/functional_tests/test_cases/gpt/gpt3_mr_mcore_te_tp1_pp4_vp1_dgx_a100_1N8G/model_config.yaml +++ b/tests/functional_tests/test_cases/gpt/gpt3_mr_mcore_te_tp1_pp4_vp1_dgx_a100_1N8G/model_config.yaml @@ -3,6 +3,7 @@ ENV_VARS: NVTE_ALLOW_NONDETERMINISTIC_ALGO: 0 NCCL_ALGO: Tree CUBLAS_WORKSPACE_CONFIG: :4096:8 + N_REPEATS: 5 MODEL_ARGS: --num-layers: 12 --hidden-size: 512 @@ -47,4 +48,4 @@ MODEL_ARGS: --ckpt-format: torch_dist --data-cache-path: ${DATA_CACHE_PATH} --bf16: true -TEST_TYPE: regular \ No newline at end of file +TEST_TYPE: regular diff --git a/tests/functional_tests/test_cases/gpt/gpt3_mr_mcore_te_tp1_pp4_vp1_dist_optimizer_overlap_grad_reduce_dgx_a100_1N8G/model_config.yaml b/tests/functional_tests/test_cases/gpt/gpt3_mr_mcore_te_tp1_pp4_vp1_dist_optimizer_overlap_grad_reduce_dgx_a100_1N8G/model_config.yaml index beae881c77..7a92bfd8cd 100644 --- a/tests/functional_tests/test_cases/gpt/gpt3_mr_mcore_te_tp1_pp4_vp1_dist_optimizer_overlap_grad_reduce_dgx_a100_1N8G/model_config.yaml +++ b/tests/functional_tests/test_cases/gpt/gpt3_mr_mcore_te_tp1_pp4_vp1_dist_optimizer_overlap_grad_reduce_dgx_a100_1N8G/model_config.yaml @@ -3,6 +3,7 @@ ENV_VARS: NVTE_ALLOW_NONDETERMINISTIC_ALGO: 0 NCCL_ALGO: Tree CUBLAS_WORKSPACE_CONFIG: :4096:8 + N_REPEATS: 5 MODEL_ARGS: --num-layers: 12 --hidden-size: 512 @@ -49,4 +50,4 @@ MODEL_ARGS: --ckpt-format: torch_dist --data-cache-path: ${DATA_CACHE_PATH} --bf16: true -TEST_TYPE: regular \ No newline at end of file +TEST_TYPE: regular diff --git a/tests/functional_tests/test_cases/gpt/gpt3_mr_mcore_te_tp1_pp4_vp1_dist_optimizer_overlap_grad_reduce_param_gather_dgx_a100_1N8G/model_config.yaml b/tests/functional_tests/test_cases/gpt/gpt3_mr_mcore_te_tp1_pp4_vp1_dist_optimizer_overlap_grad_reduce_param_gather_dgx_a100_1N8G/model_config.yaml index cdff5e00b7..ef5b64d284 100644 --- a/tests/functional_tests/test_cases/gpt/gpt3_mr_mcore_te_tp1_pp4_vp1_dist_optimizer_overlap_grad_reduce_param_gather_dgx_a100_1N8G/model_config.yaml +++ b/tests/functional_tests/test_cases/gpt/gpt3_mr_mcore_te_tp1_pp4_vp1_dist_optimizer_overlap_grad_reduce_param_gather_dgx_a100_1N8G/model_config.yaml @@ -3,6 +3,7 @@ ENV_VARS: NVTE_ALLOW_NONDETERMINISTIC_ALGO: 0 NCCL_ALGO: Tree CUBLAS_WORKSPACE_CONFIG: :4096:8 + N_REPEATS: 5 MODEL_ARGS: --num-layers: 12 --hidden-size: 512 @@ -52,4 +53,4 @@ MODEL_ARGS: --ckpt-format: torch_dist --data-cache-path: ${DATA_CACHE_PATH} --bf16: true -TEST_TYPE: regular \ No newline at end of file +TEST_TYPE: regular diff --git a/tests/functional_tests/test_cases/gpt/gpt3_mr_mcore_te_tp1_pp4_vp1_dist_optimizer_overlap_grad_reduce_param_gather_overlap_optimizer_dgx_a100_1N8G/model_config.yaml b/tests/functional_tests/test_cases/gpt/gpt3_mr_mcore_te_tp1_pp4_vp1_dist_optimizer_overlap_grad_reduce_param_gather_overlap_optimizer_dgx_a100_1N8G/model_config.yaml index 588c8a16f0..ca1de0ad37 100644 --- a/tests/functional_tests/test_cases/gpt/gpt3_mr_mcore_te_tp1_pp4_vp1_dist_optimizer_overlap_grad_reduce_param_gather_overlap_optimizer_dgx_a100_1N8G/model_config.yaml +++ b/tests/functional_tests/test_cases/gpt/gpt3_mr_mcore_te_tp1_pp4_vp1_dist_optimizer_overlap_grad_reduce_param_gather_overlap_optimizer_dgx_a100_1N8G/model_config.yaml @@ -3,6 +3,7 @@ ENV_VARS: NVTE_ALLOW_NONDETERMINISTIC_ALGO: 0 NCCL_ALGO: Tree CUBLAS_WORKSPACE_CONFIG: :4096:8 + N_REPEATS: 5 MODEL_ARGS: --num-layers: 12 --hidden-size: 512 @@ -53,4 +54,4 @@ MODEL_ARGS: --ckpt-format: torch --data-cache-path: ${DATA_CACHE_PATH} --bf16: true -TEST_TYPE: regular \ No newline at end of file +TEST_TYPE: regular diff --git a/tests/functional_tests/test_cases/gpt/gpt3_mr_mcore_te_tp1_pp4_vp1_dist_optimizer_overlap_grad_reduce_untied_dgx_a100_1N8G/model_config.yaml b/tests/functional_tests/test_cases/gpt/gpt3_mr_mcore_te_tp1_pp4_vp1_dist_optimizer_overlap_grad_reduce_untied_dgx_a100_1N8G/model_config.yaml index d373d7ccf3..30137a040d 100644 --- a/tests/functional_tests/test_cases/gpt/gpt3_mr_mcore_te_tp1_pp4_vp1_dist_optimizer_overlap_grad_reduce_untied_dgx_a100_1N8G/model_config.yaml +++ b/tests/functional_tests/test_cases/gpt/gpt3_mr_mcore_te_tp1_pp4_vp1_dist_optimizer_overlap_grad_reduce_untied_dgx_a100_1N8G/model_config.yaml @@ -3,6 +3,7 @@ ENV_VARS: NVTE_ALLOW_NONDETERMINISTIC_ALGO: 0 NCCL_ALGO: Tree CUBLAS_WORKSPACE_CONFIG: :4096:8 + N_REPEATS: 5 MODEL_ARGS: --num-layers: 12 --hidden-size: 512 @@ -50,4 +51,4 @@ MODEL_ARGS: --ckpt-format: torch_dist --data-cache-path: ${DATA_CACHE_PATH} --bf16: true -TEST_TYPE: regular \ No newline at end of file +TEST_TYPE: regular diff --git a/tests/functional_tests/test_cases/gpt/gpt3_mr_mcore_te_tp1_pp4_vp1_resume_torch_decoupled_lr_dgx_a100_1N8G/model_config.yaml b/tests/functional_tests/test_cases/gpt/gpt3_mr_mcore_te_tp1_pp4_vp1_resume_torch_decoupled_lr_dgx_a100_1N8G/model_config.yaml index 4e1ad296ed..1513a18192 100644 --- a/tests/functional_tests/test_cases/gpt/gpt3_mr_mcore_te_tp1_pp4_vp1_resume_torch_decoupled_lr_dgx_a100_1N8G/model_config.yaml +++ b/tests/functional_tests/test_cases/gpt/gpt3_mr_mcore_te_tp1_pp4_vp1_resume_torch_decoupled_lr_dgx_a100_1N8G/model_config.yaml @@ -3,6 +3,7 @@ ENV_VARS: NVTE_ALLOW_NONDETERMINISTIC_ALGO: 0 NCCL_ALGO: Tree CUBLAS_WORKSPACE_CONFIG: :4096:8 + N_REPEATS: 5 MODEL_ARGS: --num-layers: 12 --hidden-size: 512 @@ -48,4 +49,4 @@ MODEL_ARGS: --ckpt-format: torch --data-cache-path: ${DATA_CACHE_PATH} --bf16: true -TEST_TYPE: ckpt-resume \ No newline at end of file +TEST_TYPE: ckpt-resume diff --git a/tests/functional_tests/test_cases/gpt/gpt3_mr_mcore_te_tp1_pp4_vp1_resume_torch_dist_calculate_per_token_loss_dgx_a100_1N8G/model_config.yaml b/tests/functional_tests/test_cases/gpt/gpt3_mr_mcore_te_tp1_pp4_vp1_resume_torch_dist_calculate_per_token_loss_dgx_a100_1N8G/model_config.yaml index 4e9cda0a24..077c9a36e8 100644 --- a/tests/functional_tests/test_cases/gpt/gpt3_mr_mcore_te_tp1_pp4_vp1_resume_torch_dist_calculate_per_token_loss_dgx_a100_1N8G/model_config.yaml +++ b/tests/functional_tests/test_cases/gpt/gpt3_mr_mcore_te_tp1_pp4_vp1_resume_torch_dist_calculate_per_token_loss_dgx_a100_1N8G/model_config.yaml @@ -3,6 +3,7 @@ ENV_VARS: NVTE_ALLOW_NONDETERMINISTIC_ALGO: 0 NCCL_ALGO: Tree CUBLAS_WORKSPACE_CONFIG: :4096:8 + N_REPEATS: 5 MODEL_ARGS: --num-layers: 12 --hidden-size: 512 @@ -49,4 +50,4 @@ MODEL_ARGS: --ckpt-format: torch_dist --data-cache-path: ${DATA_CACHE_PATH} --bf16: true -TEST_TYPE: ckpt-resume \ No newline at end of file +TEST_TYPE: ckpt-resume diff --git a/tests/functional_tests/test_cases/gpt/gpt3_mr_mcore_te_tp1_pp4_vp1_resume_torch_dist_dgx_a100_1N8G/model_config.yaml b/tests/functional_tests/test_cases/gpt/gpt3_mr_mcore_te_tp1_pp4_vp1_resume_torch_dist_dgx_a100_1N8G/model_config.yaml index b4b28e9308..1ccbe1ae31 100644 --- a/tests/functional_tests/test_cases/gpt/gpt3_mr_mcore_te_tp1_pp4_vp1_resume_torch_dist_dgx_a100_1N8G/model_config.yaml +++ b/tests/functional_tests/test_cases/gpt/gpt3_mr_mcore_te_tp1_pp4_vp1_resume_torch_dist_dgx_a100_1N8G/model_config.yaml @@ -3,6 +3,7 @@ ENV_VARS: NVTE_ALLOW_NONDETERMINISTIC_ALGO: 0 NCCL_ALGO: Tree CUBLAS_WORKSPACE_CONFIG: :4096:8 + N_REPEATS: 5 MODEL_ARGS: --num-layers: 12 --hidden-size: 512 @@ -48,4 +49,4 @@ MODEL_ARGS: --ckpt-format: torch_dist --data-cache-path: ${DATA_CACHE_PATH} --bf16: true -TEST_TYPE: ckpt-resume \ No newline at end of file +TEST_TYPE: ckpt-resume diff --git a/tests/functional_tests/test_cases/gpt/gpt3_mr_mcore_te_tp1_pp4_vp1_resume_torch_dist_dist_optimizer_overlap_grad_reduce_dgx_a100_1N8G/model_config.yaml b/tests/functional_tests/test_cases/gpt/gpt3_mr_mcore_te_tp1_pp4_vp1_resume_torch_dist_dist_optimizer_overlap_grad_reduce_dgx_a100_1N8G/model_config.yaml index ec4a2338a8..b9ca819495 100644 --- a/tests/functional_tests/test_cases/gpt/gpt3_mr_mcore_te_tp1_pp4_vp1_resume_torch_dist_dist_optimizer_overlap_grad_reduce_dgx_a100_1N8G/model_config.yaml +++ b/tests/functional_tests/test_cases/gpt/gpt3_mr_mcore_te_tp1_pp4_vp1_resume_torch_dist_dist_optimizer_overlap_grad_reduce_dgx_a100_1N8G/model_config.yaml @@ -3,6 +3,7 @@ ENV_VARS: NVTE_ALLOW_NONDETERMINISTIC_ALGO: 0 NCCL_ALGO: Tree CUBLAS_WORKSPACE_CONFIG: :4096:8 + N_REPEATS: 5 MODEL_ARGS: --num-layers: 12 --hidden-size: 512 @@ -50,4 +51,4 @@ MODEL_ARGS: --ckpt-format: torch_dist --data-cache-path: ${DATA_CACHE_PATH} --bf16: true -TEST_TYPE: ckpt-resume \ No newline at end of file +TEST_TYPE: ckpt-resume diff --git a/tests/functional_tests/test_cases/gpt/gpt3_mr_mcore_te_tp1_pp4_vp1_resume_torch_dist_dist_optimizer_overlap_grad_reduce_param_gather_dgx_a100_1N8G/model_config.yaml b/tests/functional_tests/test_cases/gpt/gpt3_mr_mcore_te_tp1_pp4_vp1_resume_torch_dist_dist_optimizer_overlap_grad_reduce_param_gather_dgx_a100_1N8G/model_config.yaml index 18dde2b9cb..25ea6c933b 100644 --- a/tests/functional_tests/test_cases/gpt/gpt3_mr_mcore_te_tp1_pp4_vp1_resume_torch_dist_dist_optimizer_overlap_grad_reduce_param_gather_dgx_a100_1N8G/model_config.yaml +++ b/tests/functional_tests/test_cases/gpt/gpt3_mr_mcore_te_tp1_pp4_vp1_resume_torch_dist_dist_optimizer_overlap_grad_reduce_param_gather_dgx_a100_1N8G/model_config.yaml @@ -3,6 +3,7 @@ ENV_VARS: NVTE_ALLOW_NONDETERMINISTIC_ALGO: 0 NCCL_ALGO: Tree CUBLAS_WORKSPACE_CONFIG: :4096:8 + N_REPEATS: 5 MODEL_ARGS: --num-layers: 12 --hidden-size: 512 @@ -53,4 +54,4 @@ MODEL_ARGS: --ckpt-format: torch_dist --data-cache-path: ${DATA_CACHE_PATH} --bf16: true -TEST_TYPE: ckpt-resume \ No newline at end of file +TEST_TYPE: ckpt-resume diff --git a/tests/functional_tests/test_cases/gpt/gpt3_mr_mcore_te_tp1_pp4_vp1_resume_torch_dist_dist_optimizer_overlap_grad_reduce_untied_dgx_a100_1N8G/model_config.yaml b/tests/functional_tests/test_cases/gpt/gpt3_mr_mcore_te_tp1_pp4_vp1_resume_torch_dist_dist_optimizer_overlap_grad_reduce_untied_dgx_a100_1N8G/model_config.yaml index a125bbe7a6..7b7bc27f4b 100644 --- a/tests/functional_tests/test_cases/gpt/gpt3_mr_mcore_te_tp1_pp4_vp1_resume_torch_dist_dist_optimizer_overlap_grad_reduce_untied_dgx_a100_1N8G/model_config.yaml +++ b/tests/functional_tests/test_cases/gpt/gpt3_mr_mcore_te_tp1_pp4_vp1_resume_torch_dist_dist_optimizer_overlap_grad_reduce_untied_dgx_a100_1N8G/model_config.yaml @@ -3,6 +3,7 @@ ENV_VARS: NVTE_ALLOW_NONDETERMINISTIC_ALGO: 0 NCCL_ALGO: Tree CUBLAS_WORKSPACE_CONFIG: :4096:8 + N_REPEATS: 5 MODEL_ARGS: --num-layers: 12 --hidden-size: 512 @@ -51,4 +52,4 @@ MODEL_ARGS: --ckpt-format: torch_dist --data-cache-path: ${DATA_CACHE_PATH} --bf16: true -TEST_TYPE: ckpt-resume \ No newline at end of file +TEST_TYPE: ckpt-resume diff --git a/tests/functional_tests/test_cases/gpt/gpt3_mr_mcore_te_tp2_pp1_cp2_nondeterministic_dgx_a100_1N8G/model_config.yaml b/tests/functional_tests/test_cases/gpt/gpt3_mr_mcore_te_tp2_pp1_cp2_nondeterministic_dgx_a100_1N8G/model_config.yaml index 75791d64f3..7da0cc5ddd 100644 --- a/tests/functional_tests/test_cases/gpt/gpt3_mr_mcore_te_tp2_pp1_cp2_nondeterministic_dgx_a100_1N8G/model_config.yaml +++ b/tests/functional_tests/test_cases/gpt/gpt3_mr_mcore_te_tp2_pp1_cp2_nondeterministic_dgx_a100_1N8G/model_config.yaml @@ -1,6 +1,7 @@ ENV_VARS: CUDA_DEVICE_MAX_CONNECTIONS: 1 NVTE_ALLOW_NONDETERMINISTIC_ALGO: 1 + N_REPEATS: 5 MODEL_ARGS: --num-layers: 12 --hidden-size: 512 @@ -47,4 +48,4 @@ MODEL_ARGS: --ckpt-format: torch_dist --data-cache-path: ${DATA_CACHE_PATH} --bf16: true -TEST_TYPE: regular \ No newline at end of file +TEST_TYPE: regular diff --git a/tests/functional_tests/test_cases/gpt/gpt3_mr_mcore_te_tp2_pp1_resume_torch_dist_cp2_nondeterministic_dgx_a100_1N8G/model_config.yaml b/tests/functional_tests/test_cases/gpt/gpt3_mr_mcore_te_tp2_pp1_resume_torch_dist_cp2_nondeterministic_dgx_a100_1N8G/model_config.yaml index 46d36da379..476a1b6b93 100644 --- a/tests/functional_tests/test_cases/gpt/gpt3_mr_mcore_te_tp2_pp1_resume_torch_dist_cp2_nondeterministic_dgx_a100_1N8G/model_config.yaml +++ b/tests/functional_tests/test_cases/gpt/gpt3_mr_mcore_te_tp2_pp1_resume_torch_dist_cp2_nondeterministic_dgx_a100_1N8G/model_config.yaml @@ -1,6 +1,7 @@ ENV_VARS: CUDA_DEVICE_MAX_CONNECTIONS: 1 NVTE_ALLOW_NONDETERMINISTIC_ALGO: 1 + N_REPEATS: 5 MODEL_ARGS: --num-layers: 12 --hidden-size: 512 @@ -48,4 +49,4 @@ MODEL_ARGS: --ckpt-format: torch_dist --data-cache-path: ${DATA_CACHE_PATH} --bf16: true -TEST_TYPE: ckpt-resume \ No newline at end of file +TEST_TYPE: ckpt-resume diff --git a/tests/functional_tests/test_cases/gpt/gpt3_mr_mcore_te_tp2_pp1_resume_torch_dist_te_8experts2parallel_dgx_a100_1N8G/model_config.yaml b/tests/functional_tests/test_cases/gpt/gpt3_mr_mcore_te_tp2_pp1_resume_torch_dist_te_8experts2parallel_dgx_a100_1N8G/model_config.yaml index ba993c319d..613559a96e 100644 --- a/tests/functional_tests/test_cases/gpt/gpt3_mr_mcore_te_tp2_pp1_resume_torch_dist_te_8experts2parallel_dgx_a100_1N8G/model_config.yaml +++ b/tests/functional_tests/test_cases/gpt/gpt3_mr_mcore_te_tp2_pp1_resume_torch_dist_te_8experts2parallel_dgx_a100_1N8G/model_config.yaml @@ -3,6 +3,7 @@ ENV_VARS: NVTE_ALLOW_NONDETERMINISTIC_ALGO: 0 NCCL_ALGO: Tree CUBLAS_WORKSPACE_CONFIG: :4096:8 + N_REPEATS: 5 MODEL_ARGS: --num-layers: 12 --hidden-size: 512 @@ -53,4 +54,4 @@ MODEL_ARGS: --ckpt-format: torch_dist --data-cache-path: ${DATA_CACHE_PATH} --bf16: true -TEST_TYPE: ckpt-resume \ No newline at end of file +TEST_TYPE: ckpt-resume diff --git a/tests/functional_tests/test_cases/gpt/gpt3_mr_mcore_te_tp2_pp1_resume_torch_dist_te_8experts2parallel_dist_optimizer_dgx_a100_1N8G/model_config.yaml b/tests/functional_tests/test_cases/gpt/gpt3_mr_mcore_te_tp2_pp1_resume_torch_dist_te_8experts2parallel_dist_optimizer_dgx_a100_1N8G/model_config.yaml index af724f5eb0..a1f86a64c7 100644 --- a/tests/functional_tests/test_cases/gpt/gpt3_mr_mcore_te_tp2_pp1_resume_torch_dist_te_8experts2parallel_dist_optimizer_dgx_a100_1N8G/model_config.yaml +++ b/tests/functional_tests/test_cases/gpt/gpt3_mr_mcore_te_tp2_pp1_resume_torch_dist_te_8experts2parallel_dist_optimizer_dgx_a100_1N8G/model_config.yaml @@ -3,6 +3,7 @@ ENV_VARS: NVTE_ALLOW_NONDETERMINISTIC_ALGO: 0 NCCL_ALGO: Tree CUBLAS_WORKSPACE_CONFIG: :4096:8 + N_REPEATS: 5 MODEL_ARGS: --num-layers: 12 --hidden-size: 512 @@ -54,4 +55,4 @@ MODEL_ARGS: --ckpt-format: torch_dist --data-cache-path: ${DATA_CACHE_PATH} --bf16: true -TEST_TYPE: ckpt-resume \ No newline at end of file +TEST_TYPE: ckpt-resume diff --git a/tests/functional_tests/test_cases/gpt/gpt3_mr_mcore_te_tp2_pp1_resume_torch_dist_te_8experts2parallel_groupedGEMM_dgx_a100_1N8G/model_config.yaml b/tests/functional_tests/test_cases/gpt/gpt3_mr_mcore_te_tp2_pp1_resume_torch_dist_te_8experts2parallel_groupedGEMM_dgx_a100_1N8G/model_config.yaml index 688edd5164..fb5ed74f79 100644 --- a/tests/functional_tests/test_cases/gpt/gpt3_mr_mcore_te_tp2_pp1_resume_torch_dist_te_8experts2parallel_groupedGEMM_dgx_a100_1N8G/model_config.yaml +++ b/tests/functional_tests/test_cases/gpt/gpt3_mr_mcore_te_tp2_pp1_resume_torch_dist_te_8experts2parallel_groupedGEMM_dgx_a100_1N8G/model_config.yaml @@ -3,6 +3,7 @@ ENV_VARS: NVTE_ALLOW_NONDETERMINISTIC_ALGO: 0 NCCL_ALGO: Tree CUBLAS_WORKSPACE_CONFIG: :4096:8 + N_REPEATS: 5 MODEL_ARGS: --num-layers: 12 --hidden-size: 512 @@ -55,4 +56,4 @@ MODEL_ARGS: --ckpt-format: torch_dist --data-cache-path: ${DATA_CACHE_PATH} --bf16: true -TEST_TYPE: ckpt-resume \ No newline at end of file +TEST_TYPE: ckpt-resume diff --git a/tests/functional_tests/test_cases/gpt/gpt3_mr_mcore_te_tp2_pp1_resume_torch_dist_te_8experts2parallel_overlap_grad_reduce_param_gather_groupedGEMM_dgx_a100_1N8G/model_config.yaml b/tests/functional_tests/test_cases/gpt/gpt3_mr_mcore_te_tp2_pp1_resume_torch_dist_te_8experts2parallel_overlap_grad_reduce_param_gather_groupedGEMM_dgx_a100_1N8G/model_config.yaml index 32b1dd0ef4..cf4a90e410 100644 --- a/tests/functional_tests/test_cases/gpt/gpt3_mr_mcore_te_tp2_pp1_resume_torch_dist_te_8experts2parallel_overlap_grad_reduce_param_gather_groupedGEMM_dgx_a100_1N8G/model_config.yaml +++ b/tests/functional_tests/test_cases/gpt/gpt3_mr_mcore_te_tp2_pp1_resume_torch_dist_te_8experts2parallel_overlap_grad_reduce_param_gather_groupedGEMM_dgx_a100_1N8G/model_config.yaml @@ -3,6 +3,7 @@ ENV_VARS: NVTE_ALLOW_NONDETERMINISTIC_ALGO: 0 NCCL_ALGO: Tree CUBLAS_WORKSPACE_CONFIG: :4096:8 + N_REPEATS: 5 MODEL_ARGS: --num-layers: 12 --hidden-size: 512 @@ -58,4 +59,4 @@ MODEL_ARGS: --ckpt-format: torch_dist --data-cache-path: ${DATA_CACHE_PATH} --bf16: true -TEST_TYPE: ckpt-resume \ No newline at end of file +TEST_TYPE: ckpt-resume diff --git a/tests/functional_tests/test_cases/gpt/gpt3_mr_mcore_te_tp2_pp1_resume_torch_dist_te_8experts2parallel_top2router_dgx_a100_1N8G/model_config.yaml b/tests/functional_tests/test_cases/gpt/gpt3_mr_mcore_te_tp2_pp1_resume_torch_dist_te_8experts2parallel_top2router_dgx_a100_1N8G/model_config.yaml index 59ae9ff1e1..793bfb21d4 100644 --- a/tests/functional_tests/test_cases/gpt/gpt3_mr_mcore_te_tp2_pp1_resume_torch_dist_te_8experts2parallel_top2router_dgx_a100_1N8G/model_config.yaml +++ b/tests/functional_tests/test_cases/gpt/gpt3_mr_mcore_te_tp2_pp1_resume_torch_dist_te_8experts2parallel_top2router_dgx_a100_1N8G/model_config.yaml @@ -3,6 +3,7 @@ ENV_VARS: NVTE_ALLOW_NONDETERMINISTIC_ALGO: 0 NCCL_ALGO: Tree CUBLAS_WORKSPACE_CONFIG: :4096:8 + N_REPEATS: 5 MODEL_ARGS: --num-layers: 12 --hidden-size: 512 @@ -55,4 +56,4 @@ MODEL_ARGS: --ckpt-format: torch_dist --data-cache-path: ${DATA_CACHE_PATH} --bf16: true -TEST_TYPE: ckpt-resume \ No newline at end of file +TEST_TYPE: ckpt-resume diff --git a/tests/functional_tests/test_cases/gpt/gpt3_mr_mcore_te_tp2_pp1_te_8experts2parallel_dgx_a100_1N8G/model_config.yaml b/tests/functional_tests/test_cases/gpt/gpt3_mr_mcore_te_tp2_pp1_te_8experts2parallel_dgx_a100_1N8G/model_config.yaml index 30b994493e..29b87e9073 100644 --- a/tests/functional_tests/test_cases/gpt/gpt3_mr_mcore_te_tp2_pp1_te_8experts2parallel_dgx_a100_1N8G/model_config.yaml +++ b/tests/functional_tests/test_cases/gpt/gpt3_mr_mcore_te_tp2_pp1_te_8experts2parallel_dgx_a100_1N8G/model_config.yaml @@ -3,6 +3,7 @@ ENV_VARS: NVTE_ALLOW_NONDETERMINISTIC_ALGO: 0 NCCL_ALGO: Tree CUBLAS_WORKSPACE_CONFIG: :4096:8 + N_REPEATS: 5 MODEL_ARGS: --num-layers: 12 --hidden-size: 512 @@ -52,4 +53,4 @@ MODEL_ARGS: --ckpt-format: torch_dist --data-cache-path: ${DATA_CACHE_PATH} --bf16: true -TEST_TYPE: regular \ No newline at end of file +TEST_TYPE: regular diff --git a/tests/functional_tests/test_cases/gpt/gpt3_mr_mcore_te_tp2_pp1_te_8experts2parallel_dist_optimizer_dgx_a100_1N8G/model_config.yaml b/tests/functional_tests/test_cases/gpt/gpt3_mr_mcore_te_tp2_pp1_te_8experts2parallel_dist_optimizer_dgx_a100_1N8G/model_config.yaml index 322fc34b1d..c4b791a9d4 100644 --- a/tests/functional_tests/test_cases/gpt/gpt3_mr_mcore_te_tp2_pp1_te_8experts2parallel_dist_optimizer_dgx_a100_1N8G/model_config.yaml +++ b/tests/functional_tests/test_cases/gpt/gpt3_mr_mcore_te_tp2_pp1_te_8experts2parallel_dist_optimizer_dgx_a100_1N8G/model_config.yaml @@ -3,6 +3,7 @@ ENV_VARS: NVTE_ALLOW_NONDETERMINISTIC_ALGO: 0 NCCL_ALGO: Tree CUBLAS_WORKSPACE_CONFIG: :4096:8 + N_REPEATS: 5 MODEL_ARGS: --num-layers: 12 --hidden-size: 512 @@ -53,4 +54,4 @@ MODEL_ARGS: --ckpt-format: torch_dist --data-cache-path: ${DATA_CACHE_PATH} --bf16: true -TEST_TYPE: regular \ No newline at end of file +TEST_TYPE: regular diff --git a/tests/functional_tests/test_cases/gpt/gpt3_mr_mcore_te_tp2_pp1_te_8experts2parallel_groupedGEMM_dgx_a100_1N8G/model_config.yaml b/tests/functional_tests/test_cases/gpt/gpt3_mr_mcore_te_tp2_pp1_te_8experts2parallel_groupedGEMM_dgx_a100_1N8G/model_config.yaml index 191ca9c652..c2631e84e0 100644 --- a/tests/functional_tests/test_cases/gpt/gpt3_mr_mcore_te_tp2_pp1_te_8experts2parallel_groupedGEMM_dgx_a100_1N8G/model_config.yaml +++ b/tests/functional_tests/test_cases/gpt/gpt3_mr_mcore_te_tp2_pp1_te_8experts2parallel_groupedGEMM_dgx_a100_1N8G/model_config.yaml @@ -3,6 +3,7 @@ ENV_VARS: NVTE_ALLOW_NONDETERMINISTIC_ALGO: 0 NCCL_ALGO: Tree CUBLAS_WORKSPACE_CONFIG: :4096:8 + N_REPEATS: 5 MODEL_ARGS: --num-layers: 12 --hidden-size: 512 @@ -54,4 +55,4 @@ MODEL_ARGS: --ckpt-format: torch_dist --data-cache-path: ${DATA_CACHE_PATH} --bf16: true -TEST_TYPE: regular \ No newline at end of file +TEST_TYPE: regular diff --git a/tests/functional_tests/test_cases/gpt/gpt3_mr_mcore_te_tp2_pp1_te_8experts2parallel_overlap_grad_reduce_param_gather_groupedGEMM_dgx_a100_1N8G/model_config.yaml b/tests/functional_tests/test_cases/gpt/gpt3_mr_mcore_te_tp2_pp1_te_8experts2parallel_overlap_grad_reduce_param_gather_groupedGEMM_dgx_a100_1N8G/model_config.yaml index 661775605d..bc5da0c312 100644 --- a/tests/functional_tests/test_cases/gpt/gpt3_mr_mcore_te_tp2_pp1_te_8experts2parallel_overlap_grad_reduce_param_gather_groupedGEMM_dgx_a100_1N8G/model_config.yaml +++ b/tests/functional_tests/test_cases/gpt/gpt3_mr_mcore_te_tp2_pp1_te_8experts2parallel_overlap_grad_reduce_param_gather_groupedGEMM_dgx_a100_1N8G/model_config.yaml @@ -3,6 +3,7 @@ ENV_VARS: NVTE_ALLOW_NONDETERMINISTIC_ALGO: 0 NCCL_ALGO: Tree CUBLAS_WORKSPACE_CONFIG: :4096:8 + N_REPEATS: 5 MODEL_ARGS: --num-layers: 12 --hidden-size: 512 @@ -57,4 +58,4 @@ MODEL_ARGS: --ckpt-format: torch_dist --data-cache-path: ${DATA_CACHE_PATH} --bf16: true -TEST_TYPE: regular \ No newline at end of file +TEST_TYPE: regular diff --git a/tests/functional_tests/test_cases/gpt/gpt3_mr_mcore_te_tp2_pp1_te_8experts2parallel_top2router_dgx_a100_1N8G/model_config.yaml b/tests/functional_tests/test_cases/gpt/gpt3_mr_mcore_te_tp2_pp1_te_8experts2parallel_top2router_dgx_a100_1N8G/model_config.yaml index 5043699d49..7c437e0b10 100644 --- a/tests/functional_tests/test_cases/gpt/gpt3_mr_mcore_te_tp2_pp1_te_8experts2parallel_top2router_dgx_a100_1N8G/model_config.yaml +++ b/tests/functional_tests/test_cases/gpt/gpt3_mr_mcore_te_tp2_pp1_te_8experts2parallel_top2router_dgx_a100_1N8G/model_config.yaml @@ -3,6 +3,7 @@ ENV_VARS: NVTE_ALLOW_NONDETERMINISTIC_ALGO: 0 NCCL_ALGO: Tree CUBLAS_WORKSPACE_CONFIG: :4096:8 + N_REPEATS: 5 MODEL_ARGS: --num-layers: 12 --hidden-size: 512 @@ -54,4 +55,4 @@ MODEL_ARGS: --ckpt-format: torch_dist --data-cache-path: ${DATA_CACHE_PATH} --bf16: true -TEST_TYPE: regular \ No newline at end of file +TEST_TYPE: regular diff --git a/tests/functional_tests/test_cases/gpt/gpt3_mr_mcore_te_tp2_pp2_cp2_nondeterministic_dgx_a100_1N8G/model_config.yaml b/tests/functional_tests/test_cases/gpt/gpt3_mr_mcore_te_tp2_pp2_cp2_nondeterministic_dgx_a100_1N8G/model_config.yaml index 2fd4614dd8..dde8a620d3 100644 --- a/tests/functional_tests/test_cases/gpt/gpt3_mr_mcore_te_tp2_pp2_cp2_nondeterministic_dgx_a100_1N8G/model_config.yaml +++ b/tests/functional_tests/test_cases/gpt/gpt3_mr_mcore_te_tp2_pp2_cp2_nondeterministic_dgx_a100_1N8G/model_config.yaml @@ -1,6 +1,7 @@ ENV_VARS: CUDA_DEVICE_MAX_CONNECTIONS: 1 NVTE_ALLOW_NONDETERMINISTIC_ALGO: 1 + N_REPEATS: 5 MODEL_ARGS: --num-layers: 12 --hidden-size: 512 @@ -47,4 +48,4 @@ MODEL_ARGS: --ckpt-format: torch_dist --data-cache-path: ${DATA_CACHE_PATH} --bf16: true -TEST_TYPE: regular \ No newline at end of file +TEST_TYPE: regular diff --git a/tests/functional_tests/test_cases/gpt/gpt3_mr_mcore_te_tp2_pp2_cross_entropy_loss_fusion_dgx_a100_1N8G/model_config.yaml b/tests/functional_tests/test_cases/gpt/gpt3_mr_mcore_te_tp2_pp2_cross_entropy_loss_fusion_dgx_a100_1N8G/model_config.yaml index c28031708a..303182bcaf 100644 --- a/tests/functional_tests/test_cases/gpt/gpt3_mr_mcore_te_tp2_pp2_cross_entropy_loss_fusion_dgx_a100_1N8G/model_config.yaml +++ b/tests/functional_tests/test_cases/gpt/gpt3_mr_mcore_te_tp2_pp2_cross_entropy_loss_fusion_dgx_a100_1N8G/model_config.yaml @@ -1,6 +1,7 @@ ENV_VARS: CUDA_DEVICE_MAX_CONNECTIONS: 1 NVTE_ALLOW_NONDETERMINISTIC_ALGO: 1 + N_REPEATS: 5 MODEL_ARGS: --num-layers: 12 --hidden-size: 512 @@ -44,4 +45,4 @@ MODEL_ARGS: --ckpt-format: torch_dist --data-cache-path: ${DATA_CACHE_PATH} --bf16: true -TEST_TYPE: regular \ No newline at end of file +TEST_TYPE: regular diff --git a/tests/functional_tests/test_cases/gpt/gpt3_mr_mcore_te_tp2_pp2_ddp_average_in_collective_dgx_a100_1N8G/model_config.yaml b/tests/functional_tests/test_cases/gpt/gpt3_mr_mcore_te_tp2_pp2_ddp_average_in_collective_dgx_a100_1N8G/model_config.yaml index 49530a366f..c08ce2e01c 100644 --- a/tests/functional_tests/test_cases/gpt/gpt3_mr_mcore_te_tp2_pp2_ddp_average_in_collective_dgx_a100_1N8G/model_config.yaml +++ b/tests/functional_tests/test_cases/gpt/gpt3_mr_mcore_te_tp2_pp2_ddp_average_in_collective_dgx_a100_1N8G/model_config.yaml @@ -3,6 +3,7 @@ ENV_VARS: NVTE_ALLOW_NONDETERMINISTIC_ALGO: 0 NCCL_ALGO: Tree CUBLAS_WORKSPACE_CONFIG: :4096:8 + N_REPEATS: 5 MODEL_ARGS: --num-layers: 12 --hidden-size: 512 @@ -47,4 +48,4 @@ MODEL_ARGS: --ckpt-format: torch_dist --data-cache-path: ${DATA_CACHE_PATH} --bf16: true -TEST_TYPE: regular \ No newline at end of file +TEST_TYPE: regular diff --git a/tests/functional_tests/test_cases/gpt/gpt3_mr_mcore_te_tp2_pp2_defer_embedding_wgrad_compute_dgx_a100_1N8G/model_config.yaml b/tests/functional_tests/test_cases/gpt/gpt3_mr_mcore_te_tp2_pp2_defer_embedding_wgrad_compute_dgx_a100_1N8G/model_config.yaml index 3bb836d36b..959c286a50 100644 --- a/tests/functional_tests/test_cases/gpt/gpt3_mr_mcore_te_tp2_pp2_defer_embedding_wgrad_compute_dgx_a100_1N8G/model_config.yaml +++ b/tests/functional_tests/test_cases/gpt/gpt3_mr_mcore_te_tp2_pp2_defer_embedding_wgrad_compute_dgx_a100_1N8G/model_config.yaml @@ -3,6 +3,7 @@ ENV_VARS: NVTE_ALLOW_NONDETERMINISTIC_ALGO: 0 NCCL_ALGO: Tree CUBLAS_WORKSPACE_CONFIG: :4096:8 + N_REPEATS: 5 MODEL_ARGS: --num-layers: 12 --hidden-size: 512 @@ -47,4 +48,4 @@ MODEL_ARGS: --ckpt-format: torch_dist --data-cache-path: ${DATA_CACHE_PATH} --bf16: true -TEST_TYPE: regular \ No newline at end of file +TEST_TYPE: regular diff --git a/tests/functional_tests/test_cases/gpt/gpt3_mr_mcore_te_tp2_pp2_dgx_a100_1N8G/model_config.yaml b/tests/functional_tests/test_cases/gpt/gpt3_mr_mcore_te_tp2_pp2_dgx_a100_1N8G/model_config.yaml index 0dd40795b5..c9938b5ee1 100644 --- a/tests/functional_tests/test_cases/gpt/gpt3_mr_mcore_te_tp2_pp2_dgx_a100_1N8G/model_config.yaml +++ b/tests/functional_tests/test_cases/gpt/gpt3_mr_mcore_te_tp2_pp2_dgx_a100_1N8G/model_config.yaml @@ -3,6 +3,7 @@ ENV_VARS: NVTE_ALLOW_NONDETERMINISTIC_ALGO: 0 NCCL_ALGO: Tree CUBLAS_WORKSPACE_CONFIG: :4096:8 + N_REPEATS: 5 MODEL_ARGS: --num-layers: 12 --hidden-size: 512 @@ -46,4 +47,4 @@ MODEL_ARGS: --ckpt-format: torch_dist --data-cache-path: ${DATA_CACHE_PATH} --bf16: true -TEST_TYPE: regular \ No newline at end of file +TEST_TYPE: regular diff --git a/tests/functional_tests/test_cases/gpt/gpt3_mr_mcore_te_tp2_pp2_no_create_attention_mask_in_dataloader_dgx_a100_1N8G/model_config.yaml b/tests/functional_tests/test_cases/gpt/gpt3_mr_mcore_te_tp2_pp2_no_create_attention_mask_in_dataloader_dgx_a100_1N8G/model_config.yaml index dfe5b75e8e..23060e55e4 100644 --- a/tests/functional_tests/test_cases/gpt/gpt3_mr_mcore_te_tp2_pp2_no_create_attention_mask_in_dataloader_dgx_a100_1N8G/model_config.yaml +++ b/tests/functional_tests/test_cases/gpt/gpt3_mr_mcore_te_tp2_pp2_no_create_attention_mask_in_dataloader_dgx_a100_1N8G/model_config.yaml @@ -3,6 +3,7 @@ ENV_VARS: NVTE_ALLOW_NONDETERMINISTIC_ALGO: 0 NCCL_ALGO: Tree CUBLAS_WORKSPACE_CONFIG: :4096:8 + N_REPEATS: 5 MODEL_ARGS: --num-layers: 12 --hidden-size: 512 @@ -47,4 +48,4 @@ MODEL_ARGS: --ckpt-format: torch_dist --data-cache-path: ${DATA_CACHE_PATH} --bf16: true -TEST_TYPE: regular \ No newline at end of file +TEST_TYPE: regular diff --git a/tests/functional_tests/test_cases/gpt/gpt3_mr_mcore_te_tp2_pp2_no_mmap_bin_files_dgx_a100_1N8G/model_config.yaml b/tests/functional_tests/test_cases/gpt/gpt3_mr_mcore_te_tp2_pp2_no_mmap_bin_files_dgx_a100_1N8G/model_config.yaml index 9827106b20..32bd642deb 100644 --- a/tests/functional_tests/test_cases/gpt/gpt3_mr_mcore_te_tp2_pp2_no_mmap_bin_files_dgx_a100_1N8G/model_config.yaml +++ b/tests/functional_tests/test_cases/gpt/gpt3_mr_mcore_te_tp2_pp2_no_mmap_bin_files_dgx_a100_1N8G/model_config.yaml @@ -3,6 +3,7 @@ ENV_VARS: NVTE_ALLOW_NONDETERMINISTIC_ALGO: 0 NCCL_ALGO: Tree CUBLAS_WORKSPACE_CONFIG: :4096:8 + N_REPEATS: 5 MODEL_ARGS: --num-layers: 12 --hidden-size: 512 @@ -47,4 +48,4 @@ MODEL_ARGS: --ckpt-format: torch_dist --data-cache-path: ${DATA_CACHE_PATH} --bf16: true -TEST_TYPE: regular \ No newline at end of file +TEST_TYPE: regular diff --git a/tests/functional_tests/test_cases/gpt/gpt3_mr_mcore_te_tp2_pp2_resume_torch_dist_cp2_nondeterministic_dgx_a100_1N8G/model_config.yaml b/tests/functional_tests/test_cases/gpt/gpt3_mr_mcore_te_tp2_pp2_resume_torch_dist_cp2_nondeterministic_dgx_a100_1N8G/model_config.yaml index b8e763eaf6..7d64cf477f 100644 --- a/tests/functional_tests/test_cases/gpt/gpt3_mr_mcore_te_tp2_pp2_resume_torch_dist_cp2_nondeterministic_dgx_a100_1N8G/model_config.yaml +++ b/tests/functional_tests/test_cases/gpt/gpt3_mr_mcore_te_tp2_pp2_resume_torch_dist_cp2_nondeterministic_dgx_a100_1N8G/model_config.yaml @@ -1,6 +1,7 @@ ENV_VARS: CUDA_DEVICE_MAX_CONNECTIONS: 1 NVTE_ALLOW_NONDETERMINISTIC_ALGO: 1 + N_REPEATS: 5 MODEL_ARGS: --num-layers: 12 --hidden-size: 512 @@ -48,4 +49,4 @@ MODEL_ARGS: --ckpt-format: torch_dist --data-cache-path: ${DATA_CACHE_PATH} --bf16: true -TEST_TYPE: ckpt-resume \ No newline at end of file +TEST_TYPE: ckpt-resume diff --git a/tests/functional_tests/test_cases/gpt/gpt3_mr_mcore_te_tp2_pp2_resume_torch_dist_cross_entropy_loss_fusion_dgx_a100_1N8G/model_config.yaml b/tests/functional_tests/test_cases/gpt/gpt3_mr_mcore_te_tp2_pp2_resume_torch_dist_cross_entropy_loss_fusion_dgx_a100_1N8G/model_config.yaml index 63f5bc56a0..6014052dd6 100644 --- a/tests/functional_tests/test_cases/gpt/gpt3_mr_mcore_te_tp2_pp2_resume_torch_dist_cross_entropy_loss_fusion_dgx_a100_1N8G/model_config.yaml +++ b/tests/functional_tests/test_cases/gpt/gpt3_mr_mcore_te_tp2_pp2_resume_torch_dist_cross_entropy_loss_fusion_dgx_a100_1N8G/model_config.yaml @@ -1,6 +1,7 @@ ENV_VARS: CUDA_DEVICE_MAX_CONNECTIONS: 1 NVTE_ALLOW_NONDETERMINISTIC_ALGO: 1 + N_REPEATS: 5 MODEL_ARGS: --num-layers: 12 --hidden-size: 512 @@ -45,4 +46,4 @@ MODEL_ARGS: --ckpt-format: torch_dist --data-cache-path: ${DATA_CACHE_PATH} --bf16: true -TEST_TYPE: ckpt-resume \ No newline at end of file +TEST_TYPE: ckpt-resume diff --git a/tests/functional_tests/test_cases/gpt/gpt3_mr_mcore_te_tp2_pp2_resume_torch_dist_ddp_average_in_collective_dgx_a100_1N8G/model_config.yaml b/tests/functional_tests/test_cases/gpt/gpt3_mr_mcore_te_tp2_pp2_resume_torch_dist_ddp_average_in_collective_dgx_a100_1N8G/model_config.yaml index bcf5398612..6d8a590974 100644 --- a/tests/functional_tests/test_cases/gpt/gpt3_mr_mcore_te_tp2_pp2_resume_torch_dist_ddp_average_in_collective_dgx_a100_1N8G/model_config.yaml +++ b/tests/functional_tests/test_cases/gpt/gpt3_mr_mcore_te_tp2_pp2_resume_torch_dist_ddp_average_in_collective_dgx_a100_1N8G/model_config.yaml @@ -3,6 +3,7 @@ ENV_VARS: NVTE_ALLOW_NONDETERMINISTIC_ALGO: 0 NCCL_ALGO: Tree CUBLAS_WORKSPACE_CONFIG: :4096:8 + N_REPEATS: 5 MODEL_ARGS: --num-layers: 12 --hidden-size: 512 @@ -48,4 +49,4 @@ MODEL_ARGS: --ckpt-format: torch_dist --data-cache-path: ${DATA_CACHE_PATH} --bf16: true -TEST_TYPE: ckpt-resume \ No newline at end of file +TEST_TYPE: ckpt-resume diff --git a/tests/functional_tests/test_cases/gpt/gpt3_mr_mcore_te_tp2_pp2_resume_torch_dist_defer_embedding_wgrad_compute_dgx_a100_1N8G/model_config.yaml b/tests/functional_tests/test_cases/gpt/gpt3_mr_mcore_te_tp2_pp2_resume_torch_dist_defer_embedding_wgrad_compute_dgx_a100_1N8G/model_config.yaml index 9a763b34ad..c304692d62 100644 --- a/tests/functional_tests/test_cases/gpt/gpt3_mr_mcore_te_tp2_pp2_resume_torch_dist_defer_embedding_wgrad_compute_dgx_a100_1N8G/model_config.yaml +++ b/tests/functional_tests/test_cases/gpt/gpt3_mr_mcore_te_tp2_pp2_resume_torch_dist_defer_embedding_wgrad_compute_dgx_a100_1N8G/model_config.yaml @@ -3,6 +3,7 @@ ENV_VARS: NVTE_ALLOW_NONDETERMINISTIC_ALGO: 0 NCCL_ALGO: Tree CUBLAS_WORKSPACE_CONFIG: :4096:8 + N_REPEATS: 5 MODEL_ARGS: --num-layers: 12 --hidden-size: 512 @@ -48,4 +49,4 @@ MODEL_ARGS: --ckpt-format: torch_dist --data-cache-path: ${DATA_CACHE_PATH} --bf16: true -TEST_TYPE: ckpt-resume \ No newline at end of file +TEST_TYPE: ckpt-resume diff --git a/tests/functional_tests/test_cases/gpt/gpt3_mr_mcore_te_tp2_pp2_resume_torch_dist_dgx_a100_1N8G/model_config.yaml b/tests/functional_tests/test_cases/gpt/gpt3_mr_mcore_te_tp2_pp2_resume_torch_dist_dgx_a100_1N8G/model_config.yaml index 9074e6ce44..d8f1585ae2 100644 --- a/tests/functional_tests/test_cases/gpt/gpt3_mr_mcore_te_tp2_pp2_resume_torch_dist_dgx_a100_1N8G/model_config.yaml +++ b/tests/functional_tests/test_cases/gpt/gpt3_mr_mcore_te_tp2_pp2_resume_torch_dist_dgx_a100_1N8G/model_config.yaml @@ -3,6 +3,7 @@ ENV_VARS: NVTE_ALLOW_NONDETERMINISTIC_ALGO: 0 NCCL_ALGO: Tree CUBLAS_WORKSPACE_CONFIG: :4096:8 + N_REPEATS: 5 MODEL_ARGS: --num-layers: 12 --hidden-size: 512 @@ -47,4 +48,4 @@ MODEL_ARGS: --ckpt-format: torch_dist --data-cache-path: ${DATA_CACHE_PATH} --bf16: true -TEST_TYPE: ckpt-resume \ No newline at end of file +TEST_TYPE: ckpt-resume diff --git a/tests/functional_tests/test_cases/gpt/gpt3_mr_mcore_te_tp2_pp2_resume_torch_dist_no_create_attention_mask_in_dataloader_dgx_a100_1N8G/model_config.yaml b/tests/functional_tests/test_cases/gpt/gpt3_mr_mcore_te_tp2_pp2_resume_torch_dist_no_create_attention_mask_in_dataloader_dgx_a100_1N8G/model_config.yaml index 7d1fff5f28..c02d1fdc67 100644 --- a/tests/functional_tests/test_cases/gpt/gpt3_mr_mcore_te_tp2_pp2_resume_torch_dist_no_create_attention_mask_in_dataloader_dgx_a100_1N8G/model_config.yaml +++ b/tests/functional_tests/test_cases/gpt/gpt3_mr_mcore_te_tp2_pp2_resume_torch_dist_no_create_attention_mask_in_dataloader_dgx_a100_1N8G/model_config.yaml @@ -3,6 +3,7 @@ ENV_VARS: NVTE_ALLOW_NONDETERMINISTIC_ALGO: 0 NCCL_ALGO: Tree CUBLAS_WORKSPACE_CONFIG: :4096:8 + N_REPEATS: 5 MODEL_ARGS: --num-layers: 12 --hidden-size: 512 @@ -48,4 +49,4 @@ MODEL_ARGS: --ckpt-format: torch_dist --data-cache-path: ${DATA_CACHE_PATH} --bf16: true -TEST_TYPE: ckpt-resume \ No newline at end of file +TEST_TYPE: ckpt-resume diff --git a/tests/functional_tests/test_cases/gpt/gpt3_mr_mcore_te_tp2_pp2_resume_torch_dist_no_mmap_bin_files_dgx_a100_1N8G/model_config.yaml b/tests/functional_tests/test_cases/gpt/gpt3_mr_mcore_te_tp2_pp2_resume_torch_dist_no_mmap_bin_files_dgx_a100_1N8G/model_config.yaml index ab30aa8110..7d5b13b753 100644 --- a/tests/functional_tests/test_cases/gpt/gpt3_mr_mcore_te_tp2_pp2_resume_torch_dist_no_mmap_bin_files_dgx_a100_1N8G/model_config.yaml +++ b/tests/functional_tests/test_cases/gpt/gpt3_mr_mcore_te_tp2_pp2_resume_torch_dist_no_mmap_bin_files_dgx_a100_1N8G/model_config.yaml @@ -3,6 +3,7 @@ ENV_VARS: NVTE_ALLOW_NONDETERMINISTIC_ALGO: 0 NCCL_ALGO: Tree CUBLAS_WORKSPACE_CONFIG: :4096:8 + N_REPEATS: 5 MODEL_ARGS: --num-layers: 12 --hidden-size: 512 @@ -48,4 +49,4 @@ MODEL_ARGS: --ckpt-format: torch_dist --data-cache-path: ${DATA_CACHE_PATH} --bf16: true -TEST_TYPE: ckpt-resume \ No newline at end of file +TEST_TYPE: ckpt-resume diff --git a/tests/functional_tests/test_cases/gpt/gpt3_mr_mcore_te_tp2_pp2_resume_torch_dist_reshard_1x4xNone_dgx_a100_1N8G/model_config.yaml b/tests/functional_tests/test_cases/gpt/gpt3_mr_mcore_te_tp2_pp2_resume_torch_dist_reshard_1x4xNone_dgx_a100_1N8G/model_config.yaml index 4276fcf6cb..cff824669b 100644 --- a/tests/functional_tests/test_cases/gpt/gpt3_mr_mcore_te_tp2_pp2_resume_torch_dist_reshard_1x4xNone_dgx_a100_1N8G/model_config.yaml +++ b/tests/functional_tests/test_cases/gpt/gpt3_mr_mcore_te_tp2_pp2_resume_torch_dist_reshard_1x4xNone_dgx_a100_1N8G/model_config.yaml @@ -1,6 +1,7 @@ ENV_VARS: CUDA_DEVICE_MAX_CONNECTIONS: 1 NVTE_ALLOW_NONDETERMINISTIC_ALGO: 1 + N_REPEATS: 5 MODEL_ARGS: --num-layers: 12 --hidden-size: 512 @@ -44,4 +45,4 @@ MODEL_ARGS: --ckpt-format: torch_dist --data-cache-path: ${DATA_CACHE_PATH} --bf16: true -TEST_TYPE: ckpt-resume \ No newline at end of file +TEST_TYPE: ckpt-resume diff --git a/tests/functional_tests/test_cases/gpt/gpt3_mr_mcore_te_tp4_pp1_dist_optimizer_overlap_grad_reduce_dgx_a100_1N8G/model_config.yaml b/tests/functional_tests/test_cases/gpt/gpt3_mr_mcore_te_tp4_pp1_dist_optimizer_overlap_grad_reduce_dgx_a100_1N8G/model_config.yaml index 104b69873c..8846dacb40 100644 --- a/tests/functional_tests/test_cases/gpt/gpt3_mr_mcore_te_tp4_pp1_dist_optimizer_overlap_grad_reduce_dgx_a100_1N8G/model_config.yaml +++ b/tests/functional_tests/test_cases/gpt/gpt3_mr_mcore_te_tp4_pp1_dist_optimizer_overlap_grad_reduce_dgx_a100_1N8G/model_config.yaml @@ -3,6 +3,7 @@ ENV_VARS: NVTE_ALLOW_NONDETERMINISTIC_ALGO: 0 NCCL_ALGO: Tree CUBLAS_WORKSPACE_CONFIG: :4096:8 + N_REPEATS: 5 MODEL_ARGS: --num-layers: 12 --hidden-size: 512 @@ -48,4 +49,4 @@ MODEL_ARGS: --ckpt-format: torch_dist --data-cache-path: ${DATA_CACHE_PATH} --bf16: true -TEST_TYPE: regular \ No newline at end of file +TEST_TYPE: regular diff --git a/tests/functional_tests/test_cases/gpt/gpt3_mr_mcore_te_tp4_pp1_dist_optimizer_overlap_grad_reduce_param_gather_dgx_a100_1N8G/model_config.yaml b/tests/functional_tests/test_cases/gpt/gpt3_mr_mcore_te_tp4_pp1_dist_optimizer_overlap_grad_reduce_param_gather_dgx_a100_1N8G/model_config.yaml index 9f836b80b6..9295cdc580 100644 --- a/tests/functional_tests/test_cases/gpt/gpt3_mr_mcore_te_tp4_pp1_dist_optimizer_overlap_grad_reduce_param_gather_dgx_a100_1N8G/model_config.yaml +++ b/tests/functional_tests/test_cases/gpt/gpt3_mr_mcore_te_tp4_pp1_dist_optimizer_overlap_grad_reduce_param_gather_dgx_a100_1N8G/model_config.yaml @@ -3,6 +3,7 @@ ENV_VARS: NVTE_ALLOW_NONDETERMINISTIC_ALGO: 0 NCCL_ALGO: Tree CUBLAS_WORKSPACE_CONFIG: :4096:8 + N_REPEATS: 5 MODEL_ARGS: --num-layers: 12 --hidden-size: 512 @@ -49,4 +50,4 @@ MODEL_ARGS: --ckpt-format: torch_dist --data-cache-path: ${DATA_CACHE_PATH} --bf16: true -TEST_TYPE: regular \ No newline at end of file +TEST_TYPE: regular diff --git a/tests/functional_tests/test_cases/gpt/gpt3_mr_mcore_te_tp4_pp1_qk_layernorm_test_mode_dgx_a100_1N8G/model_config.yaml b/tests/functional_tests/test_cases/gpt/gpt3_mr_mcore_te_tp4_pp1_qk_layernorm_test_mode_dgx_a100_1N8G/model_config.yaml index 42e81f7bcc..b8f1667cdb 100644 --- a/tests/functional_tests/test_cases/gpt/gpt3_mr_mcore_te_tp4_pp1_qk_layernorm_test_mode_dgx_a100_1N8G/model_config.yaml +++ b/tests/functional_tests/test_cases/gpt/gpt3_mr_mcore_te_tp4_pp1_qk_layernorm_test_mode_dgx_a100_1N8G/model_config.yaml @@ -3,6 +3,7 @@ ENV_VARS: NVTE_ALLOW_NONDETERMINISTIC_ALGO: 0 NCCL_ALGO: Tree CUBLAS_WORKSPACE_CONFIG: :4096:8 + N_REPEATS: 5 MODEL_ARGS: --num-layers: 12 --hidden-size: 512 @@ -48,4 +49,4 @@ MODEL_ARGS: --ckpt-format: torch_dist --data-cache-path: ${DATA_CACHE_PATH} --bf16: true -TEST_TYPE: regular \ No newline at end of file +TEST_TYPE: regular diff --git a/tests/functional_tests/test_cases/gpt/gpt3_mr_mcore_te_tp4_pp1_resume_torch_dist_dist_optimizer_overlap_grad_reduce_dgx_a100_1N8G/model_config.yaml b/tests/functional_tests/test_cases/gpt/gpt3_mr_mcore_te_tp4_pp1_resume_torch_dist_dist_optimizer_overlap_grad_reduce_dgx_a100_1N8G/model_config.yaml index d17ae7a89e..d2888f767c 100644 --- a/tests/functional_tests/test_cases/gpt/gpt3_mr_mcore_te_tp4_pp1_resume_torch_dist_dist_optimizer_overlap_grad_reduce_dgx_a100_1N8G/model_config.yaml +++ b/tests/functional_tests/test_cases/gpt/gpt3_mr_mcore_te_tp4_pp1_resume_torch_dist_dist_optimizer_overlap_grad_reduce_dgx_a100_1N8G/model_config.yaml @@ -3,6 +3,7 @@ ENV_VARS: NVTE_ALLOW_NONDETERMINISTIC_ALGO: 0 NCCL_ALGO: Tree CUBLAS_WORKSPACE_CONFIG: :4096:8 + N_REPEATS: 5 MODEL_ARGS: --num-layers: 12 --hidden-size: 512 @@ -49,4 +50,4 @@ MODEL_ARGS: --ckpt-format: torch_dist --data-cache-path: ${DATA_CACHE_PATH} --bf16: true -TEST_TYPE: ckpt-resume \ No newline at end of file +TEST_TYPE: ckpt-resume diff --git a/tests/functional_tests/test_cases/gpt/gpt3_mr_mcore_te_tp4_pp1_resume_torch_dist_dist_optimizer_overlap_grad_reduce_param_gather_dgx_a100_1N8G/model_config.yaml b/tests/functional_tests/test_cases/gpt/gpt3_mr_mcore_te_tp4_pp1_resume_torch_dist_dist_optimizer_overlap_grad_reduce_param_gather_dgx_a100_1N8G/model_config.yaml index fd13e7a0a2..27acfbee86 100644 --- a/tests/functional_tests/test_cases/gpt/gpt3_mr_mcore_te_tp4_pp1_resume_torch_dist_dist_optimizer_overlap_grad_reduce_param_gather_dgx_a100_1N8G/model_config.yaml +++ b/tests/functional_tests/test_cases/gpt/gpt3_mr_mcore_te_tp4_pp1_resume_torch_dist_dist_optimizer_overlap_grad_reduce_param_gather_dgx_a100_1N8G/model_config.yaml @@ -3,6 +3,7 @@ ENV_VARS: NVTE_ALLOW_NONDETERMINISTIC_ALGO: 0 NCCL_ALGO: Tree CUBLAS_WORKSPACE_CONFIG: :4096:8 + N_REPEATS: 5 MODEL_ARGS: --num-layers: 12 --hidden-size: 512 @@ -50,4 +51,4 @@ MODEL_ARGS: --ckpt-format: torch_dist --data-cache-path: ${DATA_CACHE_PATH} --bf16: true -TEST_TYPE: ckpt-resume \ No newline at end of file +TEST_TYPE: ckpt-resume diff --git a/tests/functional_tests/test_cases/gpt/gpt3_mr_mcore_te_tp4_pp1_resume_torch_dist_qk_layernorm_test_mode_dgx_a100_1N8G/model_config.yaml b/tests/functional_tests/test_cases/gpt/gpt3_mr_mcore_te_tp4_pp1_resume_torch_dist_qk_layernorm_test_mode_dgx_a100_1N8G/model_config.yaml index 8e205a2636..1ea30bae73 100644 --- a/tests/functional_tests/test_cases/gpt/gpt3_mr_mcore_te_tp4_pp1_resume_torch_dist_qk_layernorm_test_mode_dgx_a100_1N8G/model_config.yaml +++ b/tests/functional_tests/test_cases/gpt/gpt3_mr_mcore_te_tp4_pp1_resume_torch_dist_qk_layernorm_test_mode_dgx_a100_1N8G/model_config.yaml @@ -3,6 +3,7 @@ ENV_VARS: NVTE_ALLOW_NONDETERMINISTIC_ALGO: 0 NCCL_ALGO: Tree CUBLAS_WORKSPACE_CONFIG: :4096:8 + N_REPEATS: 5 MODEL_ARGS: --num-layers: 12 --hidden-size: 512 @@ -49,4 +50,4 @@ MODEL_ARGS: --ckpt-format: torch_dist --data-cache-path: ${DATA_CACHE_PATH} --bf16: true -TEST_TYPE: ckpt-resume \ No newline at end of file +TEST_TYPE: ckpt-resume diff --git a/tests/functional_tests/test_cases/gpt/gpt3_mr_mcore_te_tp4_pp2_resume_torch_dist_reshard_8x1xNone_dgx_a100_1N8G/model_config.yaml b/tests/functional_tests/test_cases/gpt/gpt3_mr_mcore_te_tp4_pp2_resume_torch_dist_reshard_8x1xNone_dgx_a100_1N8G/model_config.yaml index 9916411c90..f3348d608d 100644 --- a/tests/functional_tests/test_cases/gpt/gpt3_mr_mcore_te_tp4_pp2_resume_torch_dist_reshard_8x1xNone_dgx_a100_1N8G/model_config.yaml +++ b/tests/functional_tests/test_cases/gpt/gpt3_mr_mcore_te_tp4_pp2_resume_torch_dist_reshard_8x1xNone_dgx_a100_1N8G/model_config.yaml @@ -1,6 +1,7 @@ ENV_VARS: CUDA_DEVICE_MAX_CONNECTIONS: 1 NVTE_ALLOW_NONDETERMINISTIC_ALGO: 1 + N_REPEATS: 5 MODEL_ARGS: --num-layers: 12 --hidden-size: 512 @@ -47,4 +48,4 @@ MODEL_ARGS: --ckpt-format: torch_dist --data-cache-path: ${DATA_CACHE_PATH} --bf16: true -TEST_TYPE: ckpt-resume \ No newline at end of file +TEST_TYPE: ckpt-resume diff --git a/tests/functional_tests/test_cases/gpt/gpt3_mr_mcore_tp2_pp2_resume_torch_dist_uninstall_te_dgx_a100_1N8G/model_config.yaml b/tests/functional_tests/test_cases/gpt/gpt3_mr_mcore_tp2_pp2_resume_torch_dist_uninstall_te_dgx_a100_1N8G/model_config.yaml index 282c7e07a5..fbb767cb14 100644 --- a/tests/functional_tests/test_cases/gpt/gpt3_mr_mcore_tp2_pp2_resume_torch_dist_uninstall_te_dgx_a100_1N8G/model_config.yaml +++ b/tests/functional_tests/test_cases/gpt/gpt3_mr_mcore_tp2_pp2_resume_torch_dist_uninstall_te_dgx_a100_1N8G/model_config.yaml @@ -3,6 +3,7 @@ ENV_VARS: NVTE_ALLOW_NONDETERMINISTIC_ALGO: 0 NCCL_ALGO: Tree CUBLAS_WORKSPACE_CONFIG: :4096:8 + N_REPEATS: 5 MODEL_ARGS: --num-layers: 12 --hidden-size: 512 @@ -49,4 +50,4 @@ MODEL_ARGS: --data-cache-path: ${DATA_CACHE_PATH} --fp16: true --apply-query-key-layer-scaling: true -TEST_TYPE: ckpt-resume \ No newline at end of file +TEST_TYPE: ckpt-resume diff --git a/tests/functional_tests/test_cases/gpt/gpt3_mr_mcore_tp2_pp2_uninstall_te_dgx_a100_1N8G/model_config.yaml b/tests/functional_tests/test_cases/gpt/gpt3_mr_mcore_tp2_pp2_uninstall_te_dgx_a100_1N8G/model_config.yaml index e2a87210ea..cf65df920f 100644 --- a/tests/functional_tests/test_cases/gpt/gpt3_mr_mcore_tp2_pp2_uninstall_te_dgx_a100_1N8G/model_config.yaml +++ b/tests/functional_tests/test_cases/gpt/gpt3_mr_mcore_tp2_pp2_uninstall_te_dgx_a100_1N8G/model_config.yaml @@ -4,9 +4,8 @@ ENV_VARS: NCCL_ALGO: Tree CUBLAS_WORKSPACE_CONFIG: :4096:8 SKIP_PYTEST: 1 -BEFORE_SCRIPT: - pip uninstall -y transformer_engine - pip uninstall -y Apex ## TODO: remove once Apex dependency has been removed completely + N_REPEATS: 1 +BEFORE_SCRIPT: pip uninstall -y transformer_engine pip uninstall -y Apex ## TODO: remove once Apex dependency has been removed completely MODEL_ARGS: --num-layers: 12 --hidden-size: 512 @@ -48,8 +47,8 @@ MODEL_ARGS: --deterministic-mode: true --no-gradient-accumulation-fusion: true --use-mcore-models: true - --ckpt-format: torch_dist + --ckpt-format: torch_dist --data-cache-path: ${DATA_CACHE_PATH} --fp16: true --apply-query-key-layer-scaling: true -TEST_TYPE: regular \ No newline at end of file +TEST_TYPE: regular diff --git a/tests/functional_tests/test_cases/gpt/gpt3_mr_te_tp2_pp2_dgx_a100_1N8G/model_config.yaml b/tests/functional_tests/test_cases/gpt/gpt3_mr_te_tp2_pp2_dgx_a100_1N8G/model_config.yaml index 7d2cada241..af105662a9 100644 --- a/tests/functional_tests/test_cases/gpt/gpt3_mr_te_tp2_pp2_dgx_a100_1N8G/model_config.yaml +++ b/tests/functional_tests/test_cases/gpt/gpt3_mr_te_tp2_pp2_dgx_a100_1N8G/model_config.yaml @@ -3,6 +3,7 @@ ENV_VARS: NVTE_ALLOW_NONDETERMINISTIC_ALGO: 0 NCCL_ALGO: Tree CUBLAS_WORKSPACE_CONFIG: :4096:8 + N_REPEATS: 5 MODEL_ARGS: --num-layers: 12 --hidden-size: 512 @@ -46,4 +47,4 @@ MODEL_ARGS: --use-legacy-models: true --data-cache-path: ${DATA_CACHE_PATH} --bf16: true -TEST_TYPE: regular \ No newline at end of file +TEST_TYPE: regular diff --git a/tests/functional_tests/test_cases/gpt/gpt3_mr_te_tp2_pp2_resume_torch_dgx_a100_1N8G/model_config.yaml b/tests/functional_tests/test_cases/gpt/gpt3_mr_te_tp2_pp2_resume_torch_dgx_a100_1N8G/model_config.yaml index 6735a087b1..3d27f95aa6 100644 --- a/tests/functional_tests/test_cases/gpt/gpt3_mr_te_tp2_pp2_resume_torch_dgx_a100_1N8G/model_config.yaml +++ b/tests/functional_tests/test_cases/gpt/gpt3_mr_te_tp2_pp2_resume_torch_dgx_a100_1N8G/model_config.yaml @@ -3,6 +3,7 @@ ENV_VARS: NVTE_ALLOW_NONDETERMINISTIC_ALGO: 0 NCCL_ALGO: Tree CUBLAS_WORKSPACE_CONFIG: :4096:8 + N_REPEATS: 5 MODEL_ARGS: --num-layers: 12 --hidden-size: 512 @@ -47,4 +48,4 @@ MODEL_ARGS: --use-legacy-models: true --data-cache-path: ${DATA_CACHE_PATH} --bf16: true -TEST_TYPE: ckpt-resume \ No newline at end of file +TEST_TYPE: ckpt-resume diff --git a/tests/functional_tests/test_cases/gpt/gpt3_mr_tp1_pp4_vp1_dgx_a100_1N8G/model_config.yaml b/tests/functional_tests/test_cases/gpt/gpt3_mr_tp1_pp4_vp1_dgx_a100_1N8G/model_config.yaml index e4c082290e..1e6b07a429 100644 --- a/tests/functional_tests/test_cases/gpt/gpt3_mr_tp1_pp4_vp1_dgx_a100_1N8G/model_config.yaml +++ b/tests/functional_tests/test_cases/gpt/gpt3_mr_tp1_pp4_vp1_dgx_a100_1N8G/model_config.yaml @@ -3,6 +3,7 @@ ENV_VARS: NVTE_ALLOW_NONDETERMINISTIC_ALGO: 0 NCCL_ALGO: Tree CUBLAS_WORKSPACE_CONFIG: :4096:8 + N_REPEATS: 5 MODEL_ARGS: --num-layers: 12 --hidden-size: 512 @@ -47,4 +48,4 @@ MODEL_ARGS: --data-cache-path: ${DATA_CACHE_PATH} --fp16: true --apply-query-key-layer-scaling: true -TEST_TYPE: regular \ No newline at end of file +TEST_TYPE: regular diff --git a/tests/functional_tests/test_cases/gpt/gpt3_mr_tp1_pp4_vp1_resume_torch_dgx_a100_1N8G/model_config.yaml b/tests/functional_tests/test_cases/gpt/gpt3_mr_tp1_pp4_vp1_resume_torch_dgx_a100_1N8G/model_config.yaml index bbb14c899c..2ff5fc2224 100644 --- a/tests/functional_tests/test_cases/gpt/gpt3_mr_tp1_pp4_vp1_resume_torch_dgx_a100_1N8G/model_config.yaml +++ b/tests/functional_tests/test_cases/gpt/gpt3_mr_tp1_pp4_vp1_resume_torch_dgx_a100_1N8G/model_config.yaml @@ -3,6 +3,7 @@ ENV_VARS: NVTE_ALLOW_NONDETERMINISTIC_ALGO: 0 NCCL_ALGO: Tree CUBLAS_WORKSPACE_CONFIG: :4096:8 + N_REPEATS: 5 MODEL_ARGS: --num-layers: 12 --hidden-size: 512 @@ -48,4 +49,4 @@ MODEL_ARGS: --data-cache-path: ${DATA_CACHE_PATH} --fp16: true --apply-query-key-layer-scaling: true -TEST_TYPE: ckpt-resume \ No newline at end of file +TEST_TYPE: ckpt-resume diff --git a/tests/functional_tests/test_cases/gpt/gpt3_mr_tp2_pp2_dgx_a100_1N8G/model_config.yaml b/tests/functional_tests/test_cases/gpt/gpt3_mr_tp2_pp2_dgx_a100_1N8G/model_config.yaml index b5881f04d2..4e4a963417 100644 --- a/tests/functional_tests/test_cases/gpt/gpt3_mr_tp2_pp2_dgx_a100_1N8G/model_config.yaml +++ b/tests/functional_tests/test_cases/gpt/gpt3_mr_tp2_pp2_dgx_a100_1N8G/model_config.yaml @@ -3,6 +3,7 @@ ENV_VARS: NVTE_ALLOW_NONDETERMINISTIC_ALGO: 0 NCCL_ALGO: Tree CUBLAS_WORKSPACE_CONFIG: :4096:8 + N_REPEATS: 5 MODEL_ARGS: --num-layers: 12 --hidden-size: 512 @@ -46,4 +47,4 @@ MODEL_ARGS: --data-cache-path: ${DATA_CACHE_PATH} --fp16: true --apply-query-key-layer-scaling: true -TEST_TYPE: regular \ No newline at end of file +TEST_TYPE: regular diff --git a/tests/functional_tests/test_cases/gpt/gpt3_mr_tp2_pp2_resume_torch_dgx_a100_1N8G/model_config.yaml b/tests/functional_tests/test_cases/gpt/gpt3_mr_tp2_pp2_resume_torch_dgx_a100_1N8G/model_config.yaml index fca698dc0f..8d11e207e7 100644 --- a/tests/functional_tests/test_cases/gpt/gpt3_mr_tp2_pp2_resume_torch_dgx_a100_1N8G/model_config.yaml +++ b/tests/functional_tests/test_cases/gpt/gpt3_mr_tp2_pp2_resume_torch_dgx_a100_1N8G/model_config.yaml @@ -3,6 +3,7 @@ ENV_VARS: NVTE_ALLOW_NONDETERMINISTIC_ALGO: 0 NCCL_ALGO: Tree CUBLAS_WORKSPACE_CONFIG: :4096:8 + N_REPEATS: 5 MODEL_ARGS: --num-layers: 12 --hidden-size: 512 @@ -47,4 +48,4 @@ MODEL_ARGS: --data-cache-path: ${DATA_CACHE_PATH} --fp16: true --apply-query-key-layer-scaling: true -TEST_TYPE: ckpt-resume \ No newline at end of file +TEST_TYPE: ckpt-resume diff --git a/tests/functional_tests/test_cases/multimodal-llava/multimodal_llava_mr_mcore_te_tp1_pp1_dgx_a100_1N8G/model_config.yaml b/tests/functional_tests/test_cases/multimodal-llava/multimodal_llava_mr_mcore_te_tp1_pp1_dgx_a100_1N8G/model_config.yaml index 496cedad25..6da0c3a85a 100644 --- a/tests/functional_tests/test_cases/multimodal-llava/multimodal_llava_mr_mcore_te_tp1_pp1_dgx_a100_1N8G/model_config.yaml +++ b/tests/functional_tests/test_cases/multimodal-llava/multimodal_llava_mr_mcore_te_tp1_pp1_dgx_a100_1N8G/model_config.yaml @@ -3,6 +3,7 @@ ENV_VARS: NVTE_ALLOW_NONDETERMINISTIC_ALGO: 0 NCCL_ALGO: Tree CUBLAS_WORKSPACE_CONFIG: :4096:8 + N_REPEATS: 5 MODEL_ARGS: --num-layers: 12 --hidden-size: 624 @@ -49,4 +50,4 @@ MODEL_ARGS: --img-w: 336 --patch-dim: 14 --mock-data: true -TEST_TYPE: regular \ No newline at end of file +TEST_TYPE: regular diff --git a/tests/functional_tests/test_cases/multimodal-llava/multimodal_llava_mr_mcore_te_tp2_pp3_dgx_a100_1N8G/model_config.yaml b/tests/functional_tests/test_cases/multimodal-llava/multimodal_llava_mr_mcore_te_tp2_pp3_dgx_a100_1N8G/model_config.yaml index 7574866666..816aa8bf1f 100644 --- a/tests/functional_tests/test_cases/multimodal-llava/multimodal_llava_mr_mcore_te_tp2_pp3_dgx_a100_1N8G/model_config.yaml +++ b/tests/functional_tests/test_cases/multimodal-llava/multimodal_llava_mr_mcore_te_tp2_pp3_dgx_a100_1N8G/model_config.yaml @@ -3,6 +3,7 @@ ENV_VARS: NVTE_ALLOW_NONDETERMINISTIC_ALGO: 0 NCCL_ALGO: Tree CUBLAS_WORKSPACE_CONFIG: :4096:8 + N_REPEATS: 5 MODEL_ARGS: --num-layers: 12 --hidden-size: 624 @@ -50,4 +51,4 @@ MODEL_ARGS: --img-w: 336 --patch-dim: 14 --mock-data: true -TEST_TYPE: regular \ No newline at end of file +TEST_TYPE: regular diff --git a/tests/functional_tests/test_cases/multimodal-llava/multimodal_llava_mr_mcore_te_tp4_pp1_etp3_dgx_a100_1N7G/model_config.yaml b/tests/functional_tests/test_cases/multimodal-llava/multimodal_llava_mr_mcore_te_tp4_pp1_etp3_dgx_a100_1N7G/model_config.yaml index eb82bff8a5..180e6beedd 100644 --- a/tests/functional_tests/test_cases/multimodal-llava/multimodal_llava_mr_mcore_te_tp4_pp1_etp3_dgx_a100_1N7G/model_config.yaml +++ b/tests/functional_tests/test_cases/multimodal-llava/multimodal_llava_mr_mcore_te_tp4_pp1_etp3_dgx_a100_1N7G/model_config.yaml @@ -4,6 +4,7 @@ ENV_VARS: NCCL_ALGO: Tree CUBLAS_WORKSPACE_CONFIG: :4096:8 GPUS_PER_NODE: 7 + N_REPEATS: 5 MODEL_ARGS: --num-layers: 12 --hidden-size: 624 @@ -52,4 +53,4 @@ MODEL_ARGS: --img-w: 336 --patch-dim: 14 --mock-data: true -TEST_TYPE: regular \ No newline at end of file +TEST_TYPE: regular diff --git a/tests/functional_tests/test_cases/multimodal-llava/multimodal_llava_mr_mcore_te_tp4_pp1_resume_torch_etp3_dgx_a100_1N7G/model_config.yaml b/tests/functional_tests/test_cases/multimodal-llava/multimodal_llava_mr_mcore_te_tp4_pp1_resume_torch_etp3_dgx_a100_1N7G/model_config.yaml index a56ded5f84..1fade8fd4e 100644 --- a/tests/functional_tests/test_cases/multimodal-llava/multimodal_llava_mr_mcore_te_tp4_pp1_resume_torch_etp3_dgx_a100_1N7G/model_config.yaml +++ b/tests/functional_tests/test_cases/multimodal-llava/multimodal_llava_mr_mcore_te_tp4_pp1_resume_torch_etp3_dgx_a100_1N7G/model_config.yaml @@ -4,6 +4,7 @@ ENV_VARS: NCCL_ALGO: Tree CUBLAS_WORKSPACE_CONFIG: :4096:8 GPUS_PER_NODE: 7 + N_REPEATS: 5 MODEL_ARGS: --num-layers: 12 --hidden-size: 624 @@ -53,4 +54,4 @@ MODEL_ARGS: --img-w: 336 --patch-dim: 14 --mock-data: true -TEST_TYPE: ckpt-resume \ No newline at end of file +TEST_TYPE: ckpt-resume diff --git a/tests/functional_tests/test_cases/t5/t5_220m_mr_mcore_tp2_pp2_dgx_a100_1N8G/model_config.yaml b/tests/functional_tests/test_cases/t5/t5_220m_mr_mcore_tp2_pp2_dgx_a100_1N8G/model_config.yaml index 3a0a741e7a..8abace27d3 100644 --- a/tests/functional_tests/test_cases/t5/t5_220m_mr_mcore_tp2_pp2_dgx_a100_1N8G/model_config.yaml +++ b/tests/functional_tests/test_cases/t5/t5_220m_mr_mcore_tp2_pp2_dgx_a100_1N8G/model_config.yaml @@ -3,6 +3,7 @@ ENV_VARS: NVTE_ALLOW_NONDETERMINISTIC_ALGO: 0 NCCL_ALGO: ^NVLS CUBLAS_WORKSPACE_CONFIG: :4096:8 + N_REPEATS: 5 MODEL_ARGS: --encoder-num-layers: 12 --decoder-num-layers: 12 @@ -51,4 +52,4 @@ MODEL_ARGS: --encoder-pipeline-model-parallel-size: 2 --deterministic-mode: true --ckpt-format: torch -TEST_TYPE: regular \ No newline at end of file +TEST_TYPE: regular diff --git a/tests/functional_tests/test_cases/t5/t5_220m_mr_mcore_tp2_pp2_resume_torch_dgx_a100_1N8G/model_config.yaml b/tests/functional_tests/test_cases/t5/t5_220m_mr_mcore_tp2_pp2_resume_torch_dgx_a100_1N8G/model_config.yaml index 2e06641f34..c1a6d51bf1 100644 --- a/tests/functional_tests/test_cases/t5/t5_220m_mr_mcore_tp2_pp2_resume_torch_dgx_a100_1N8G/model_config.yaml +++ b/tests/functional_tests/test_cases/t5/t5_220m_mr_mcore_tp2_pp2_resume_torch_dgx_a100_1N8G/model_config.yaml @@ -3,6 +3,7 @@ ENV_VARS: NVTE_ALLOW_NONDETERMINISTIC_ALGO: 0 NCCL_ALGO: ^NVLS CUBLAS_WORKSPACE_CONFIG: :4096:8 + N_REPEATS: 5 MODEL_ARGS: --encoder-num-layers: 12 --decoder-num-layers: 12 @@ -51,4 +52,4 @@ MODEL_ARGS: --encoder-pipeline-model-parallel-size: 2 --deterministic-mode: true --ckpt-format: torch -TEST_TYPE: ckpt-resume \ No newline at end of file +TEST_TYPE: ckpt-resume