diff --git a/.gitlab-ci.yml b/.gitlab-ci.yml index 667e9f5e53..c04d974bf7 100644 --- a/.gitlab-ci.yml +++ b/.gitlab-ci.yml @@ -51,7 +51,7 @@ formatting: script: &selene-test-resume-launcher-script - echo "Running selene resume from checkpoint test. " - pwd - - run_cmd="bash tests/functional_tests/shell_test_utils/run_selene_test_resume_checkpoint_launcher_script.sh RUN_MODEL=$RUN_MODEL TP_SIZE=$TP_SIZE PP_SIZE=$PP_SIZE NUM_NODES=$NUM_NODES SELENE_ADLR_CI_PATH=$SELENE_ADLR_CI_PATH CI_PIPELINE_ID=$CI_PIPELINE_ID RUN_NAME=$RUN_NAME PYTORCH_IMAGE=$PYTORCH_IMAGE DATA_DIR=$DATA_DIR" + - run_cmd="bash tests/functional_tests/shell_test_utils/run_selene_test_resume_checkpoint_launcher_script.sh RUN_MODEL=$RUN_MODEL TP_SIZE=$TP_SIZE PP_SIZE=$PP_SIZE VP_SIZE=$VP_SIZE NUM_NODES=$NUM_NODES SELENE_ADLR_CI_PATH=$SELENE_ADLR_CI_PATH CI_PIPELINE_ID=$CI_PIPELINE_ID RUN_NAME=$RUN_NAME PYTORCH_IMAGE=$PYTORCH_IMAGE DATA_DIR=$DATA_DIR" - echo "$run_cmd" - ${run_cmd} - echo "Completed the job" @@ -63,7 +63,6 @@ formatting: - if: $CI_MERGE_REQUEST_APPROVED && $TEST_LEVEL =~ $TESTS_TO_RUN_AFTER_MERGE_REQ_APPROVED when: always allow_failure: false - retry: 2 .selene_test_launcher: &selene-test-launcher tags: @@ -72,7 +71,7 @@ formatting: script: &selene-test-launcher-script - echo "Running selene test" - pwd - - run_cmd="bash tests/functional_tests/shell_test_utils/run_selene_test_launcher_script.sh RUN_MODEL=$RUN_MODEL TP_SIZE=$TP_SIZE PP_SIZE=$PP_SIZE NUM_NODES=$NUM_NODES SELENE_ADLR_CI_PATH=$SELENE_ADLR_CI_PATH CI_PIPELINE_ID=$CI_PIPELINE_ID RUN_NAME=$RUN_NAME MAX_STEPS=$MAX_STEPS PYTORCH_IMAGE=$PYTORCH_IMAGE DATA_DIR=$DATA_DIR USE_CORE=$USE_CORE USE_TE=$USE_TE" + - run_cmd="bash tests/functional_tests/shell_test_utils/run_selene_test_launcher_script.sh RUN_MODEL=$RUN_MODEL TP_SIZE=$TP_SIZE PP_SIZE=$PP_SIZE VP_SIZE=$VP_SIZE NUM_NODES=$NUM_NODES SELENE_ADLR_CI_PATH=$SELENE_ADLR_CI_PATH CI_PIPELINE_ID=$CI_PIPELINE_ID RUN_NAME=$RUN_NAME MAX_STEPS=$MAX_STEPS PYTORCH_IMAGE=$PYTORCH_IMAGE DATA_DIR=$DATA_DIR USE_CORE=$USE_CORE USE_TE=$USE_TE" - echo "$run_cmd" - ${run_cmd} - echo "Completed the job" @@ -84,7 +83,6 @@ formatting: - if: $CI_MERGE_REQUEST_APPROVED && $TEST_LEVEL =~ $TESTS_TO_RUN_AFTER_MERGE_REQ_APPROVED when: always allow_failure: false - retry: 2 train.te_gpt3.345m_tp2_pp2_1node_50steps: <<: *selene-test-launcher @@ -143,6 +141,20 @@ train.gpt3_core.345m_tp1_pp2_1node_50steps: TEST_LEVEL: L0 train.gpt3_core.345m_tp1_pp4_1node_50steps: + <<: *selene-test-launcher + variables: + <<: [*VARS] + RUN_MODEL: gpt3 + USE_TE: 0 + TP_SIZE: 1 + PP_SIZE: 4 + NUM_NODES: 1 + MAX_STEPS: 50 + USE_CORE: 1 + TIME_LIMIT: "20:00" + TEST_LEVEL: L0 + +train.gpt3_core.345m_tp1_pp4_interleaved_1node_50steps: <<: *selene-test-launcher variables: <<: [*VARS] @@ -181,7 +193,6 @@ train.gpt3_core.345m_tp1_pp4_1node_50steps_swiglu: USE_TE: 0 TP_SIZE: 1 PP_SIZE: 4 - VP_SIZE: 1 NUM_NODES: 1 MAX_STEPS: 50 USE_CORE: 1 @@ -198,7 +209,6 @@ train.gpt3_core.345m_tp1_pp4_1node_50steps_disable_bias_linear: USE_TE: 0 TP_SIZE: 1 PP_SIZE: 4 - VP_SIZE: 1 NUM_NODES: 1 MAX_STEPS: 50 USE_CORE: 1 @@ -215,7 +225,6 @@ train.gpt3_core.345m_tp1_pp4_1node_50steps_untie_embeddings_and_outputs: USE_TE: 0 TP_SIZE: 1 PP_SIZE: 4 - VP_SIZE: 1 NUM_NODES: 1 MAX_STEPS: 50 USE_CORE: 1 @@ -232,7 +241,6 @@ train.gpt3_core.345m_tp1_pp4_1node_50steps_sequence_parallel: USE_TE: 0 TP_SIZE: 1 PP_SIZE: 4 - VP_SIZE: 1 NUM_NODES: 1 MAX_STEPS: 50 USE_CORE: 1 @@ -284,6 +292,20 @@ train.gpt3.345m_tp1_pp2_1node_50steps: TEST_LEVEL: L0 train.gpt3.345m_tp1_pp4_1node_50steps: + <<: *selene-test-launcher + variables: + <<: [*VARS] + RUN_MODEL: gpt3 + USE_TE: 0 + TP_SIZE: 1 + PP_SIZE: 4 + NUM_NODES: 1 + MAX_STEPS: 50 + USE_CORE: 0 + TIME_LIMIT: "20:00" + TEST_LEVEL: L0 + +train.gpt3.345m_tp1_pp4_interleaved_1node_50steps: <<: *selene-test-launcher variables: <<: [*VARS] @@ -382,7 +404,6 @@ train.te_core_moe_gpt3.345m_tp2_pp2_2experts_1node_50steps: USE_TE: 0 TP_SIZE: 2 PP_SIZE: 2 - VP_SIZE: 1 NUM_NODES: 1 MAX_STEPS: 50 USE_CORE: 1 @@ -399,7 +420,6 @@ train.te_core_moe_gpt3.345m_tp2_pp2_4experts2parallel_1node_50steps: USE_TE: 0 TP_SIZE: 2 PP_SIZE: 2 - VP_SIZE: 1 NUM_NODES: 1 MAX_STEPS: 50 USE_CORE: 1 @@ -416,7 +436,6 @@ train.te_core_moe_gpt3.345m_tp2_pp1_4experts2parallel_1node_50steps: USE_TE: 0 TP_SIZE: 2 PP_SIZE: 1 - VP_SIZE: 1 NUM_NODES: 1 MAX_STEPS: 50 USE_CORE: 1 @@ -433,7 +452,6 @@ train.moe_gpt3.345m_tp2_pp2_4experts_1node_50steps: USE_TE: 0 TP_SIZE: 2 PP_SIZE: 2 - VP_SIZE: 1 NUM_NODES: 1 MAX_STEPS: 50 USE_CORE: 0 @@ -479,6 +497,18 @@ train.bert.345m_tp1_pp2_1node_50steps: TEST_LEVEL: L0 train.bert.345m_tp1_pp4_1node_50steps: + <<: *selene-test-launcher + variables: + <<: [*VARS] + RUN_MODEL: bert + TP_SIZE: 1 + PP_SIZE: 4 + NUM_NODES: 1 + MAX_STEPS: 50 + TIME_LIMIT: "20:00" + TEST_LEVEL: L0 + +train.bert.345m_tp1_pp4_interleaved_1node_50steps: <<: *selene-test-launcher variables: <<: [*VARS] diff --git a/tests/functional_tests/shell_test_utils/run_selene_test_launcher_script.sh b/tests/functional_tests/shell_test_utils/run_selene_test_launcher_script.sh index 63f4c0ea47..73b3603b75 100644 --- a/tests/functional_tests/shell_test_utils/run_selene_test_launcher_script.sh +++ b/tests/functional_tests/shell_test_utils/run_selene_test_launcher_script.sh @@ -21,7 +21,8 @@ if [[ $USE_CORE -eq 1 && $USE_TE -eq 1 ]]; then fi # step 2 : SETTING RUN NAME -RUN_NAME=${RUN_MODEL}_tp${TP_SIZE}_pp${PP_SIZE}_${NUM_NODES}nodes_${MAX_STEPS}steps +if [[ -n $VP_SIZE ]]; then INTERLEAVED_STR="_interleaved"; else INTERLEAVED_STR=""; fi +RUN_NAME=${RUN_MODEL}_tp${TP_SIZE}_pp${PP_SIZE}${INTERLEAVED_STR}_${NUM_NODES}nodes_${MAX_STEPS}steps if [[ $USE_TE == 1 ]]; then RUN_NAME=${RUN_NAME}_te_enabled; fi if [[ $USE_CORE == 1 ]]; then RUN_NAME=${RUN_NAME}_core_enabled; fi if [[ -n $METADATA ]]; then RUN_NAME=${RUN_NAME}_${METADATA}; fi @@ -47,10 +48,10 @@ export GOTO_NUM_THREADS=2 export OPENBLAS_NUM_THREADS=2 # step 5 : CREATING A COPY OF THE SBATCH SCRIPT THAT WILL BE RUN FOR DEBUGGING -envsubst '$BASE_DIR $PYTORCH_IMAGE $BUILD_DIR $DATA_DIR $VP_SIZE $MBS $GBS $ADDITIONAL_PARAMS $USE_TE $TP_SIZE $PP_SIZE $NUM_NODES $MAX_STEPS $USE_CORE' <$BUILD_DIR/tests/functional_tests/test_scripts/$RUN_MODEL/sbatch_${RUN_MODEL}_distributed_test.sh > $SELENE_ADLR_CI_PATH/$CI_PIPELINE_ID/$RUN_NAME/debug/sbatch_${RUN_MODEL}_distributed_test.sh +envsubst '$BASE_DIR $PYTORCH_IMAGE $BUILD_DIR $DATA_DIR $MBS $GBS $ADDITIONAL_PARAMS $USE_TE $TP_SIZE $PP_SIZE $VP_SIZE $NUM_NODES $MAX_STEPS $USE_CORE' <$BUILD_DIR/tests/functional_tests/test_scripts/$RUN_MODEL/sbatch_${RUN_MODEL}_distributed_test.sh > $SELENE_ADLR_CI_PATH/$CI_PIPELINE_ID/$RUN_NAME/debug/sbatch_${RUN_MODEL}_distributed_test.sh # step 6 : SUBMITTING THE JOB -sbatch_submission=`sbatch $BUILD_DIR/tests/functional_tests/test_scripts/$RUN_MODEL/sbatch_${RUN_MODEL}_distributed_test.sh --export=BASE_DIR,BUILD_DIR,DATA_DIR,USE_TE,TP_SIZE,PP_SIZE,NUM_NODES,MAX_STEPS,VP_SIZE,MBS,GBS,PYTORCH_IMAGE,ADDITIONAL_PARAMS` +sbatch_submission=`sbatch $BUILD_DIR/tests/functional_tests/test_scripts/$RUN_MODEL/sbatch_${RUN_MODEL}_distributed_test.sh --export=BASE_DIR,BUILD_DIR,DATA_DIR,USE_TE,TP_SIZE,PP_SIZE,VP_SIZE,NUM_NODES,MAX_STEPS,MBS,GBS,PYTORCH_IMAGE,ADDITIONAL_PARAMS` export SLURM_JOBID=$(echo $sbatch_submission| grep 'Submitted batch job' | awk '{ print $4 }'); # step 7 : WAITING FOR JOB TO COMPLETE AND PRINTING JOB INFO @@ -78,4 +79,4 @@ fi export EXPECTED_METRICS_FILE=$BUILD_DIR/tests/functional_tests/test_results/$RUN_MODEL/$RUN_NAME.json PYTEST_EXIT=0 pytest $BUILD_DIR/tests/functional_tests/python_test_utils/test_ci_pipeline.py || PYTEST_EXIT=$? -if [[ $PYTEST_EXIT == 0 ]]; then echo "Pytest succeded"; else echo "Pytest failed. See ${SELENE_ADLR_CI_PATH}/${CI_PIPELINE_ID}/${RUN_NAME}/debug directory for result logs"; exit $PYTEST_EXIT; fi \ No newline at end of file +if [[ $PYTEST_EXIT == 0 ]]; then echo "Pytest succeded"; else echo "Pytest failed. See ${SELENE_ADLR_CI_PATH}/${CI_PIPELINE_ID}/${RUN_NAME}/debug directory for result logs"; exit $PYTEST_EXIT; fi diff --git a/tests/functional_tests/shell_test_utils/run_selene_test_resume_checkpoint_launcher_script.sh b/tests/functional_tests/shell_test_utils/run_selene_test_resume_checkpoint_launcher_script.sh index 6060d48606..ab3eb22103 100644 --- a/tests/functional_tests/shell_test_utils/run_selene_test_resume_checkpoint_launcher_script.sh +++ b/tests/functional_tests/shell_test_utils/run_selene_test_resume_checkpoint_launcher_script.sh @@ -39,10 +39,10 @@ export GOTO_NUM_THREADS=2 export OPENBLAS_NUM_THREADS=2 # step 5 : CREATING A COPY OF THE SBATCH SCRIPT THAT WILL BE RUN FOR DEBUGGING -envsubst '$BASE_DIR $PYTORCH_IMAGE $BUILD_DIR $DATA_DIR $VP_SIZE $MBS $GBS $TP_SIZE $PP_SIZE $NUM_NODES $MAX_STEPS' <$BUILD_DIR/tests/functional_tests/test_scripts/$RUN_MODEL/sbatch_${RUN_MODEL}_distributed_resume_checkpoint_test.sh > $SELENE_ADLR_CI_PATH/$CI_PIPELINE_ID/$RUN_NAME/debug/sbatch_${RUN_MODEL}_distributed_resume_checkpoint_test.sh +envsubst '$BASE_DIR $PYTORCH_IMAGE $BUILD_DIR $DATA_DIR $MBS $GBS $TP_SIZE $PP_SIZE $VP_SIZE $NUM_NODES $MAX_STEPS' <$BUILD_DIR/tests/functional_tests/test_scripts/$RUN_MODEL/sbatch_${RUN_MODEL}_distributed_resume_checkpoint_test.sh > $SELENE_ADLR_CI_PATH/$CI_PIPELINE_ID/$RUN_NAME/debug/sbatch_${RUN_MODEL}_distributed_resume_checkpoint_test.sh # step 6 : SUBMITTING THE JOB -sbatch_submission=`sbatch $BUILD_DIR/tests/functional_tests/test_scripts/$RUN_MODEL/sbatch_${RUN_MODEL}_distributed_resume_checkpoint_test.sh --export=BASE_DIR,BUILD_DIR,DATA_DIR,TP_SIZE,PP_SIZE,NUM_NODES,PYTORCH_IMAGE` +sbatch_submission=`sbatch $BUILD_DIR/tests/functional_tests/test_scripts/$RUN_MODEL/sbatch_${RUN_MODEL}_distributed_resume_checkpoint_test.sh --export=BASE_DIR,BUILD_DIR,DATA_DIR,TP_SIZE,PP_SIZE,VP_SIZE,NUM_NODES,PYTORCH_IMAGE` export SLURM_JOBID=$(echo $sbatch_submission| grep 'Submitted batch job' | awk '{ print $4 }'); # step 7 : WAITING FOR JOB TO COMPLETE AND PRINTING JOB INFO @@ -62,4 +62,4 @@ if [[ "$SLURM_STATE" != "COMPLETED" ]]; then echo "Slurm job did not complete. S source $PYTHON_VIRTUAL_ENV PYTEST_EXIT=0 pytest $BUILD_DIR/tests/functional_tests/python_test_utils/test_resume_checkpoint_pipeline.py || PYTEST_EXIT=$? -if [[ $PYTEST_EXIT == 0 ]]; then echo "Pytest succeded"; else echo "Pytest failed. See ${SELENE_ADLR_CI_PATH}/${CI_PIPELINE_ID}/${RUN_NAME}/debug directory for result logs"; exit $PYTEST_EXIT; fi \ No newline at end of file +if [[ $PYTEST_EXIT == 0 ]]; then echo "Pytest succeded"; else echo "Pytest failed. See ${SELENE_ADLR_CI_PATH}/${CI_PIPELINE_ID}/${RUN_NAME}/debug directory for result logs"; exit $PYTEST_EXIT; fi diff --git a/tests/functional_tests/test_results/bert/bert_tp1_pp4_1nodes_50steps.json b/tests/functional_tests/test_results/bert/bert_tp1_pp4_1nodes_50steps.json index 5ed9c5d9f5..784ea91eca 100644 --- a/tests/functional_tests/test_results/bert/bert_tp1_pp4_1nodes_50steps.json +++ b/tests/functional_tests/test_results/bert/bert_tp1_pp4_1nodes_50steps.json @@ -1 +1 @@ -{"lm loss": {"start_step": 0, "end_step": 50, "step_interval": 5, "values": [10.54837, 10.54636, 10.55694, 10.54151, 10.53088, 10.48503, 10.46275, 10.31499, 10.17122, 9.97326]}, "num-zeros": {"start_step": 0, "end_step": 34, "step_interval": 5, "values": [22606.0, 20619.0, 26292.0, 23607.0, 21666.0, 21672.0, 23313.0]}, "iteration_timing_avg": 0.9262994117647059} \ No newline at end of file +{"lm loss": {"start_step": 0, "end_step": 50, "step_interval": 5, "values": [10.5414, 10.53988, 10.55513, 10.52847, 10.54297, 10.51657, 10.47015, 10.36882, 10.23301, 10.05128]}, "num-zeros": {"start_step": 0, "end_step": 34, "step_interval": 5, "values": [26510.0, 16034.0, 24829.0, 21005.0, 20977.0, 19155.0, 18836.0]}, "iteration_timing_avg": 0.6206926470588235} \ No newline at end of file diff --git a/tests/functional_tests/test_results/bert/bert_tp1_pp4_interleaved_1nodes_50steps.json b/tests/functional_tests/test_results/bert/bert_tp1_pp4_interleaved_1nodes_50steps.json new file mode 100644 index 0000000000..80be53a258 --- /dev/null +++ b/tests/functional_tests/test_results/bert/bert_tp1_pp4_interleaved_1nodes_50steps.json @@ -0,0 +1 @@ +{"lm loss": {"start_step": 0, "end_step": 50, "step_interval": 5, "values": [10.54837, 10.54636, 10.55694, 10.54151, 10.53088, 10.48503, 10.46275, 10.31499, 10.17122, 9.97326]}, "num-zeros": {"start_step": 0, "end_step": 34, "step_interval": 5, "values": [22606.0, 20619.0, 26292.0, 23607.0, 21666.0, 21672.0, 23313.0]}, "iteration_timing_avg": 0.999115588235294} \ No newline at end of file diff --git a/tests/functional_tests/test_results/gpt3/gpt3_tp1_pp4_interleaved_1nodes_50steps.json b/tests/functional_tests/test_results/gpt3/gpt3_tp1_pp4_interleaved_1nodes_50steps.json new file mode 100644 index 0000000000..0319d1ca7b --- /dev/null +++ b/tests/functional_tests/test_results/gpt3/gpt3_tp1_pp4_interleaved_1nodes_50steps.json @@ -0,0 +1 @@ +{"lm loss": {"start_step": 0, "end_step": 45, "step_interval": 5, "values": [10.7951, 10.84939, 10.87411, 10.83459, 10.82865, 10.78676, 10.56492, 10.57063, 10.48545]}, "num-zeros": {"start_step": 0, "end_step": 29, "step_interval": 5, "values": [2561.0, 2771.0, 2141.0, 2656.0, 2737.0, 2472.0]}, "iteration_timing_avg": 0.1285973333333333} \ No newline at end of file diff --git a/tests/functional_tests/test_results/gpt3/gpt3_tp1_pp4_interleaved_1nodes_50steps_core_enabled.json b/tests/functional_tests/test_results/gpt3/gpt3_tp1_pp4_interleaved_1nodes_50steps_core_enabled.json new file mode 100644 index 0000000000..429017fda9 --- /dev/null +++ b/tests/functional_tests/test_results/gpt3/gpt3_tp1_pp4_interleaved_1nodes_50steps_core_enabled.json @@ -0,0 +1 @@ +{"lm loss": {"start_step": 0, "end_step": 50, "step_interval": 5, "values": [10.82096, 10.87358, 10.8827, 10.79796, 10.68762, 10.59849, 10.09941, 10.21477, 10.14024, 9.80787]}, "num-zeros": {"start_step": 0, "end_step": 50, "step_interval": 5, "values": [1500.0, 1792.0, 1899.0, 1853.0, 1884.0, 1847.0, 1596.0, 1783.0, 2314.0, 2349.0]}, "iteration_timing_avg": 0.12620382352941178} \ No newline at end of file diff --git a/tests/functional_tests/test_scripts/bert/sbatch_bert_distributed_test.sh b/tests/functional_tests/test_scripts/bert/sbatch_bert_distributed_test.sh index 2ddef48bad..ccd793d865 100755 --- a/tests/functional_tests/test_scripts/bert/sbatch_bert_distributed_test.sh +++ b/tests/functional_tests/test_scripts/bert/sbatch_bert_distributed_test.sh @@ -16,4 +16,4 @@ echo 'Running tests using $PYTORCH_IMAGE image' srun --output $BASE_DIR/debug/slurm-%j.out --error $BASE_DIR/debug/slurm-%j.out --container-image $PYTORCH_IMAGE --container-mounts $BASE_DIR/tensorboard_logs:/workspace/tensorboard_logs,$BASE_DIR/checkpoints:/workspace/checkpoints,$BUILD_DIR:/workspace/megatron-lm,$DATA_DIR:/workspace/data --no-container-mount-home bash -c " ls cd /workspace/megatron-lm - ./tests/functional_tests/test_scripts/bert/pretrain_bert_distributed_test.sh DATA_PATH=$DATA_PATH CHECKPOINT_PATH=$CHECKPOINT_PATH TENSORBOARD_DIR=$TENSORBOARD_DIR SCRIPTS_DIR=$SCRIPTS_DIR TP_SIZE=$TP_SIZE PP_SIZE=$PP_SIZE NUM_NODES=$NUM_NODES MAX_STEPS=$MAX_STEPS VP_SIZE=$VP_SIZE" \ No newline at end of file + ./tests/functional_tests/test_scripts/bert/pretrain_bert_distributed_test.sh DATA_PATH=$DATA_PATH CHECKPOINT_PATH=$CHECKPOINT_PATH TENSORBOARD_DIR=$TENSORBOARD_DIR SCRIPTS_DIR=$SCRIPTS_DIR TP_SIZE=$TP_SIZE PP_SIZE=$PP_SIZE VP_SIZE=$VP_SIZE NUM_NODES=$NUM_NODES MAX_STEPS=$MAX_STEPS" diff --git a/tests/functional_tests/test_scripts/gpt3/pretrain_gpt3_distributed_test.sh b/tests/functional_tests/test_scripts/gpt3/pretrain_gpt3_distributed_test.sh index d71795e785..dce91ed739 100755 --- a/tests/functional_tests/test_scripts/gpt3/pretrain_gpt3_distributed_test.sh +++ b/tests/functional_tests/test_scripts/gpt3/pretrain_gpt3_distributed_test.sh @@ -15,7 +15,6 @@ echo "---------------------------------" set -x if [[ -n $MBS ]]; then MBS=4; fi if [[ -n $GBS ]]; then GBS=32; fi -if [[ -n $VP_SIZE ]]; then VP_SIZE="" ; fi GPUS_PER_NODE=8 # Change for multinode config diff --git a/tests/functional_tests/test_scripts/gpt3/sbatch_gpt3_distributed_test.sh b/tests/functional_tests/test_scripts/gpt3/sbatch_gpt3_distributed_test.sh index 5bc660f45d..ba2a1b4b62 100755 --- a/tests/functional_tests/test_scripts/gpt3/sbatch_gpt3_distributed_test.sh +++ b/tests/functional_tests/test_scripts/gpt3/sbatch_gpt3_distributed_test.sh @@ -16,4 +16,4 @@ echo 'Running tests using $PYTORCH_IMAGE image' srun --output $BASE_DIR/debug/slurm-%j.out --error $BASE_DIR/debug/slurm-%j.out --container-image $PYTORCH_IMAGE --container-mounts $BASE_DIR/tensorboard_logs:/workspace/tensorboard_logs,$BASE_DIR/debug:/workspace/debug,$BASE_DIR/checkpoints:/workspace/checkpoints,$BUILD_DIR:/workspace/megatron-lm,$DATA_DIR:/workspace/data --no-container-mount-home bash -c " ls cd /workspace/megatron-lm - ./tests/functional_tests/test_scripts/gpt3/pretrain_gpt3_distributed_test.sh DATA_PATH=$DATA_PATH CHECKPOINT_PATH=$CHECKPOINT_PATH TENSORBOARD_DIR=$TENSORBOARD_DIR SCRIPTS_DIR=$SCRIPTS_DIR USE_TE=$USE_TE TP_SIZE=$TP_SIZE PP_SIZE=$PP_SIZE NUM_NODES=$NUM_NODES MAX_STEPS=$MAX_STEPS USE_CORE=$USE_CORE VP_SIZE=$VP_SIZE MBS=$MBS GBS=$GBS ADDITIONAL_PARAMS=\"$ADDITIONAL_PARAMS\"" + ./tests/functional_tests/test_scripts/gpt3/pretrain_gpt3_distributed_test.sh DATA_PATH=$DATA_PATH CHECKPOINT_PATH=$CHECKPOINT_PATH TENSORBOARD_DIR=$TENSORBOARD_DIR SCRIPTS_DIR=$SCRIPTS_DIR USE_TE=$USE_TE TP_SIZE=$TP_SIZE PP_SIZE=$PP_SIZE VP_SIZE=$VP_SIZE NUM_NODES=$NUM_NODES MAX_STEPS=$MAX_STEPS USE_CORE=$USE_CORE MBS=$MBS GBS=$GBS ADDITIONAL_PARAMS=\"$ADDITIONAL_PARAMS\""