Skip to content

Commit

Permalink
Merge branch 'interleaved_pp_test_fixes' into 'main'
Browse files Browse the repository at this point in the history
Various fixes for tests using interleaved parallelism

See merge request ADLR/megatron-lm!857
  • Loading branch information
jaredcasper committed Oct 17, 2023
2 parents 954a65b + 4994cf1 commit feac76a
Show file tree
Hide file tree
Showing 10 changed files with 56 additions and 23 deletions.
54 changes: 42 additions & 12 deletions .gitlab-ci.yml
Original file line number Diff line number Diff line change
Expand Up @@ -51,7 +51,7 @@ formatting:
script: &selene-test-resume-launcher-script
- echo "Running selene resume from checkpoint test. "
- pwd
- run_cmd="bash tests/functional_tests/shell_test_utils/run_selene_test_resume_checkpoint_launcher_script.sh RUN_MODEL=$RUN_MODEL TP_SIZE=$TP_SIZE PP_SIZE=$PP_SIZE NUM_NODES=$NUM_NODES SELENE_ADLR_CI_PATH=$SELENE_ADLR_CI_PATH CI_PIPELINE_ID=$CI_PIPELINE_ID RUN_NAME=$RUN_NAME PYTORCH_IMAGE=$PYTORCH_IMAGE DATA_DIR=$DATA_DIR"
- run_cmd="bash tests/functional_tests/shell_test_utils/run_selene_test_resume_checkpoint_launcher_script.sh RUN_MODEL=$RUN_MODEL TP_SIZE=$TP_SIZE PP_SIZE=$PP_SIZE VP_SIZE=$VP_SIZE NUM_NODES=$NUM_NODES SELENE_ADLR_CI_PATH=$SELENE_ADLR_CI_PATH CI_PIPELINE_ID=$CI_PIPELINE_ID RUN_NAME=$RUN_NAME PYTORCH_IMAGE=$PYTORCH_IMAGE DATA_DIR=$DATA_DIR"
- echo "$run_cmd"
- ${run_cmd}
- echo "Completed the job"
Expand All @@ -63,7 +63,6 @@ formatting:
- if: $CI_MERGE_REQUEST_APPROVED && $TEST_LEVEL =~ $TESTS_TO_RUN_AFTER_MERGE_REQ_APPROVED
when: always
allow_failure: false
retry: 2

.selene_test_launcher: &selene-test-launcher
tags:
Expand All @@ -72,7 +71,7 @@ formatting:
script: &selene-test-launcher-script
- echo "Running selene test"
- pwd
- run_cmd="bash tests/functional_tests/shell_test_utils/run_selene_test_launcher_script.sh RUN_MODEL=$RUN_MODEL TP_SIZE=$TP_SIZE PP_SIZE=$PP_SIZE NUM_NODES=$NUM_NODES SELENE_ADLR_CI_PATH=$SELENE_ADLR_CI_PATH CI_PIPELINE_ID=$CI_PIPELINE_ID RUN_NAME=$RUN_NAME MAX_STEPS=$MAX_STEPS PYTORCH_IMAGE=$PYTORCH_IMAGE DATA_DIR=$DATA_DIR USE_CORE=$USE_CORE USE_TE=$USE_TE"
- run_cmd="bash tests/functional_tests/shell_test_utils/run_selene_test_launcher_script.sh RUN_MODEL=$RUN_MODEL TP_SIZE=$TP_SIZE PP_SIZE=$PP_SIZE VP_SIZE=$VP_SIZE NUM_NODES=$NUM_NODES SELENE_ADLR_CI_PATH=$SELENE_ADLR_CI_PATH CI_PIPELINE_ID=$CI_PIPELINE_ID RUN_NAME=$RUN_NAME MAX_STEPS=$MAX_STEPS PYTORCH_IMAGE=$PYTORCH_IMAGE DATA_DIR=$DATA_DIR USE_CORE=$USE_CORE USE_TE=$USE_TE"
- echo "$run_cmd"
- ${run_cmd}
- echo "Completed the job"
Expand All @@ -84,7 +83,6 @@ formatting:
- if: $CI_MERGE_REQUEST_APPROVED && $TEST_LEVEL =~ $TESTS_TO_RUN_AFTER_MERGE_REQ_APPROVED
when: always
allow_failure: false
retry: 2

train.te_gpt3.345m_tp2_pp2_1node_50steps:
<<: *selene-test-launcher
Expand Down Expand Up @@ -143,6 +141,20 @@ train.gpt3_core.345m_tp1_pp2_1node_50steps:
TEST_LEVEL: L0

train.gpt3_core.345m_tp1_pp4_1node_50steps:
<<: *selene-test-launcher
variables:
<<: [*VARS]
RUN_MODEL: gpt3
USE_TE: 0
TP_SIZE: 1
PP_SIZE: 4
NUM_NODES: 1
MAX_STEPS: 50
USE_CORE: 1
TIME_LIMIT: "20:00"
TEST_LEVEL: L0

train.gpt3_core.345m_tp1_pp4_interleaved_1node_50steps:
<<: *selene-test-launcher
variables:
<<: [*VARS]
Expand Down Expand Up @@ -181,7 +193,6 @@ train.gpt3_core.345m_tp1_pp4_1node_50steps_swiglu:
USE_TE: 0
TP_SIZE: 1
PP_SIZE: 4
VP_SIZE: 1
NUM_NODES: 1
MAX_STEPS: 50
USE_CORE: 1
Expand All @@ -198,7 +209,6 @@ train.gpt3_core.345m_tp1_pp4_1node_50steps_disable_bias_linear:
USE_TE: 0
TP_SIZE: 1
PP_SIZE: 4
VP_SIZE: 1
NUM_NODES: 1
MAX_STEPS: 50
USE_CORE: 1
Expand All @@ -215,7 +225,6 @@ train.gpt3_core.345m_tp1_pp4_1node_50steps_untie_embeddings_and_outputs:
USE_TE: 0
TP_SIZE: 1
PP_SIZE: 4
VP_SIZE: 1
NUM_NODES: 1
MAX_STEPS: 50
USE_CORE: 1
Expand All @@ -232,7 +241,6 @@ train.gpt3_core.345m_tp1_pp4_1node_50steps_sequence_parallel:
USE_TE: 0
TP_SIZE: 1
PP_SIZE: 4
VP_SIZE: 1
NUM_NODES: 1
MAX_STEPS: 50
USE_CORE: 1
Expand Down Expand Up @@ -284,6 +292,20 @@ train.gpt3.345m_tp1_pp2_1node_50steps:
TEST_LEVEL: L0

train.gpt3.345m_tp1_pp4_1node_50steps:
<<: *selene-test-launcher
variables:
<<: [*VARS]
RUN_MODEL: gpt3
USE_TE: 0
TP_SIZE: 1
PP_SIZE: 4
NUM_NODES: 1
MAX_STEPS: 50
USE_CORE: 0
TIME_LIMIT: "20:00"
TEST_LEVEL: L0

train.gpt3.345m_tp1_pp4_interleaved_1node_50steps:
<<: *selene-test-launcher
variables:
<<: [*VARS]
Expand Down Expand Up @@ -382,7 +404,6 @@ train.te_core_moe_gpt3.345m_tp2_pp2_2experts_1node_50steps:
USE_TE: 0
TP_SIZE: 2
PP_SIZE: 2
VP_SIZE: 1
NUM_NODES: 1
MAX_STEPS: 50
USE_CORE: 1
Expand All @@ -399,7 +420,6 @@ train.te_core_moe_gpt3.345m_tp2_pp2_4experts2parallel_1node_50steps:
USE_TE: 0
TP_SIZE: 2
PP_SIZE: 2
VP_SIZE: 1
NUM_NODES: 1
MAX_STEPS: 50
USE_CORE: 1
Expand All @@ -416,7 +436,6 @@ train.te_core_moe_gpt3.345m_tp2_pp1_4experts2parallel_1node_50steps:
USE_TE: 0
TP_SIZE: 2
PP_SIZE: 1
VP_SIZE: 1
NUM_NODES: 1
MAX_STEPS: 50
USE_CORE: 1
Expand All @@ -433,7 +452,6 @@ train.moe_gpt3.345m_tp2_pp2_4experts_1node_50steps:
USE_TE: 0
TP_SIZE: 2
PP_SIZE: 2
VP_SIZE: 1
NUM_NODES: 1
MAX_STEPS: 50
USE_CORE: 0
Expand Down Expand Up @@ -479,6 +497,18 @@ train.bert.345m_tp1_pp2_1node_50steps:
TEST_LEVEL: L0

train.bert.345m_tp1_pp4_1node_50steps:
<<: *selene-test-launcher
variables:
<<: [*VARS]
RUN_MODEL: bert
TP_SIZE: 1
PP_SIZE: 4
NUM_NODES: 1
MAX_STEPS: 50
TIME_LIMIT: "20:00"
TEST_LEVEL: L0

train.bert.345m_tp1_pp4_interleaved_1node_50steps:
<<: *selene-test-launcher
variables:
<<: [*VARS]
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -21,7 +21,8 @@ if [[ $USE_CORE -eq 1 && $USE_TE -eq 1 ]]; then
fi

# step 2 : SETTING RUN NAME
RUN_NAME=${RUN_MODEL}_tp${TP_SIZE}_pp${PP_SIZE}_${NUM_NODES}nodes_${MAX_STEPS}steps
if [[ -n $VP_SIZE ]]; then INTERLEAVED_STR="_interleaved"; else INTERLEAVED_STR=""; fi
RUN_NAME=${RUN_MODEL}_tp${TP_SIZE}_pp${PP_SIZE}${INTERLEAVED_STR}_${NUM_NODES}nodes_${MAX_STEPS}steps
if [[ $USE_TE == 1 ]]; then RUN_NAME=${RUN_NAME}_te_enabled; fi
if [[ $USE_CORE == 1 ]]; then RUN_NAME=${RUN_NAME}_core_enabled; fi
if [[ -n $METADATA ]]; then RUN_NAME=${RUN_NAME}_${METADATA}; fi
Expand All @@ -47,10 +48,10 @@ export GOTO_NUM_THREADS=2
export OPENBLAS_NUM_THREADS=2

# step 5 : CREATING A COPY OF THE SBATCH SCRIPT THAT WILL BE RUN FOR DEBUGGING
envsubst '$BASE_DIR $PYTORCH_IMAGE $BUILD_DIR $DATA_DIR $VP_SIZE $MBS $GBS $ADDITIONAL_PARAMS $USE_TE $TP_SIZE $PP_SIZE $NUM_NODES $MAX_STEPS $USE_CORE' <$BUILD_DIR/tests/functional_tests/test_scripts/$RUN_MODEL/sbatch_${RUN_MODEL}_distributed_test.sh > $SELENE_ADLR_CI_PATH/$CI_PIPELINE_ID/$RUN_NAME/debug/sbatch_${RUN_MODEL}_distributed_test.sh
envsubst '$BASE_DIR $PYTORCH_IMAGE $BUILD_DIR $DATA_DIR $MBS $GBS $ADDITIONAL_PARAMS $USE_TE $TP_SIZE $PP_SIZE $VP_SIZE $NUM_NODES $MAX_STEPS $USE_CORE' <$BUILD_DIR/tests/functional_tests/test_scripts/$RUN_MODEL/sbatch_${RUN_MODEL}_distributed_test.sh > $SELENE_ADLR_CI_PATH/$CI_PIPELINE_ID/$RUN_NAME/debug/sbatch_${RUN_MODEL}_distributed_test.sh

# step 6 : SUBMITTING THE JOB
sbatch_submission=`sbatch $BUILD_DIR/tests/functional_tests/test_scripts/$RUN_MODEL/sbatch_${RUN_MODEL}_distributed_test.sh --export=BASE_DIR,BUILD_DIR,DATA_DIR,USE_TE,TP_SIZE,PP_SIZE,NUM_NODES,MAX_STEPS,VP_SIZE,MBS,GBS,PYTORCH_IMAGE,ADDITIONAL_PARAMS`
sbatch_submission=`sbatch $BUILD_DIR/tests/functional_tests/test_scripts/$RUN_MODEL/sbatch_${RUN_MODEL}_distributed_test.sh --export=BASE_DIR,BUILD_DIR,DATA_DIR,USE_TE,TP_SIZE,PP_SIZE,VP_SIZE,NUM_NODES,MAX_STEPS,MBS,GBS,PYTORCH_IMAGE,ADDITIONAL_PARAMS`
export SLURM_JOBID=$(echo $sbatch_submission| grep 'Submitted batch job' | awk '{ print $4 }');

# step 7 : WAITING FOR JOB TO COMPLETE AND PRINTING JOB INFO
Expand Down Expand Up @@ -78,4 +79,4 @@ fi
export EXPECTED_METRICS_FILE=$BUILD_DIR/tests/functional_tests/test_results/$RUN_MODEL/$RUN_NAME.json
PYTEST_EXIT=0
pytest $BUILD_DIR/tests/functional_tests/python_test_utils/test_ci_pipeline.py || PYTEST_EXIT=$?
if [[ $PYTEST_EXIT == 0 ]]; then echo "Pytest succeded"; else echo "Pytest failed. See ${SELENE_ADLR_CI_PATH}/${CI_PIPELINE_ID}/${RUN_NAME}/debug directory for result logs"; exit $PYTEST_EXIT; fi
if [[ $PYTEST_EXIT == 0 ]]; then echo "Pytest succeded"; else echo "Pytest failed. See ${SELENE_ADLR_CI_PATH}/${CI_PIPELINE_ID}/${RUN_NAME}/debug directory for result logs"; exit $PYTEST_EXIT; fi
Original file line number Diff line number Diff line change
Expand Up @@ -39,10 +39,10 @@ export GOTO_NUM_THREADS=2
export OPENBLAS_NUM_THREADS=2

# step 5 : CREATING A COPY OF THE SBATCH SCRIPT THAT WILL BE RUN FOR DEBUGGING
envsubst '$BASE_DIR $PYTORCH_IMAGE $BUILD_DIR $DATA_DIR $VP_SIZE $MBS $GBS $TP_SIZE $PP_SIZE $NUM_NODES $MAX_STEPS' <$BUILD_DIR/tests/functional_tests/test_scripts/$RUN_MODEL/sbatch_${RUN_MODEL}_distributed_resume_checkpoint_test.sh > $SELENE_ADLR_CI_PATH/$CI_PIPELINE_ID/$RUN_NAME/debug/sbatch_${RUN_MODEL}_distributed_resume_checkpoint_test.sh
envsubst '$BASE_DIR $PYTORCH_IMAGE $BUILD_DIR $DATA_DIR $MBS $GBS $TP_SIZE $PP_SIZE $VP_SIZE $NUM_NODES $MAX_STEPS' <$BUILD_DIR/tests/functional_tests/test_scripts/$RUN_MODEL/sbatch_${RUN_MODEL}_distributed_resume_checkpoint_test.sh > $SELENE_ADLR_CI_PATH/$CI_PIPELINE_ID/$RUN_NAME/debug/sbatch_${RUN_MODEL}_distributed_resume_checkpoint_test.sh

# step 6 : SUBMITTING THE JOB
sbatch_submission=`sbatch $BUILD_DIR/tests/functional_tests/test_scripts/$RUN_MODEL/sbatch_${RUN_MODEL}_distributed_resume_checkpoint_test.sh --export=BASE_DIR,BUILD_DIR,DATA_DIR,TP_SIZE,PP_SIZE,NUM_NODES,PYTORCH_IMAGE`
sbatch_submission=`sbatch $BUILD_DIR/tests/functional_tests/test_scripts/$RUN_MODEL/sbatch_${RUN_MODEL}_distributed_resume_checkpoint_test.sh --export=BASE_DIR,BUILD_DIR,DATA_DIR,TP_SIZE,PP_SIZE,VP_SIZE,NUM_NODES,PYTORCH_IMAGE`
export SLURM_JOBID=$(echo $sbatch_submission| grep 'Submitted batch job' | awk '{ print $4 }');

# step 7 : WAITING FOR JOB TO COMPLETE AND PRINTING JOB INFO
Expand All @@ -62,4 +62,4 @@ if [[ "$SLURM_STATE" != "COMPLETED" ]]; then echo "Slurm job did not complete. S
source $PYTHON_VIRTUAL_ENV
PYTEST_EXIT=0
pytest $BUILD_DIR/tests/functional_tests/python_test_utils/test_resume_checkpoint_pipeline.py || PYTEST_EXIT=$?
if [[ $PYTEST_EXIT == 0 ]]; then echo "Pytest succeded"; else echo "Pytest failed. See ${SELENE_ADLR_CI_PATH}/${CI_PIPELINE_ID}/${RUN_NAME}/debug directory for result logs"; exit $PYTEST_EXIT; fi
if [[ $PYTEST_EXIT == 0 ]]; then echo "Pytest succeded"; else echo "Pytest failed. See ${SELENE_ADLR_CI_PATH}/${CI_PIPELINE_ID}/${RUN_NAME}/debug directory for result logs"; exit $PYTEST_EXIT; fi
Original file line number Diff line number Diff line change
@@ -1 +1 @@
{"lm loss": {"start_step": 0, "end_step": 50, "step_interval": 5, "values": [10.54837, 10.54636, 10.55694, 10.54151, 10.53088, 10.48503, 10.46275, 10.31499, 10.17122, 9.97326]}, "num-zeros": {"start_step": 0, "end_step": 34, "step_interval": 5, "values": [22606.0, 20619.0, 26292.0, 23607.0, 21666.0, 21672.0, 23313.0]}, "iteration_timing_avg": 0.9262994117647059}
{"lm loss": {"start_step": 0, "end_step": 50, "step_interval": 5, "values": [10.5414, 10.53988, 10.55513, 10.52847, 10.54297, 10.51657, 10.47015, 10.36882, 10.23301, 10.05128]}, "num-zeros": {"start_step": 0, "end_step": 34, "step_interval": 5, "values": [26510.0, 16034.0, 24829.0, 21005.0, 20977.0, 19155.0, 18836.0]}, "iteration_timing_avg": 0.6206926470588235}
Original file line number Diff line number Diff line change
@@ -0,0 +1 @@
{"lm loss": {"start_step": 0, "end_step": 50, "step_interval": 5, "values": [10.54837, 10.54636, 10.55694, 10.54151, 10.53088, 10.48503, 10.46275, 10.31499, 10.17122, 9.97326]}, "num-zeros": {"start_step": 0, "end_step": 34, "step_interval": 5, "values": [22606.0, 20619.0, 26292.0, 23607.0, 21666.0, 21672.0, 23313.0]}, "iteration_timing_avg": 0.999115588235294}
Original file line number Diff line number Diff line change
@@ -0,0 +1 @@
{"lm loss": {"start_step": 0, "end_step": 45, "step_interval": 5, "values": [10.7951, 10.84939, 10.87411, 10.83459, 10.82865, 10.78676, 10.56492, 10.57063, 10.48545]}, "num-zeros": {"start_step": 0, "end_step": 29, "step_interval": 5, "values": [2561.0, 2771.0, 2141.0, 2656.0, 2737.0, 2472.0]}, "iteration_timing_avg": 0.1285973333333333}
Original file line number Diff line number Diff line change
@@ -0,0 +1 @@
{"lm loss": {"start_step": 0, "end_step": 50, "step_interval": 5, "values": [10.82096, 10.87358, 10.8827, 10.79796, 10.68762, 10.59849, 10.09941, 10.21477, 10.14024, 9.80787]}, "num-zeros": {"start_step": 0, "end_step": 50, "step_interval": 5, "values": [1500.0, 1792.0, 1899.0, 1853.0, 1884.0, 1847.0, 1596.0, 1783.0, 2314.0, 2349.0]}, "iteration_timing_avg": 0.12620382352941178}
Original file line number Diff line number Diff line change
Expand Up @@ -16,4 +16,4 @@ echo 'Running tests using $PYTORCH_IMAGE image'
srun --output $BASE_DIR/debug/slurm-%j.out --error $BASE_DIR/debug/slurm-%j.out --container-image $PYTORCH_IMAGE --container-mounts $BASE_DIR/tensorboard_logs:/workspace/tensorboard_logs,$BASE_DIR/checkpoints:/workspace/checkpoints,$BUILD_DIR:/workspace/megatron-lm,$DATA_DIR:/workspace/data --no-container-mount-home bash -c "
ls
cd /workspace/megatron-lm
./tests/functional_tests/test_scripts/bert/pretrain_bert_distributed_test.sh DATA_PATH=$DATA_PATH CHECKPOINT_PATH=$CHECKPOINT_PATH TENSORBOARD_DIR=$TENSORBOARD_DIR SCRIPTS_DIR=$SCRIPTS_DIR TP_SIZE=$TP_SIZE PP_SIZE=$PP_SIZE NUM_NODES=$NUM_NODES MAX_STEPS=$MAX_STEPS VP_SIZE=$VP_SIZE"
./tests/functional_tests/test_scripts/bert/pretrain_bert_distributed_test.sh DATA_PATH=$DATA_PATH CHECKPOINT_PATH=$CHECKPOINT_PATH TENSORBOARD_DIR=$TENSORBOARD_DIR SCRIPTS_DIR=$SCRIPTS_DIR TP_SIZE=$TP_SIZE PP_SIZE=$PP_SIZE VP_SIZE=$VP_SIZE NUM_NODES=$NUM_NODES MAX_STEPS=$MAX_STEPS"
Original file line number Diff line number Diff line change
Expand Up @@ -15,7 +15,6 @@ echo "---------------------------------"
set -x
if [[ -n $MBS ]]; then MBS=4; fi
if [[ -n $GBS ]]; then GBS=32; fi
if [[ -n $VP_SIZE ]]; then VP_SIZE="" ; fi

GPUS_PER_NODE=8
# Change for multinode config
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -16,4 +16,4 @@ echo 'Running tests using $PYTORCH_IMAGE image'
srun --output $BASE_DIR/debug/slurm-%j.out --error $BASE_DIR/debug/slurm-%j.out --container-image $PYTORCH_IMAGE --container-mounts $BASE_DIR/tensorboard_logs:/workspace/tensorboard_logs,$BASE_DIR/debug:/workspace/debug,$BASE_DIR/checkpoints:/workspace/checkpoints,$BUILD_DIR:/workspace/megatron-lm,$DATA_DIR:/workspace/data --no-container-mount-home bash -c "
ls
cd /workspace/megatron-lm
./tests/functional_tests/test_scripts/gpt3/pretrain_gpt3_distributed_test.sh DATA_PATH=$DATA_PATH CHECKPOINT_PATH=$CHECKPOINT_PATH TENSORBOARD_DIR=$TENSORBOARD_DIR SCRIPTS_DIR=$SCRIPTS_DIR USE_TE=$USE_TE TP_SIZE=$TP_SIZE PP_SIZE=$PP_SIZE NUM_NODES=$NUM_NODES MAX_STEPS=$MAX_STEPS USE_CORE=$USE_CORE VP_SIZE=$VP_SIZE MBS=$MBS GBS=$GBS ADDITIONAL_PARAMS=\"$ADDITIONAL_PARAMS\""
./tests/functional_tests/test_scripts/gpt3/pretrain_gpt3_distributed_test.sh DATA_PATH=$DATA_PATH CHECKPOINT_PATH=$CHECKPOINT_PATH TENSORBOARD_DIR=$TENSORBOARD_DIR SCRIPTS_DIR=$SCRIPTS_DIR USE_TE=$USE_TE TP_SIZE=$TP_SIZE PP_SIZE=$PP_SIZE VP_SIZE=$VP_SIZE NUM_NODES=$NUM_NODES MAX_STEPS=$MAX_STEPS USE_CORE=$USE_CORE MBS=$MBS GBS=$GBS ADDITIONAL_PARAMS=\"$ADDITIONAL_PARAMS\""

0 comments on commit feac76a

Please sign in to comment.