From bf7b97888ce04e9a0bfb4cba36a40805b4954a2d Mon Sep 17 00:00:00 2001 From: Oliver Koenig Date: Fri, 13 Sep 2024 19:14:35 -0700 Subject: [PATCH] ADLR/megatron-lm!2059 - ci: Add release tests for 0.9 --- .gitlab/stages/03.convergence-tests.yml | 112 +++++++++--------- .../shell_test_utils/run_ci_test_locally.sh | 21 ++-- 2 files changed, 65 insertions(+), 68 deletions(-) diff --git a/.gitlab/stages/03.convergence-tests.yml b/.gitlab/stages/03.convergence-tests.yml index a91f24eab8..5c7bd6a7a3 100644 --- a/.gitlab/stages/03.convergence-tests.yml +++ b/.gitlab/stages/03.convergence-tests.yml @@ -1,54 +1,78 @@ -release-test: - rules: - - if: $CONVERGENCE_TEST == "yes" && $CONVERGENCE_TEST_SCOPE == "release" +.common_release: stage: convergence_tests needs: [build_image] - tags: - - ${TAG} timeout: 7d - parallel: - matrix: - - MODEL: bert - VARIANT: bert_release - TAG: mcore-ssh-node-B - - MODEL: gpt - VARIANT: gpt3_15b_8t_release - TAG: mcore-ssh-node-B - - MODEL: mixtral - VARIANT: mixtral_8x7b_alltoall_tp2pp4ep4_release - TAG: mcore-ssh-node-B before_script: - git rm -r tests/functional_tests/local_recipes || true - git submodule add --force https://gitlab-ci-token:${CI_JOB_TOKEN}@${GITLAB_ENDPOINT}/ADLR/megatron-lm-convergence-tests.git tests/functional_tests/local_recipes - ls tests/functional_tests/local_recipes - - python -m venv local/venv - - source local/venv/bin/activate + - INSTALL_DIR=$(pwd)/local + - rm -rf "$INSTALL_DIR" + - mkdir -p "$INSTALL_DIR" + - wget "https://repo.anaconda.com/miniconda/Miniconda3-latest-Linux-$(uname --machine).sh" -O "$INSTALL_DIR/miniconda.sh" + - bash "$INSTALL_DIR/miniconda.sh" -b -u -p "$INSTALL_DIR" + - rm -rf "$INSTALL_DIR/miniconda.sh" + - source $INSTALL_DIR/bin/activate - pip install jet-api --upgrade $JET_INDEX_URLS + variables: + GIT_STRATEGY: clone + GIT_SUBMODULE_STRATEGY: none script: - | - env + env set -x - - MCORE_RELEASE_NUM=$(python -c "from megatron import core; print(core.__version__)") - export IMAGE_TAG=v$MCORE_RELEASE_NUM-${CI_PIPELINE_ID} - export RUN_NAME=release-testing/mcore-v$MCORE_RELEASE_NUM/$MODEL/$VARIANT - export WANDB_EXPERIMENT=v$MCORE_RELEASE_NUM_$MODEL_$VARIANT + + export IMAGE_TAG=${CI_PIPELINE_ID} export WANDB_API_KEY + CONVERGENCE_TEST_RUN_NAME=$(eval echo $CONVERGENCE_TEST_RUN_NAME) + + if [[ -z $CONVERGENCE_TEST_RUN_NAME ]]; then + echo Please assign a CONVERGENCE_TEST_RUN_NAME + fi - bash ./tests/functional_tests/local_recipes/$MODEL/$VARIANT.sh + export RUN_NAME=$CONVERGENCE_TEST_RUN_NAME/$MODEL/$VARIANT + export WANDB_EXPERIMENT=$CONVERGENCE_TEST_RUN_NAME_$MODEL_$VARIANT + bash ./tests/functional_tests/local_recipes/$MODEL/$VARIANT.sh artifacts: paths: - ./golden_values.json - + retry: + max: 2 + +release-test: + rules: + - if: $CONVERGENCE_TEST == "yes" && $CONVERGENCE_TEST_SCOPE == "release" + extends: [.common_release] + tags: + - ${TAG} + parallel: + matrix: + - MODEL: bert + VARIANT: bert_release + TAG: mcore-ssh-node-B + - MODEL: gpt + VARIANT: gpt3_15b_8t_release + TAG: mcore-ssh-node-B + - MODEL: mixtral + VARIANT: mixtral_8x7b_alltoall_tp2pp4ep4_release + TAG: mcore-ssh-node-B + - MODEL: mixtral + VARIANT: mixtral_8x7b_tp1pp4ep8vpp8_release + TAG: mcore-ssh-agent-C + - MODEL: mixtral + VARIANT: mixtral_8x22b_tp2pp8ep8vpp1_release + TAG: mcore-ssh-agent-C + - MODEL: t5 + VARIANT: t5_release + TAG: mcore-ssh-agent-C + pre-release-test: rules: - if: $CONVERGENCE_TEST == "yes" && $CONVERGENCE_TEST_SCOPE == "pre-release" - stage: convergence_tests - needs: [build_image] + extends: [.common_release] tags: - ${TAG} - timeout: 7d parallel: matrix: - MODEL: bert @@ -60,33 +84,3 @@ pre-release-test: - MODEL: mixtral VARIANT: mixtral_8x7b_alltoall_tp2pp4ep4_release_sm TAG: mcore-ssh-node-B - variables: - GIT_SUBMODULE_STRATEGY: none - before_script: - - git rm -r tests/functional_tests/local_recipes || true - - git submodule add --force https://gitlab-ci-token:${CI_JOB_TOKEN}@${GITLAB_ENDPOINT}/ADLR/megatron-lm-convergence-tests.git tests/functional_tests/local_recipes - - ls tests/functional_tests/local_recipes - - python -m venv local/venv - - source local/venv/bin/activate - - pip install jet-api --upgrade $JET_INDEX_URLS - script: - - | - env - set -x - - export IMAGE_TAG=${CI_PIPELINE_ID} - export WANDB_API_KEY - CONVERGENCE_TEST_RUN_NAME=$(eval echo $CONVERGENCE_TEST_RUN_NAME) - - if [[ -z $CONVERGENCE_TEST_RUN_NAME ]]; then - echo Please assign a CONVERGENCE_TEST_RUN_NAME - fi - - export RUN_NAME=$CONVERGENCE_TEST_RUN_NAME/$MODEL/$VARIANT - export WANDB_EXPERIMENT=$CONVERGENCE_TEST_RUN_NAME_$MODEL_$VARIANT - - bash ./tests/functional_tests/local_recipes/$MODEL/$VARIANT.sh - - artifacts: - paths: - - ./golden_values.json \ No newline at end of file diff --git a/tests/functional_tests/shell_test_utils/run_ci_test_locally.sh b/tests/functional_tests/shell_test_utils/run_ci_test_locally.sh index 19d0e307a2..3ee776ce9b 100644 --- a/tests/functional_tests/shell_test_utils/run_ci_test_locally.sh +++ b/tests/functional_tests/shell_test_utils/run_ci_test_locally.sh @@ -38,6 +38,7 @@ MANDATORY_VARS=( "CLUSTER" "DATASET" "WANDB_EXPERIMENT" + "GPUS_PER_NODE" ) for mandatory_var in "${MANDATORY_VARS[@]}"; do if [[ -z "${!mandatory_var}" ]]; then @@ -88,15 +89,15 @@ if [[ $(cat "${OUTPUT_PATH}/checkpoints/latest_checkpointed_iteration.txt" || ec fi # Fire of sbatch -set +e -sbatch -W < sbatch.sh -#SBATCH --nodes=$NODES +if [[ $GPUS_PER_NODE != null ]]; then + echo '#SBATCH --gres=gpu:8' >> sbatch.sh +fi +echo "#SBATCH --nodes=$NODES #SBATCH --account $PPP #SBATCH --partition $PARTITION #SBATCH --ntasks-per-node=1 -#SBATCH --gres=gpu:8 #SBATCH --time "04:00:00" #SBATCH --job-name=$PPP:mcore:release:$MODEL #SBATCH --dependency=singleton @@ -109,11 +110,13 @@ echo "SLURM_JOB_ID=\$SLURM_JOB_ID" > "$SLURM_LOGS/\${SLURM_JOB_ID}.log" srun \ --ntasks-per-node=1 \ - --container-image="gitlab-master.nvidia.com/adlr/megatron-lm/mcore_ci:$IMAGE_TAG" \ - --container-mounts="${DATA_PATH}:${DATA_PATH},${OUTPUT_PATH}:${OUTPUT_PATH}" \ + --container-image='gitlab-master.nvidia.com/adlr/megatron-lm/mcore_ci:$IMAGE_TAG' \ + --container-mounts='${DATA_PATH}:${DATA_PATH},${OUTPUT_PATH}:${OUTPUT_PATH}' \ --container-workdir=/workspace/megatron-lm \ - bash ./tests/functional_tests/shell_test_utils/run_ci_test.sh ${ARGUMENTS[@]}>>"$SLURM_LOGS/\${SLURM_JOB_ID}.log" 2>&1 -EOF + bash ./tests/functional_tests/shell_test_utils/run_ci_test.sh ${ARGUMENTS[@]}>>'$SLURM_LOGS/\${SLURM_JOB_ID}.log' 2>&1" >> sbatch.sh + +set +e +sbatch -W sbatch.sh set -e done