Skip to content

Commit

Permalink
ADLR/megatron-lm!2059 - ci: Add release tests for 0.9
Browse files Browse the repository at this point in the history
  • Loading branch information
ko3n1g committed Sep 14, 2024
1 parent 90cd925 commit bf7b978
Show file tree
Hide file tree
Showing 2 changed files with 65 additions and 68 deletions.
112 changes: 53 additions & 59 deletions .gitlab/stages/03.convergence-tests.yml
Original file line number Diff line number Diff line change
@@ -1,54 +1,78 @@
release-test:
rules:
- if: $CONVERGENCE_TEST == "yes" && $CONVERGENCE_TEST_SCOPE == "release"
.common_release:
stage: convergence_tests
needs: [build_image]
tags:
- ${TAG}
timeout: 7d
parallel:
matrix:
- MODEL: bert
VARIANT: bert_release
TAG: mcore-ssh-node-B
- MODEL: gpt
VARIANT: gpt3_15b_8t_release
TAG: mcore-ssh-node-B
- MODEL: mixtral
VARIANT: mixtral_8x7b_alltoall_tp2pp4ep4_release
TAG: mcore-ssh-node-B
before_script:
- git rm -r tests/functional_tests/local_recipes || true
- git submodule add --force https://gitlab-ci-token:${CI_JOB_TOKEN}@${GITLAB_ENDPOINT}/ADLR/megatron-lm-convergence-tests.git tests/functional_tests/local_recipes
- ls tests/functional_tests/local_recipes
- python -m venv local/venv
- source local/venv/bin/activate
- INSTALL_DIR=$(pwd)/local
- rm -rf "$INSTALL_DIR"
- mkdir -p "$INSTALL_DIR"
- wget "https://repo.anaconda.com/miniconda/Miniconda3-latest-Linux-$(uname --machine).sh" -O "$INSTALL_DIR/miniconda.sh"
- bash "$INSTALL_DIR/miniconda.sh" -b -u -p "$INSTALL_DIR"
- rm -rf "$INSTALL_DIR/miniconda.sh"
- source $INSTALL_DIR/bin/activate
- pip install jet-api --upgrade $JET_INDEX_URLS
variables:
GIT_STRATEGY: clone
GIT_SUBMODULE_STRATEGY: none
script:
- |
env
env
set -x
MCORE_RELEASE_NUM=$(python -c "from megatron import core; print(core.__version__)")
export IMAGE_TAG=v$MCORE_RELEASE_NUM-${CI_PIPELINE_ID}
export RUN_NAME=release-testing/mcore-v$MCORE_RELEASE_NUM/$MODEL/$VARIANT
export WANDB_EXPERIMENT=v$MCORE_RELEASE_NUM_$MODEL_$VARIANT
export IMAGE_TAG=${CI_PIPELINE_ID}
export WANDB_API_KEY
CONVERGENCE_TEST_RUN_NAME=$(eval echo $CONVERGENCE_TEST_RUN_NAME)
if [[ -z $CONVERGENCE_TEST_RUN_NAME ]]; then
echo Please assign a CONVERGENCE_TEST_RUN_NAME
fi
bash ./tests/functional_tests/local_recipes/$MODEL/$VARIANT.sh
export RUN_NAME=$CONVERGENCE_TEST_RUN_NAME/$MODEL/$VARIANT
export WANDB_EXPERIMENT=$CONVERGENCE_TEST_RUN_NAME_$MODEL_$VARIANT
bash ./tests/functional_tests/local_recipes/$MODEL/$VARIANT.sh
artifacts:
paths:
- ./golden_values.json

retry:
max: 2

release-test:
rules:
- if: $CONVERGENCE_TEST == "yes" && $CONVERGENCE_TEST_SCOPE == "release"
extends: [.common_release]
tags:
- ${TAG}
parallel:
matrix:
- MODEL: bert
VARIANT: bert_release
TAG: mcore-ssh-node-B
- MODEL: gpt
VARIANT: gpt3_15b_8t_release
TAG: mcore-ssh-node-B
- MODEL: mixtral
VARIANT: mixtral_8x7b_alltoall_tp2pp4ep4_release
TAG: mcore-ssh-node-B
- MODEL: mixtral
VARIANT: mixtral_8x7b_tp1pp4ep8vpp8_release
TAG: mcore-ssh-agent-C
- MODEL: mixtral
VARIANT: mixtral_8x22b_tp2pp8ep8vpp1_release
TAG: mcore-ssh-agent-C
- MODEL: t5
VARIANT: t5_release
TAG: mcore-ssh-agent-C

pre-release-test:
rules:
- if: $CONVERGENCE_TEST == "yes" && $CONVERGENCE_TEST_SCOPE == "pre-release"
stage: convergence_tests
needs: [build_image]
extends: [.common_release]
tags:
- ${TAG}
timeout: 7d
parallel:
matrix:
- MODEL: bert
Expand All @@ -60,33 +84,3 @@ pre-release-test:
- MODEL: mixtral
VARIANT: mixtral_8x7b_alltoall_tp2pp4ep4_release_sm
TAG: mcore-ssh-node-B
variables:
GIT_SUBMODULE_STRATEGY: none
before_script:
- git rm -r tests/functional_tests/local_recipes || true
- git submodule add --force https://gitlab-ci-token:${CI_JOB_TOKEN}@${GITLAB_ENDPOINT}/ADLR/megatron-lm-convergence-tests.git tests/functional_tests/local_recipes
- ls tests/functional_tests/local_recipes
- python -m venv local/venv
- source local/venv/bin/activate
- pip install jet-api --upgrade $JET_INDEX_URLS
script:
- |
env
set -x
export IMAGE_TAG=${CI_PIPELINE_ID}
export WANDB_API_KEY
CONVERGENCE_TEST_RUN_NAME=$(eval echo $CONVERGENCE_TEST_RUN_NAME)
if [[ -z $CONVERGENCE_TEST_RUN_NAME ]]; then
echo Please assign a CONVERGENCE_TEST_RUN_NAME
fi
export RUN_NAME=$CONVERGENCE_TEST_RUN_NAME/$MODEL/$VARIANT
export WANDB_EXPERIMENT=$CONVERGENCE_TEST_RUN_NAME_$MODEL_$VARIANT
bash ./tests/functional_tests/local_recipes/$MODEL/$VARIANT.sh
artifacts:
paths:
- ./golden_values.json
21 changes: 12 additions & 9 deletions tests/functional_tests/shell_test_utils/run_ci_test_locally.sh
Original file line number Diff line number Diff line change
Expand Up @@ -38,6 +38,7 @@ MANDATORY_VARS=(
"CLUSTER"
"DATASET"
"WANDB_EXPERIMENT"
"GPUS_PER_NODE"
)
for mandatory_var in "${MANDATORY_VARS[@]}"; do
if [[ -z "${!mandatory_var}" ]]; then
Expand Down Expand Up @@ -88,15 +89,15 @@ if [[ $(cat "${OUTPUT_PATH}/checkpoints/latest_checkpointed_iteration.txt" || ec
fi

# Fire of sbatch
set +e
sbatch -W <<EOF
#!/bin/bash
echo '#!/bin/bash' > sbatch.sh

#SBATCH --nodes=$NODES
if [[ $GPUS_PER_NODE != null ]]; then
echo '#SBATCH --gres=gpu:8' >> sbatch.sh
fi
echo "#SBATCH --nodes=$NODES
#SBATCH --account $PPP
#SBATCH --partition $PARTITION
#SBATCH --ntasks-per-node=1
#SBATCH --gres=gpu:8
#SBATCH --time "04:00:00"
#SBATCH --job-name=$PPP:mcore:release:$MODEL
#SBATCH --dependency=singleton
Expand All @@ -109,11 +110,13 @@ echo "SLURM_JOB_ID=\$SLURM_JOB_ID" > "$SLURM_LOGS/\${SLURM_JOB_ID}.log"
srun \
--ntasks-per-node=1 \
--container-image="gitlab-master.nvidia.com/adlr/megatron-lm/mcore_ci:$IMAGE_TAG" \
--container-mounts="${DATA_PATH}:${DATA_PATH},${OUTPUT_PATH}:${OUTPUT_PATH}" \
--container-image='gitlab-master.nvidia.com/adlr/megatron-lm/mcore_ci:$IMAGE_TAG' \
--container-mounts='${DATA_PATH}:${DATA_PATH},${OUTPUT_PATH}:${OUTPUT_PATH}' \
--container-workdir=/workspace/megatron-lm \
bash ./tests/functional_tests/shell_test_utils/run_ci_test.sh ${ARGUMENTS[@]}>>"$SLURM_LOGS/\${SLURM_JOB_ID}.log" 2>&1
EOF
bash ./tests/functional_tests/shell_test_utils/run_ci_test.sh ${ARGUMENTS[@]}>>'$SLURM_LOGS/\${SLURM_JOB_ID}.log' 2>&1" >> sbatch.sh

set +e
sbatch -W sbatch.sh
set -e
done

Expand Down

0 comments on commit bf7b978

Please sign in to comment.