Skip to content

Commit

Permalink
ci: minor improvements to functional tests (#379)
Browse files Browse the repository at this point in the history
Signed-off-by: ashors1 <[email protected]>
Co-authored-by: Terry Kong <[email protected]>
  • Loading branch information
ashors1 and terrykong authored Nov 8, 2024
1 parent f665ad8 commit 7816a72
Show file tree
Hide file tree
Showing 4 changed files with 9 additions and 39 deletions.
15 changes: 2 additions & 13 deletions tests/functional/dpo.sh
Original file line number Diff line number Diff line change
Expand Up @@ -8,43 +8,32 @@ export NCCL_ALGO=Tree
export NVTE_APPLY_QK_LAYER_SCALING=1

KL=${KL:-0.1}
#LR=${LR:-9e-7}
GBS=${GBS:-4}
PRETRAINED_CHECKPOINT_NEMO_FILE=${PRETRAINED_CHECKPOINT_NEMO_FILE}


#MIN_LR=$(awk -v var="$LR" 'BEGIN {print var - 1e-11}')

TRAIN_DATA_PATH=$SCRIPT_DIR/test_data/dummy-dpo.jsonl
VALID_DATA_PATH=$SCRIPT_DIR/test_data/dummy-dpo.jsonl

NAME="llama3_dpo_test"
NAME="dpo_test"

# PARAMETERS
RESULTS_DIR="/tmp/${NAME}"
mkdir -p $RESULTS_DIR

GPFS=$(git rev-parse --show-toplevel)

# W&B Logging
PROJECT=llama3_dpo_test

# START HETEROGENEUS JOB 3
CONF_DIR="${GPFS}/examples/nlp/gpt/conf/"
CONF_NAME="gpt_dpo"

CHECKPOINT_DIR="${RESULTS_DIR}/checkpoints"
TENSOBOARD_DIR="${RESULTS_DIR}/tensorboard"

mkdir -p $RESULTS_DIR
mkdir -p $TENSOBOARD_DIR
mkdir -p $CHECKPOINT_DIR

dpo() {
export CUDA_VISIBLE_DEVICES=0,1
export PYTHONPATH="${GPFS}:${PYTHONPATH:-}"
export HYDRA_FULL_ERROR=1
mpirun -np 2 --allow-run-as-root python -u ${GPFS}/examples/nlp/gpt/train_gpt_dpo.py \
torchrun --nproc-per-node 2 ${GPFS}/examples/nlp/gpt/train_gpt_dpo.py \
--config-path=${CONF_DIR} \
--config-name=${CONF_NAME} \
trainer.num_nodes=1 \
Expand Down
10 changes: 2 additions & 8 deletions tests/functional/ppo.sh
Original file line number Diff line number Diff line change
Expand Up @@ -23,7 +23,7 @@ MIN_LR=$(awk -v var="$LR" 'BEGIN {print var - 1e-11}')
TRAIN_DATA_PATH=$SCRIPT_DIR/test_data/synthetic-123.jsonl
VALID_DATA_PATH=$SCRIPT_DIR/test_data/synthetic-123.jsonl

NAME="llama3_test"
NAME="ppo_test"

# PARAMETERS
RESULTS_DIR="/tmp/${NAME}"
Expand All @@ -32,7 +32,7 @@ mkdir -p $RESULTS_DIR
GPFS=$(git rev-parse --show-toplevel)

# W&B Logging
PROJECT=llama3_ppo_test
PROJECT=ppo_test

CRITIC_CONFIG_PATH="$GPFS/examples/nlp/gpt/conf/"
CRITIC_CONFIG_NAME="gpt_ppo_critic"
Expand Down Expand Up @@ -101,15 +101,9 @@ CONF_DIR="${GPFS}/examples/nlp/gpt/conf/"
CONF_NAME="gpt_ppo_actor"

ACTOR_LOG_DIR="${RESULTS_DIR}/actor_results"
CHECKPOINT_DIR="${ACTOR_LOG_DIR}/checkpoints"
TENSOBOARD_DIR="${ACTOR_LOG_DIR}/tensorboard"

mkdir -p $ACTOR_LOG_DIR
mkdir -p $TENSOBOARD_DIR
mkdir -p $CHECKPOINT_DIR

ACTOR_NAME="${NAME}_actor"

host_critic=localhost

actor() {
Expand Down
12 changes: 3 additions & 9 deletions tests/functional/rm.sh
Original file line number Diff line number Diff line change
Expand Up @@ -13,12 +13,10 @@ GBS=${GBS:-4}
PRETRAINED_CHECKPOINT_NEMO_FILE=${PRETRAINED_CHECKPOINT_NEMO_FILE}


#MIN_LR=$(awk -v var="$LR" 'BEGIN {print var - 1e-11}')

TRAIN_DATA_PATH=$SCRIPT_DIR/test_data/test-rm.jsonl
VALID_DATA_PATH=$SCRIPT_DIR/test_data/test-rm.jsonl

NAME="llama3_rm_test"
NAME="rm_test"

# PARAMETERS
RESULTS_DIR="/tmp/${NAME}"
Expand All @@ -27,24 +25,20 @@ mkdir -p $RESULTS_DIR
GPFS=$(git rev-parse --show-toplevel)

# W&B Logging
PROJECT=llama3_rm_test
PROJECT=rm_test

# START HETEROGENEUS JOB 3
CONF_DIR="${GPFS}/examples/nlp/gpt/conf/"
CONF_NAME="training_rm"

CHECKPOINT_DIR="${RESULTS_DIR}/checkpoints"
TENSOBOARD_DIR="${RESULTS_DIR}/tensorboard"

mkdir -p $RESULTS_DIR
mkdir -p $TENSOBOARD_DIR
mkdir -p $CHECKPOINT_DIR

rm_training() {
export CUDA_VISIBLE_DEVICES=0,1
export PYTHONPATH="${GPFS}:${PYTHONPATH:-}"
export HYDRA_FULL_ERROR=1
mpirun -np 2 --allow-run-as-root python -u ${GPFS}/examples/nlp/gpt/train_reward_model.py \
torchrun --nproc-per-node 2 ${GPFS}/examples/nlp/gpt/train_reward_model.py \
--config-path=${CONF_DIR} \
--config-name=${CONF_NAME} \
trainer.num_nodes=1 \
Expand Down
11 changes: 2 additions & 9 deletions tests/functional/sft.sh
Original file line number Diff line number Diff line change
Expand Up @@ -15,32 +15,25 @@ MAX_SEQ_LENGTH=${MAX_SEQ_LENGTH:-4096}
TRAIN_DATA_PATH=$SCRIPT_DIR/test_data/dummy-sft.jsonl
VALID_DATA_PATH=$SCRIPT_DIR/test_data/dummy-sft.jsonl

NAME="llama3_sft_test"
NAME="sft_test"

# PARAMETERS
RESULTS_DIR="/tmp/${NAME}"
mkdir -p $RESULTS_DIR

GPFS=$(git rev-parse --show-toplevel)

# W&B Logging
PROJECT=llama3_sft_test

CONF_DIR="${GPFS}/examples/nlp/gpt/conf/"
CONF_NAME="gpt_sft"

CHECKPOINT_DIR="${RESULTS_DIR}/checkpoints"
TENSOBOARD_DIR="${RESULTS_DIR}/tensorboard"

mkdir -p $RESULTS_DIR
mkdir -p $TENSOBOARD_DIR
mkdir -p $CHECKPOINT_DIR

sft() {
export CUDA_VISIBLE_DEVICES=0,1
export PYTHONPATH="${GPFS}:${PYTHONPATH:-}"
export HYDRA_FULL_ERROR=1
mpirun -np 2 --allow-run-as-root python -u ${GPFS}/examples/nlp/gpt/train_gpt_sft.py \
torchrun --nproc-per-node 2 ${GPFS}/examples/nlp/gpt/train_gpt_sft.py \
--config-path=${CONF_DIR} \
--config-name=${CONF_NAME} \
trainer.num_nodes=1 \
Expand Down

0 comments on commit 7816a72

Please sign in to comment.