diff --git a/.vscode/settings.json b/.vscode/settings.json index 79d6fa1..bbadf97 100644 --- a/.vscode/settings.json +++ b/.vscode/settings.json @@ -47,6 +47,7 @@ "pbar", "peft", "plamo", + "probs", "psutil", "pubmed", "samsum", diff --git a/megatron_lm/megatron/core/datasets/Makefile b/megatron_lm/megatron/core/datasets/Makefile index a409f51..7bd3930 100644 --- a/megatron_lm/megatron/core/datasets/Makefile +++ b/megatron_lm/megatron/core/datasets/Makefile @@ -1,7 +1,7 @@ CXXFLAGS += -O3 -Wall -shared -std=c++11 -fPIC -fdiagnostics-color CPPFLAGS += $(shell python3 -m pybind11 --includes) LIBNAME = helpers -LIBEXT = $(shell ${PYENV_ROOT}/versions/3.10.12/bin/python3-config --extension-suffix) +LIBEXT = $(shell ${PYENV_ROOT}/versions/3.11.9/bin/python3-config --extension-suffix) default: $(LIBNAME)$(LIBEXT) diff --git a/requirements.txt b/requirements.txt index 773ef20..d67edc1 100644 --- a/requirements.txt +++ b/requirements.txt @@ -1,5 +1,5 @@ --find-links https://download.pytorch.org/whl/torch_stable.html -torch==2.2.2+cu121 +torch==2.3.1+cu121 # huggingface transformers>=4.41.1 diff --git a/scripts/abci/instruction/swallow-7b/swallow-7b-baseline.sh b/scripts/abci/instruction/Llama-3-8B/Llama-3-8B-instruct-v0.2.sh similarity index 65% rename from scripts/abci/instruction/swallow-7b/swallow-7b-baseline.sh rename to scripts/abci/instruction/Llama-3-8B/Llama-3-8B-instruct-v0.2.sh index 5c74799..4ebf185 100644 --- a/scripts/abci/instruction/swallow-7b/swallow-7b-baseline.sh +++ b/scripts/abci/instruction/Llama-3-8B/Llama-3-8B-instruct-v0.2.sh @@ -1,16 +1,19 @@ #!/bin/bash #$ -l rt_AF=2 -#$ -l h_rt=1:00:00:00 +#$ -l h_rt=0:01:00:00 #$ -j y -#$ -o outputs/instruction/swallow-7b/ +#$ -o outputs/instruction/Llama-3-8B/ #$ -cwd # module load source /etc/profile.d/modules.sh -module load cuda/11.8/11.8.0 -module load cudnn/8.9/8.9.2 -module load nccl/2.16/2.16.2-1 +module use /bb/llm/gaf51275/modules/modulefiles + +module load cuda/12.1/12.1.1 +module load cudnn/cuda-12.1/9.0.0 +module load nccl/2.20.5 module load hpcx/2.12 +module load gcc/11.4.0 # swich virtual env source .env/bin/activate @@ -44,33 +47,33 @@ while read -r line; do done <"$SGE_JOB_HOSTLIST" >"$HOSTFILE_NAME" # training config -SEQ_LENGTH=4096 +SEQ_LENGTH=8192 DATA_PARALLEL_SIZE=$NUM_GPUS -MICRO_BATCH_SIZE=4 -GLOBAL_BATCH_SIZE=64 +MICRO_BATCH_SIZE=1 +GLOBAL_BATCH_SIZE=128 # optimizer config -LR=2e-5 -MIN_LR=2e-6 +LR=1e-5 +MIN_LR=1e-6 WEIGHT_DECAY=0.1 GRAD_CLIP=1 -# checkpoint & tokenizer -TOKENIZER_MODEL=/bb/llm/gaf51275/llama/huggingface-checkpoint/Swallow-7b-hf/tokenizer.model -CHECKPOINT_DIR=/bb/llm/gaf51275/llama/huggingface-checkpoint/Swallow-7b-hf -CHECKPOINT_SAVE_DIR="/bb/llm/gaf51275/llama/checkpoints/Swallow-7b-VE-chat/baseline-lr_${LR}-minlr_${MIN_LR}" +# checkpoint +TOKENIZER_DIR=/groups/gag51395/hf-checkpoints/Meta-Llama-3-8B-Instruct +CHECKPOINT_DIR=/groups/gag51395/hf-checkpoints/Meta-Llama-3-8B-Instruct +CHECKPOINT_SAVE_DIR="/bb/llm/gaf51275/2024/checkpoints/Llama-3-8B-Instruct-v0.2/LR_${LR}_MINLR_${MIN_LR}_WD_${WEIGHT_DECAY}_GC_${GRAD_CLIP}" mkdir -p ${CHECKPOINT_SAVE_DIR} # dataset -DATASET_DIR=/bb/llm/gaf51275/llama/finetuning/datasets/training/baseline +DATASET_DIR=/groups/gag51395/datasets/instruction/2023-swallow/training/baseline TRAIN_DATA_PATH=${DATASET_DIR}/train.jsonl -VALID_DATA_PATH=${DATASET_DIR}/val.jsonl +VALID_DATA_PATH=${DATASET_DIR}/train.jsonl # job name -JOB_NAME="Swallow-7b-VE-baseline-BS=${GLOBAL_BATCH_SIZE}-LR=${LR}-MINLR=${MIN_LR}" +JOB_NAME="Llama-3-8B-instruct-v0.2-BS=${GLOBAL_BATCH_SIZE}-LR=${LR}-MINLR=${MIN_LR}-WD=${WEIGHT_DECAY}-GC=${GRAD_CLIP}" # run mpirun -np $NUM_GPUS \ @@ -78,20 +81,18 @@ mpirun -np $NUM_GPUS \ -hostfile $HOSTFILE_NAME \ -x MASTER_ADDR=$MASTER_ADDR \ -x MASTER_PORT=$MASTER_PORT \ - -bind-to none -map-by slot \ + -bind-to none \ + -x PATH \ + -x LD_LIBRARY_PATH \ -x PATH \ python examples/finetuning.py \ --seq-length ${SEQ_LENGTH} \ - --sliding-window-size ${SEQ_LENGTH} \ --micro-batch-size ${MICRO_BATCH_SIZE} \ --global-batch-size ${GLOBAL_BATCH_SIZE} \ - --hf-transformer-model-dir ${CHECKPOINT_DIR} \ - --tokenizer-type Llama2Tokenizer \ - --tokenizer-model ${TOKENIZER_MODEL} \ + --hf-transformer-model-dir ${TOKENIZER_DIR} \ --instruction-train-data-path ${TRAIN_DATA_PATH} \ --instruction-valid-data-path ${VALID_DATA_PATH} \ - --epoch 2 \ - --train-iters 500000 \ + --epoch 1 \ --lr ${LR} \ --min-lr ${MIN_LR} \ --lr-decay-style cosine \ @@ -100,10 +101,10 @@ mpirun -np $NUM_GPUS \ --optimizer adam \ --adam-beta1 0.9 \ --adam-beta2 0.95 \ - --adam-eps 1e-6 \ + --adam-eps 1e-8 \ --save-interval 500 \ - --eval-interval 100 \ - --eval-iters 20 \ + --eval-interval 500 \ + --eval-iters 10 \ --bf16 \ --mixed-precision \ --base-model ${CHECKPOINT_DIR} \ @@ -116,6 +117,6 @@ mpirun -np $NUM_GPUS \ --instruction-tuning \ --save-sampler-state \ --use-mpi \ - --wandb-entity "prj-jalm" \ - --wandb-project "Llama-2-7b-instruct" \ + --wandb-entity "okoge" \ + --wandb-project "llm-recipes" \ --wandb-name "${JOB_NAME}" diff --git a/scripts/abci/instruction/swallow-13b/swallow-13b-dolly-oasst2-top1-imitation-2-3.sh b/scripts/abci/instruction/swallow-13b/swallow-13b-dolly-oasst2-top1-imitation-2-3.sh deleted file mode 100644 index 10b834e..0000000 --- a/scripts/abci/instruction/swallow-13b/swallow-13b-dolly-oasst2-top1-imitation-2-3.sh +++ /dev/null @@ -1,121 +0,0 @@ -#!/bin/bash -#$ -l rt_AF=2 -#$ -l h_rt=1:00:00:00 -#$ -j y -#$ -o outputs/instruction/swallow-13b/ -#$ -cwd - -# module load -source /etc/profile.d/modules.sh -module load cuda/11.8/11.8.0 -module load cudnn/8.9/8.9.2 -module load nccl/2.16/2.16.2-1 -module load hpcx/2.12 - -# swich virtual env -source .env/bin/activate - -# distributed settings -export MASTER_ADDR=$(/usr/sbin/ip a show dev bond0 | grep 'inet ' | awk '{ print $2 }' | cut -d "/" -f 1) -export MASTER_PORT=$((10000 + ($JOB_ID % 50000))) - -echo "MASTER_ADDR=${MASTER_ADDR}" - -# hostfile - -if [[ "$SGE_RESOURCE_TYPE" == "rt_F" ]]; then - export NUM_GPU_PER_NODE=4 - NODE_TYPE="v100" -elif [[ "$SGE_RESOURCE_TYPE" == "rt_AF" ]]; then - export NUM_GPU_PER_NODE=8 - NODE_TYPE="a100" -else - echo "Unrecognized SGE_RESOURCE_TYPE: $SGE_RESOURCE_TYPE" -fi - -NUM_NODES=$NHOSTS -NUM_GPUS=$((${NUM_NODES} * ${NUM_GPU_PER_NODE})) - -mkdir -p ./hostfile - -HOSTFILE_NAME=./hostfile/hostfile_${JOB_ID} -while read -r line; do - echo "${line} slots=${NUM_GPU_PER_NODE}" -done <"$SGE_JOB_HOSTLIST" >"$HOSTFILE_NAME" - -# training config -SEQ_LENGTH=4096 -DATA_PARALLEL_SIZE=$NUM_GPUS - -MICRO_BATCH_SIZE=2 -GLOBAL_BATCH_SIZE=256 - -# optimizer config -LR=2e-5 -MIN_LR=2e-6 -WEIGHT_DECAY=0.1 -GRAD_CLIP=1 - -# checkpoint & tokenizer -TOKENIZER_MODEL=/bb/llm/gaf51275/llama/huggingface-checkpoint/Swallow-13b-hf/tokenizer.model -CHECKPOINT_DIR=/bb/llm/gaf51275/llama/huggingface-checkpoint/Swallow-13b-hf -CHECKPOINT_SAVE_DIR="/bb/llm/gaf51275/llama/checkpoints/Swallow-13b-VE-chat/dolly-oasst2-top1-imitation-2-3-lr_${LR}-minlr_${MIN_LR}-GB_${GLOBAL_BATCH_SIZE}" - -mkdir -p ${CHECKPOINT_SAVE_DIR} - -# dataset -DATASET_DIR=/bb/llm/gaf51275/llama/finetuning/datasets/training/dolly-oasst2-top1-imitation-2-3 - -TRAIN_DATA_PATH=${DATASET_DIR}/train.jsonl -VALID_DATA_PATH=${DATASET_DIR}/val.jsonl - -# job name -JOB_NAME="Swallow-13b-VE-dolly-oasst2-top1-imitation-2-3-BS=${GLOBAL_BATCH_SIZE}-LR=${LR}-MINLR=${MIN_LR}" - -# run -mpirun -np $NUM_GPUS \ - --npernode $NUM_GPU_PER_NODE \ - -hostfile $HOSTFILE_NAME \ - -x MASTER_ADDR=$MASTER_ADDR \ - -x MASTER_PORT=$MASTER_PORT \ - -bind-to none -map-by slot \ - -x PATH \ - python examples/finetuning.py \ - --seq-length ${SEQ_LENGTH} \ - --sliding-window-size ${SEQ_LENGTH} \ - --micro-batch-size ${MICRO_BATCH_SIZE} \ - --global-batch-size ${GLOBAL_BATCH_SIZE} \ - --hf-transformer-model-dir ${CHECKPOINT_DIR} \ - --tokenizer-type Llama2Tokenizer \ - --tokenizer-model ${TOKENIZER_MODEL} \ - --instruction-train-data-path ${TRAIN_DATA_PATH} \ - --instruction-valid-data-path ${VALID_DATA_PATH} \ - --epoch 2 \ - --train-iters 500000 \ - --lr ${LR} \ - --min-lr ${MIN_LR} \ - --lr-decay-style cosine \ - --weight-decay ${WEIGHT_DECAY} \ - --grad-clip-norm ${GRAD_CLIP} \ - --optimizer adam \ - --adam-beta1 0.9 \ - --adam-beta2 0.95 \ - --adam-eps 1e-6 \ - --save-interval 500 \ - --eval-interval 100 \ - --eval-iters 20 \ - --bf16 \ - --mixed-precision \ - --base-model ${CHECKPOINT_DIR} \ - --save ${CHECKPOINT_SAVE_DIR} \ - --load ${CHECKPOINT_SAVE_DIR} \ - --low-cpu-fsdp \ - --sharding-strategy FULL_SHARD \ - --checkpoint-type LOCAL_STATE_DICT \ - --fsdp-activation-checkpointing \ - --instruction-tuning \ - --save-sampler-state \ - --use-mpi \ - --wandb-entity "prj-jalm" \ - --wandb-project "Llama-2-13b-instruct" \ - --wandb-name "${JOB_NAME}" diff --git a/scripts/abci/instruction/swallow-13b/swallow-13b-oasst2-top1-imitation2-3.sh b/scripts/abci/instruction/swallow-13b/swallow-13b-oasst2-top1-imitation2-3.sh deleted file mode 100644 index b360148..0000000 --- a/scripts/abci/instruction/swallow-13b/swallow-13b-oasst2-top1-imitation2-3.sh +++ /dev/null @@ -1,121 +0,0 @@ -#!/bin/bash -#$ -l rt_AF=2 -#$ -l h_rt=1:00:00:00 -#$ -j y -#$ -o outputs/instruction/swallow-13b/ -#$ -cwd - -# module load -source /etc/profile.d/modules.sh -module load cuda/11.8/11.8.0 -module load cudnn/8.9/8.9.2 -module load nccl/2.16/2.16.2-1 -module load hpcx/2.12 - -# swich virtual env -source .env/bin/activate - -# distributed settings -export MASTER_ADDR=$(/usr/sbin/ip a show dev bond0 | grep 'inet ' | awk '{ print $2 }' | cut -d "/" -f 1) -export MASTER_PORT=$((10000 + ($JOB_ID % 50000))) - -echo "MASTER_ADDR=${MASTER_ADDR}" - -# hostfile - -if [[ "$SGE_RESOURCE_TYPE" == "rt_F" ]]; then - export NUM_GPU_PER_NODE=4 - NODE_TYPE="v100" -elif [[ "$SGE_RESOURCE_TYPE" == "rt_AF" ]]; then - export NUM_GPU_PER_NODE=8 - NODE_TYPE="a100" -else - echo "Unrecognized SGE_RESOURCE_TYPE: $SGE_RESOURCE_TYPE" -fi - -NUM_NODES=$NHOSTS -NUM_GPUS=$((${NUM_NODES} * ${NUM_GPU_PER_NODE})) - -mkdir -p ./hostfile - -HOSTFILE_NAME=./hostfile/hostfile_${JOB_ID} -while read -r line; do - echo "${line} slots=${NUM_GPU_PER_NODE}" -done <"$SGE_JOB_HOSTLIST" >"$HOSTFILE_NAME" - -# training config -SEQ_LENGTH=4096 -DATA_PARALLEL_SIZE=$NUM_GPUS - -MICRO_BATCH_SIZE=2 -GLOBAL_BATCH_SIZE=256 - -# optimizer config -LR=2e-5 -MIN_LR=2e-6 -WEIGHT_DECAY=0.1 -GRAD_CLIP=1 - -# checkpoint & tokenizer -TOKENIZER_MODEL=/bb/llm/gaf51275/llama/huggingface-checkpoint/Swallow-13b-hf/tokenizer.model -CHECKPOINT_DIR=/bb/llm/gaf51275/llama/huggingface-checkpoint/Swallow-13b-hf -CHECKPOINT_SAVE_DIR="/bb/llm/gaf51275/llama/checkpoints/Swallow-13b-VE-chat/oasst2-top1-imitation-2-3-lr_${LR}-minlr_${MIN_LR}-GB_${GLOBAL_BATCH_SIZE}" - -mkdir -p ${CHECKPOINT_SAVE_DIR} - -# dataset -DATASET_DIR=/bb/llm/gaf51275/llama/finetuning/datasets/training/oasst2-top1-imitation-2-3 - -TRAIN_DATA_PATH=${DATASET_DIR}/train.jsonl -VALID_DATA_PATH=${DATASET_DIR}/val.jsonl - -# job name -JOB_NAME="Swallow-13b-VE-oasst2-top1-imitation-2-3-BS=${GLOBAL_BATCH_SIZE}-LR=${LR}-MINLR=${MIN_LR}" - -# run -mpirun -np $NUM_GPUS \ - --npernode $NUM_GPU_PER_NODE \ - -hostfile $HOSTFILE_NAME \ - -x MASTER_ADDR=$MASTER_ADDR \ - -x MASTER_PORT=$MASTER_PORT \ - -bind-to none -map-by slot \ - -x PATH \ - python examples/finetuning.py \ - --seq-length ${SEQ_LENGTH} \ - --sliding-window-size ${SEQ_LENGTH} \ - --micro-batch-size ${MICRO_BATCH_SIZE} \ - --global-batch-size ${GLOBAL_BATCH_SIZE} \ - --hf-transformer-model-dir ${CHECKPOINT_DIR} \ - --tokenizer-type Llama2Tokenizer \ - --tokenizer-model ${TOKENIZER_MODEL} \ - --instruction-train-data-path ${TRAIN_DATA_PATH} \ - --instruction-valid-data-path ${VALID_DATA_PATH} \ - --epoch 2 \ - --train-iters 500000 \ - --lr ${LR} \ - --min-lr ${MIN_LR} \ - --lr-decay-style cosine \ - --weight-decay ${WEIGHT_DECAY} \ - --grad-clip-norm ${GRAD_CLIP} \ - --optimizer adam \ - --adam-beta1 0.9 \ - --adam-beta2 0.95 \ - --adam-eps 1e-6 \ - --save-interval 500 \ - --eval-interval 100 \ - --eval-iters 20 \ - --bf16 \ - --mixed-precision \ - --base-model ${CHECKPOINT_DIR} \ - --save ${CHECKPOINT_SAVE_DIR} \ - --load ${CHECKPOINT_SAVE_DIR} \ - --low-cpu-fsdp \ - --sharding-strategy FULL_SHARD \ - --checkpoint-type LOCAL_STATE_DICT \ - --fsdp-activation-checkpointing \ - --instruction-tuning \ - --save-sampler-state \ - --use-mpi \ - --wandb-entity "prj-jalm" \ - --wandb-project "Llama-2-13b-instruct" \ - --wandb-name "${JOB_NAME}" diff --git a/scripts/abci/instruction/swallow-70b/swallow-70b-baseline_imitatation_2.sh b/scripts/abci/instruction/swallow-70b/swallow-70b-baseline_imitatation_2.sh deleted file mode 100644 index 6eb65ef..0000000 --- a/scripts/abci/instruction/swallow-70b/swallow-70b-baseline_imitatation_2.sh +++ /dev/null @@ -1,121 +0,0 @@ -#!/bin/bash -#$ -l rt_AF=16 -#$ -l h_rt=0:10:00:00 -#$ -j y -#$ -o outputs/instruction/swallow-70b/ -#$ -cwd - -# module load -source /etc/profile.d/modules.sh -module load cuda/11.8/11.8.0 -module load cudnn/8.9/8.9.2 -module load nccl/2.16/2.16.2-1 -module load hpcx/2.12 - -# swich virtual env -source .env/bin/activate - -# distributed settings -export MASTER_ADDR=$(/usr/sbin/ip a show dev bond0 | grep 'inet ' | awk '{ print $2 }' | cut -d "/" -f 1) -export MASTER_PORT=$((10000 + ($JOB_ID % 50000))) - -echo "MASTER_ADDR=${MASTER_ADDR}" - -# hostfile - -if [[ "$SGE_RESOURCE_TYPE" == "rt_F" ]]; then - export NUM_GPU_PER_NODE=4 - NODE_TYPE="v100" -elif [[ "$SGE_RESOURCE_TYPE" == "rt_AF" ]]; then - export NUM_GPU_PER_NODE=8 - NODE_TYPE="a100" -else - echo "Unrecognized SGE_RESOURCE_TYPE: $SGE_RESOURCE_TYPE" -fi - -NUM_NODES=$NHOSTS -NUM_GPUS=$((${NUM_NODES} * ${NUM_GPU_PER_NODE})) - -mkdir -p ./hostfile - -HOSTFILE_NAME=./hostfile/hostfile_${JOB_ID} -while read -r line; do - echo "${line} slots=${NUM_GPU_PER_NODE}" -done <"$SGE_JOB_HOSTLIST" >"$HOSTFILE_NAME" - -# training config -SEQ_LENGTH=4096 -DATA_PARALLEL_SIZE=$NUM_GPUS - -MICRO_BATCH_SIZE=1 -GLOBAL_BATCH_SIZE=256 - -# optimizer config -LR=1e-5 -MIN_LR=1e-6 -WEIGHT_DECAY=0.1 -GRAD_CLIP=1 - -# checkpoint & tokenizer -TOKENIZER_MODEL=/bb/llm/gaf51275/llama/huggingface-checkpoint/Swallow-70b-hf/tokenizer.model -CHECKPOINT_DIR=/bb/llm/gaf51275/llama/huggingface-checkpoint/Swallow-70b-hf -CHECKPOINT_SAVE_DIR="/bb/llm/gaf51275/llama/checkpoints/Swallow-70b-VE-chat/baseline-imitation-2-lr_${LR}-minlr_${MIN_LR}-GB_${GLOBAL_BATCH_SIZE}-GB_${GLOBAL_BATCH_SIZE}" - -mkdir -p ${CHECKPOINT_SAVE_DIR} - -# dataset -DATASET_DIR=/bb/llm/gaf51275/llama/finetuning/datasets/training/baseline-imitation_2 - -TRAIN_DATA_PATH=${DATASET_DIR}/train.jsonl -VALID_DATA_PATH=${DATASET_DIR}/val.jsonl - -# job name -JOB_NAME="Swallow-70b-VE-baseline-imitation-2-BS=${GLOBAL_BATCH_SIZE}-LR=${LR}-MINLR=${MIN_LR}" - -# run -mpirun -np $NUM_GPUS \ - --npernode $NUM_GPU_PER_NODE \ - -hostfile $HOSTFILE_NAME \ - -x MASTER_ADDR=$MASTER_ADDR \ - -x MASTER_PORT=$MASTER_PORT \ - -bind-to none -map-by slot \ - -x PATH \ - python examples/finetuning.py \ - --seq-length ${SEQ_LENGTH} \ - --sliding-window-size ${SEQ_LENGTH} \ - --micro-batch-size ${MICRO_BATCH_SIZE} \ - --global-batch-size ${GLOBAL_BATCH_SIZE} \ - --hf-transformer-model-dir ${CHECKPOINT_DIR} \ - --tokenizer-type Llama2Tokenizer \ - --tokenizer-model ${TOKENIZER_MODEL} \ - --instruction-train-data-path ${TRAIN_DATA_PATH} \ - --instruction-valid-data-path ${VALID_DATA_PATH} \ - --epoch 2 \ - --train-iters 500000 \ - --lr ${LR} \ - --min-lr ${MIN_LR} \ - --lr-decay-style cosine \ - --weight-decay ${WEIGHT_DECAY} \ - --grad-clip-norm ${GRAD_CLIP} \ - --optimizer adam \ - --adam-beta1 0.9 \ - --adam-beta2 0.95 \ - --adam-eps 1e-6 \ - --save-interval 500 \ - --eval-interval 100 \ - --eval-iters 20 \ - --bf16 \ - --mixed-precision \ - --base-model ${CHECKPOINT_DIR} \ - --save ${CHECKPOINT_SAVE_DIR} \ - --load ${CHECKPOINT_SAVE_DIR} \ - --low-cpu-fsdp \ - --sharding-strategy FULL_SHARD \ - --checkpoint-type LOCAL_STATE_DICT \ - --fsdp-activation-checkpointing \ - --instruction-tuning \ - --save-sampler-state \ - --use-mpi \ - --wandb-entity "prj-jalm" \ - --wandb-project "Llama-2-70b-instruct" \ - --wandb-name "${JOB_NAME}" diff --git a/scripts/abci/instruction/swallow-70b/swallow-70b-dolly-oasst2-top1-imitation-2-3.sh b/scripts/abci/instruction/swallow-70b/swallow-70b-dolly-oasst2-top1-imitation-2-3.sh deleted file mode 100644 index 34f0aaf..0000000 --- a/scripts/abci/instruction/swallow-70b/swallow-70b-dolly-oasst2-top1-imitation-2-3.sh +++ /dev/null @@ -1,121 +0,0 @@ -#!/bin/bash -#$ -l rt_AF=16 -#$ -l h_rt=1:05:00:00 -#$ -j y -#$ -o outputs/instruction/swallow-70b/ -#$ -cwd - -# module load -source /etc/profile.d/modules.sh -module load cuda/11.8/11.8.0 -module load cudnn/8.9/8.9.2 -module load nccl/2.16/2.16.2-1 -module load hpcx/2.12 - -# swich virtual env -source .env/bin/activate - -# distributed settings -export MASTER_ADDR=$(/usr/sbin/ip a show dev bond0 | grep 'inet ' | awk '{ print $2 }' | cut -d "/" -f 1) -export MASTER_PORT=$((10000 + ($JOB_ID % 50000))) - -echo "MASTER_ADDR=${MASTER_ADDR}" - -# hostfile - -if [[ "$SGE_RESOURCE_TYPE" == "rt_F" ]]; then - export NUM_GPU_PER_NODE=4 - NODE_TYPE="v100" -elif [[ "$SGE_RESOURCE_TYPE" == "rt_AF" ]]; then - export NUM_GPU_PER_NODE=8 - NODE_TYPE="a100" -else - echo "Unrecognized SGE_RESOURCE_TYPE: $SGE_RESOURCE_TYPE" -fi - -NUM_NODES=$NHOSTS -NUM_GPUS=$((${NUM_NODES} * ${NUM_GPU_PER_NODE})) - -mkdir -p ./hostfile - -HOSTFILE_NAME=./hostfile/hostfile_${JOB_ID} -while read -r line; do - echo "${line} slots=${NUM_GPU_PER_NODE}" -done <"$SGE_JOB_HOSTLIST" >"$HOSTFILE_NAME" - -# training config -SEQ_LENGTH=4096 -DATA_PARALLEL_SIZE=$NUM_GPUS - -MICRO_BATCH_SIZE=1 -GLOBAL_BATCH_SIZE=256 - -# optimizer config -LR=1e-5 -MIN_LR=1e-6 -WEIGHT_DECAY=0.1 -GRAD_CLIP=1 - -# checkpoint & tokenizer -TOKENIZER_MODEL=/bb/llm/gaf51275/llama/huggingface-checkpoint/Swallow-70b-hf/tokenizer.model -CHECKPOINT_DIR=/bb/llm/gaf51275/llama/huggingface-checkpoint/Swallow-70b-hf -CHECKPOINT_SAVE_DIR="/bb/llm/gaf51275/llama/checkpoints/Swallow-7-b-VE-chat/dolly-oasst2-top1-imitation-2-3-lr_${LR}-minlr_${MIN_LR}-GB_${GLOBAL_BATCH_SIZE}" - -mkdir -p ${CHECKPOINT_SAVE_DIR} - -# dataset -DATASET_DIR=/bb/llm/gaf51275/llama/finetuning/datasets/training/dolly-oasst2-top1-imitation-2-3 - -TRAIN_DATA_PATH=${DATASET_DIR}/train.jsonl -VALID_DATA_PATH=${DATASET_DIR}/val.jsonl - -# job name -JOB_NAME="Swallow-70b-VE-dolly-oasst2-top1-imitation-2-3-BS=${GLOBAL_BATCH_SIZE}-LR=${LR}-MINLR=${MIN_LR}" - -# run -mpirun -np $NUM_GPUS \ - --npernode $NUM_GPU_PER_NODE \ - -hostfile $HOSTFILE_NAME \ - -x MASTER_ADDR=$MASTER_ADDR \ - -x MASTER_PORT=$MASTER_PORT \ - -bind-to none -map-by slot \ - -x PATH \ - python examples/finetuning.py \ - --seq-length ${SEQ_LENGTH} \ - --sliding-window-size ${SEQ_LENGTH} \ - --micro-batch-size ${MICRO_BATCH_SIZE} \ - --global-batch-size ${GLOBAL_BATCH_SIZE} \ - --hf-transformer-model-dir ${CHECKPOINT_DIR} \ - --tokenizer-type Llama2Tokenizer \ - --tokenizer-model ${TOKENIZER_MODEL} \ - --instruction-train-data-path ${TRAIN_DATA_PATH} \ - --instruction-valid-data-path ${VALID_DATA_PATH} \ - --epoch 2 \ - --train-iters 500000 \ - --lr ${LR} \ - --min-lr ${MIN_LR} \ - --lr-decay-style cosine \ - --weight-decay ${WEIGHT_DECAY} \ - --grad-clip-norm ${GRAD_CLIP} \ - --optimizer adam \ - --adam-beta1 0.9 \ - --adam-beta2 0.95 \ - --adam-eps 1e-6 \ - --save-interval 500 \ - --eval-interval 100 \ - --eval-iters 20 \ - --bf16 \ - --mixed-precision \ - --base-model ${CHECKPOINT_DIR} \ - --save ${CHECKPOINT_SAVE_DIR} \ - --load ${CHECKPOINT_SAVE_DIR} \ - --low-cpu-fsdp \ - --sharding-strategy FULL_SHARD \ - --checkpoint-type LOCAL_STATE_DICT \ - --fsdp-activation-checkpointing \ - --instruction-tuning \ - --save-sampler-state \ - --use-mpi \ - --wandb-entity "prj-jalm" \ - --wandb-project "Llama-2-70b-instruct" \ - --wandb-name "${JOB_NAME}" diff --git a/scripts/abci/instruction/swallow-70b/swallow-70b-oasst2-top1-imitation2-3.sh b/scripts/abci/instruction/swallow-70b/swallow-70b-oasst2-top1-imitation2-3.sh deleted file mode 100644 index 8cdef14..0000000 --- a/scripts/abci/instruction/swallow-70b/swallow-70b-oasst2-top1-imitation2-3.sh +++ /dev/null @@ -1,121 +0,0 @@ -#!/bin/bash -#$ -l rt_AF=16 -#$ -l h_rt=0:14:00:00 -#$ -j y -#$ -o outputs/instruction/swallow-70b/ -#$ -cwd - -# module load -source /etc/profile.d/modules.sh -module load cuda/11.8/11.8.0 -module load cudnn/8.9/8.9.2 -module load nccl/2.16/2.16.2-1 -module load hpcx/2.12 - -# swich virtual env -source .env/bin/activate - -# distributed settings -export MASTER_ADDR=$(/usr/sbin/ip a show dev bond0 | grep 'inet ' | awk '{ print $2 }' | cut -d "/" -f 1) -export MASTER_PORT=$((10000 + ($JOB_ID % 50000))) - -echo "MASTER_ADDR=${MASTER_ADDR}" - -# hostfile - -if [[ "$SGE_RESOURCE_TYPE" == "rt_F" ]]; then - export NUM_GPU_PER_NODE=4 - NODE_TYPE="v100" -elif [[ "$SGE_RESOURCE_TYPE" == "rt_AF" ]]; then - export NUM_GPU_PER_NODE=8 - NODE_TYPE="a100" -else - echo "Unrecognized SGE_RESOURCE_TYPE: $SGE_RESOURCE_TYPE" -fi - -NUM_NODES=$NHOSTS -NUM_GPUS=$((${NUM_NODES} * ${NUM_GPU_PER_NODE})) - -mkdir -p ./hostfile - -HOSTFILE_NAME=./hostfile/hostfile_${JOB_ID} -while read -r line; do - echo "${line} slots=${NUM_GPU_PER_NODE}" -done <"$SGE_JOB_HOSTLIST" >"$HOSTFILE_NAME" - -# training config -SEQ_LENGTH=4096 -DATA_PARALLEL_SIZE=$NUM_GPUS - -MICRO_BATCH_SIZE=1 -GLOBAL_BATCH_SIZE=256 - -# optimizer config -LR=1e-5 -MIN_LR=1e-6 -WEIGHT_DECAY=0.1 -GRAD_CLIP=1 - -# checkpoint & tokenizer -TOKENIZER_MODEL=/bb/llm/gaf51275/llama/huggingface-checkpoint/Swallow-70b-hf/tokenizer.model -CHECKPOINT_DIR=/bb/llm/gaf51275/llama/huggingface-checkpoint/Swallow-70b-hf -CHECKPOINT_SAVE_DIR="/bb/llm/gaf51275/llama/checkpoints/Swallow-70b-VE-chat/oasst2-top1-imitation-2-3-lr_${LR}-minlr_${MIN_LR}-GB_${GLOBAL_BATCH_SIZE}" - -mkdir -p ${CHECKPOINT_SAVE_DIR} - -# dataset -DATASET_DIR=/bb/llm/gaf51275/llama/finetuning/datasets/training/oasst2-top1-imitation-2-3 - -TRAIN_DATA_PATH=${DATASET_DIR}/train.jsonl -VALID_DATA_PATH=${DATASET_DIR}/val.jsonl - -# job name -JOB_NAME="Swallow-70b-VE-oasst2-top1-imitation-2-3-BS=${GLOBAL_BATCH_SIZE}-LR=${LR}-MINLR=${MIN_LR}" - -# run -mpirun -np $NUM_GPUS \ - --npernode $NUM_GPU_PER_NODE \ - -hostfile $HOSTFILE_NAME \ - -x MASTER_ADDR=$MASTER_ADDR \ - -x MASTER_PORT=$MASTER_PORT \ - -bind-to none -map-by slot \ - -x PATH \ - python examples/finetuning.py \ - --seq-length ${SEQ_LENGTH} \ - --sliding-window-size ${SEQ_LENGTH} \ - --micro-batch-size ${MICRO_BATCH_SIZE} \ - --global-batch-size ${GLOBAL_BATCH_SIZE} \ - --hf-transformer-model-dir ${CHECKPOINT_DIR} \ - --tokenizer-type Llama2Tokenizer \ - --tokenizer-model ${TOKENIZER_MODEL} \ - --instruction-train-data-path ${TRAIN_DATA_PATH} \ - --instruction-valid-data-path ${VALID_DATA_PATH} \ - --epoch 2 \ - --train-iters 500000 \ - --lr ${LR} \ - --min-lr ${MIN_LR} \ - --lr-decay-style cosine \ - --weight-decay ${WEIGHT_DECAY} \ - --grad-clip-norm ${GRAD_CLIP} \ - --optimizer adam \ - --adam-beta1 0.9 \ - --adam-beta2 0.95 \ - --adam-eps 1e-6 \ - --save-interval 500 \ - --eval-interval 100 \ - --eval-iters 20 \ - --bf16 \ - --mixed-precision \ - --base-model ${CHECKPOINT_DIR} \ - --save ${CHECKPOINT_SAVE_DIR} \ - --load ${CHECKPOINT_SAVE_DIR} \ - --low-cpu-fsdp \ - --sharding-strategy FULL_SHARD \ - --checkpoint-type LOCAL_STATE_DICT \ - --fsdp-activation-checkpointing \ - --instruction-tuning \ - --save-sampler-state \ - --use-mpi \ - --wandb-entity "prj-jalm" \ - --wandb-project "Llama-2-70b-instruct" \ - --wandb-name "${JOB_NAME}" diff --git a/scripts/abci/instruction/swallow-7b/swallow-7b-baseline_GB_256.sh b/scripts/abci/instruction/swallow-7b/swallow-7b-baseline_GB_256.sh deleted file mode 100644 index c9ae201..0000000 --- a/scripts/abci/instruction/swallow-7b/swallow-7b-baseline_GB_256.sh +++ /dev/null @@ -1,121 +0,0 @@ -#!/bin/bash -#$ -l rt_AF=2 -#$ -l h_rt=0:08:00:00 -#$ -j y -#$ -o outputs/instruction/swallow-7b/ -#$ -cwd - -# module load -source /etc/profile.d/modules.sh -module load cuda/11.8/11.8.0 -module load cudnn/8.9/8.9.2 -module load nccl/2.16/2.16.2-1 -module load hpcx/2.12 - -# swich virtual env -source .env/bin/activate - -# distributed settings -export MASTER_ADDR=$(/usr/sbin/ip a show dev bond0 | grep 'inet ' | awk '{ print $2 }' | cut -d "/" -f 1) -export MASTER_PORT=$((10000 + ($JOB_ID % 50000))) - -echo "MASTER_ADDR=${MASTER_ADDR}" - -# hostfile - -if [[ "$SGE_RESOURCE_TYPE" == "rt_F" ]]; then - export NUM_GPU_PER_NODE=4 - NODE_TYPE="v100" -elif [[ "$SGE_RESOURCE_TYPE" == "rt_AF" ]]; then - export NUM_GPU_PER_NODE=8 - NODE_TYPE="a100" -else - echo "Unrecognized SGE_RESOURCE_TYPE: $SGE_RESOURCE_TYPE" -fi - -NUM_NODES=$NHOSTS -NUM_GPUS=$((${NUM_NODES} * ${NUM_GPU_PER_NODE})) - -mkdir -p ./hostfile - -HOSTFILE_NAME=./hostfile/hostfile_${JOB_ID} -while read -r line; do - echo "${line} slots=${NUM_GPU_PER_NODE}" -done <"$SGE_JOB_HOSTLIST" >"$HOSTFILE_NAME" - -# training config -SEQ_LENGTH=4096 -DATA_PARALLEL_SIZE=$NUM_GPUS - -MICRO_BATCH_SIZE=4 -GLOBAL_BATCH_SIZE=256 - -# optimizer config -LR=2e-5 -MIN_LR=2e-6 -WEIGHT_DECAY=0.1 -GRAD_CLIP=1 - -# checkpoint & tokenizer -TOKENIZER_MODEL=/bb/llm/gaf51275/llama/huggingface-checkpoint/Swallow-7b-hf/tokenizer.model -CHECKPOINT_DIR=/bb/llm/gaf51275/llama/huggingface-checkpoint/Swallow-7b-hf -CHECKPOINT_SAVE_DIR="/bb/llm/gaf51275/llama/checkpoints/Swallow-7b-VE-chat/baseline-lr_${LR}-minlr_${MIN_LR}_GB_${GLOBAL_BATCH_SIZE}" - -mkdir -p ${CHECKPOINT_SAVE_DIR} - -# dataset -DATASET_DIR=/bb/llm/gaf51275/llama/finetuning/datasets/training/baseline - -TRAIN_DATA_PATH=${DATASET_DIR}/train.jsonl -VALID_DATA_PATH=${DATASET_DIR}/val.jsonl - -# job name -JOB_NAME="Swallow-7b-VE-baseline-BS=${GLOBAL_BATCH_SIZE}-LR=${LR}-MINLR=${MIN_LR}" - -# run -mpirun -np $NUM_GPUS \ - --npernode $NUM_GPU_PER_NODE \ - -hostfile $HOSTFILE_NAME \ - -x MASTER_ADDR=$MASTER_ADDR \ - -x MASTER_PORT=$MASTER_PORT \ - -bind-to none -map-by slot \ - -x PATH \ - python examples/finetuning.py \ - --seq-length ${SEQ_LENGTH} \ - --sliding-window-size ${SEQ_LENGTH} \ - --micro-batch-size ${MICRO_BATCH_SIZE} \ - --global-batch-size ${GLOBAL_BATCH_SIZE} \ - --hf-transformer-model-dir ${CHECKPOINT_DIR} \ - --tokenizer-type Llama2Tokenizer \ - --tokenizer-model ${TOKENIZER_MODEL} \ - --instruction-train-data-path ${TRAIN_DATA_PATH} \ - --instruction-valid-data-path ${VALID_DATA_PATH} \ - --epoch 2 \ - --train-iters 500000 \ - --lr ${LR} \ - --min-lr ${MIN_LR} \ - --lr-decay-style cosine \ - --weight-decay ${WEIGHT_DECAY} \ - --grad-clip-norm ${GRAD_CLIP} \ - --optimizer adam \ - --adam-beta1 0.9 \ - --adam-beta2 0.95 \ - --adam-eps 1e-6 \ - --save-interval 500 \ - --eval-interval 100 \ - --eval-iters 20 \ - --bf16 \ - --mixed-precision \ - --base-model ${CHECKPOINT_DIR} \ - --save ${CHECKPOINT_SAVE_DIR} \ - --load ${CHECKPOINT_SAVE_DIR} \ - --low-cpu-fsdp \ - --sharding-strategy FULL_SHARD \ - --checkpoint-type LOCAL_STATE_DICT \ - --fsdp-activation-checkpointing \ - --instruction-tuning \ - --save-sampler-state \ - --use-mpi \ - --wandb-entity "prj-jalm" \ - --wandb-project "Llama-2-7b-instruct" \ - --wandb-name "${JOB_NAME}" diff --git a/scripts/abci/instruction/swallow-7b/swallow-7b-baseline_imitatation_2.sh b/scripts/abci/instruction/swallow-7b/swallow-7b-baseline_imitatation_2.sh deleted file mode 100644 index 1f12d22..0000000 --- a/scripts/abci/instruction/swallow-7b/swallow-7b-baseline_imitatation_2.sh +++ /dev/null @@ -1,121 +0,0 @@ -#!/bin/bash -#$ -l rt_AF=2 -#$ -l h_rt=0:10:00:00 -#$ -j y -#$ -o outputs/instruction/swallow-7b/ -#$ -cwd - -# module load -source /etc/profile.d/modules.sh -module load cuda/11.8/11.8.0 -module load cudnn/8.9/8.9.2 -module load nccl/2.16/2.16.2-1 -module load hpcx/2.12 - -# swich virtual env -source .env/bin/activate - -# distributed settings -export MASTER_ADDR=$(/usr/sbin/ip a show dev bond0 | grep 'inet ' | awk '{ print $2 }' | cut -d "/" -f 1) -export MASTER_PORT=$((10000 + ($JOB_ID % 50000))) - -echo "MASTER_ADDR=${MASTER_ADDR}" - -# hostfile - -if [[ "$SGE_RESOURCE_TYPE" == "rt_F" ]]; then - export NUM_GPU_PER_NODE=4 - NODE_TYPE="v100" -elif [[ "$SGE_RESOURCE_TYPE" == "rt_AF" ]]; then - export NUM_GPU_PER_NODE=8 - NODE_TYPE="a100" -else - echo "Unrecognized SGE_RESOURCE_TYPE: $SGE_RESOURCE_TYPE" -fi - -NUM_NODES=$NHOSTS -NUM_GPUS=$((${NUM_NODES} * ${NUM_GPU_PER_NODE})) - -mkdir -p ./hostfile - -HOSTFILE_NAME=./hostfile/hostfile_${JOB_ID} -while read -r line; do - echo "${line} slots=${NUM_GPU_PER_NODE}" -done <"$SGE_JOB_HOSTLIST" >"$HOSTFILE_NAME" - -# training config -SEQ_LENGTH=4096 -DATA_PARALLEL_SIZE=$NUM_GPUS - -MICRO_BATCH_SIZE=4 -GLOBAL_BATCH_SIZE=256 - -# optimizer config -LR=2e-5 -MIN_LR=2e-6 -WEIGHT_DECAY=0.1 -GRAD_CLIP=1 - -# checkpoint & tokenizer -TOKENIZER_MODEL=/bb/llm/gaf51275/llama/huggingface-checkpoint/Swallow-7b-hf/tokenizer.model -CHECKPOINT_DIR=/bb/llm/gaf51275/llama/huggingface-checkpoint/Swallow-7b-hf -CHECKPOINT_SAVE_DIR="/bb/llm/gaf51275/llama/checkpoints/Swallow-7b-VE-chat/baseline-imitation-2-lr_${LR}-minlr_${MIN_LR}-GB_${GLOBAL_BATCH_SIZE}" - -mkdir -p ${CHECKPOINT_SAVE_DIR} - -# dataset -DATASET_DIR=/bb/llm/gaf51275/llama/finetuning/datasets/training/baseline-imitation_2 - -TRAIN_DATA_PATH=${DATASET_DIR}/train.jsonl -VALID_DATA_PATH=${DATASET_DIR}/val.jsonl - -# job name -JOB_NAME="Swallow-7b-VE-baseline-imitation-2-BS=${GLOBAL_BATCH_SIZE}-LR=${LR}-MINLR=${MIN_LR}" - -# run -mpirun -np $NUM_GPUS \ - --npernode $NUM_GPU_PER_NODE \ - -hostfile $HOSTFILE_NAME \ - -x MASTER_ADDR=$MASTER_ADDR \ - -x MASTER_PORT=$MASTER_PORT \ - -bind-to none -map-by slot \ - -x PATH \ - python examples/finetuning.py \ - --seq-length ${SEQ_LENGTH} \ - --sliding-window-size ${SEQ_LENGTH} \ - --micro-batch-size ${MICRO_BATCH_SIZE} \ - --global-batch-size ${GLOBAL_BATCH_SIZE} \ - --hf-transformer-model-dir ${CHECKPOINT_DIR} \ - --tokenizer-type Llama2Tokenizer \ - --tokenizer-model ${TOKENIZER_MODEL} \ - --instruction-train-data-path ${TRAIN_DATA_PATH} \ - --instruction-valid-data-path ${VALID_DATA_PATH} \ - --epoch 2 \ - --train-iters 500000 \ - --lr ${LR} \ - --min-lr ${MIN_LR} \ - --lr-decay-style cosine \ - --weight-decay ${WEIGHT_DECAY} \ - --grad-clip-norm ${GRAD_CLIP} \ - --optimizer adam \ - --adam-beta1 0.9 \ - --adam-beta2 0.95 \ - --adam-eps 1e-6 \ - --save-interval 500 \ - --eval-interval 100 \ - --eval-iters 20 \ - --bf16 \ - --mixed-precision \ - --base-model ${CHECKPOINT_DIR} \ - --save ${CHECKPOINT_SAVE_DIR} \ - --load ${CHECKPOINT_SAVE_DIR} \ - --low-cpu-fsdp \ - --sharding-strategy FULL_SHARD \ - --checkpoint-type LOCAL_STATE_DICT \ - --fsdp-activation-checkpointing \ - --instruction-tuning \ - --save-sampler-state \ - --use-mpi \ - --wandb-entity "prj-jalm" \ - --wandb-project "Llama-2-7b-instruct" \ - --wandb-name "${JOB_NAME}" diff --git a/scripts/abci/instruction/swallow-7b/swallow-7b-dolly-oasst2-top1-imitation-2-3.sh b/scripts/abci/instruction/swallow-7b/swallow-7b-dolly-oasst2-top1-imitation-2-3.sh deleted file mode 100644 index 11bf134..0000000 --- a/scripts/abci/instruction/swallow-7b/swallow-7b-dolly-oasst2-top1-imitation-2-3.sh +++ /dev/null @@ -1,121 +0,0 @@ -#!/bin/bash -#$ -l rt_AF=2 -#$ -l h_rt=1:00:00:00 -#$ -j y -#$ -o outputs/instruction/swallow-7b/ -#$ -cwd - -# module load -source /etc/profile.d/modules.sh -module load cuda/11.8/11.8.0 -module load cudnn/8.9/8.9.2 -module load nccl/2.16/2.16.2-1 -module load hpcx/2.12 - -# swich virtual env -source .env/bin/activate - -# distributed settings -export MASTER_ADDR=$(/usr/sbin/ip a show dev bond0 | grep 'inet ' | awk '{ print $2 }' | cut -d "/" -f 1) -export MASTER_PORT=$((10000 + ($JOB_ID % 50000))) - -echo "MASTER_ADDR=${MASTER_ADDR}" - -# hostfile - -if [[ "$SGE_RESOURCE_TYPE" == "rt_F" ]]; then - export NUM_GPU_PER_NODE=4 - NODE_TYPE="v100" -elif [[ "$SGE_RESOURCE_TYPE" == "rt_AF" ]]; then - export NUM_GPU_PER_NODE=8 - NODE_TYPE="a100" -else - echo "Unrecognized SGE_RESOURCE_TYPE: $SGE_RESOURCE_TYPE" -fi - -NUM_NODES=$NHOSTS -NUM_GPUS=$((${NUM_NODES} * ${NUM_GPU_PER_NODE})) - -mkdir -p ./hostfile - -HOSTFILE_NAME=./hostfile/hostfile_${JOB_ID} -while read -r line; do - echo "${line} slots=${NUM_GPU_PER_NODE}" -done <"$SGE_JOB_HOSTLIST" >"$HOSTFILE_NAME" - -# training config -SEQ_LENGTH=4096 -DATA_PARALLEL_SIZE=$NUM_GPUS - -MICRO_BATCH_SIZE=4 -GLOBAL_BATCH_SIZE=256 - -# optimizer config -LR=2e-5 -MIN_LR=2e-6 -WEIGHT_DECAY=0.1 -GRAD_CLIP=1 - -# checkpoint & tokenizer -TOKENIZER_MODEL=/bb/llm/gaf51275/llama/huggingface-checkpoint/Swallow-7b-hf/tokenizer.model -CHECKPOINT_DIR=/bb/llm/gaf51275/llama/huggingface-checkpoint/Swallow-7b-hf -CHECKPOINT_SAVE_DIR="/bb/llm/gaf51275/llama/checkpoints/Swallow-7b-VE-chat/dolly-oasst2-top1-imitation-2-3-lr_${LR}-minlr_${MIN_LR}-GB_${GLOBAL_BATCH_SIZE}" - -mkdir -p ${CHECKPOINT_SAVE_DIR} - -# dataset -DATASET_DIR=/bb/llm/gaf51275/llama/finetuning/datasets/training/dolly-oasst2-top1-imitation-2-3 - -TRAIN_DATA_PATH=${DATASET_DIR}/train.jsonl -VALID_DATA_PATH=${DATASET_DIR}/val.jsonl - -# job name -JOB_NAME="Swallow-7b-VE-dolly-oasst2-top1-imitation-2-3-BS=${GLOBAL_BATCH_SIZE}-LR=${LR}-MINLR=${MIN_LR}" - -# run -mpirun -np $NUM_GPUS \ - --npernode $NUM_GPU_PER_NODE \ - -hostfile $HOSTFILE_NAME \ - -x MASTER_ADDR=$MASTER_ADDR \ - -x MASTER_PORT=$MASTER_PORT \ - -bind-to none -map-by slot \ - -x PATH \ - python examples/finetuning.py \ - --seq-length ${SEQ_LENGTH} \ - --sliding-window-size ${SEQ_LENGTH} \ - --micro-batch-size ${MICRO_BATCH_SIZE} \ - --global-batch-size ${GLOBAL_BATCH_SIZE} \ - --hf-transformer-model-dir ${CHECKPOINT_DIR} \ - --tokenizer-type Llama2Tokenizer \ - --tokenizer-model ${TOKENIZER_MODEL} \ - --instruction-train-data-path ${TRAIN_DATA_PATH} \ - --instruction-valid-data-path ${VALID_DATA_PATH} \ - --epoch 2 \ - --train-iters 500000 \ - --lr ${LR} \ - --min-lr ${MIN_LR} \ - --lr-decay-style cosine \ - --weight-decay ${WEIGHT_DECAY} \ - --grad-clip-norm ${GRAD_CLIP} \ - --optimizer adam \ - --adam-beta1 0.9 \ - --adam-beta2 0.95 \ - --adam-eps 1e-6 \ - --save-interval 500 \ - --eval-interval 100 \ - --eval-iters 20 \ - --bf16 \ - --mixed-precision \ - --base-model ${CHECKPOINT_DIR} \ - --save ${CHECKPOINT_SAVE_DIR} \ - --load ${CHECKPOINT_SAVE_DIR} \ - --low-cpu-fsdp \ - --sharding-strategy FULL_SHARD \ - --checkpoint-type LOCAL_STATE_DICT \ - --fsdp-activation-checkpointing \ - --instruction-tuning \ - --save-sampler-state \ - --use-mpi \ - --wandb-entity "prj-jalm" \ - --wandb-project "Llama-2-7b-instruct" \ - --wandb-name "${JOB_NAME}" diff --git a/scripts/abci/instruction/swallow-7b/swallow-7b-imitation-2-oass2-top1.sh b/scripts/abci/instruction/swallow-7b/swallow-7b-imitation-2-oass2-top1.sh deleted file mode 100644 index e61df85..0000000 --- a/scripts/abci/instruction/swallow-7b/swallow-7b-imitation-2-oass2-top1.sh +++ /dev/null @@ -1,121 +0,0 @@ -#!/bin/bash -#$ -l rt_AF=2 -#$ -l h_rt=0:08:00:00 -#$ -j y -#$ -o outputs/instruction/swallow-7b/ -#$ -cwd - -# module load -source /etc/profile.d/modules.sh -module load cuda/11.8/11.8.0 -module load cudnn/8.9/8.9.2 -module load nccl/2.16/2.16.2-1 -module load hpcx/2.12 - -# swich virtual env -source .env/bin/activate - -# distributed settings -export MASTER_ADDR=$(/usr/sbin/ip a show dev bond0 | grep 'inet ' | awk '{ print $2 }' | cut -d "/" -f 1) -export MASTER_PORT=$((10000 + ($JOB_ID % 50000))) - -echo "MASTER_ADDR=${MASTER_ADDR}" - -# hostfile - -if [[ "$SGE_RESOURCE_TYPE" == "rt_F" ]]; then - export NUM_GPU_PER_NODE=4 - NODE_TYPE="v100" -elif [[ "$SGE_RESOURCE_TYPE" == "rt_AF" ]]; then - export NUM_GPU_PER_NODE=8 - NODE_TYPE="a100" -else - echo "Unrecognized SGE_RESOURCE_TYPE: $SGE_RESOURCE_TYPE" -fi - -NUM_NODES=$NHOSTS -NUM_GPUS=$((${NUM_NODES} * ${NUM_GPU_PER_NODE})) - -mkdir -p ./hostfile - -HOSTFILE_NAME=./hostfile/hostfile_${JOB_ID} -while read -r line; do - echo "${line} slots=${NUM_GPU_PER_NODE}" -done <"$SGE_JOB_HOSTLIST" >"$HOSTFILE_NAME" - -# training config -SEQ_LENGTH=4096 -DATA_PARALLEL_SIZE=$NUM_GPUS - -MICRO_BATCH_SIZE=4 -GLOBAL_BATCH_SIZE=64 - -# optimizer config -LR=2e-5 -MIN_LR=2e-6 -WEIGHT_DECAY=0.1 -GRAD_CLIP=1 - -# checkpoint & tokenizer -TOKENIZER_MODEL=/bb/llm/gaf51275/llama/huggingface-checkpoint/Swallow-7b-hf/tokenizer.model -CHECKPOINT_DIR=/bb/llm/gaf51275/llama/huggingface-checkpoint/Swallow-7b-hf -CHECKPOINT_SAVE_DIR="/bb/llm/gaf51275/llama/checkpoints/Swallow-7b-VE-chat/imitation-2-oasst2-top1-lr_${LR}-minlr_${MIN_LR}" - -mkdir -p ${CHECKPOINT_SAVE_DIR} - -# dataset -DATASET_DIR=/bb/llm/gaf51275/llama/finetuning/datasets/training/imitation_2_oasst2_top1 - -TRAIN_DATA_PATH=${DATASET_DIR}/train.jsonl -VALID_DATA_PATH=${DATASET_DIR}/val.jsonl - -# job name -JOB_NAME="Swallow-7b-VE-imitation-2-oasst2-top1-BS=${GLOBAL_BATCH_SIZE}-LR=${LR}-MINLR=${MIN_LR}" - -# run -mpirun -np $NUM_GPUS \ - --npernode $NUM_GPU_PER_NODE \ - -hostfile $HOSTFILE_NAME \ - -x MASTER_ADDR=$MASTER_ADDR \ - -x MASTER_PORT=$MASTER_PORT \ - -bind-to none -map-by slot \ - -x PATH \ - python examples/finetuning.py \ - --seq-length ${SEQ_LENGTH} \ - --sliding-window-size ${SEQ_LENGTH} \ - --micro-batch-size ${MICRO_BATCH_SIZE} \ - --global-batch-size ${GLOBAL_BATCH_SIZE} \ - --hf-transformer-model-dir ${CHECKPOINT_DIR} \ - --tokenizer-type Llama2Tokenizer \ - --tokenizer-model ${TOKENIZER_MODEL} \ - --instruction-train-data-path ${TRAIN_DATA_PATH} \ - --instruction-valid-data-path ${VALID_DATA_PATH} \ - --epoch 2 \ - --train-iters 500000 \ - --lr ${LR} \ - --min-lr ${MIN_LR} \ - --lr-decay-style cosine \ - --weight-decay ${WEIGHT_DECAY} \ - --grad-clip-norm ${GRAD_CLIP} \ - --optimizer adam \ - --adam-beta1 0.9 \ - --adam-beta2 0.95 \ - --adam-eps 1e-6 \ - --save-interval 500 \ - --eval-interval 100 \ - --eval-iters 20 \ - --bf16 \ - --mixed-precision \ - --base-model ${CHECKPOINT_DIR} \ - --save ${CHECKPOINT_SAVE_DIR} \ - --load ${CHECKPOINT_SAVE_DIR} \ - --low-cpu-fsdp \ - --sharding-strategy FULL_SHARD \ - --checkpoint-type LOCAL_STATE_DICT \ - --fsdp-activation-checkpointing \ - --instruction-tuning \ - --save-sampler-state \ - --use-mpi \ - --wandb-entity "prj-jalm" \ - --wandb-project "Llama-2-7b-instruct" \ - --wandb-name "${JOB_NAME}" diff --git a/scripts/abci/instruction/swallow-7b/swallow-7b-imitation_1_and_2.sh b/scripts/abci/instruction/swallow-7b/swallow-7b-imitation_1_and_2.sh deleted file mode 100644 index 9c15834..0000000 --- a/scripts/abci/instruction/swallow-7b/swallow-7b-imitation_1_and_2.sh +++ /dev/null @@ -1,121 +0,0 @@ -#!/bin/bash -#$ -l rt_AF=2 -#$ -l h_rt=0:08:00:00 -#$ -j y -#$ -o outputs/instruction/swallow-7b/ -#$ -cwd - -# module load -source /etc/profile.d/modules.sh -module load cuda/11.8/11.8.0 -module load cudnn/8.9/8.9.2 -module load nccl/2.16/2.16.2-1 -module load hpcx/2.12 - -# swich virtual env -source .env/bin/activate - -# distributed settings -export MASTER_ADDR=$(/usr/sbin/ip a show dev bond0 | grep 'inet ' | awk '{ print $2 }' | cut -d "/" -f 1) -export MASTER_PORT=$((10000 + ($JOB_ID % 50000))) - -echo "MASTER_ADDR=${MASTER_ADDR}" - -# hostfile - -if [[ "$SGE_RESOURCE_TYPE" == "rt_F" ]]; then - export NUM_GPU_PER_NODE=4 - NODE_TYPE="v100" -elif [[ "$SGE_RESOURCE_TYPE" == "rt_AF" ]]; then - export NUM_GPU_PER_NODE=8 - NODE_TYPE="a100" -else - echo "Unrecognized SGE_RESOURCE_TYPE: $SGE_RESOURCE_TYPE" -fi - -NUM_NODES=$NHOSTS -NUM_GPUS=$((${NUM_NODES} * ${NUM_GPU_PER_NODE})) - -mkdir -p ./hostfile - -HOSTFILE_NAME=./hostfile/hostfile_${JOB_ID} -while read -r line; do - echo "${line} slots=${NUM_GPU_PER_NODE}" -done <"$SGE_JOB_HOSTLIST" >"$HOSTFILE_NAME" - -# training config -SEQ_LENGTH=4096 -DATA_PARALLEL_SIZE=$NUM_GPUS - -MICRO_BATCH_SIZE=4 -GLOBAL_BATCH_SIZE=64 - -# optimizer config -LR=2e-5 -MIN_LR=2e-6 -WEIGHT_DECAY=0.1 -GRAD_CLIP=1 - -# checkpoint & tokenizer -TOKENIZER_MODEL=/bb/llm/gaf51275/llama/huggingface-checkpoint/Swallow-7b-hf/tokenizer.model -CHECKPOINT_DIR=/bb/llm/gaf51275/llama/huggingface-checkpoint/Swallow-7b-hf -CHECKPOINT_SAVE_DIR="/bb/llm/gaf51275/llama/checkpoints/Swallow-7b-VE-chat/imitation-1-and-2-lr_${LR}-minlr_${MIN_LR}" - -mkdir -p ${CHECKPOINT_SAVE_DIR} - -# dataset -DATASET_DIR=/bb/llm/gaf51275/llama/finetuning/datasets/training/imitation_1_and_2 - -TRAIN_DATA_PATH=${DATASET_DIR}/train.jsonl -VALID_DATA_PATH=${DATASET_DIR}/val.jsonl - -# job name -JOB_NAME="Swallow-7b-VE-imitation-1-and-2-BS=${GLOBAL_BATCH_SIZE}-LR=${LR}-MINLR=${MIN_LR}" - -# run -mpirun -np $NUM_GPUS \ - --npernode $NUM_GPU_PER_NODE \ - -hostfile $HOSTFILE_NAME \ - -x MASTER_ADDR=$MASTER_ADDR \ - -x MASTER_PORT=$MASTER_PORT \ - -bind-to none -map-by slot \ - -x PATH \ - python examples/finetuning.py \ - --seq-length ${SEQ_LENGTH} \ - --sliding-window-size ${SEQ_LENGTH} \ - --micro-batch-size ${MICRO_BATCH_SIZE} \ - --global-batch-size ${GLOBAL_BATCH_SIZE} \ - --hf-transformer-model-dir ${CHECKPOINT_DIR} \ - --tokenizer-type Llama2Tokenizer \ - --tokenizer-model ${TOKENIZER_MODEL} \ - --instruction-train-data-path ${TRAIN_DATA_PATH} \ - --instruction-valid-data-path ${VALID_DATA_PATH} \ - --epoch 2 \ - --train-iters 500000 \ - --lr ${LR} \ - --min-lr ${MIN_LR} \ - --lr-decay-style cosine \ - --weight-decay ${WEIGHT_DECAY} \ - --grad-clip-norm ${GRAD_CLIP} \ - --optimizer adam \ - --adam-beta1 0.9 \ - --adam-beta2 0.95 \ - --adam-eps 1e-6 \ - --save-interval 500 \ - --eval-interval 100 \ - --eval-iters 20 \ - --bf16 \ - --mixed-precision \ - --base-model ${CHECKPOINT_DIR} \ - --save ${CHECKPOINT_SAVE_DIR} \ - --load ${CHECKPOINT_SAVE_DIR} \ - --low-cpu-fsdp \ - --sharding-strategy FULL_SHARD \ - --checkpoint-type LOCAL_STATE_DICT \ - --fsdp-activation-checkpointing \ - --instruction-tuning \ - --save-sampler-state \ - --use-mpi \ - --wandb-entity "prj-jalm" \ - --wandb-project "Llama-2-7b-instruct" \ - --wandb-name "${JOB_NAME}" diff --git a/scripts/abci/instruction/swallow-7b/swallow-7b-oasst2-top1-imitation2-3.sh b/scripts/abci/instruction/swallow-7b/swallow-7b-oasst2-top1-imitation2-3.sh deleted file mode 100644 index cad631c..0000000 --- a/scripts/abci/instruction/swallow-7b/swallow-7b-oasst2-top1-imitation2-3.sh +++ /dev/null @@ -1,121 +0,0 @@ -#!/bin/bash -#$ -l rt_AF=2 -#$ -l h_rt=0:10:00:00 -#$ -j y -#$ -o outputs/instruction/swallow-7b/ -#$ -cwd - -# module load -source /etc/profile.d/modules.sh -module load cuda/11.8/11.8.0 -module load cudnn/8.9/8.9.2 -module load nccl/2.16/2.16.2-1 -module load hpcx/2.12 - -# swich virtual env -source .env/bin/activate - -# distributed settings -export MASTER_ADDR=$(/usr/sbin/ip a show dev bond0 | grep 'inet ' | awk '{ print $2 }' | cut -d "/" -f 1) -export MASTER_PORT=$((10000 + ($JOB_ID % 50000))) - -echo "MASTER_ADDR=${MASTER_ADDR}" - -# hostfile - -if [[ "$SGE_RESOURCE_TYPE" == "rt_F" ]]; then - export NUM_GPU_PER_NODE=4 - NODE_TYPE="v100" -elif [[ "$SGE_RESOURCE_TYPE" == "rt_AF" ]]; then - export NUM_GPU_PER_NODE=8 - NODE_TYPE="a100" -else - echo "Unrecognized SGE_RESOURCE_TYPE: $SGE_RESOURCE_TYPE" -fi - -NUM_NODES=$NHOSTS -NUM_GPUS=$((${NUM_NODES} * ${NUM_GPU_PER_NODE})) - -mkdir -p ./hostfile - -HOSTFILE_NAME=./hostfile/hostfile_${JOB_ID} -while read -r line; do - echo "${line} slots=${NUM_GPU_PER_NODE}" -done <"$SGE_JOB_HOSTLIST" >"$HOSTFILE_NAME" - -# training config -SEQ_LENGTH=4096 -DATA_PARALLEL_SIZE=$NUM_GPUS - -MICRO_BATCH_SIZE=4 -GLOBAL_BATCH_SIZE=256 - -# optimizer config -LR=2e-5 -MIN_LR=2e-6 -WEIGHT_DECAY=0.1 -GRAD_CLIP=1 - -# checkpoint & tokenizer -TOKENIZER_MODEL=/bb/llm/gaf51275/llama/huggingface-checkpoint/Swallow-7b-hf/tokenizer.model -CHECKPOINT_DIR=/bb/llm/gaf51275/llama/huggingface-checkpoint/Swallow-7b-hf -CHECKPOINT_SAVE_DIR="/bb/llm/gaf51275/llama/checkpoints/Swallow-7b-VE-chat/oasst2-top1-imitation-2-3-lr_${LR}-minlr_${MIN_LR}-GB_${GLOBAL_BATCH_SIZE}" - -mkdir -p ${CHECKPOINT_SAVE_DIR} - -# dataset -DATASET_DIR=/bb/llm/gaf51275/llama/finetuning/datasets/training/oasst2-top1-imitation-2-3 - -TRAIN_DATA_PATH=${DATASET_DIR}/train.jsonl -VALID_DATA_PATH=${DATASET_DIR}/val.jsonl - -# job name -JOB_NAME="Swallow-7b-VE-oasst2-top1-imitation-2-3-BS=${GLOBAL_BATCH_SIZE}-LR=${LR}-MINLR=${MIN_LR}" - -# run -mpirun -np $NUM_GPUS \ - --npernode $NUM_GPU_PER_NODE \ - -hostfile $HOSTFILE_NAME \ - -x MASTER_ADDR=$MASTER_ADDR \ - -x MASTER_PORT=$MASTER_PORT \ - -bind-to none -map-by slot \ - -x PATH \ - python examples/finetuning.py \ - --seq-length ${SEQ_LENGTH} \ - --sliding-window-size ${SEQ_LENGTH} \ - --micro-batch-size ${MICRO_BATCH_SIZE} \ - --global-batch-size ${GLOBAL_BATCH_SIZE} \ - --hf-transformer-model-dir ${CHECKPOINT_DIR} \ - --tokenizer-type Llama2Tokenizer \ - --tokenizer-model ${TOKENIZER_MODEL} \ - --instruction-train-data-path ${TRAIN_DATA_PATH} \ - --instruction-valid-data-path ${VALID_DATA_PATH} \ - --epoch 2 \ - --train-iters 500000 \ - --lr ${LR} \ - --min-lr ${MIN_LR} \ - --lr-decay-style cosine \ - --weight-decay ${WEIGHT_DECAY} \ - --grad-clip-norm ${GRAD_CLIP} \ - --optimizer adam \ - --adam-beta1 0.9 \ - --adam-beta2 0.95 \ - --adam-eps 1e-6 \ - --save-interval 500 \ - --eval-interval 100 \ - --eval-iters 20 \ - --bf16 \ - --mixed-precision \ - --base-model ${CHECKPOINT_DIR} \ - --save ${CHECKPOINT_SAVE_DIR} \ - --load ${CHECKPOINT_SAVE_DIR} \ - --low-cpu-fsdp \ - --sharding-strategy FULL_SHARD \ - --checkpoint-type LOCAL_STATE_DICT \ - --fsdp-activation-checkpointing \ - --instruction-tuning \ - --save-sampler-state \ - --use-mpi \ - --wandb-entity "prj-jalm" \ - --wandb-project "Llama-2-7b-instruct" \ - --wandb-name "${JOB_NAME}" diff --git a/scripts/abci/llama3/Llama-3-8b.sh b/scripts/abci/llama3/Llama-3-8b.sh index 2208409..e4b9cb6 100644 --- a/scripts/abci/llama3/Llama-3-8b.sh +++ b/scripts/abci/llama3/Llama-3-8b.sh @@ -1,17 +1,17 @@ #!/bin/bash -#$ -l rt_AF=4 -#$ -l h_rt=5:0:00:00 +#$ -l rt_AF=2 +#$ -l h_rt=0:1:00:00 #$ -j y #$ -o outputs/Llama-3-8b/ #$ -cwd # module load source /etc/profile.d/modules.sh -module use /groups/gag51395/modules/modulefiles +module use /bb/llm/gaf51275/modules/modulefiles module load cuda/12.1/12.1.1 module load cudnn/cuda-12.1/9.0.0 -module load nccl/2.17/2.17.1-1 +module load nccl/2.20.5 module load hpcx/2.12 module load gcc/11.4.0 @@ -108,7 +108,7 @@ mpirun -np $NUM_GPUS \ --optimizer adam \ --adam-beta1 0.9 \ --adam-beta2 0.95 \ - --adam-eps 1e-5 \ + --adam-eps 1e-8 \ --save-interval 500 \ --eval-interval 100 \ --eval-iters 10 \ diff --git a/scripts/gcp/gemma-swallow/gemma-swallow-27b.sh b/scripts/gcp/gemma-swallow/gemma-swallow-27b.sh new file mode 100644 index 0000000..29da6ba --- /dev/null +++ b/scripts/gcp/gemma-swallow/gemma-swallow-27b.sh @@ -0,0 +1,182 @@ +#!/bin/bash +#SBATCH --job-name=gemma +#SBATCH --partition=a3 +#SBATCH --exclusive +#SBATCH --nodes 8 +#SBATCH --gpus-per-node=8 +#SBATCH --ntasks-per-node=8 +#SBATCH --time=3-12:00:00 +#SBATCH --output=outputs/gemma/%x-%j.out +#SBATCH --error=outputs/gemma/%x-%j.out + +set -e + +# module load +module load turing/cuda/12.1 +module load turing/cudnn/8.9.7 +module load turing/nccl/2.20.5 +module load turing/hpcx/2.17.1 + +# open file limit +ulimit -n 65536 1048576 + +# python virtualenv +source .env/bin/activate + +# Important TCPX environment variables +UDS_PATH="/run/tcpx-${SLURM_JOB_ID}" + +# Only use TCPX for multi-node jobs. +[[ "${SLURM_JOB_NUM_NODES}" -gt 1 ]] && export USE_TCPX=yes || export USE_TCPX=no + +# Only use TCPX for multi-node jobs. +if [[ ${USE_TCPX} = "yes" ]]; then + # Set up NCCL Environment variables + export NCCL_NET=GPUDirectTCPX_v7 + # These network interfaces use Ubuntu's consistent naming scheme. See + # https://manpages.ubuntu.com/manpages/focal/man7/systemd.net-naming-scheme.7.html + export NCCL_SOCKET_IFNAME=enp0s12 + export NCCL_GPUDIRECTTCPX_CTRL_DEV=enp0s12 + export NCCL_GPUDIRECTTCPX_SOCKET_IFNAME=enp6s0,enp12s0,enp134s0,enp140s0 + export NCCL_CROSS_NIC=0 + export NCCL_ALGO=Ring + export NCCL_PROTO=Simple + export NCCL_NSOCKS_PERTHREAD=4 + export NCCL_SOCKET_NTHREADS=1 + export NCCL_DYNAMIC_CHUNK_SIZE=524288 + export NCCL_P2P_NET_CHUNKSIZE=524288 + export NCCL_P2P_PCI_CHUNKSIZE=524288 + export NCCL_P2P_NVL_CHUNKSIZE=1048576 + export CUDA_VISIBLE_DEVICES=0,1,2,3,4,5,6,7 + export NCCL_NET_GDR_LEVEL=PIX + export NCCL_P2P_PXN_LEVEL=0 + export NCCL_GPUDIRECTTCPX_UNIX_CLIENT_PREFIX=${UDS_PATH} + export NCCL_GPUDIRECTTCPX_PROGRAM_FLOW_STEERING_WAIT_MICROS=500000 + export NCCL_GPUDIRECTTCPX_TX_BINDINGS="enp6s0:8-21,112-125;enp12s0:8-21,112-125;enp134s0:60-73,164-177;enp140s0:60-73,164-177" + export NCCL_GPUDIRECTTCPX_RX_BINDINGS="enp6s0:22-35,126-139;enp12s0:22-35,126-139;enp134s0:74-87,178-191;enp140s0:74-87,178-191" + + export LD_LIBRARY_PATH=/var/lib/tcpx/lib64:${LD_LIBRARY_PATH} +else + unset NCCL_NET +fi + +# distributed settings +export MASTER_ADDR=$(scontrol show hostname $SLURM_JOB_NODELIST | head -n1) +export MASTER_PORT=$((10000 + ($SLURM_JOBID % 50000))) + +echo "MASTER_ADDR=${MASTER_ADDR}" + +# hostfile +export NUM_GPU_PER_NODE=8 +NODE_TYPE="H100" + +NUM_NODES=$SLURM_JOB_NUM_NODES +NUM_GPUS=$((${NUM_NODES} * ${NUM_GPU_PER_NODE})) + +# training config +SEQ_LENGTH=8192 +DATA_PARALLEL_SIZE=$NUM_GPUS + +MICRO_BATCH_SIZE=1 +GLOBAL_BATCH_SIZE=512 +TRAIN_STEPS=25000 + +# optimizer config +LR=1.5E-5 +MIN_LR=1.5E-6 +LR_WARMUP_STEPS=1000 +LR_DECAY_STEPS=25000 +WEIGHT_DECAY=0.1 +GRAD_CLIP=1 +# model config +TOKENIZER_MODEL=/home/ext_kazuki_fujii_turing_motors_c/hf-checkpoints/gemma-2-27b/tokenizer.model +CHECKPOINT_DIR=/home/ext_kazuki_fujii_turing_motors_c/hf-checkpoints/gemma-2-27b +CHECKPOINT_SAVE_DIR="/data/checkpoints/gemma-2-27b/LR${LR}-MINLR${MIN_LR}-WARMUP${LR_WARMUP_STEPS}-WD${WEIGHT_DECAY}-GC${GRAD_CLIP}" + +mkdir -p ${CHECKPOINT_SAVE_DIR} + +# data config +DATASET_DIR=/data/gemma_datasets/gemma-2_original_transformers-4.42.4 + +TRAIN_DATA_PATH="" + +# ja swallow corpus +TRAIN_DATA_PATH="${TRAIN_DATA_PATH} 9651717149 ${DATASET_DIR}/split_0_text_document" +TRAIN_DATA_PATH="${TRAIN_DATA_PATH} 9509783737 ${DATASET_DIR}/split_1_text_document" +TRAIN_DATA_PATH="${TRAIN_DATA_PATH} 11318518471 ${DATASET_DIR}/split_2_text_document" +TRAIN_DATA_PATH="${TRAIN_DATA_PATH} 14904913186 ${DATASET_DIR}/split_3_text_document" +TRAIN_DATA_PATH="${TRAIN_DATA_PATH} 34418569125 ${DATASET_DIR}/split_4_text_document" + +# ja wikipedia +TRAIN_DATA_PATH="${TRAIN_DATA_PATH} 1478281282 ${DATASET_DIR}/ja_wiki_merged_text_document" + +# ja-en laboro +TRAIN_DATA_PATH="${TRAIN_DATA_PATH} 769992751 ${DATASET_DIR}/default_plain_text_format_text_document" + +# en wikipedia +TRAIN_DATA_PATH="${TRAIN_DATA_PATH} 1890321820 ${DATASET_DIR}/en_wiki_merged_train_text_document" + +# en refinedweb +TRAIN_DATA_PATH="${TRAIN_DATA_PATH} 1890321820 ${DATASET_DIR}/lumi_en_falcon_merge_text_document" + +# en cosmopedia +TRAIN_DATA_PATH="${TRAIN_DATA_PATH} 1276041352 ${DATASET_DIR}/cosmopedia_automathtext_train_text_document" +TRAIN_DATA_PATH="${TRAIN_DATA_PATH} 20344318 ${DATASET_DIR}/cosmopedia_khanacademy_train_text_document" +TRAIN_DATA_PATH="${TRAIN_DATA_PATH} 94906162 ${DATASET_DIR}/cosmopedia_openstax_train_text_document" +TRAIN_DATA_PATH="${TRAIN_DATA_PATH} 914799590 ${DATASET_DIR}/cosmopedia_stanford_train_text_document" +TRAIN_DATA_PATH="${TRAIN_DATA_PATH} 2621500949 ${DATASET_DIR}/cosmopedia_stories_train_text_document" +TRAIN_DATA_PATH="${TRAIN_DATA_PATH} 158819730 ${DATASET_DIR}/cosmopedia_wikihow_train_text_document" + +# code +TRAIN_DATA_PATH="${TRAIN_DATA_PATH} 4540584279 ${DATASET_DIR}/algebraic-stack_text_document" +TRAIN_DATA_PATH="${TRAIN_DATA_PATH} 4540584279 ${DATASET_DIR}/proof-pile-2-train_merged_open-web-math_text_document" + + +# job name +JOB_NAME="gemma-2-turing-swallow-gcp-${NODE_TYPE}-${NUM_NODES}node-${NUM_GPUS}gpu-BS=${GLOBAL_BATCH_SIZE}-LR=${LR}-MINLR=${MIN_LR}-WARMUP=${LR_WARMUP_STEPS}-WD=${WEIGHT_DECAY}-GC=${GRAD_CLIP}" + +# run +mpirun -np $NUM_GPUS \ + --npernode $NUM_GPU_PER_NODE \ + -x MASTER_ADDR=$MASTER_ADDR \ + -x MASTER_PORT=$MASTER_PORT \ + -bind-to none \ + -x LD_LIBRARY_PATH \ + -x PATH \ + python examples/finetuning.py \ + --seq-length ${SEQ_LENGTH} \ + --sliding-window-size ${SEQ_LENGTH} \ + --micro-batch-size ${MICRO_BATCH_SIZE} \ + --global-batch-size ${GLOBAL_BATCH_SIZE} \ + --train-iters ${TRAIN_STEPS} \ + --tokenizer-type Llama2Tokenizer \ + --tokenizer-model ${TOKENIZER_MODEL} \ + --data-path ${TRAIN_DATA_PATH} \ + --split 989,10,1 \ + --lr ${LR} \ + --min-lr ${MIN_LR} \ + --lr-decay-style cosine \ + --lr-warmup-iters ${LR_WARMUP_STEPS} \ + --lr-decay-iters ${LR_DECAY_STEPS} \ + --weight-decay ${WEIGHT_DECAY} \ + --grad-clip-norm ${GRAD_CLIP} \ + --optimizer adam \ + --adam-beta1 0.9 \ + --adam-beta2 0.95 \ + --adam-eps 1e-8 \ + --save-interval 250 \ + --eval-interval 100 \ + --eval-iters 10 \ + --bf16 \ + --mixed-precision \ + --base-model ${CHECKPOINT_DIR} \ + --save ${CHECKPOINT_SAVE_DIR} \ + --load ${CHECKPOINT_SAVE_DIR} \ + --low-cpu-fsdp \ + --sharding-strategy FULL_SHARD \ + --checkpoint-type LOCAL_STATE_DICT \ + --fsdp-activation-checkpointing \ + --use-mpi \ + --wandb-entity "turing-geniac" \ + --wandb-project "gemma-2-turing-swallow" \ + --wandb-name "${JOB_NAME}" diff --git a/scripts/gcp/gemma-swallow/gemma-swallow-9b.sh b/scripts/gcp/gemma-swallow/gemma-swallow-9b.sh new file mode 100644 index 0000000..b6e7e16 --- /dev/null +++ b/scripts/gcp/gemma-swallow/gemma-swallow-9b.sh @@ -0,0 +1,182 @@ +#!/bin/bash +#SBATCH --job-name=gemma +#SBATCH --partition=a3 +#SBATCH --exclusive +#SBATCH --nodes 8 +#SBATCH --gpus-per-node=8 +#SBATCH --ntasks-per-node=8 +#SBATCH --time=3-12:00:00 +#SBATCH --output=outputs/gemma/%x-%j.out +#SBATCH --error=outputs/gemma/%x-%j.out + +set -e + +# module load +module load turing/cuda/12.1 +module load turing/cudnn/8.9.7 +module load turing/nccl/2.20.5 +module load turing/hpcx/2.17.1 + +# open file limit +ulimit -n 65536 1048576 + +# python virtualenv +source .env/bin/activate + +# Important TCPX environment variables +UDS_PATH="/run/tcpx-${SLURM_JOB_ID}" + +# Only use TCPX for multi-node jobs. +[[ "${SLURM_JOB_NUM_NODES}" -gt 1 ]] && export USE_TCPX=yes || export USE_TCPX=no + +# Only use TCPX for multi-node jobs. +if [[ ${USE_TCPX} = "yes" ]]; then + # Set up NCCL Environment variables + export NCCL_NET=GPUDirectTCPX_v7 + # These network interfaces use Ubuntu's consistent naming scheme. See + # https://manpages.ubuntu.com/manpages/focal/man7/systemd.net-naming-scheme.7.html + export NCCL_SOCKET_IFNAME=enp0s12 + export NCCL_GPUDIRECTTCPX_CTRL_DEV=enp0s12 + export NCCL_GPUDIRECTTCPX_SOCKET_IFNAME=enp6s0,enp12s0,enp134s0,enp140s0 + export NCCL_CROSS_NIC=0 + export NCCL_ALGO=Ring + export NCCL_PROTO=Simple + export NCCL_NSOCKS_PERTHREAD=4 + export NCCL_SOCKET_NTHREADS=1 + export NCCL_DYNAMIC_CHUNK_SIZE=524288 + export NCCL_P2P_NET_CHUNKSIZE=524288 + export NCCL_P2P_PCI_CHUNKSIZE=524288 + export NCCL_P2P_NVL_CHUNKSIZE=1048576 + export CUDA_VISIBLE_DEVICES=0,1,2,3,4,5,6,7 + export NCCL_NET_GDR_LEVEL=PIX + export NCCL_P2P_PXN_LEVEL=0 + export NCCL_GPUDIRECTTCPX_UNIX_CLIENT_PREFIX=${UDS_PATH} + export NCCL_GPUDIRECTTCPX_PROGRAM_FLOW_STEERING_WAIT_MICROS=500000 + export NCCL_GPUDIRECTTCPX_TX_BINDINGS="enp6s0:8-21,112-125;enp12s0:8-21,112-125;enp134s0:60-73,164-177;enp140s0:60-73,164-177" + export NCCL_GPUDIRECTTCPX_RX_BINDINGS="enp6s0:22-35,126-139;enp12s0:22-35,126-139;enp134s0:74-87,178-191;enp140s0:74-87,178-191" + + export LD_LIBRARY_PATH=/var/lib/tcpx/lib64:${LD_LIBRARY_PATH} +else + unset NCCL_NET +fi + +# distributed settings +export MASTER_ADDR=$(scontrol show hostname $SLURM_JOB_NODELIST | head -n1) +export MASTER_PORT=$((10000 + ($SLURM_JOBID % 50000))) + +echo "MASTER_ADDR=${MASTER_ADDR}" + +# hostfile +export NUM_GPU_PER_NODE=8 +NODE_TYPE="H100" + +NUM_NODES=$SLURM_JOB_NUM_NODES +NUM_GPUS=$((${NUM_NODES} * ${NUM_GPU_PER_NODE})) + +# training config +SEQ_LENGTH=8192 +DATA_PARALLEL_SIZE=$NUM_GPUS + +MICRO_BATCH_SIZE=1 +GLOBAL_BATCH_SIZE=512 +TRAIN_STEPS=25000 + +# optimizer config +LR=2.5E-5 +MIN_LR=2.5E-6 +LR_WARMUP_STEPS=1000 +LR_DECAY_STEPS=25000 +WEIGHT_DECAY=0.1 +GRAD_CLIP=1 +# model config +TOKENIZER_MODEL=/home/ext_kazuki_fujii_turing_motors_c/hf-checkpoints/gemma-2-9b/tokenizer.model +CHECKPOINT_DIR=/home/ext_kazuki_fujii_turing_motors_c/hf-checkpoints/gemma-2-9b +CHECKPOINT_SAVE_DIR="/data/checkpoints/gemma-2-9b/LR${LR}-MINLR${MIN_LR}-WARMUP${LR_WARMUP_STEPS}-WD${WEIGHT_DECAY}-GC${GRAD_CLIP}" + +mkdir -p ${CHECKPOINT_SAVE_DIR} + +# data config +DATASET_DIR=/data/gemma_datasets/gemma-2_original_transformers-4.42.4 + +TRAIN_DATA_PATH="" + +# ja swallow corpus +TRAIN_DATA_PATH="${TRAIN_DATA_PATH} 9651717149 ${DATASET_DIR}/split_0_text_document" +TRAIN_DATA_PATH="${TRAIN_DATA_PATH} 9509783737 ${DATASET_DIR}/split_1_text_document" +TRAIN_DATA_PATH="${TRAIN_DATA_PATH} 11318518471 ${DATASET_DIR}/split_2_text_document" +TRAIN_DATA_PATH="${TRAIN_DATA_PATH} 14904913186 ${DATASET_DIR}/split_3_text_document" +TRAIN_DATA_PATH="${TRAIN_DATA_PATH} 34418569125 ${DATASET_DIR}/split_4_text_document" + +# ja wikipedia +TRAIN_DATA_PATH="${TRAIN_DATA_PATH} 1478281282 ${DATASET_DIR}/ja_wiki_merged_text_document" + +# ja-en laboro +TRAIN_DATA_PATH="${TRAIN_DATA_PATH} 769992751 ${DATASET_DIR}/default_plain_text_format_text_document" + +# en wikipedia +TRAIN_DATA_PATH="${TRAIN_DATA_PATH} 1890321820 ${DATASET_DIR}/en_wiki_merged_train_text_document" + +# en refinedweb +TRAIN_DATA_PATH="${TRAIN_DATA_PATH} 1890321820 ${DATASET_DIR}/lumi_en_falcon_merge_text_document" + +# en cosmopedia +TRAIN_DATA_PATH="${TRAIN_DATA_PATH} 1276041352 ${DATASET_DIR}/cosmopedia_automathtext_train_text_document" +TRAIN_DATA_PATH="${TRAIN_DATA_PATH} 20344318 ${DATASET_DIR}/cosmopedia_khanacademy_train_text_document" +TRAIN_DATA_PATH="${TRAIN_DATA_PATH} 94906162 ${DATASET_DIR}/cosmopedia_openstax_train_text_document" +TRAIN_DATA_PATH="${TRAIN_DATA_PATH} 914799590 ${DATASET_DIR}/cosmopedia_stanford_train_text_document" +TRAIN_DATA_PATH="${TRAIN_DATA_PATH} 2621500949 ${DATASET_DIR}/cosmopedia_stories_train_text_document" +TRAIN_DATA_PATH="${TRAIN_DATA_PATH} 158819730 ${DATASET_DIR}/cosmopedia_wikihow_train_text_document" + +# code +TRAIN_DATA_PATH="${TRAIN_DATA_PATH} 4540584279 ${DATASET_DIR}/algebraic-stack_text_document" +TRAIN_DATA_PATH="${TRAIN_DATA_PATH} 4540584279 ${DATASET_DIR}/proof-pile-2-train_merged_open-web-math_text_document" + + +# job name +JOB_NAME="gemma-2-turing-swallow-gcp-${NODE_TYPE}-${NUM_NODES}node-${NUM_GPUS}gpu-BS=${GLOBAL_BATCH_SIZE}-LR=${LR}-MINLR=${MIN_LR}-WARMUP=${LR_WARMUP_STEPS}-WD=${WEIGHT_DECAY}-GC=${GRAD_CLIP}" + +# run +mpirun -np $NUM_GPUS \ + --npernode $NUM_GPU_PER_NODE \ + -x MASTER_ADDR=$MASTER_ADDR \ + -x MASTER_PORT=$MASTER_PORT \ + -bind-to none \ + -x LD_LIBRARY_PATH \ + -x PATH \ + python examples/finetuning.py \ + --seq-length ${SEQ_LENGTH} \ + --sliding-window-size ${SEQ_LENGTH} \ + --micro-batch-size ${MICRO_BATCH_SIZE} \ + --global-batch-size ${GLOBAL_BATCH_SIZE} \ + --train-iters ${TRAIN_STEPS} \ + --tokenizer-type Llama2Tokenizer \ + --tokenizer-model ${TOKENIZER_MODEL} \ + --data-path ${TRAIN_DATA_PATH} \ + --split 989,10,1 \ + --lr ${LR} \ + --min-lr ${MIN_LR} \ + --lr-decay-style cosine \ + --lr-warmup-iters ${LR_WARMUP_STEPS} \ + --lr-decay-iters ${LR_DECAY_STEPS} \ + --weight-decay ${WEIGHT_DECAY} \ + --grad-clip-norm ${GRAD_CLIP} \ + --optimizer adam \ + --adam-beta1 0.9 \ + --adam-beta2 0.95 \ + --adam-eps 1e-8 \ + --save-interval 500 \ + --eval-interval 100 \ + --eval-iters 10 \ + --bf16 \ + --mixed-precision \ + --base-model ${CHECKPOINT_DIR} \ + --save ${CHECKPOINT_SAVE_DIR} \ + --load ${CHECKPOINT_SAVE_DIR} \ + --low-cpu-fsdp \ + --sharding-strategy FULL_SHARD \ + --checkpoint-type LOCAL_STATE_DICT \ + --fsdp-activation-checkpointing \ + --use-mpi \ + --wandb-entity "turing-geniac" \ + --wandb-project "gemma-2-turing-swallow" \ + --wandb-name "${JOB_NAME}" diff --git a/scripts/gcp/install.sh b/scripts/gcp/install.sh new file mode 100644 index 0000000..ca35f01 --- /dev/null +++ b/scripts/gcp/install.sh @@ -0,0 +1,42 @@ +#!/bin/bash +#SBATCH --job-name=install +#SBATCH --partition=a3 +#SBATCH --nodes 1 +#SBATCH --gpus-per-node=8 +#SBATCH --ntasks-per-node=8 +#SBATCH --output=outputs/install/%x-%j.out +#SBATCH --error=outputs/install/%x-%j.out + +set -e + +# module load +module load turing/cuda/12.1 +module load turing/cudnn/8.9.7 +module load turing/nccl/2.20.5 +module load turing/hpcx/2.17.1 + +# swich virtual env +source .env/bin/activate + +# pip version up +pip install --upgrade pip + +# pip install requirements +pip install -r requirements.txt +pip install ninja packaging wheel + +# distirbuted training requirements +pip install mpi4py + +# huggingface requirements +pip install huggingface_hub + +# install transformer engine +pip install git+https://github.com/NVIDIA/TransformerEngine.git@v1.6 +pip uninstall flash-attn + +# install flash-atten +git clone git@github.com:Dao-AILab/flash-attention.git +cd flash-attention +git checkout v2.4.2 +pip install -e . diff --git a/scripts/tsubame/dpo/Llama-3-8B/Llama-3-8B-chat-v0.2.sh b/scripts/tsubame/dpo/Llama-3-8B/Llama-3-8B-chat-v0.2.sh new file mode 100644 index 0000000..8e134ab --- /dev/null +++ b/scripts/tsubame/dpo/Llama-3-8B/Llama-3-8B-chat-v0.2.sh @@ -0,0 +1,116 @@ +#!/bin/sh +#$ -cwd +#$ -l node_f=4 +#$ -l h_rt=1:00:00:00 +#$ -o outputs/Llama-3-8b-dpo/$JOB_ID.log +#$ -e outputs/Llama-3-8b-dpo/$JOB_ID.log +#$ -p -5 + +# module load +module use /gs/fs/tga-NII-LLM/modules/modulefiles + +module load ylab/cuda/12.1 +module load ylab/cudnn/8.9.7 +module load ylab/nccl/cuda-12.2/2.20.5 +module load ylab/hpcx/2.17.1 +module load ninja/1.11.1 + +# swich virtual env +source .env/bin/activate + +# distributed settings +export MASTER_ADDR=$(/usr/sbin/ip a show dev bond0 | grep 'inet ' | awk '{ print $2 }' | cut -d "/" -f 1) +export MASTER_PORT=$((10000 + ($JOB_ID % 50000))) + +echo "MASTER_ADDR=${MASTER_ADDR}" + +# hostfile +export NUM_GPU_PER_NODE=4 +NODE_TYPE="h100" + +NUM_NODES=$NHOSTS +NUM_GPUS=$((${NUM_NODES} * ${NUM_GPU_PER_NODE})) + +mkdir -p ./hostfile + +HOSTFILE_NAME=./hostfile/hostfile_${JOB_ID} +while read -r hostname _ rest; do + echo "${hostname} slots=${NUM_GPU_PER_NODE}" +done <"$PE_HOSTFILE" >"$HOSTFILE_NAME" + +# training config +SEQ_LENGTH=8192 +DATA_PARALLEL_SIZE=$NUM_GPUS + +MICRO_BATCH_SIZE=2 +GLOBAL_BATCH_SIZE=128 + +# optimizer config +LR=1e-5 +MIN_LR=1e-6 +WEIGHT_DECAY=0.1 +GRAD_CLIP=1 + +# checkpoint +TOKENIZER_DIR=/gs/bs/tga-NII-LLM/hf-checkpoints/Meta-Llama-3-8B-Instruct +CHECKPOINT_DIR=/gs/bs/tga-NII-LLM/swallow-hf/Llama-3-Swallow-8B-v0.1 +CHECKPOINT_SAVE_DIR="/gs/bs/tga-NII-LLM/checkpoints/Llama-3-8B-chat-v0.2/LR_${LR}_MINLR_${MIN_LR}_WD_${WEIGHT_DECAY}_GC_${GRAD_CLIP}" + +mkdir -p ${CHECKPOINT_SAVE_DIR} + +# dataset +DATASET_DIR=/gs/bs/tga-NII-LLM/datasets/raw/dpo/hh-rlhf-12k-ja + +TRAIN_DATA_PATH=${DATASET_DIR}/converted.jsonl +VALID_DATA_PATH=${DATASET_DIR}/converted.jsonl + +# job name +JOB_NAME="Llama-3-8B-dpo-v0.2-BS=${GLOBAL_BATCH_SIZE}-LR=${LR}-MINLR=${MIN_LR}-WD=${WEIGHT_DECAY}-GC=${GRAD_CLIP}" + +# run +mpirun -np $NUM_GPUS \ + --npernode $NUM_GPU_PER_NODE \ + -hostfile $HOSTFILE_NAME \ + -x MASTER_ADDR=$MASTER_ADDR \ + -x MASTER_PORT=$MASTER_PORT \ + -x CUDA_DEVICE_MAX_CONNECTIONS=1 \ + -x LD_LIBRARY_PATH \ + -x PATH \ + -bind-to none \ + python examples/finetuning.py \ + --seq-length ${SEQ_LENGTH} \ + --micro-batch-size ${MICRO_BATCH_SIZE} \ + --global-batch-size ${GLOBAL_BATCH_SIZE} \ + --hf-transformer-model-dir ${TOKENIZER_DIR} \ + --dpo-train-data-path ${TRAIN_DATA_PATH} \ + --dpo-valid-data-path ${VALID_DATA_PATH} \ + --epoch 1 \ + --lr ${LR} \ + --min-lr ${MIN_LR} \ + --lr-decay-style cosine \ + --weight-decay ${WEIGHT_DECAY} \ + --grad-clip-norm ${GRAD_CLIP} \ + --optimizer adam \ + --adam-beta1 0.9 \ + --adam-beta2 0.95 \ + --adam-eps 1e-8 \ + --save-interval 500 \ + --eval-interval 500 \ + --eval-iters 10 \ + --bf16 \ + --mixed-precision \ + --base-model ${CHECKPOINT_DIR} \ + --save ${CHECKPOINT_SAVE_DIR} \ + --load ${CHECKPOINT_SAVE_DIR} \ + --low-cpu-fsdp \ + --sharding-strategy FULL_SHARD \ + --checkpoint-type LOCAL_STATE_DICT \ + --fsdp-activation-checkpointing \ + --direct-preference-optimization \ + --dpo-beta 0.1 \ + --dpo-label-smoothing 0.0 \ + --save-sampler-state \ + --use-mpi \ + --wandb-entity "prj-jalm" \ + --wandb-project "Llama-3-8B-chat-v0.2" \ + --wandb-name "${JOB_NAME}" diff --git a/scripts/tsubame/install.sh b/scripts/tsubame/install.sh index 0535157..900f578 100644 --- a/scripts/tsubame/install.sh +++ b/scripts/tsubame/install.sh @@ -4,23 +4,30 @@ #$ -l h_rt=1:00:00 #$ -p -5 -# priotiry: -5: normal, -4: high, -3: highest - # Load modules -module load cuda/12.1.0 -module load nccl/2.20.5 -module load openmpi/5.0.2-gcc +module use /gs/fs/tga-NII-LLM/modules/modulefiles + +module load ylab/cuda/12.1 +module load ylab/cudnn/8.9.7 +module load ylab/nccl/cuda-12.2/2.20.5 +module load ylab/hpcx/2.17.1 module load ninja/1.11.1 -module load ~/modulefiles/cudnn/9.0.0 # Set environment variables source .env/bin/activate +# pip version up pip install --upgrade pip -# Install packages +# pip install requirements pip install -r requirements.txt -# flash attn +# distirbuted training requirements +pip install mpi4py + +# huggingface requirements +pip install huggingface_hub + +# install flash-atten pip install ninja packaging wheel pip install flash-attn --no-build-isolation diff --git a/scripts/abci/instruction/swallow-13b/swallow-13b-baseline_imitatation_2.sh b/scripts/tsubame/instruct/Llama-3-8B/Llama-3-8B-instruct-v0.2.sh similarity index 51% rename from scripts/abci/instruction/swallow-13b/swallow-13b-baseline_imitatation_2.sh rename to scripts/tsubame/instruct/Llama-3-8B/Llama-3-8B-instruct-v0.2.sh index 582e974..fa9e3f5 100644 --- a/scripts/abci/instruction/swallow-13b/swallow-13b-baseline_imitatation_2.sh +++ b/scripts/tsubame/instruct/Llama-3-8B/Llama-3-8B-instruct-v0.2.sh @@ -1,16 +1,19 @@ -#!/bin/bash -#$ -l rt_AF=2 -#$ -l h_rt=0:20:00:00 -#$ -j y -#$ -o outputs/instruction/swallow-13b/ +#!/bin/sh #$ -cwd +#$ -l node_f=4 +#$ -l h_rt=0:20:00:00 +#$ -o outputs/Llama-3-8b-instruct/$JOB_ID.log +#$ -e outputs/Llama-3-8b-instruct/$JOB_ID.log +#$ -p -5 # module load -source /etc/profile.d/modules.sh -module load cuda/11.8/11.8.0 -module load cudnn/8.9/8.9.2 -module load nccl/2.16/2.16.2-1 -module load hpcx/2.12 +module use /gs/fs/tga-NII-LLM/modules/modulefiles + +module load ylab/cuda/12.1 +module load ylab/cudnn/8.9.7 +module load ylab/nccl/cuda-12.2/2.20.5 +module load ylab/hpcx/2.17.1 +module load ninja/1.11.1 # swich virtual env source .env/bin/activate @@ -22,16 +25,8 @@ export MASTER_PORT=$((10000 + ($JOB_ID % 50000))) echo "MASTER_ADDR=${MASTER_ADDR}" # hostfile - -if [[ "$SGE_RESOURCE_TYPE" == "rt_F" ]]; then - export NUM_GPU_PER_NODE=4 - NODE_TYPE="v100" -elif [[ "$SGE_RESOURCE_TYPE" == "rt_AF" ]]; then - export NUM_GPU_PER_NODE=8 - NODE_TYPE="a100" -else - echo "Unrecognized SGE_RESOURCE_TYPE: $SGE_RESOURCE_TYPE" -fi +export NUM_GPU_PER_NODE=4 +NODE_TYPE="h100" NUM_NODES=$NHOSTS NUM_GPUS=$((${NUM_NODES} * ${NUM_GPU_PER_NODE})) @@ -39,38 +34,38 @@ NUM_GPUS=$((${NUM_NODES} * ${NUM_GPU_PER_NODE})) mkdir -p ./hostfile HOSTFILE_NAME=./hostfile/hostfile_${JOB_ID} -while read -r line; do - echo "${line} slots=${NUM_GPU_PER_NODE}" -done <"$SGE_JOB_HOSTLIST" >"$HOSTFILE_NAME" +while read -r hostname _ rest; do + echo "${hostname} slots=${NUM_GPU_PER_NODE}" +done <"$PE_HOSTFILE" >"$HOSTFILE_NAME" # training config -SEQ_LENGTH=4096 +SEQ_LENGTH=8192 DATA_PARALLEL_SIZE=$NUM_GPUS MICRO_BATCH_SIZE=2 -GLOBAL_BATCH_SIZE=256 +GLOBAL_BATCH_SIZE=128 # optimizer config -LR=2e-5 -MIN_LR=2e-6 +LR=1e-5 +MIN_LR=1e-6 WEIGHT_DECAY=0.1 GRAD_CLIP=1 -# checkpoint & tokenizer -TOKENIZER_MODEL=/bb/llm/gaf51275/llama/huggingface-checkpoint/Swallow-13b-hf/tokenizer.model -CHECKPOINT_DIR=/bb/llm/gaf51275/llama/huggingface-checkpoint/Swallow-13b-hf -CHECKPOINT_SAVE_DIR="/bb/llm/gaf51275/llama/checkpoints/Swallow-13b-VE-chat/baseline-imitation-2-lr_${LR}-minlr_${MIN_LR}-GB_${GLOBAL_BATCH_SIZE}" +# checkpoint +TOKENIZER_DIR=/gs/bs/tga-NII-LLM/hf-checkpoints/Meta-Llama-3-8B-Instruct +CHECKPOINT_DIR=/gs/bs/tga-NII-LLM/swallow-hf/Llama-3-Swallow-8B-v0.1 +CHECKPOINT_SAVE_DIR="/gs/bs/tga-NII-LLM/checkpoints/Llama-3-8B-Instruct-v0.2/LR_${LR}_MINLR_${MIN_LR}_WD_${WEIGHT_DECAY}_GC_${GRAD_CLIP}" mkdir -p ${CHECKPOINT_SAVE_DIR} # dataset -DATASET_DIR=/bb/llm/gaf51275/llama/finetuning/datasets/training/baseline-imitation_2 +DATASET_DIR=/gs/bs/tga-NII-LLM/datasets/raw/instruct/synthetic/general/Synthetic-JP-Conversations-Magpie-Nemotron-4-10k -TRAIN_DATA_PATH=${DATASET_DIR}/train.jsonl -VALID_DATA_PATH=${DATASET_DIR}/val.jsonl +TRAIN_DATA_PATH=${DATASET_DIR}/converted.jsonl +VALID_DATA_PATH=${DATASET_DIR}/converted.jsonl # job name -JOB_NAME="Swallow-13b-VE-baseline-imitation-2-BS=${GLOBAL_BATCH_SIZE}-LR=${LR}-MINLR=${MIN_LR}" +JOB_NAME="Llama-3-8B-instruct-v0.2-BS=${GLOBAL_BATCH_SIZE}-LR=${LR}-MINLR=${MIN_LR}-WD=${WEIGHT_DECAY}-GC=${GRAD_CLIP}" # run mpirun -np $NUM_GPUS \ @@ -78,20 +73,18 @@ mpirun -np $NUM_GPUS \ -hostfile $HOSTFILE_NAME \ -x MASTER_ADDR=$MASTER_ADDR \ -x MASTER_PORT=$MASTER_PORT \ - -bind-to none -map-by slot \ + -x CUDA_DEVICE_MAX_CONNECTIONS=1 \ + -x LD_LIBRARY_PATH \ -x PATH \ + -bind-to none \ python examples/finetuning.py \ --seq-length ${SEQ_LENGTH} \ - --sliding-window-size ${SEQ_LENGTH} \ --micro-batch-size ${MICRO_BATCH_SIZE} \ --global-batch-size ${GLOBAL_BATCH_SIZE} \ - --hf-transformer-model-dir ${CHECKPOINT_DIR} \ - --tokenizer-type Llama2Tokenizer \ - --tokenizer-model ${TOKENIZER_MODEL} \ + --hf-transformer-model-dir ${TOKENIZER_DIR} \ --instruction-train-data-path ${TRAIN_DATA_PATH} \ --instruction-valid-data-path ${VALID_DATA_PATH} \ - --epoch 2 \ - --train-iters 500000 \ + --epoch 1 \ --lr ${LR} \ --min-lr ${MIN_LR} \ --lr-decay-style cosine \ @@ -100,10 +93,10 @@ mpirun -np $NUM_GPUS \ --optimizer adam \ --adam-beta1 0.9 \ --adam-beta2 0.95 \ - --adam-eps 1e-6 \ + --adam-eps 1e-8 \ --save-interval 500 \ - --eval-interval 100 \ - --eval-iters 20 \ + --eval-interval 500 \ + --eval-iters 10 \ --bf16 \ --mixed-precision \ --base-model ${CHECKPOINT_DIR} \ @@ -117,5 +110,5 @@ mpirun -np $NUM_GPUS \ --save-sampler-state \ --use-mpi \ --wandb-entity "prj-jalm" \ - --wandb-project "Llama-2-13b-instruct" \ + --wandb-project "Llama-3-8B-Instruct-v0.2" \ --wandb-name "${JOB_NAME}" diff --git a/src/llama_recipes/arguments.py b/src/llama_recipes/arguments.py index 62da802..c69f447 100644 --- a/src/llama_recipes/arguments.py +++ b/src/llama_recipes/arguments.py @@ -20,6 +20,8 @@ def parse_args() -> argparse.Namespace: # validate if args.use_freeze_layers: assert args.no_save_optimizer_state is True + # adam epsilon is very sensitive value so don't change + assert args.adam_eps == 1e-8 return args @@ -147,6 +149,13 @@ def _add_data_args(parser: argparse.ArgumentParser) -> argparse.ArgumentParser: '--vocab-extra-ids', type=int, default=0, help='Number of additional vocabulary tokens. They are used for span masking in the T5 model' ) + # instruction tuning + group.add_argument( + '--system-prompt-role', type=str, default="system" + ) + group.add_argument( + '--system-prompt-content', type=str, default='あなたは誠実で優秀な日本人のアシスタントです。' + ) return parser @@ -261,6 +270,8 @@ def _add_training_args(parser: argparse.ArgumentParser) -> argparse.ArgumentPars group.add_argument("--instruction-tuning", action="store_true") # DPO group.add_argument("--direct-preference-optimization", action="store_true") + group.add_argument('--dpo-beta', type=float, default=0.1) + group.add_argument('--dpo-label-smoothing', type=float, default=0.0) return parser @@ -308,6 +319,12 @@ def _add_instruction_tuning_args(parser: argparse.ArgumentParser) -> argparse.Ar group.add_argument( "--instruction-valid-data-path", type=str, default=None, ) + group.add_argument( + "--dpo-train-data-path", type=str, default=None, + ) + group.add_argument( + "--dpo-valid-data-path", type=str, default=None, + ) group.add_argument( "--epoch", type=int, default=2, ) diff --git a/src/llama_recipes/finetuning.py b/src/llama_recipes/finetuning.py index 3fb45d0..8669558 100644 --- a/src/llama_recipes/finetuning.py +++ b/src/llama_recipes/finetuning.py @@ -1,3 +1,4 @@ +import copy import os import sys @@ -49,7 +50,8 @@ def main() -> None: # initialize args = parse_args() - set_global_variables(args=args) + is_pretraining = not (args.instruction_tuning or args.direct_preference_optimization) + set_global_variables(args=args, build_tokenizer=is_pretraining) # Set the seeds for reproducibility set_seed(seed=args.seed) @@ -79,6 +81,7 @@ def main() -> None: "name": args.wandb_name, "config": vars(args), } + wandb.require("core") wandb.init(**wandb_setting) if torch_distributed.is_initialized(): @@ -99,6 +102,10 @@ def main() -> None: model = get_model( model_name=args.base_model, use_cache=use_cache ) + if args.direct_preference_optimization: + reference_model = copy.deepcopy(model) + for param in reference_model.parameters(): + param.requires_grad = False if args.load: load_model_state_dict(model, args.load) # type: ignore @@ -113,6 +120,13 @@ def main() -> None: elif args.fp16: model.to(torch.float16) # type: ignore + if args.direct_preference_optimization: + with preserve_fp32_buffers(reference_model): + if args.bf16: + reference_model.to(torch.bfloat16) # type: ignore + elif args.fp16: + reference_model.to(torch.float16) # type: ignore + if args.use_freeze_layers: print_rank_0("NOTE: freeze transformer layers") freeze_transformer_layers(model=model, layer_ranges=args.freeze_layers) @@ -140,9 +154,27 @@ def main() -> None: if args.fsdp_activation_checkpointing: apply_fsdp_checkpointing(model=model, model_name=args.base_model) + if args.direct_preference_optimization: + reference_model = FSDP( + reference_model, # type: ignore + auto_wrap_policy=wrapping_policy, + cpu_offload=CPUOffload(offload_params=True) if args.fsdp_cpu_offload else None, + mixed_precision=mixed_precision_policy, + sharding_strategy=get_sharding_strategy(), + device_id=torch.cuda.current_device(), + limit_all_gathers=True, + sync_module_states=args.low_cpu_fsdp, + param_init_fn=lambda module: module.to_empty( # type: ignore + device=torch.cuda.current_device(), recurse=False, # type: ignore + ) + if args.low_cpu_fsdp and rank != 0 + else None, + ) + if not args.instruction_tuning and not args.direct_preference_optimization: args.continual_pretraining = True + dpo_loss_fn = None if args.continual_pretraining: from llama_recipes.datasets.pretrain_dataset import build_train_valid_test_datasets from megatron_lm.megatron.data.data_samplers import build_pretraining_data_loader @@ -165,6 +197,7 @@ def main() -> None: else: from transformers import AutoTokenizer from llama_recipes.utils.instruction_tuning import get_instruction_tuning_dataloader + from llama_recipes.utils.dpo_dataset import get_dpo_dataloader hf_tokenizer = AutoTokenizer.from_pretrained( pretrained_model_name_or_path=args.hf_transformer_model_dir @@ -190,7 +223,30 @@ def main() -> None: update_iter_info() elif args.direct_preference_optimization: - pass + from llama_recipes.utils.dpo_loss import DPOLoss + + dpo_loss_fn = DPOLoss( + beta=args.dpo_beta, + label_smoothing=args.dpo_label_smoothing, + ) + + train_dataloader = get_dpo_dataloader( + tokenizer=hf_tokenizer, # type: ignore + data_path=args.dpo_train_data_path, + train=True + ) + validation_dataloader = get_dpo_dataloader( + tokenizer=hf_tokenizer, # type: ignore + data_path=args.dpo_valid_data_path + ) + + args.train_iters = args.dpo_dataset_size // args.global_batch_size * args.epoch + args.lr_decay_iters = args.train_iters + args.lr_warmup_iters = args.lr_decay_iters // 10 + args.save_sampler_state = True + if rank == 0: + from llama_recipes.utils.wandb_utils import update_iter_info + update_iter_info() else: raise ValueError("unknown training mode") @@ -241,6 +297,8 @@ def main() -> None: gradient_accumulation_steps=args.gradient_accumulation_steps, local_rank=get_local_rank(), rank=get_rank(), + dpo_loss_fn=dpo_loss_fn, + reference_model=reference_model if args.direct_preference_optimization else None, ) diff --git a/src/llama_recipes/get_model_decoder_layer.py b/src/llama_recipes/get_model_decoder_layer.py index 6082fdc..b989d28 100644 --- a/src/llama_recipes/get_model_decoder_layer.py +++ b/src/llama_recipes/get_model_decoder_layer.py @@ -1,16 +1,19 @@ from transformers.models.llama.modeling_llama import LlamaDecoderLayer from transformers.models.mistral.modeling_mistral import MistralDecoderLayer from transformers.models.phi3.modeling_phi3 import Phi3DecoderLayer +from transformers.models.gemma2.modeling_gemma2 import Gemma2DecoderLayer def get_model_decoder_layer( model_name: str, -) -> type[LlamaDecoderLayer] | type[MistralDecoderLayer] | type[Phi3DecoderLayer]: +) -> type[LlamaDecoderLayer] | type[MistralDecoderLayer] | type[Phi3DecoderLayer] | type[Gemma2DecoderLayer]: if "Llama" in model_name or "Swallow" in model_name or "Yi" in model_name: return LlamaDecoderLayer elif "Mistral" in model_name or "mistral" in model_name or "Codestral" in model_name: return MistralDecoderLayer elif "Phi-3" in model_name: return Phi3DecoderLayer + elif "gemma-2" in model_name: + return Gemma2DecoderLayer else: raise NotImplementedError(f"{model_name}: this model decoder layer is not implemented.") diff --git a/src/llama_recipes/get_models.py b/src/llama_recipes/get_models.py index 487ccfc..05a9851 100644 --- a/src/llama_recipes/get_models.py +++ b/src/llama_recipes/get_models.py @@ -1,8 +1,11 @@ +import time + from transformers import ( LlamaConfig, LlamaForCausalLM, MistralForCausalLM, Phi3ForCausalLM, + Gemma2ForCausalLM, AutoModelForCausalLM, ) from llama_recipes.utils.distributed import is_rank_0 @@ -12,7 +15,7 @@ def get_model( model_name: str, use_cache: bool = False -) -> LlamaForCausalLM | MistralForCausalLM | AutoModelForCausalLM: +) -> LlamaForCausalLM | MistralForCausalLM | Phi3ForCausalLM | Gemma2ForCausalLM: """return CausalLM model Args: @@ -26,6 +29,9 @@ def get_model( LlamaForCausalLM | MistralForCausalLM: PyTorch model """ args = get_args() + if is_rank_0(): + print("Instantiating Model ...", flush=True) + init_time = time.perf_counter() if "Llama" in model_name or "Swallow" in model_name: if args.low_cpu_fsdp: @@ -56,8 +62,6 @@ def get_model( use_cache=use_cache, ) - return model # type: ignore - elif "Mistral" in model_name or "mistral" in model_name or "Codestral" in model_name: # If using torch.device("meta"), FSDP training hang # FYI: https://github.com/iwiwi/epochraft-hf-fsdp/pull/10#issuecomment-1803360147 @@ -77,8 +81,6 @@ def get_model( torch_dtype=torch.bfloat16 if args.bf16 else torch.float16, ) - return model # type: ignore - elif "Phi-3" in model_name: model = Phi3ForCausalLM.from_pretrained( @@ -91,8 +93,6 @@ def get_model( torch_dtype=torch.bfloat16 if args.bf16 else torch.float16, ) - return model # type: ignore - elif "Yi-1.5" in model_name: # https://huggingface.co/01-ai/Yi-1.5-9B/blob/main/config.json @@ -106,7 +106,21 @@ def get_model( torch_dtype=torch.bfloat16 if args.bf16 else torch.float16, ) - return model # type: ignore + elif "gemma-2" in model_name: + model = Gemma2ForCausalLM.from_pretrained( + model_name, + load_in_8bit=True if args.quantization else None, + device_map="auto" if args.quantization else None, + use_cache=use_cache, + max_position_embeddings=args.seq_length, + attn_implementation="flash_attention_2", + torch_dtype=torch.bfloat16 if args.bf16 else torch.float16, + ) else: raise NotImplementedError("model not implemented") + + if is_rank_0(): + print(f"Model instantiation took {time.perf_counter() - init_time:.2f} secs") + + return model # type: ignore diff --git a/src/llama_recipes/policies/mixed_precision.py b/src/llama_recipes/policies/mixed_precision.py index 4eb5d18..f7157c1 100644 --- a/src/llama_recipes/policies/mixed_precision.py +++ b/src/llama_recipes/policies/mixed_precision.py @@ -1,6 +1,3 @@ -# Copyright (c) Meta Platforms, Inc. and affiliates. -# This software may be used and distributed according to the terms of the Llama 2 Community License Agreement. - import torch from torch.distributed.fsdp.api import ( @@ -11,7 +8,7 @@ fpSixteen = MixedPrecision( param_dtype=torch.float16, # Gradient communication precision. - reduce_dtype=torch.float16, + reduce_dtype=torch.float32, # Use float32 for gradient communication. (Llama-3, Megatron_LM like) # Buffer precision. buffer_dtype=torch.float16, ) @@ -19,7 +16,7 @@ bfSixteen = MixedPrecision( param_dtype=torch.bfloat16, # Gradient communication precision. - reduce_dtype=torch.bfloat16, + reduce_dtype=torch.float32, # Use float32 for gradient communication. (Llama-3, Megatron_LM like) # Buffer precision. buffer_dtype=torch.bfloat16, cast_forward_inputs=True, @@ -27,7 +24,7 @@ bfSixteen_mixed = MixedPrecision( param_dtype=torch.float32, - reduce_dtype=torch.bfloat16, + reduce_dtype=torch.float32, # Use float32 for gradient communication. (Llama-3, Megatron_LM like) buffer_dtype=torch.bfloat16, ) diff --git a/src/llama_recipes/utils/checkpoint.py b/src/llama_recipes/utils/checkpoint.py index c68ac70..4826138 100644 --- a/src/llama_recipes/utils/checkpoint.py +++ b/src/llama_recipes/utils/checkpoint.py @@ -1,6 +1,7 @@ import time import torch import torch.distributed as torch_distributed +from torch.utils.data.distributed import DistributedSampler from torch.distributed.fsdp import ( # noqa: F401 FullyShardedDataParallel as FSDP, # type: ignore StateDictType, # type: ignore @@ -9,6 +10,7 @@ from torch.distributed.fsdp.api import FullOptimStateDictConfig from pathlib import Path import os +import gc from megatron_lm.megatron.global_vars import get_args, get_sampler @@ -42,6 +44,8 @@ def save_model_state_dict(model: FSDP, path: str) -> None: print(f"Saving model state dict to {path}") torch.save(state_dict, path) print(f"Saved model state dict to {path}") + del state_dict + gc.collect() def save_optimizer_state_dict(model: FSDP, optimizer: torch.optim.Optimizer, path: str) -> None: @@ -50,6 +54,8 @@ def save_optimizer_state_dict(model: FSDP, optimizer: torch.optim.Optimizer, pat print(f"Saving optimizer state dict to {path}") torch.save(state_dict, path) print(f"Saved optimizer state dict to {path}") + del state_dict + gc.collect() def save_scheduler_state_dict(scheduler: torch.optim.lr_scheduler.LRScheduler, path: str) -> None: @@ -59,10 +65,10 @@ def save_scheduler_state_dict(scheduler: torch.optim.lr_scheduler.LRScheduler, p print(f"Saved scheduler state dict to {path}") -def save_sampler_state_dict(sampler: torch.utils.data.distributed.DistributedSampler, path: str) -> None: +def save_sampler_state_dict(sampler: DistributedSampler, path: str) -> None: if torch_distributed.get_rank() == 0: print(f"Saving sampler indices to {path}") - torch.save(sampler.state_dict(), path) + torch.save(sampler.state_dict(), path) # type: ignore print(f"Saved sampler indices to {path}") @@ -197,14 +203,14 @@ def load_scheduler_state_dict(scheduler: torch.optim.lr_scheduler.LRScheduler, p del state_dict -def load_sampler_state_dict(sampler: torch.utils.data.distributed.DistributedSampler, path: str) -> None: +def load_sampler_state_dict(sampler: DistributedSampler, path: str) -> None: latest_iteration: int = get_latest_iteration(path) if latest_iteration == 0: return latest_checkpoint_path: str = get_checkpoint_name(path, latest_iteration) state_dict = torch.load(f"{latest_checkpoint_path}/sampler.pt", map_location="cpu") - sampler.load_state_dict(state_dict) + sampler.load_state_dict(state_dict) # type: ignore del state_dict diff --git a/src/llama_recipes/utils/dpo.py b/src/llama_recipes/utils/dpo.py new file mode 100644 index 0000000..ef9d325 --- /dev/null +++ b/src/llama_recipes/utils/dpo.py @@ -0,0 +1,85 @@ +from typing import Tuple +import torch +from torch import nn + + +CROSS_ENTROPY_IGNORE_IDX = -100 + + +def get_batch_log_probs( + logits: torch.FloatTensor, + labels: torch.LongTensor, + label_pad_token_id: int = CROSS_ENTROPY_IGNORE_IDX, +) -> torch.FloatTensor: + """ + Calculate log probabilities based on provided logits and labels. + + Args: + logits (torch.FloatTensor): direct logits output of the model of shape (b, s, v) + labels (torch.LongTensor): ground-truth labels to compute log probs with, shape (b, s). + Label tokens with a value of label_pad_token_id are ignored. + label_pad_token_id (int): token id to ignore in labels. + + Returns: + Calculated log probs of shape (b, ) + + Raises: + ValueError: If logits and labels have different shapes. + """ + + if logits.shape[:-1] != labels.shape: + raise ValueError( + "Logits (batch and sequence length dim) and labels must have the same shape." + ) + + labels = labels[:, 1:].clone() # type: ignore + logits = logits[:, :-1, :] # type: ignore + loss_mask = labels != label_pad_token_id + + labels[labels == label_pad_token_id] = 0 + # take log-likelihood of the labels given our model + per_token_log_probs = torch.gather( + logits.log_softmax(-1), dim=2, index=labels.unsqueeze(2) + ).squeeze(2) + + return (per_token_log_probs * loss_mask).sum(-1) # type: ignore + + +def concatenated_forward( + model: nn.Module, batch: dict[str, torch.Tensor], local_rank: int +) -> Tuple[torch.Tensor, torch.Tensor, torch.Tensor, torch.Tensor]: + """ + Run forward pass of the model with chosen and rejected samples concatenated. + + Args: + model (nn.Module): The model to be used for the forward pass. + batch (Tuple[torch.Tensor, torch.Tensor]): Tuple of input_ids and labels. + + Returns: + Tuple of chosen log probs, rejected log probs, chosen logits, rejected logits. + """ + concatenated_input_ids = torch.cat( + [batch['chosen_input_ids'], batch['rejected_input_ids']], dim=0 + ) + concatenated_labels = torch.cat( + [batch['chosen_labels'], batch['rejected_labels']], dim=0 + ) + concatenated_input_ids = concatenated_input_ids.to(local_rank) + concatenated_labels = concatenated_labels.to(local_rank) + + # formed by concatenating an equal number of "chosen" and "rejected". + len_chosen = concatenated_input_ids.shape[0] // 2 + + all_logits = model(concatenated_input_ids).logits + + all_log_probs = get_batch_log_probs( + all_logits, concatenated_labels # type: ignore + ) + + chosen_log_probs = all_log_probs[:len_chosen] + rejected_log_probs = all_log_probs[len_chosen:] + + chosen_logits = all_logits[:len_chosen] + rejected_logits = all_logits[len_chosen:] + + return (chosen_log_probs, rejected_log_probs, chosen_logits, rejected_logits) diff --git a/src/llama_recipes/utils/dpo_dataset.py b/src/llama_recipes/utils/dpo_dataset.py new file mode 100644 index 0000000..2c23c14 --- /dev/null +++ b/src/llama_recipes/utils/dpo_dataset.py @@ -0,0 +1,193 @@ +import copy +import json +import os + +import numpy as np +import torch +from torch.utils.data import Dataset, DataLoader +import torch.distributed as torch_distributed +from transformers.tokenization_utils import PreTrainedTokenizer +from pathlib import Path +from llama_recipes.utils.distributed import print_rank_0 + +from megatron_lm.megatron.global_vars import get_args, set_sampler + + +class DPODataset(Dataset): + def __init__( + self, + tokenizer: PreTrainedTokenizer, + data_path: str, + ) -> None: + args = get_args() + + self.data_path: str = data_path + self.max_tokens: int = args.seq_length + self.tokenizer = tokenizer + + # system prompt + self.system_prompt_role = args.system_prompt_role + self.system_prompt_content = args.system_prompt_content + + # index file + dataset_dir = Path(self.data_path).parent + index_cache_dir = dataset_dir / ".index_cache" + os.makedirs(index_cache_dir, exist_ok=True) + index_file_path = index_cache_dir / str(os.path.basename(self.data_path)).replace(".jsonl", ".idx") + self.index_file_path: str = str(index_file_path) + + try: + with open(self.index_file_path, "r", encoding="utf-8") as f: + self.indexes: list[int] = [int(line.strip()) for line in f] + except Exception as e: + print(f"index file error: {e}") + exit(1) + + def __len__(self) -> int: + return len(self.indexes) + + def __getitem__(self, index: int) -> dict[str, torch.Tensor]: + IGNORE_INDEX = -100 # The default setting in CrossEntropyLoss + + with open(self.data_path, "r", encoding="utf-8") as file: + offset: int = self.indexes[index] + file.seek(offset) + try: + line = file.readline() + except Exception as e: + print(f"index={index}, offset={offset}, error={e}") + exit(1) + + try: + conversations: dict[str, list[dict[str, str]] | str] = json.loads(line) + except Exception as e: + print(f"index={index}, offset={offset}, line={line}, error={e}") + exit(1) + + SYSTEM_PROMPT: list[dict[str, str]] = [ + { + "role": self.system_prompt_role, + "content": self.system_prompt_content, + } + ] + # chat template + prompt = self.tokenizer.apply_chat_template( + conversation=SYSTEM_PROMPT + conversations["conversations"], # type: ignore + add_generation_prompt=True, + tokenize=True, + ) + + chosen = self.tokenizer.apply_chat_template( + conversation=SYSTEM_PROMPT + conversations["conversations"] + [ # type: ignore + {"role": "assistant", "content": conversations["chosen"]} + ], + tokenize=True, + ) + rejected = self.tokenizer.apply_chat_template( + conversation=SYSTEM_PROMPT + conversations["conversations"] + [ # type: ignore + {"role": "assistant", "content": conversations["rejected"]} + ], + tokenize=True, + ) + chosen_input_ids: torch.Tensor = torch.tensor(chosen, dtype=torch.int64) + rejected_input_ids: torch.Tensor = torch.tensor(rejected, dtype=torch.int64) + + if len(chosen) > self.max_tokens or len(rejected) > self.max_tokens: + print(f"\n\nWARNING: chosen={self.tokenizer.decode(chosen)}\n\n") + print(f"\n\nWARNING: rejected={self.tokenizer.decode(rejected)}\n\n") + + eos_token_id: int = self.tokenizer.encode("<|end_of_text|>", add_special_tokens=False)[0] + pad_token_id = eos_token_id + + def pad_tensor(tensor: torch.Tensor) -> torch.Tensor: + padding_length: int = self.max_tokens - len(tensor) + if padding_length > 0: + pad_tensor = torch.full( + (padding_length,), pad_token_id, dtype=torch.int64 + ) + tensor = torch.cat((tensor, pad_tensor)) + elif padding_length < 0: + tensor = tensor[: self.max_tokens] + + return tensor + + chosen_input_ids = pad_tensor(tensor=chosen_input_ids) + rejected_input_ids = pad_tensor(tensor=rejected_input_ids) + + chosen_labels = copy.deepcopy(chosen_input_ids) + rejected_labels = copy.deepcopy(rejected_input_ids) + # promptの長さ分だけ -1 で埋める -> 損失関数で無視するようになる + chosen_labels[: len(prompt)] = -1 + rejected_labels[: len(prompt)] = -1 + chosen_label_mask = chosen_labels.ge(0) + rejected_label_mask = rejected_labels.ge(0) + + if torch.all(chosen_label_mask == 0) or torch.all(rejected_label_mask == 0): + random_index: int = np.random.randint(0, len(self.indexes)) + self.__getitem__(random_index) + + # ~label_mask -> prompt の部分を ignore_index で埋める + chosen_labels[~chosen_label_mask] = IGNORE_INDEX + rejected_labels[~rejected_label_mask] = IGNORE_INDEX + chosen_labels[chosen_labels == pad_token_id] = IGNORE_INDEX + rejected_labels[rejected_labels == pad_token_id] = IGNORE_INDEX + + return { + "chosen_input_ids": chosen_input_ids, + "rejected_input_ids": rejected_input_ids, + "chosen_labels": chosen_labels, + "rejected_labels": rejected_labels, + } + + +def worker_init_fn(worker_id: int) -> None: + import random + + args = get_args() + + worker_seed = args.seed + worker_id + np.random.seed(worker_seed) + random.seed(worker_seed) + + +def get_dpo_dataloader( + tokenizer: PreTrainedTokenizer, + data_path: str, + train: bool = False, +) -> DataLoader: + from llama_recipes.utils.sequence_length_warmup import CustomDistributedSampler + from llama_recipes.utils.checkpoint import load_sampler_state_dict + + args = get_args() + + dpo_dataset = DPODataset( + tokenizer=tokenizer, + data_path=data_path, + ) + + if train: + args.dpo_dataset_size = len(dpo_dataset) + print_rank_0(f"DPO dataset size: {args.dpo_dataset_size}") + + train_sampler = CustomDistributedSampler( + dataset=dpo_dataset, + rank=torch_distributed.get_rank(), + num_replicas=torch_distributed.get_world_size(), + shuffle=True, + seed=args.seed, + ) + + if args.load: + load_sampler_state_dict(sampler=train_sampler, path=args.load) + + set_sampler(sampler=train_sampler) + + return DataLoader( + dpo_dataset, + batch_size=args.micro_batch_size, + sampler=train_sampler, + num_workers=args.num_workers, + pin_memory=True, + drop_last=True, + worker_init_fn=worker_init_fn, + ) diff --git a/src/llama_recipes/utils/dpo_loss.py b/src/llama_recipes/utils/dpo_loss.py new file mode 100644 index 0000000..7ab7169 --- /dev/null +++ b/src/llama_recipes/utils/dpo_loss.py @@ -0,0 +1,241 @@ +# Copyright (c) Meta Platforms, Inc. and affiliates. +# All rights reserved. +# +# This source code is licensed under the BSD-style license found in the +# LICENSE file in the root directory of this source tree. + +from typing import Tuple + +import torch +import torch.nn as nn +import torch.nn.functional as F + + +class DPOLoss(nn.Module): + """ + Direct Preference Optimization (DPO) Loss module: https://arxiv.org/abs/2305.18290. + Simply stated from the paper: + + Intuitively, the DPO update increases the relative log probability of preferred to dispreferred responses, + but it incorporates a dynamic, per-example importance weight that prevents + the model degeneration that we find occurs with a naive probability ratio objective. + + Based on the implementation in HF's TRL library: + https://github.com/huggingface/trl/blob/5d1deb1445828cfd0e947cb3a7925b1c03a283fc/trl/trainer/dpo_trainer.py#L844 + + DPO retains similarities to PPO (https://arxiv.org/abs/2009.01325), where it optimizes a policy + (language) model to align with human preferences, and regularizes the loss function using a baseline + reference (the frozen, initial language model) to prevent over-fitting to the preference dataset. + It differs from PPO by optimizing the policy model directly using labelled preference data, rather + than using an additional reward model to provide feedback. + This significantly simplifies training and reduces compute overhead. + + Args: + beta (float): Temperature parameter for the DPO loss, typically in the range of 0.1 to 0.5. Default is 0.1. + label_smoothing (float): Parameter encoding uncertainty about the labels. Default is 0. + """ + + def __init__( + self, + beta: float = 0.1, + label_smoothing: float = 0.0, + ): + super().__init__() + self.beta = beta + self.label_smoothing = label_smoothing + + def forward( + self, + policy_chosen_logps: torch.Tensor, + policy_rejected_logps: torch.Tensor, + reference_chosen_logps: torch.Tensor, + reference_rejected_logps: torch.Tensor, + ) -> Tuple[torch.Tensor, torch.Tensor, torch.Tensor]: + """ + Compute the DPO loss for a batch of policy and reference model log probabilities. + + Args: + policy_chosen_logps (torch.Tensor): Log probabilities of the policy model + for the chosen responses. Shape: (batch_size) + policy_rejected_logps (torch.Tensor): Log probabilities of the policy model + for the rejected responses. Shape: (batch_size) + reference_chosen_logps (torch.Tensor): Log probabilities of the reference model + for the chosen responses. Shape: (batch_size) + reference_rejected_logps (torch.Tensor): Log probabilities of the reference model + for the rejected responses. Shape: (batch_size) + + Returns: + Tuple[torch.Tensor, torch.Tensor, torch.Tensor]: A tuple of three tensors: + - losses: The DPO loss for each example in the batch. + - chosen_rewards: Rewards for the chosen responses. + - rejected_rewards: Rewards for the rejected responses. + + """ + pi_logratios = policy_chosen_logps - policy_rejected_logps + ref_logratios = reference_chosen_logps - reference_rejected_logps + + logits = pi_logratios - ref_logratios + + # The beta is a temperature parameter for the DPO loss, typically something in the range of 0.1 to 0.5. + # We ignore the reference model as beta -> 0. The label_smoothing parameter encodes our uncertainty about the labels and + # calculates a conservative DPO loss. + losses = ( + -F.logsigmoid(self.beta * logits) * (1 - self.label_smoothing) + - F.logsigmoid(-self.beta * logits) * self.label_smoothing + ) + + chosen_rewards = ( + self.beta * (policy_chosen_logps - reference_chosen_logps).detach() + ) + rejected_rewards = ( + self.beta * (policy_rejected_logps - reference_rejected_logps).detach() + ) + + return losses, chosen_rewards, rejected_rewards + + +class RSOLoss(nn.Module): + """ + Statistical Rejection Sampling Optimization (RSO) or "hinge" loss module: https://arxiv.org/abs/2309.06657. + Intuition from the paper: + + DPO is a logistic regression on human preference data, and SLiC (https://arxiv.org/abs/2305.10425) is almost + equivalent to a support vector machine (SVM) with hinge loss. [RSO] improve[s] SLiC as the SVM counter part of DPO. + + Based on the implementation in HF's TRL library: + https://github.com/huggingface/trl/blob/4dce042a3863db1d375358e8c8092b874b02934b/trl/trainer/dpo_trainer.py#L1141 + + Args: + gamma (float): Equivalent temperature parameter (from DPO) for the RSO loss. + """ + + def __init__( + self, + gamma: float = 0.1, + ): + super().__init__() + self.gamma = gamma + + def forward( + self, + policy_chosen_logps: torch.Tensor, + policy_rejected_logps: torch.Tensor, + reference_chosen_logps: torch.Tensor, + reference_rejected_logps: torch.Tensor, + ) -> Tuple[torch.Tensor, torch.Tensor, torch.Tensor]: + """ + Compute the RSO loss for a batch of policy and reference model log probabilities. + + Args: + policy_chosen_logps (torch.Tensor): Log probabilities of the policy model + for the chosen responses. Shape: (batch_size) + policy_rejected_logps (torch.Tensor): Log probabilities of the policy model + for the rejected responses. Shape: (batch_size) + reference_chosen_logps (torch.Tensor): Log probabilities of the reference model + for the chosen responses. Shape: (batch_size) + reference_rejected_logps (torch.Tensor): Log probabilities of the reference model + for the rejected responses. Shape: (batch_size) + + Returns: + Tuple[torch.Tensor, torch.Tensor, torch.Tensor]: A tuple of three tensors: + - losses: The RSO loss for each example in the batch. + - chosen_rewards: Rewards for the chosen responses. + - rejected_rewards: Rewards for the rejected responses. + + """ + pi_logratios = policy_chosen_logps - policy_rejected_logps + ref_logratios = reference_chosen_logps - reference_rejected_logps + + logits = pi_logratios - ref_logratios + + losses = torch.relu(1 - self.gamma * logits) + + chosen_rewards = ( + self.gamma * (policy_chosen_logps - reference_chosen_logps).detach() + ) + rejected_rewards = ( + self.gamma * (policy_rejected_logps - reference_rejected_logps).detach() + ) + + return losses, chosen_rewards, rejected_rewards + + +class IPOLoss(nn.Module): + """ + Identity Preference Optimization (IPO) Loss module: https://arxiv.org/abs/2310.12036. + Intuition from the paper: + + (Given a policy pi and reference policy, pi_ref) + + IPO learns from preferences dataset simply by regressing the gap between log-likelihood ratios + + log(pi(chosen)/pi(rejected)) and log(pi_ref(chosen)/pi_ref(rejected)) + + to 1/(2*tau), where tau is the temperature parameter. [T]he weaker the regularisation becomes, the + higher would be the log-likelihood ratio of chosen to rejected logprobs. In other words IPO, unlike DPO, + always regularizes its solution towards pi_ref by controlling the gap between the log-likelihood ratios + + log(pi(chosen)/pi(rejected)) and log(pi_ref(chosen)/pi_ref(rejected)) + + thus avoiding the over-fitting to the preference dataset. + + Based on the implementation in HF's TRL library: + https://github.com/huggingface/trl/blob/4dce042a3863db1d375358e8c8092b874b02934b/trl/trainer/dpo_trainer.py#L1143 + + + Args: + tau (float): Equivalent temperature scaling parameter (from DPO) for the IPO loss. From the TRL documentation: + + the [tau] parameter is the reciprocal of the gap between the log-likelihood ratios of the + chosen vs the rejected completion pair and thus the smaller the tau the larger this gap is. + """ + + def __init__( + self, + tau: float = 0.1, + ): + super().__init__() + self.tau = tau + + def forward( + self, + policy_chosen_logps: torch.Tensor, + policy_rejected_logps: torch.Tensor, + reference_chosen_logps: torch.Tensor, + reference_rejected_logps: torch.Tensor, + ) -> Tuple[torch.Tensor, torch.Tensor, torch.Tensor]: + """ + Compute the DPO loss for a batch of policy and reference model log probabilities. + + Args: + policy_chosen_logps (torch.Tensor): Log probabilities of the policy model + for the chosen responses. Shape: (batch_size) + policy_rejected_logps (torch.Tensor): Log probabilities of the policy model + for the rejected responses. Shape: (batch_size) + reference_chosen_logps (torch.Tensor): Log probabilities of the reference model + for the chosen responses. Shape: (batch_size) + reference_rejected_logps (torch.Tensor): Log probabilities of the reference model + for the rejected responses. Shape: (batch_size) + + Returns: + Tuple[torch.Tensor, torch.Tensor, torch.Tensor]: A tuple of three tensors: + - losses: The DPO loss for each example in the batch. + - chosen_rewards: Rewards for the chosen responses. + - rejected_rewards: Rewards for the rejected responses. + + """ + pi_logratios = policy_chosen_logps - policy_rejected_logps + ref_logratios = reference_chosen_logps - reference_rejected_logps + + logits = pi_logratios - ref_logratios + + losses = (logits - 1 / (2 * self.tau)) ** 2 + + chosen_rewards = ( + self.tau * (policy_chosen_logps - reference_chosen_logps).detach() + ) + rejected_rewards = ( + self.tau * (policy_rejected_logps - reference_rejected_logps).detach() + ) + + return losses, chosen_rewards, rejected_rewards diff --git a/src/llama_recipes/utils/instruction_tuning.py b/src/llama_recipes/utils/instruction_tuning.py index a843529..1cb73aa 100644 --- a/src/llama_recipes/utils/instruction_tuning.py +++ b/src/llama_recipes/utils/instruction_tuning.py @@ -4,7 +4,8 @@ import numpy as np import torch -from torch.utils.data import Dataset +from torch.utils.data import Dataset, DataLoader +import torch.distributed as torch_distributed from transformers.tokenization_utils import PreTrainedTokenizer from pathlib import Path from llama_recipes.utils.distributed import print_rank_0 @@ -24,6 +25,10 @@ def __init__( self.max_words: int = args.seq_length self.tokenizer = tokenizer + # system prompt + self.system_prompt_role = args.system_prompt_role + self.system_prompt_content = args.system_prompt_content + # index file dataset_dir = Path(self.data_path).parent index_cache_dir = dataset_dir / ".index_cache" @@ -54,60 +59,65 @@ def __getitem__(self, index: int) -> dict[str, torch.Tensor]: exit(1) try: - conversations: dict[str, str | list[dict[str, str]]] = json.loads(line) + conversations: dict[str, list[dict[str, str]] | str] = json.loads(line) except Exception as e: print(f"index={index}, offset={offset}, line={line}, error={e}") exit(1) - SYSTEM_PROMPT = [ - {"role": "system", "text": "あなたは誠実で優秀な日本人のアシスタントです。"} + SYSTEM_PROMPT: list[dict[str, str]] = [ + { + "role": self.system_prompt_role, + "content": self.system_prompt_content, + } ] # chat template - prompt: str = self.tokenizer.apply_chat_template( + prompt = self.tokenizer.apply_chat_template( conversation=SYSTEM_PROMPT + conversations["input"], # type: ignore - tokenize=False + add_generation_prompt=True, + tokenize=True, ) - example: str = prompt + conversations["output"] # type: ignore - encoded_prompt: torch.Tensor = torch.tensor( - self.tokenizer.encode(prompt, add_special_tokens=False), - dtype=torch.int64 - ) - encoded_example: list[int] = self.tokenizer.encode( - example, add_special_tokens=False + example = self.tokenizer.apply_chat_template( + conversation=SYSTEM_PROMPT + conversations["input"] + [ # type: ignore + {"role": "assistant", "content": conversations["output"]} + ], + tokenize=True, ) - encoded_example.append(self.tokenizer.eos_token_id) # type: ignore - encoded_tensor_example: torch.Tensor = torch.tensor(encoded_example, dtype=torch.int64) - - if len(encoded_example) > self.max_words: - print(f"\n\nWARNING: example={example}\n\n") - - padding: int = self.max_words - encoded_tensor_example.shape[0] - if padding > 0: # pad_token_id = 0 (substitute unk_token) - encoded_tensor_example = torch.cat((encoded_tensor_example, torch.zeros(padding, dtype=torch.int64) - 1)) - elif padding < 0: - encoded_tensor_example = encoded_tensor_example[: self.max_words] - - labels = copy.deepcopy(encoded_tensor_example) + tensor_example: torch.Tensor = torch.tensor(example, dtype=torch.int64) + + if len(example) > self.max_words: + print(f"\n\nWARNING: example={self.tokenizer.decode(example)}\n\n") + + padding_length: int = self.max_words - len(example) + eos_token_id: int = self.tokenizer.encode("<|end_of_text|>", add_special_tokens=False)[0] + pad_token_id = eos_token_id + if padding_length > 0: + pad_tensor = torch.full( + (padding_length,), pad_token_id, dtype=torch.int64 + ) + tensor_example = torch.cat((tensor_example, pad_tensor)) + elif padding_length < 0: + tensor_example = tensor_example[: self.max_words] + + labels = copy.deepcopy(tensor_example) # promptの長さ分だけ -1 で埋める -> 損失関数で無視するようになる - labels[: len(encoded_prompt)] = -1 - # 0より大きい(ge)かどうかの真偽値でmaskを作成 - example_mask = encoded_tensor_example.ge(0) + labels[: len(prompt)] = -1 label_mask = labels.ge(0) - if torch.all(label_mask == 0): # len(output) == 0 + if torch.all(label_mask == 0): # 予測部分がない random_index: int = np.random.randint(0, len(self.indexes)) self.__getitem__(random_index) - # ~example_mask -> paddingの部分を 0 で埋める - encoded_tensor_example[~example_mask] = 0 # ~label_mask -> prompt の部分を ignore_index で埋める labels[~label_mask] = IGNORE_INDEX + labels[labels == pad_token_id] = IGNORE_INDEX + # mask out pad token + attention_mask = (tensor_example != pad_token_id).float() return { - "input_ids": encoded_tensor_example, + "input_ids": tensor_example, "labels": labels, - "attention_mask": example_mask.float(), + "attention_mask": attention_mask, } @@ -125,7 +135,7 @@ def get_instruction_tuning_dataloader( tokenizer: PreTrainedTokenizer, data_path: str, train: bool = False, -) -> torch.utils.data.DataLoader: +) -> DataLoader: from llama_recipes.utils.sequence_length_warmup import CustomDistributedSampler from llama_recipes.utils.checkpoint import load_sampler_state_dict @@ -142,8 +152,8 @@ def get_instruction_tuning_dataloader( train_sampler = CustomDistributedSampler( dataset=instruction_dataset, - rank=torch.distributed.get_rank(), - num_replicas=torch.distributed.get_world_size(), + rank=torch_distributed.get_rank(), + num_replicas=torch_distributed.get_world_size(), shuffle=True, seed=args.seed, ) @@ -153,7 +163,7 @@ def get_instruction_tuning_dataloader( set_sampler(sampler=train_sampler) - return torch.utils.data.DataLoader( + return DataLoader( instruction_dataset, batch_size=args.micro_batch_size, sampler=train_sampler, diff --git a/src/llama_recipes/utils/train_utils.py b/src/llama_recipes/utils/train_utils.py index cd9c800..12b948e 100644 --- a/src/llama_recipes/utils/train_utils.py +++ b/src/llama_recipes/utils/train_utils.py @@ -1,6 +1,5 @@ import os import time -from contextlib import nullcontext import torch import torch.cuda.nccl as nccl @@ -11,6 +10,7 @@ from llama_recipes.policies import fpSixteen, bfSixteen, bfSixteen_mixed, get_decoder_layer_wrapper from llama_recipes.utils.wandb_utils import log_model_info, log_wandb from llama_recipes.utils.checkpoint import save_checkpoint, get_latest_iteration +from llama_recipes.utils.dpo_loss import DPOLoss from typing import Optional, Any import wandb @@ -39,6 +39,8 @@ def train( gradient_accumulation_steps: int, local_rank: Optional[int] = None, rank: Optional[int] = None, + dpo_loss_fn: Optional[DPOLoss] = None, + reference_model: Optional[torch.nn.Module] = None, ) -> None: """ Trains the model on the given dataloader @@ -63,7 +65,6 @@ def train( world_size = int(os.environ["WORLD_SIZE"]) local_rank = local_rank if local_rank is not None else 0 - autocast = torch.cuda.amp.autocast if args.fp16 else nullcontext # type: ignore # set model info if rank == 0 and args.wandb_name: @@ -100,11 +101,60 @@ def train( batch = next(train_dataloader) - for key in batch.keys(): - batch[key] = batch[key].to(local_rank) + if args.direct_preference_optimization: + # DPO( Direct Preference Optimization) + from llama_recipes.utils.dpo import concatenated_forward + + if dpo_loss_fn is None: + raise ValueError( + "DPO(Direct Preference Optimization) is enabled, but dpo loss function is None" + ) + if reference_model is None: + raise ValueError( + "DPO(Direct Preference Optimization) is enabled, but reference model is None" + ) + + # forward + ( + policy_chosen_log_probs, + policy_rejected_log_probs, + policy_chosen_logits, + policy_rejected_logits, + ) = concatenated_forward(model=model, batch=batch, local_rank=local_rank) + + policy_chosen_logits_mean = policy_chosen_logits.detach().mean() + policy_rejected_logits_mean = policy_rejected_logits.detach().mean() + + # deleting logits here helps reduce (peak) memory usage - we only need them for metric logging + del policy_chosen_logits, policy_rejected_logits + + with torch.no_grad(): + ( + reference_chosen_log_probs, + reference_rejected_log_probs, + _, + _, + ) = concatenated_forward(model=reference_model, batch=batch, local_rank=local_rank) + + loss, chosen_rewards, rejected_rewards = dpo_loss_fn( + policy_chosen_log_probs, + policy_rejected_log_probs, + reference_chosen_log_probs, + reference_rejected_log_probs, + ) + loss = loss.mean() + reward_accuracies = (chosen_rewards > rejected_rewards).float() + else: + # continual-pre-training & Instruction Tuning + for key in batch.keys(): + batch[key] = batch[key].to(local_rank) + + with torch.cuda.amp.autocast( + enabled=args.mixed_precision, + dtype=torch.bfloat16 if args.bf16 else torch.float16 + ): + loss: torch.Tensor = model(**batch).loss - with autocast(): - loss: torch.Tensor = model(**batch).loss loss = loss / gradient_accumulation_steps if args.fp16: @@ -119,8 +169,13 @@ def train( # gradient clipping if args.grad_clip_norm > 0: clip_grad_norm_(model.parameters(), args.grad_clip_norm) - real_batch_size: int = batch["input_ids"].shape[0] - real_seq_len: int = batch["input_ids"].shape[1] + + if args.direct_preference_optimization: + real_batch_size: int = batch["chosen_input_ids"].shape[0] + real_seq_len: int = batch["chosen_input_ids"].shape[1] + else: + real_batch_size: int = batch["input_ids"].shape[0] + real_seq_len: int = batch["input_ids"].shape[1] # gradient accumulation end iteration += 1 @@ -152,6 +207,21 @@ def train( world_size=world_size, iteration_start_time=iteration_start_time, ) + if args.direct_preference_optimization: + wandb.log( + { + "rewards/chosen": chosen_rewards.mean().cpu(), + "rewards/rejected": rejected_rewards.mean().cpu(), + "rewards/accuracies": reward_accuracies.mean().cpu(), + "rewards/margins": (chosen_rewards - rejected_rewards).mean().cpu(), # type: ignore + "log_probs/rejected": policy_rejected_log_probs.detach().mean().cpu(), + "log_probs/chosen": policy_chosen_log_probs.detach().mean().cpu(), + "logits/rejected": policy_rejected_logits_mean.cpu(), + "logits/chosen": policy_chosen_logits_mean.cpu(), + }, + step=iteration, + ) + total_loss = 0.0 iteration_start_time = time.perf_counter() diff --git a/tools/checkpoint-convert/scripts/abci/convert_ckpt_instruct.sh b/tools/checkpoint-convert/scripts/abci/convert_ckpt_instruct.sh index 4efc617..e54860b 100644 --- a/tools/checkpoint-convert/scripts/abci/convert_ckpt_instruct.sh +++ b/tools/checkpoint-convert/scripts/abci/convert_ckpt_instruct.sh @@ -4,12 +4,16 @@ #$ -j y #$ -o outputs/convert/ckpt/ #$ -cwd + # module load source /etc/profile.d/modules.sh -module load cuda/11.8/11.8.0 -module load cudnn/8.9/8.9.2 -module load nccl/2.16/2.16.2-1 +module use /bb/llm/gaf51275/modules/modulefiles + +module load cuda/12.1/12.1.1 +module load cudnn/cuda-12.1/9.0.0 +module load nccl/2.20.5 module load hpcx/2.12 +module load gcc/11.4.0 set -e diff --git a/tools/dataset/convert_dataset_dpo.py b/tools/dataset/convert_dataset_dpo.py new file mode 100644 index 0000000..4b487dc --- /dev/null +++ b/tools/dataset/convert_dataset_dpo.py @@ -0,0 +1,42 @@ +import argparse +import json + + +def convert_jsonl(input_path: str, output_path: str, from_key: str, value_key: str) -> None: + converted_data = [] + + with open(input_path, 'r', encoding='utf-8') as file: + for line in file: + item = json.loads(line) + conversations = item['conversations'] + chosen = item['chosen'] + rejected = item['rejected'] + converted_conversations = [] + for conversation in conversations: + converted_conversation = { + 'role': conversation[from_key], + 'content': conversation[value_key] + } + converted_conversations.append(converted_conversation) + converted_data.append({ + 'conversations': converted_conversations, + 'chosen': chosen, + 'rejected': rejected + }) + with open(output_path, 'w', encoding='utf-8') as outfile: + for item in converted_data: + outfile.write(json.dumps(item, ensure_ascii=False) + '\n') + +def main(): + parser = argparse.ArgumentParser(description="Convert JSONL file keys to specified format.") + parser.add_argument('--input-path', type=str, help='Path to the input JSONL file') + parser.add_argument('--output-path', type=str, help='Path to the output JSONL file') + parser.add_argument('--from-key', type=str, default='from', help='Key name to be converted to role') + parser.add_argument('--value-key', type=str, default='value', help='Key name to be converted to context') + + args = parser.parse_args() + + convert_jsonl(args.input_path, args.output_path, args.from_key, args.value_key) + +if __name__ == "__main__": + main() diff --git a/tools/dataset/convert_dataset_instruct.py b/tools/dataset/convert_dataset_instruct.py new file mode 100644 index 0000000..ed35caf --- /dev/null +++ b/tools/dataset/convert_dataset_instruct.py @@ -0,0 +1,42 @@ +import argparse +import json +import copy + + +def convert_jsonl(input_path: str, output_path: str) -> None: + converted_data = [] + + with open(input_path, 'r', encoding='utf-8') as file: + for line in file: + item = json.loads(line) + messages = item['messages'] + + assert len(messages) % 2 == 0 + conversation_turn: int = len(messages) // 2 + + inputs = [] + for i in range(conversation_turn): + user_message = messages[i * 2] + assistant_message = messages[i * 2 + 1] + inputs.append(user_message) + converted_data.append({ + "input": copy.deepcopy(inputs), + "output": assistant_message + }) + inputs.append(assistant_message) + + with open(output_path, 'w', encoding='utf-8') as outfile: + for item in converted_data: + outfile.write(json.dumps(item, ensure_ascii=False) + '\n') + +def main(): + parser = argparse.ArgumentParser(description="Convert JSONL file keys to specified format.") + parser.add_argument('--input-path', type=str, help='Path to the input JSONL file') + parser.add_argument('--output-path', type=str, help='Path to the output JSONL file') + + args = parser.parse_args() + + convert_jsonl(args.input_path, args.output_path) + +if __name__ == "__main__": + main() diff --git a/tools/dataset/debug_chat_template.py b/tools/dataset/debug_chat_template.py index 03e22b6..025b71e 100644 --- a/tools/dataset/debug_chat_template.py +++ b/tools/dataset/debug_chat_template.py @@ -13,28 +13,54 @@ "input": [ { "role": "user", - "text": "こんにちは!" + "content": "こんにちは!" }, { "role": "assistant", - "text": "こんにちは!ご質問やお困りのことがありましたら、何でもご相談ください。何が必要か教えてください。" + "content": "こんにちは!ご質問やお困りのことがありましたら、何でもご相談ください。何が必要か教えてください。" }, { "role": "user", - "text": "世界のすべての国をアルファベット順に、それぞれの国の人口を教えてください。" + "content": "世界のすべての国をアルファベット順に、それぞれの国の人口を教えてください。" } - ], - "output": "世界中の国をアルファベット順に並べたリストと、その国の推定人口です:\n\nアフガニスタン: 38,928,346 アルバニア: 2,877,797 アルジェリア:44,344,744 アンドラ: 77,265 アンゴラ: 32,878,272 アンティグア・バーブーダ: 97,929 アルゼンチン: 45,195,774 アルメニア: 2,977,600 オーストラリア: 25,499,884 オーストリア: 9,006,398 アゼルバイジャン: 10,134,604 バハマ:393,248 バーレーン: 1,714,571 バングラデシュ: 164,689,383164,689,383 バルバドス: 287,375 ベラルーシ: 9,449,323 ベルギー: 11,589,623 ベリーズ: 397,628 ベナン: 12,123,200 ブータン: 754,288 ボリビア: 11,673,095 ボスニア・ヘルツェゴビナ: 3,279,000 ボツワナ:2,335,814 ブラジル: 212,559,417 ブルネイ: 441,484 ブルガリア: 7,034,879 ブルキナファソ: 20,903,273 ブルンジ: 11,890,781 カボベルデ: 555,987 カンボジア: 16,205,218 カメルーン: 26,545,863 カナダ:37,742,154 中央アフリカ共和国4,829,767 チャド: 16,425,864 チリ: 19,116,201 中国: 1,439,323,776 コロンビア: 50,882,891 コモロ: 869,601 コンゴ民主共和国:87,534,403 コンゴ共和国5,457,821 コスタリカ5,094,118 コートジボワール: 26,378,274 クロアチア: 4,105,267 キューバ: 11,239,224 キプロス:1,207,359 チェコ:10,708,919 デンマーク:5,792,2025,792,202" + ] } -chat_template: str = "{% if messages[0]['role'] == 'system' %}{% set loop_messages = messages[1:] %}{% set system_message = messages[0]['text'] %}{% elif false == true and not '<>' in messages[0]['text'] %}{% set loop_messages = messages %}{% set system_message = 'あなたは誠実で優秀な日本人のアシスタントです。' %}{% else %}{% set loop_messages = messages %}{% set system_message = false %}{% endif %}{{ bos_token }}{% for message in loop_messages %}{% if (message['role'] == 'user') != (loop.index0 % 2 == 0) %}{{ raise_exception('Conversation roles must alternate user/assistant/user/assistant/...') }}{% endif %}{% if loop.index0 == 0 and system_message != false %}{% set content = '<>\\n' + system_message + '\\n<>\\n\\n' + message['text'] %}{% else %}{% set content = message['text'] %}{% endif %}{% if message['role'] == 'user' %}{{ '[INST] ' + content.strip() + ' [/INST] ' }}{% elif message['role'] == 'system' %}{{ '<>\\n' + content.strip() + '\\n<>\\n\\n' }}{% elif message['role'] == 'assistant' %}{{ '' + content.strip() + '' + eos_token }}{% endif %}{% endfor %}" +conversations_with_output = [ + { + "role": "user", + "content": "こんにちは!" + }, + { + "role": "assistant", + "content": "こんにちは!ご質問やお困りのことがありましたら、何でもご相談ください。何が必要か教えてください。" + }, + { + "role": "user", + "content": "世界のすべての国をアルファベット順に、それぞれの国の人口を教えてください。" + }, + { + "role": "assistant", + "content": "output", + } +] + +chat_template: str = "{% set loop_messages = messages %}{% for message in loop_messages %}{% set content = '<|start_header_id|>' + message['role'] + '<|end_header_id|>\n\n'+ message['content'] | trim + '<|eot_id|>' %}{% if loop.index0 == 0 %}{% set content = bos_token + content %}{% endif %}{{ content }}{% endfor %}{% if add_generation_prompt %}{{ '<|start_header_id|>assistant<|end_header_id|>\n\n' }}{% endif %}" print("before apply chat template") prompt: str = tokenizer.apply_chat_template( - [{"role": "system", "text": "あなたは誠実で優秀な日本人のアシスタントです。"}] + conversations["input"], # type: ignore - # chat_template=chat_template, - tokenize=False + [{"role": "system", "content": "あなたは誠実で優秀な日本人のアシスタントです。"}] + conversations["input"], # type: ignore + add_generation_prompt=True, + # tokenize=False ) print(prompt) +print(type(prompt)) + +print("--------------------------------") + +print(tokenizer.apply_chat_template( + conversation=conversations_with_output, + # tokenize=False +)) diff --git a/tools/inference/inference.py b/tools/inference/inference.py index 0828a1b..e0a1a58 100644 --- a/tools/inference/inference.py +++ b/tools/inference/inference.py @@ -2,7 +2,7 @@ import torch -from transformers import AutoTokenizer, MistralForCausalLM +from transformers import AutoTokenizer, AutoModelForCausalLM parser = argparse.ArgumentParser(description="Generation") @@ -17,7 +17,7 @@ tokenizer = AutoTokenizer.from_pretrained( pretrained_model_name_or_path=args.tokenizer_path, ) -model = MistralForCausalLM.from_pretrained( +model = AutoModelForCausalLM.from_pretrained( args.model_path, device_map="auto", torch_dtype=torch.bfloat16 ) @@ -29,8 +29,8 @@ ) outputs = model.generate( # type: ignore input_ids.to(device=model.device), # type: ignore - max_new_tokens=128, - temperature=0.99, + max_new_tokens=1024, + temperature=0.7, top_p=0.95, do_sample=True, ) diff --git a/tools/inference/inference.sh b/tools/inference/inference.sh index 33d8c51..4d05001 100644 --- a/tools/inference/inference.sh +++ b/tools/inference/inference.sh @@ -4,24 +4,22 @@ #$ -j y #$ -o outputs/inference/ #$ -cwd + # module load source /etc/profile.d/modules.sh -module load cuda/11.8/11.8.0 -module load cudnn/8.9/8.9.2 -module load nccl/2.16/2.16.2-1 +module use /bb/llm/gaf51275/modules/modulefiles + +module load cuda/12.1/12.1.1 +module load cudnn/cuda-12.1/9.0.0 +module load nccl/2.20.5 module load hpcx/2.12 +module load gcc/11.4.0 set -e # swich virtual env source .env/bin/activate -# distributed settings -export MASTER_ADDR=$(/usr/sbin/ip a show dev bond0 | grep 'inet ' | awk '{ print $2 }' | cut -d "/" -f 1) -export MASTER_PORT=$((10000 + ($JOB_ID % 50000))) - -echo "MASTER_ADDR=${MASTER_ADDR}" - python tools/inference/inference.py \ --model-path /bb/llm/gaf51275/llama/converted-hf-checkpoint/mistral-7B-VE/okazaki-cc/iter_0004000 \ --tokenizer-path /bb/llm/gaf51275/llama/converted-hf-checkpoint/mistral-7B-VE/okazaki-cc/iter_0004000 \ diff --git a/tools/model-upload/upload.py b/tools/model-upload/upload.py index 76816fd..91828fe 100644 --- a/tools/model-upload/upload.py +++ b/tools/model-upload/upload.py @@ -1,25 +1,41 @@ import os import argparse - +from tqdm import tqdm from huggingface_hub import HfApi, create_repo +def upload_directory(api, local_dir, repo_name, repo_type, branch_name): + for root, _, files in os.walk(local_dir): + for file in tqdm(files, desc=f"Uploading files in {root}"): + local_path = os.path.join(root, file) + repo_path = os.path.relpath(local_path, local_dir) + + print(f"Uploading {repo_path} to branch {branch_name}...") + api.upload_file( + path_or_fileobj=local_path, + path_in_repo=repo_path, + repo_id=repo_name, + repo_type=repo_type, + commit_message=f"Upload {repo_path}", + revision=branch_name, + ) + print(f"Successfully uploaded {repo_path}!") + + parser = argparse.ArgumentParser() -parser.add_argument("--ckpt-path", type=str) -parser.add_argument("--repo-name", type=str) -parser.add_argument("--branch-name", type=str, default="main") +parser.add_argument("--ckpt-path", type=str, help="Path to the checkpoint directory") +parser.add_argument("--repo-name", type=str, help="Name of the Hugging Face repository") +parser.add_argument("--branch-name", type=str, default="main", help="Branch name in the repository") args = parser.parse_args() converted_ckpt: str = args.ckpt_path repo_name: str = args.repo_name branch_name: str = args.branch_name + try: create_repo(repo_name, repo_type="model", private=True) except Exception as e: - print(f"repo {repo_name} already exists! error: {e}") - pass - -files = os.listdir(converted_ckpt) + print(f"Repository {repo_name} already exists! Error: {e}") api = HfApi() if branch_name != "main": @@ -29,17 +45,9 @@ repo_type="model", branch=branch_name, ) - except Exception: - print(f"branch {branch_name} already exists, try again...") -print(f"to upload: {files}") -for file in files: - print(f"Uploading {file} to branch {branch_name}...") - api.upload_file( - path_or_fileobj=os.path.join(converted_ckpt, file), - path_in_repo=file, - repo_id=repo_name, - repo_type="model", - commit_message=f"Upload {file}", - revision=branch_name, - ) - print(f"Successfully uploaded {file} !") + except Exception as e: + print(f"Branch {branch_name} already exists. Error: {e}") + +print(f"Starting upload of directory: {converted_ckpt}") +upload_directory(api, converted_ckpt, repo_name, "model", branch_name) +print("Upload completed successfully!") diff --git a/tools/model-upload/upload.sh b/tools/model-upload/upload.sh index a5dba10..b60b766 100644 --- a/tools/model-upload/upload.sh +++ b/tools/model-upload/upload.sh @@ -2,16 +2,42 @@ set -e -start=2080 -end=2080 -increment=5000 +start=9250 +end=9250 +increment=2500 -upload_base_dir=/bb/llm/gaf51275/llama/converted-hf-checkpoint/Swallow-7b-VE-chat/imitation-1-and-2-lr_2e-5-minlr_2e-6-GB_64 +EXPERIMENT_NAME=exp1 + +upload_base_dir=/bb/llm/gaf51275/2024/checkpoints/Llama-3-8b-instruct-v0.2/${EXPERIMENT_NAME}/LR1.0E-5-MINLR1.0E-6-WD0.1 + +upload_checkpoint() { + local upload_dir=$1 + local repo_name=$2 + local max_retries=5 + local retry_count=0 + + while [ $retry_count -lt $max_retries ]; do + if python scripts/abci/upload/upload.py \ + --ckpt-path "$upload_dir" \ + --repo-name "$repo_name"; then + echo "Successfully uploaded $repo_name" + return 0 + else + echo "Upload failed for $repo_name. Retrying..." + ((retry_count++)) + sleep 5 + fi + done + + echo "Failed to upload $repo_name after $max_retries attempts" + return 1 +} for ((i = start; i <= end; i += increment)); do upload_dir=$upload_base_dir/iter_$(printf "%07d" $i) + repo_name="tokyotech-llm/Llama-3-8b-instruct-v0.2-${EXPERIMENT_NAME}-LR1.0e-5-MINLR1.0E-6-iter$(printf "%07d" $i)" - python tools/model-upload/upload.py \ - --ckpt-path $upload_dir \ - --repo-name tokyotech-llm/Swallow-7b-VE-instruct-v1.0-imitation-1-and-2-lr_2e-5-minlr_2e-6-GB_64-iter$(printf "%07d" $i) + if ! upload_checkpoint "$upload_dir" "$repo_name"; then + echo "Skipping to next checkpoint after repeated failures for $repo_name" + fi done diff --git a/tools/pre-process/scripts/index.sh b/tools/pre-process/scripts/index.sh index e333900..2a46f3f 100644 --- a/tools/pre-process/scripts/index.sh +++ b/tools/pre-process/scripts/index.sh @@ -2,39 +2,8 @@ source .env/bin/activate -INPUT_DIR=/bb/llm/gaf51275/llama/finetuning/datasets/training +INPUT_DIR=/gs/bs/tga-NII-LLM/datasets/raw/instruct/synthetic/general/Synthetic-JP-Conversations-Magpie-Nemotron-4-10k # baseline python tools/pre-process/index_dataset.py \ - --data-file-path $INPUT_DIR/baseline/train.jsonl - -python tools/pre-process/index_dataset.py \ - --data-file-path $INPUT_DIR/baseline/val.jsonl - -# baseline-imitation_2 -python tools/pre-process/index_dataset.py \ - --data-file-path $INPUT_DIR/baseline-imitation_2/train.jsonl - -python tools/pre-process/index_dataset.py \ - --data-file-path $INPUT_DIR/baseline-imitation_2/val.jsonl - -# ichikara -python tools/pre-process/index_dataset.py \ - --data-file-path $INPUT_DIR/ichikara/train.jsonl - -python tools/pre-process/index_dataset.py \ - --data-file-path $INPUT_DIR/ichikara/val.jsonl - -# imitation_1_and_2 -python tools/pre-process/index_dataset.py \ - --data-file-path $INPUT_DIR/imitation_1_and_2/train.jsonl - -python tools/pre-process/index_dataset.py \ - --data-file-path $INPUT_DIR/imitation_1_and_2/val.jsonl - -# imitation_2_oasst2_top1 -python tools/pre-process/index_dataset.py \ - --data-file-path $INPUT_DIR/imitation_2_oasst2_top1/train.jsonl - -python tools/pre-process/index_dataset.py \ - --data-file-path $INPUT_DIR/imitation_2_oasst2_top1/val.jsonl + --data-file-path $INPUT_DIR/Synthetic-JP-Conversations-Magpie-Nemotron-4-10k.jsonl