-
Notifications
You must be signed in to change notification settings - Fork 4
Commit
This commit does not belong to any branch on this repository, and may belong to a fork outside of the repository.
Merge pull request #4 from okoge-kaz/feature/phi-3
Support Phi-3, Yi-1.5, Codestral
- Loading branch information
Showing
12 changed files
with
388 additions
and
19 deletions.
There are no files selected for viewing
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
Original file line number | Diff line number | Diff line change |
---|---|---|
@@ -0,0 +1,151 @@ | ||
#!/bin/bash | ||
#SBATCH --job-name=codestral | ||
#SBATCH --partition=a3 | ||
#SBATCH --exclusive | ||
#SBATCH --nodes 2 | ||
#SBATCH --gpus-per-node=8 | ||
#SBATCH --ntasks-per-node=8 | ||
#SBATCH --output=outputs/codestral/%x-%j.out | ||
#SBATCH --error=outputs/codestral/%x-%j.out | ||
|
||
set -e | ||
|
||
# module load | ||
module load cuda/12.1 | ||
module load cudnn/8.9.7 | ||
module load hpcx/2.17.1 | ||
|
||
# open file limit | ||
ulimit -n 65536 1048576 | ||
|
||
# python virtualenv | ||
source .env/bin/activate | ||
|
||
# Important TCPX environment variables | ||
UDS_PATH="/run/tcpx-${SLURM_JOB_ID}" | ||
|
||
# Only use TCPX for multi-node jobs. | ||
[[ "${SLURM_JOB_NUM_NODES}" -gt 1 ]] && export USE_TCPX=yes || export USE_TCPX=no | ||
|
||
# Only use TCPX for multi-node jobs. | ||
if [[ ${USE_TCPX} = "yes" ]]; then | ||
# Set up NCCL Environment variables | ||
export NCCL_NET=GPUDirectTCPX_v7 | ||
# These network interfaces use Ubuntu's consistent naming scheme. See | ||
# https://manpages.ubuntu.com/manpages/focal/man7/systemd.net-naming-scheme.7.html | ||
export NCCL_SOCKET_IFNAME=enp0s12 | ||
export NCCL_GPUDIRECTTCPX_CTRL_DEV=enp0s12 | ||
export NCCL_GPUDIRECTTCPX_SOCKET_IFNAME=enp6s0,enp12s0,enp134s0,enp140s0 | ||
export NCCL_CROSS_NIC=0 | ||
export NCCL_ALGO=Ring | ||
export NCCL_PROTO=Simple | ||
export NCCL_NSOCKS_PERTHREAD=4 | ||
export NCCL_SOCKET_NTHREADS=1 | ||
export NCCL_DYNAMIC_CHUNK_SIZE=524288 | ||
export NCCL_P2P_NET_CHUNKSIZE=524288 | ||
export NCCL_P2P_PCI_CHUNKSIZE=524288 | ||
export NCCL_P2P_NVL_CHUNKSIZE=1048576 | ||
export CUDA_VISIBLE_DEVICES=0,1,2,3,4,5,6,7 | ||
export NCCL_NET_GDR_LEVEL=PIX | ||
export NCCL_P2P_PXN_LEVEL=0 | ||
export NCCL_GPUDIRECTTCPX_UNIX_CLIENT_PREFIX=${UDS_PATH} | ||
export NCCL_GPUDIRECTTCPX_PROGRAM_FLOW_STEERING_WAIT_MICROS=500000 | ||
export NCCL_GPUDIRECTTCPX_TX_BINDINGS="enp6s0:8-21,112-125;enp12s0:8-21,112-125;enp134s0:60-73,164-177;enp140s0:60-73,164-177" | ||
export NCCL_GPUDIRECTTCPX_RX_BINDINGS="enp6s0:22-35,126-139;enp12s0:22-35,126-139;enp134s0:74-87,178-191;enp140s0:74-87,178-191" | ||
|
||
export LD_LIBRARY_PATH=/var/lib/tcpx/lib64:${LD_LIBRARY_PATH} | ||
else | ||
unset NCCL_NET | ||
fi | ||
|
||
# distributed settings | ||
export MASTER_ADDR=$(scontrol show hostname $SLURM_JOB_NODELIST | head -n1) | ||
export MASTER_PORT=$((10000 + ($SLURM_JOBID % 50000))) | ||
|
||
echo "MASTER_ADDR=${MASTER_ADDR}" | ||
|
||
# hostfile | ||
export NUM_GPU_PER_NODE=8 | ||
NODE_TYPE="H100" | ||
|
||
NUM_NODES=$SLURM_JOB_NUM_NODES | ||
NUM_GPUS=$((${NUM_NODES} * ${NUM_GPU_PER_NODE})) | ||
|
||
# training config | ||
SEQ_LENGTH=4096 | ||
DATA_PARALLEL_SIZE=$NUM_GPUS | ||
|
||
MICRO_BATCH_SIZE=1 | ||
GLOBAL_BATCH_SIZE=1024 | ||
TRAIN_STEPS=25000 | ||
|
||
# optimizer config | ||
LR=2.5E-5 | ||
MIN_LR=2.5E-6 | ||
LR_WARMUP_STEPS=1000 | ||
LR_DECAY_STEPS=25000 | ||
WEIGHT_DECAY=0.1 | ||
GRAD_CLIP=1 | ||
# model config | ||
TOKENIZER_MODEL=/home/ext_kazuki_fujii_rio_gsic_titech/hf_checkpoints/Codestral-22B-v0.1/tokenizer.model | ||
CHECKPOINT_DIR=/home/ext_kazuki_fujii_rio_gsic_titech/hf_checkpoints/Codestral-22B-v0.1 | ||
CHECKPOINT_SAVE_DIR=/home/ext_kazuki_fujii_rio_gsic_titech/checkpoints/Codestral-22B-v0.1 | ||
|
||
mkdir -p ${CHECKPOINT_SAVE_DIR} | ||
|
||
# data config | ||
DATASET_DIR=/home/ext_kazuki_fujii_rio_gsic_titech/datasets/debug/Codestral-22B-v0.1 | ||
|
||
TRAIN_DATA_PATH="" | ||
|
||
# ja wiki | ||
TRAIN_DATA_PATH="${TRAIN_DATA_PATH} 2741303196 ${DATASET_DIR}/ja_wiki_text_document" | ||
|
||
# job name | ||
JOB_NAME="Codestral-22B-v0.1-gcp-${NODE_TYPE}-${NUM_NODES}node-${NUM_GPUS}gpu-BS=${GLOBAL_BATCH_SIZE}-LR=${LR}-MINLR=${MIN_LR}-WARMUP=${LR_WARMUP_STEPS}-WD=${WEIGHT_DECAY}-GC=${GRAD_CLIP}" | ||
|
||
# run | ||
mpirun -np $NUM_GPUS \ | ||
--npernode $NUM_GPU_PER_NODE \ | ||
-x MASTER_ADDR=$MASTER_ADDR \ | ||
-x MASTER_PORT=$MASTER_PORT \ | ||
-bind-to none \ | ||
-x LD_LIBRARY_PATH \ | ||
-x PATH \ | ||
python examples/finetuning.py \ | ||
--seq-length ${SEQ_LENGTH} \ | ||
--sliding-window-size ${SEQ_LENGTH} \ | ||
--micro-batch-size ${MICRO_BATCH_SIZE} \ | ||
--global-batch-size ${GLOBAL_BATCH_SIZE} \ | ||
--train-iters ${TRAIN_STEPS} \ | ||
--tokenizer-type Llama2Tokenizer \ | ||
--tokenizer-model ${TOKENIZER_MODEL} \ | ||
--data-path ${TRAIN_DATA_PATH} \ | ||
--split 949,50,1 \ | ||
--lr ${LR} \ | ||
--min-lr ${MIN_LR} \ | ||
--lr-decay-style cosine \ | ||
--lr-warmup-iters ${LR_WARMUP_STEPS} \ | ||
--lr-decay-iters ${LR_DECAY_STEPS} \ | ||
--weight-decay ${WEIGHT_DECAY} \ | ||
--grad-clip-norm ${GRAD_CLIP} \ | ||
--optimizer adam \ | ||
--adam-beta1 0.9 \ | ||
--adam-beta2 0.95 \ | ||
--adam-eps 1e-5 \ | ||
--save-interval 500 \ | ||
--eval-interval 100 \ | ||
--eval-iters 10 \ | ||
--bf16 \ | ||
--mixed-precision \ | ||
--base-model ${CHECKPOINT_DIR} \ | ||
--save ${CHECKPOINT_SAVE_DIR} \ | ||
--load ${CHECKPOINT_SAVE_DIR} \ | ||
--low-cpu-fsdp \ | ||
--sharding-strategy FULL_SHARD \ | ||
--checkpoint-type LOCAL_STATE_DICT \ | ||
--fsdp-activation-checkpointing \ | ||
--use-mpi \ | ||
--wandb-entity "okoge" \ | ||
--wandb-project "llm-recipes" \ | ||
--wandb-name "${JOB_NAME}" |
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
Original file line number | Diff line number | Diff line change |
---|---|---|
@@ -0,0 +1,18 @@ | ||
#!/bin/bash | ||
|
||
# swich virtual env | ||
source .env/bin/activate | ||
|
||
DATASET_DIR=/home/ext_kazuki_fujii_rio_gsic_titech/datasets/samples | ||
OUTPUT_DIR=/home/ext_kazuki_fujii_rio_gsic_titech/datasets/debug/Codestral-22B-v0.1 | ||
|
||
mkdir -p ${OUTPUT_DIR} | ||
|
||
# tokenize japanese wikipedia | ||
python megatron_lm/tools/preprocess_data.py \ | ||
--input ${DATASET_DIR}/ja_wiki.jsonl \ | ||
--output-prefix ${OUTPUT_DIR}/ja_wiki \ | ||
--tokenizer-type Llama2Tokenizer \ | ||
--tokenizer-model /home/ext_kazuki_fujii_rio_gsic_titech/hf_checkpoints/Codestral-22B-v0.1/tokenizer.model \ | ||
--append-eod \ | ||
--workers 64 |
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
Original file line number | Diff line number | Diff line change |
---|---|---|
@@ -0,0 +1,18 @@ | ||
#!/bin/bash | ||
|
||
# swich virtual env | ||
source .env/bin/activate | ||
|
||
DATASET_DIR=/home/ext_kazuki_fujii_rio_gsic_titech/datasets/samples | ||
OUTPUT_DIR=/home/ext_kazuki_fujii_rio_gsic_titech/datasets/debug/yi-1.5 | ||
|
||
mkdir -p ${OUTPUT_DIR} | ||
|
||
# tokenize japanese wikipedia | ||
python megatron_lm/tools/preprocess_data.py \ | ||
--input ${DATASET_DIR}/ja_wiki.jsonl \ | ||
--output-prefix ${OUTPUT_DIR}/ja_wiki \ | ||
--tokenizer-type Llama2Tokenizer \ | ||
--tokenizer-model /home/ext_kazuki_fujii_rio_gsic_titech/hf_checkpoints/Yi-1.5-9B/tokenizer.model \ | ||
--append-eod \ | ||
--workers 64 |
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
Original file line number | Diff line number | Diff line change |
---|---|---|
@@ -0,0 +1,151 @@ | ||
#!/bin/bash | ||
#SBATCH --job-name=yi-1.5-9b | ||
#SBATCH --partition=a3 | ||
#SBATCH --exclusive | ||
#SBATCH --nodes 2 | ||
#SBATCH --gpus-per-node=8 | ||
#SBATCH --ntasks-per-node=8 | ||
#SBATCH --output=outputs/yi-1.5-9b/%x-%j.out | ||
#SBATCH --error=outputs/yi-1.5-9b/%x-%j.out | ||
|
||
set -e | ||
|
||
# module load | ||
module load cuda/12.1 | ||
module load cudnn/8.9.7 | ||
module load hpcx/2.17.1 | ||
|
||
# open file limit | ||
ulimit -n 65536 1048576 | ||
|
||
# python virtualenv | ||
source .env/bin/activate | ||
|
||
# Important TCPX environment variables | ||
UDS_PATH="/run/tcpx-${SLURM_JOB_ID}" | ||
|
||
# Only use TCPX for multi-node jobs. | ||
[[ "${SLURM_JOB_NUM_NODES}" -gt 1 ]] && export USE_TCPX=yes || export USE_TCPX=no | ||
|
||
# Only use TCPX for multi-node jobs. | ||
if [[ ${USE_TCPX} = "yes" ]]; then | ||
# Set up NCCL Environment variables | ||
export NCCL_NET=GPUDirectTCPX_v7 | ||
# These network interfaces use Ubuntu's consistent naming scheme. See | ||
# https://manpages.ubuntu.com/manpages/focal/man7/systemd.net-naming-scheme.7.html | ||
export NCCL_SOCKET_IFNAME=enp0s12 | ||
export NCCL_GPUDIRECTTCPX_CTRL_DEV=enp0s12 | ||
export NCCL_GPUDIRECTTCPX_SOCKET_IFNAME=enp6s0,enp12s0,enp134s0,enp140s0 | ||
export NCCL_CROSS_NIC=0 | ||
export NCCL_ALGO=Ring | ||
export NCCL_PROTO=Simple | ||
export NCCL_NSOCKS_PERTHREAD=4 | ||
export NCCL_SOCKET_NTHREADS=1 | ||
export NCCL_DYNAMIC_CHUNK_SIZE=524288 | ||
export NCCL_P2P_NET_CHUNKSIZE=524288 | ||
export NCCL_P2P_PCI_CHUNKSIZE=524288 | ||
export NCCL_P2P_NVL_CHUNKSIZE=1048576 | ||
export CUDA_VISIBLE_DEVICES=0,1,2,3,4,5,6,7 | ||
export NCCL_NET_GDR_LEVEL=PIX | ||
export NCCL_P2P_PXN_LEVEL=0 | ||
export NCCL_GPUDIRECTTCPX_UNIX_CLIENT_PREFIX=${UDS_PATH} | ||
export NCCL_GPUDIRECTTCPX_PROGRAM_FLOW_STEERING_WAIT_MICROS=500000 | ||
export NCCL_GPUDIRECTTCPX_TX_BINDINGS="enp6s0:8-21,112-125;enp12s0:8-21,112-125;enp134s0:60-73,164-177;enp140s0:60-73,164-177" | ||
export NCCL_GPUDIRECTTCPX_RX_BINDINGS="enp6s0:22-35,126-139;enp12s0:22-35,126-139;enp134s0:74-87,178-191;enp140s0:74-87,178-191" | ||
|
||
export LD_LIBRARY_PATH=/var/lib/tcpx/lib64:${LD_LIBRARY_PATH} | ||
else | ||
unset NCCL_NET | ||
fi | ||
|
||
# distributed settings | ||
export MASTER_ADDR=$(scontrol show hostname $SLURM_JOB_NODELIST | head -n1) | ||
export MASTER_PORT=$((10000 + ($SLURM_JOBID % 50000))) | ||
|
||
echo "MASTER_ADDR=${MASTER_ADDR}" | ||
|
||
# hostfile | ||
export NUM_GPU_PER_NODE=8 | ||
NODE_TYPE="H100" | ||
|
||
NUM_NODES=$SLURM_JOB_NUM_NODES | ||
NUM_GPUS=$((${NUM_NODES} * ${NUM_GPU_PER_NODE})) | ||
|
||
# training config | ||
SEQ_LENGTH=4096 | ||
DATA_PARALLEL_SIZE=$NUM_GPUS | ||
|
||
MICRO_BATCH_SIZE=8 | ||
GLOBAL_BATCH_SIZE=1024 | ||
TRAIN_STEPS=25000 | ||
|
||
# optimizer config | ||
LR=2.5E-5 | ||
MIN_LR=2.5E-6 | ||
LR_WARMUP_STEPS=1000 | ||
LR_DECAY_STEPS=25000 | ||
WEIGHT_DECAY=0.1 | ||
GRAD_CLIP=1 | ||
# model config | ||
TOKENIZER_MODEL=/home/ext_kazuki_fujii_rio_gsic_titech/hf_checkpoints/Yi-1.5-9B/tokenizer.model | ||
CHECKPOINT_DIR=/home/ext_kazuki_fujii_rio_gsic_titech/hf_checkpoints/Yi-1.5-9B | ||
CHECKPOINT_SAVE_DIR=/home/ext_kazuki_fujii_rio_gsic_titech/checkpoints/Yi-1.5-9B | ||
|
||
mkdir -p ${CHECKPOINT_SAVE_DIR} | ||
|
||
# data config | ||
DATASET_DIR=/home/ext_kazuki_fujii_rio_gsic_titech/datasets/debug/yi-1.5 | ||
|
||
DATA_PATH="" | ||
|
||
# ja wiki | ||
DATA_PATH="${DATA_PATH} 2990167836 ${DATASET_DIR}/ja_wiki_text_document" | ||
|
||
# job name | ||
JOB_NAME="Yi-1.5-9B-gcp-${NODE_TYPE}-${NUM_NODES}node-${NUM_GPUS}gpu-BS=${GLOBAL_BATCH_SIZE}-LR=${LR}-MINLR=${MIN_LR}-WARMUP=${LR_WARMUP_STEPS}-WD=${WEIGHT_DECAY}-GC=${GRAD_CLIP}" | ||
|
||
# run | ||
mpirun -np $NUM_GPUS \ | ||
--npernode $NUM_GPU_PER_NODE \ | ||
-x MASTER_ADDR=$MASTER_ADDR \ | ||
-x MASTER_PORT=$MASTER_PORT \ | ||
-bind-to none \ | ||
-x LD_LIBRARY_PATH \ | ||
-x PATH \ | ||
python examples/finetuning.py \ | ||
--seq-length ${SEQ_LENGTH} \ | ||
--sliding-window-size ${SEQ_LENGTH} \ | ||
--micro-batch-size ${MICRO_BATCH_SIZE} \ | ||
--global-batch-size ${GLOBAL_BATCH_SIZE} \ | ||
--train-iters ${TRAIN_STEPS} \ | ||
--tokenizer-type Llama2Tokenizer \ | ||
--tokenizer-model ${TOKENIZER_MODEL} \ | ||
--data-path ${DATA_PATH} \ | ||
--split 949,50,1 \ | ||
--lr ${LR} \ | ||
--min-lr ${MIN_LR} \ | ||
--lr-decay-style cosine \ | ||
--lr-warmup-iters ${LR_WARMUP_STEPS} \ | ||
--lr-decay-iters ${LR_DECAY_STEPS} \ | ||
--weight-decay ${WEIGHT_DECAY} \ | ||
--grad-clip-norm ${GRAD_CLIP} \ | ||
--optimizer adam \ | ||
--adam-beta1 0.9 \ | ||
--adam-beta2 0.95 \ | ||
--adam-eps 1e-5 \ | ||
--save-interval 500 \ | ||
--eval-interval 100 \ | ||
--eval-iters 10 \ | ||
--bf16 \ | ||
--mixed-precision \ | ||
--base-model ${CHECKPOINT_DIR} \ | ||
--save ${CHECKPOINT_SAVE_DIR} \ | ||
--load ${CHECKPOINT_SAVE_DIR} \ | ||
--low-cpu-fsdp \ | ||
--sharding-strategy FULL_SHARD \ | ||
--checkpoint-type LOCAL_STATE_DICT \ | ||
--fsdp-activation-checkpointing \ | ||
--use-mpi \ | ||
--wandb-entity "okoge" \ | ||
--wandb-project "llm-recipes" \ | ||
--wandb-name "${JOB_NAME}" |
Oops, something went wrong.