|
| 1 | +#!/bin/bash |
| 2 | +#SBATCH --job-name=codestral |
| 3 | +#SBATCH --partition=a3 |
| 4 | +#SBATCH --exclusive |
| 5 | +#SBATCH --nodes 2 |
| 6 | +#SBATCH --gpus-per-node=8 |
| 7 | +#SBATCH --ntasks-per-node=8 |
| 8 | +#SBATCH --output=outputs/codestral/%x-%j.out |
| 9 | +#SBATCH --error=outputs/codestral/%x-%j.out |
| 10 | + |
| 11 | +set -e |
| 12 | + |
| 13 | +# module load |
| 14 | +module load cuda/12.1 |
| 15 | +module load cudnn/8.9.7 |
| 16 | +module load hpcx/2.17.1 |
| 17 | + |
| 18 | +# open file limit |
| 19 | +ulimit -n 65536 1048576 |
| 20 | + |
| 21 | +# python virtualenv |
| 22 | +source .env/bin/activate |
| 23 | + |
| 24 | +# Important TCPX environment variables |
| 25 | +UDS_PATH="/run/tcpx-${SLURM_JOB_ID}" |
| 26 | + |
| 27 | +# Only use TCPX for multi-node jobs. |
| 28 | +[[ "${SLURM_JOB_NUM_NODES}" -gt 1 ]] && export USE_TCPX=yes || export USE_TCPX=no |
| 29 | + |
| 30 | +# Only use TCPX for multi-node jobs. |
| 31 | +if [[ ${USE_TCPX} = "yes" ]]; then |
| 32 | + # Set up NCCL Environment variables |
| 33 | + export NCCL_NET=GPUDirectTCPX_v7 |
| 34 | + # These network interfaces use Ubuntu's consistent naming scheme. See |
| 35 | + # https://manpages.ubuntu.com/manpages/focal/man7/systemd.net-naming-scheme.7.html |
| 36 | + export NCCL_SOCKET_IFNAME=enp0s12 |
| 37 | + export NCCL_GPUDIRECTTCPX_CTRL_DEV=enp0s12 |
| 38 | + export NCCL_GPUDIRECTTCPX_SOCKET_IFNAME=enp6s0,enp12s0,enp134s0,enp140s0 |
| 39 | + export NCCL_CROSS_NIC=0 |
| 40 | + export NCCL_ALGO=Ring |
| 41 | + export NCCL_PROTO=Simple |
| 42 | + export NCCL_NSOCKS_PERTHREAD=4 |
| 43 | + export NCCL_SOCKET_NTHREADS=1 |
| 44 | + export NCCL_DYNAMIC_CHUNK_SIZE=524288 |
| 45 | + export NCCL_P2P_NET_CHUNKSIZE=524288 |
| 46 | + export NCCL_P2P_PCI_CHUNKSIZE=524288 |
| 47 | + export NCCL_P2P_NVL_CHUNKSIZE=1048576 |
| 48 | + export CUDA_VISIBLE_DEVICES=0,1,2,3,4,5,6,7 |
| 49 | + export NCCL_NET_GDR_LEVEL=PIX |
| 50 | + export NCCL_P2P_PXN_LEVEL=0 |
| 51 | + export NCCL_GPUDIRECTTCPX_UNIX_CLIENT_PREFIX=${UDS_PATH} |
| 52 | + export NCCL_GPUDIRECTTCPX_PROGRAM_FLOW_STEERING_WAIT_MICROS=500000 |
| 53 | + export NCCL_GPUDIRECTTCPX_TX_BINDINGS="enp6s0:8-21,112-125;enp12s0:8-21,112-125;enp134s0:60-73,164-177;enp140s0:60-73,164-177" |
| 54 | + export NCCL_GPUDIRECTTCPX_RX_BINDINGS="enp6s0:22-35,126-139;enp12s0:22-35,126-139;enp134s0:74-87,178-191;enp140s0:74-87,178-191" |
| 55 | + |
| 56 | + export LD_LIBRARY_PATH=/var/lib/tcpx/lib64:${LD_LIBRARY_PATH} |
| 57 | +else |
| 58 | + unset NCCL_NET |
| 59 | +fi |
| 60 | + |
| 61 | +# distributed settings |
| 62 | +export MASTER_ADDR=$(scontrol show hostname $SLURM_JOB_NODELIST | head -n1) |
| 63 | +export MASTER_PORT=$((10000 + ($SLURM_JOBID % 50000))) |
| 64 | + |
| 65 | +echo "MASTER_ADDR=${MASTER_ADDR}" |
| 66 | + |
| 67 | +# hostfile |
| 68 | +export NUM_GPU_PER_NODE=8 |
| 69 | +NODE_TYPE="H100" |
| 70 | + |
| 71 | +NUM_NODES=$SLURM_JOB_NUM_NODES |
| 72 | +NUM_GPUS=$((${NUM_NODES} * ${NUM_GPU_PER_NODE})) |
| 73 | + |
| 74 | +# training config |
| 75 | +SEQ_LENGTH=4096 |
| 76 | +DATA_PARALLEL_SIZE=$NUM_GPUS |
| 77 | + |
| 78 | +MICRO_BATCH_SIZE=1 |
| 79 | +GLOBAL_BATCH_SIZE=1024 |
| 80 | +TRAIN_STEPS=25000 |
| 81 | + |
| 82 | +# optimizer config |
| 83 | +LR=2.5E-5 |
| 84 | +MIN_LR=2.5E-6 |
| 85 | +LR_WARMUP_STEPS=1000 |
| 86 | +LR_DECAY_STEPS=25000 |
| 87 | +WEIGHT_DECAY=0.1 |
| 88 | +GRAD_CLIP=1 |
| 89 | +# model config |
| 90 | +TOKENIZER_MODEL=/home/ext_kazuki_fujii_rio_gsic_titech/hf_checkpoints/Codestral-22B-v0.1/tokenizer.model |
| 91 | +CHECKPOINT_DIR=/home/ext_kazuki_fujii_rio_gsic_titech/hf_checkpoints/Codestral-22B-v0.1 |
| 92 | +CHECKPOINT_SAVE_DIR=/home/ext_kazuki_fujii_rio_gsic_titech/checkpoints/Codestral-22B-v0.1 |
| 93 | + |
| 94 | +mkdir -p ${CHECKPOINT_SAVE_DIR} |
| 95 | + |
| 96 | +# data config |
| 97 | +DATASET_DIR=/home/ext_kazuki_fujii_rio_gsic_titech/datasets/debug/Codestral-22B-v0.1 |
| 98 | + |
| 99 | +TRAIN_DATA_PATH="" |
| 100 | + |
| 101 | +# ja wiki |
| 102 | +TRAIN_DATA_PATH="${TRAIN_DATA_PATH} 2741303196 ${DATASET_DIR}/ja_wiki_text_document" |
| 103 | + |
| 104 | +# job name |
| 105 | +JOB_NAME="Codestral-22B-v0.1-gcp-${NODE_TYPE}-${NUM_NODES}node-${NUM_GPUS}gpu-BS=${GLOBAL_BATCH_SIZE}-LR=${LR}-MINLR=${MIN_LR}-WARMUP=${LR_WARMUP_STEPS}-WD=${WEIGHT_DECAY}-GC=${GRAD_CLIP}" |
| 106 | + |
| 107 | +# run |
| 108 | +mpirun -np $NUM_GPUS \ |
| 109 | + --npernode $NUM_GPU_PER_NODE \ |
| 110 | + -x MASTER_ADDR=$MASTER_ADDR \ |
| 111 | + -x MASTER_PORT=$MASTER_PORT \ |
| 112 | + -bind-to none \ |
| 113 | + -x LD_LIBRARY_PATH \ |
| 114 | + -x PATH \ |
| 115 | + python examples/finetuning.py \ |
| 116 | + --seq-length ${SEQ_LENGTH} \ |
| 117 | + --sliding-window-size ${SEQ_LENGTH} \ |
| 118 | + --micro-batch-size ${MICRO_BATCH_SIZE} \ |
| 119 | + --global-batch-size ${GLOBAL_BATCH_SIZE} \ |
| 120 | + --train-iters ${TRAIN_STEPS} \ |
| 121 | + --tokenizer-type Llama2Tokenizer \ |
| 122 | + --tokenizer-model ${TOKENIZER_MODEL} \ |
| 123 | + --data-path ${TRAIN_DATA_PATH} \ |
| 124 | + --split 949,50,1 \ |
| 125 | + --lr ${LR} \ |
| 126 | + --min-lr ${MIN_LR} \ |
| 127 | + --lr-decay-style cosine \ |
| 128 | + --lr-warmup-iters ${LR_WARMUP_STEPS} \ |
| 129 | + --lr-decay-iters ${LR_DECAY_STEPS} \ |
| 130 | + --weight-decay ${WEIGHT_DECAY} \ |
| 131 | + --grad-clip-norm ${GRAD_CLIP} \ |
| 132 | + --optimizer adam \ |
| 133 | + --adam-beta1 0.9 \ |
| 134 | + --adam-beta2 0.95 \ |
| 135 | + --adam-eps 1e-5 \ |
| 136 | + --save-interval 500 \ |
| 137 | + --eval-interval 100 \ |
| 138 | + --eval-iters 10 \ |
| 139 | + --bf16 \ |
| 140 | + --mixed-precision \ |
| 141 | + --base-model ${CHECKPOINT_DIR} \ |
| 142 | + --save ${CHECKPOINT_SAVE_DIR} \ |
| 143 | + --load ${CHECKPOINT_SAVE_DIR} \ |
| 144 | + --low-cpu-fsdp \ |
| 145 | + --sharding-strategy FULL_SHARD \ |
| 146 | + --checkpoint-type LOCAL_STATE_DICT \ |
| 147 | + --fsdp-activation-checkpointing \ |
| 148 | + --use-mpi \ |
| 149 | + --wandb-entity "okoge" \ |
| 150 | + --wandb-project "llm-recipes" \ |
| 151 | + --wandb-name "${JOB_NAME}" |
0 commit comments