Skip to content

Commit 1d741c4

Browse files
committed
feat: support codestral
1 parent 87510b3 commit 1d741c4

File tree

6 files changed

+175
-3
lines changed

6 files changed

+175
-3
lines changed

.vscode/settings.json

Lines changed: 1 addition & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -18,6 +18,7 @@
1818
"anyprecision",
1919
"autocast",
2020
"bettertransformer",
21+
"Codestral",
2122
"colour",
2223
"Concatenator",
2324
"detokenize",

scripts/gcp/codestral-22b.sh

Lines changed: 151 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,151 @@
1+
#!/bin/bash
2+
#SBATCH --job-name=codestral
3+
#SBATCH --partition=a3
4+
#SBATCH --exclusive
5+
#SBATCH --nodes 2
6+
#SBATCH --gpus-per-node=8
7+
#SBATCH --ntasks-per-node=8
8+
#SBATCH --output=outputs/codestral/%x-%j.out
9+
#SBATCH --error=outputs/codestral/%x-%j.out
10+
11+
set -e
12+
13+
# module load
14+
module load cuda/12.1
15+
module load cudnn/8.9.7
16+
module load hpcx/2.17.1
17+
18+
# open file limit
19+
ulimit -n 65536 1048576
20+
21+
# python virtualenv
22+
source .env/bin/activate
23+
24+
# Important TCPX environment variables
25+
UDS_PATH="/run/tcpx-${SLURM_JOB_ID}"
26+
27+
# Only use TCPX for multi-node jobs.
28+
[[ "${SLURM_JOB_NUM_NODES}" -gt 1 ]] && export USE_TCPX=yes || export USE_TCPX=no
29+
30+
# Only use TCPX for multi-node jobs.
31+
if [[ ${USE_TCPX} = "yes" ]]; then
32+
# Set up NCCL Environment variables
33+
export NCCL_NET=GPUDirectTCPX_v7
34+
# These network interfaces use Ubuntu's consistent naming scheme. See
35+
# https://manpages.ubuntu.com/manpages/focal/man7/systemd.net-naming-scheme.7.html
36+
export NCCL_SOCKET_IFNAME=enp0s12
37+
export NCCL_GPUDIRECTTCPX_CTRL_DEV=enp0s12
38+
export NCCL_GPUDIRECTTCPX_SOCKET_IFNAME=enp6s0,enp12s0,enp134s0,enp140s0
39+
export NCCL_CROSS_NIC=0
40+
export NCCL_ALGO=Ring
41+
export NCCL_PROTO=Simple
42+
export NCCL_NSOCKS_PERTHREAD=4
43+
export NCCL_SOCKET_NTHREADS=1
44+
export NCCL_DYNAMIC_CHUNK_SIZE=524288
45+
export NCCL_P2P_NET_CHUNKSIZE=524288
46+
export NCCL_P2P_PCI_CHUNKSIZE=524288
47+
export NCCL_P2P_NVL_CHUNKSIZE=1048576
48+
export CUDA_VISIBLE_DEVICES=0,1,2,3,4,5,6,7
49+
export NCCL_NET_GDR_LEVEL=PIX
50+
export NCCL_P2P_PXN_LEVEL=0
51+
export NCCL_GPUDIRECTTCPX_UNIX_CLIENT_PREFIX=${UDS_PATH}
52+
export NCCL_GPUDIRECTTCPX_PROGRAM_FLOW_STEERING_WAIT_MICROS=500000
53+
export NCCL_GPUDIRECTTCPX_TX_BINDINGS="enp6s0:8-21,112-125;enp12s0:8-21,112-125;enp134s0:60-73,164-177;enp140s0:60-73,164-177"
54+
export NCCL_GPUDIRECTTCPX_RX_BINDINGS="enp6s0:22-35,126-139;enp12s0:22-35,126-139;enp134s0:74-87,178-191;enp140s0:74-87,178-191"
55+
56+
export LD_LIBRARY_PATH=/var/lib/tcpx/lib64:${LD_LIBRARY_PATH}
57+
else
58+
unset NCCL_NET
59+
fi
60+
61+
# distributed settings
62+
export MASTER_ADDR=$(scontrol show hostname $SLURM_JOB_NODELIST | head -n1)
63+
export MASTER_PORT=$((10000 + ($SLURM_JOBID % 50000)))
64+
65+
echo "MASTER_ADDR=${MASTER_ADDR}"
66+
67+
# hostfile
68+
export NUM_GPU_PER_NODE=8
69+
NODE_TYPE="H100"
70+
71+
NUM_NODES=$SLURM_JOB_NUM_NODES
72+
NUM_GPUS=$((${NUM_NODES} * ${NUM_GPU_PER_NODE}))
73+
74+
# training config
75+
SEQ_LENGTH=4096
76+
DATA_PARALLEL_SIZE=$NUM_GPUS
77+
78+
MICRO_BATCH_SIZE=1
79+
GLOBAL_BATCH_SIZE=1024
80+
TRAIN_STEPS=25000
81+
82+
# optimizer config
83+
LR=2.5E-5
84+
MIN_LR=2.5E-6
85+
LR_WARMUP_STEPS=1000
86+
LR_DECAY_STEPS=25000
87+
WEIGHT_DECAY=0.1
88+
GRAD_CLIP=1
89+
# model config
90+
TOKENIZER_MODEL=/home/ext_kazuki_fujii_rio_gsic_titech/hf_checkpoints/Codestral-22B-v0.1/tokenizer.model
91+
CHECKPOINT_DIR=/home/ext_kazuki_fujii_rio_gsic_titech/hf_checkpoints/Codestral-22B-v0.1
92+
CHECKPOINT_SAVE_DIR=/home/ext_kazuki_fujii_rio_gsic_titech/checkpoints/Codestral-22B-v0.1
93+
94+
mkdir -p ${CHECKPOINT_SAVE_DIR}
95+
96+
# data config
97+
DATASET_DIR=/home/ext_kazuki_fujii_rio_gsic_titech/datasets/debug/Codestral-22B-v0.1
98+
99+
TRAIN_DATA_PATH=""
100+
101+
# ja wiki
102+
TRAIN_DATA_PATH="${TRAIN_DATA_PATH} 2741303196 ${DATASET_DIR}/ja_wiki_text_document"
103+
104+
# job name
105+
JOB_NAME="Codestral-22B-v0.1-gcp-${NODE_TYPE}-${NUM_NODES}node-${NUM_GPUS}gpu-BS=${GLOBAL_BATCH_SIZE}-LR=${LR}-MINLR=${MIN_LR}-WARMUP=${LR_WARMUP_STEPS}-WD=${WEIGHT_DECAY}-GC=${GRAD_CLIP}"
106+
107+
# run
108+
mpirun -np $NUM_GPUS \
109+
--npernode $NUM_GPU_PER_NODE \
110+
-x MASTER_ADDR=$MASTER_ADDR \
111+
-x MASTER_PORT=$MASTER_PORT \
112+
-bind-to none \
113+
-x LD_LIBRARY_PATH \
114+
-x PATH \
115+
python examples/finetuning.py \
116+
--seq-length ${SEQ_LENGTH} \
117+
--sliding-window-size ${SEQ_LENGTH} \
118+
--micro-batch-size ${MICRO_BATCH_SIZE} \
119+
--global-batch-size ${GLOBAL_BATCH_SIZE} \
120+
--train-iters ${TRAIN_STEPS} \
121+
--tokenizer-type Llama2Tokenizer \
122+
--tokenizer-model ${TOKENIZER_MODEL} \
123+
--data-path ${TRAIN_DATA_PATH} \
124+
--split 949,50,1 \
125+
--lr ${LR} \
126+
--min-lr ${MIN_LR} \
127+
--lr-decay-style cosine \
128+
--lr-warmup-iters ${LR_WARMUP_STEPS} \
129+
--lr-decay-iters ${LR_DECAY_STEPS} \
130+
--weight-decay ${WEIGHT_DECAY} \
131+
--grad-clip-norm ${GRAD_CLIP} \
132+
--optimizer adam \
133+
--adam-beta1 0.9 \
134+
--adam-beta2 0.95 \
135+
--adam-eps 1e-5 \
136+
--save-interval 500 \
137+
--eval-interval 100 \
138+
--eval-iters 10 \
139+
--bf16 \
140+
--mixed-precision \
141+
--base-model ${CHECKPOINT_DIR} \
142+
--save ${CHECKPOINT_SAVE_DIR} \
143+
--load ${CHECKPOINT_SAVE_DIR} \
144+
--low-cpu-fsdp \
145+
--sharding-strategy FULL_SHARD \
146+
--checkpoint-type LOCAL_STATE_DICT \
147+
--fsdp-activation-checkpointing \
148+
--use-mpi \
149+
--wandb-entity "okoge" \
150+
--wandb-project "llm-recipes" \
151+
--wandb-name "${JOB_NAME}"
Lines changed: 18 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,18 @@
1+
#!/bin/bash
2+
3+
# swich virtual env
4+
source .env/bin/activate
5+
6+
DATASET_DIR=/home/ext_kazuki_fujii_rio_gsic_titech/datasets/samples
7+
OUTPUT_DIR=/home/ext_kazuki_fujii_rio_gsic_titech/datasets/debug/Codestral-22B-v0.1
8+
9+
mkdir -p ${OUTPUT_DIR}
10+
11+
# tokenize japanese wikipedia
12+
python megatron_lm/tools/preprocess_data.py \
13+
--input ${DATASET_DIR}/ja_wiki.jsonl \
14+
--output-prefix ${OUTPUT_DIR}/ja_wiki \
15+
--tokenizer-type Llama2Tokenizer \
16+
--tokenizer-model /home/ext_kazuki_fujii_rio_gsic_titech/hf_checkpoints/Codestral-22B-v0.1/tokenizer.model \
17+
--append-eod \
18+
--workers 64

src/llama_recipes/get_model_decoder_layer.py

Lines changed: 1 addition & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -8,7 +8,7 @@ def get_model_decoder_layer(
88
) -> type[LlamaDecoderLayer] | type[MistralDecoderLayer] | type[Phi3DecoderLayer]:
99
if "Llama" in model_name or "Swallow" in model_name or "Yi" in model_name:
1010
return LlamaDecoderLayer
11-
elif "Mistral" in model_name or "mistral" in model_name:
11+
elif "Mistral" in model_name or "mistral" in model_name or "Codestral" in model_name:
1212
return MistralDecoderLayer
1313
elif "Phi-3" in model_name:
1414
return Phi3DecoderLayer

src/llama_recipes/get_models.py

Lines changed: 3 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -58,7 +58,7 @@ def get_model(
5858

5959
return model # type: ignore
6060

61-
elif "Mistral" in model_name or "mistral" in model_name:
61+
elif "Mistral" in model_name or "mistral" in model_name or "Codestral" in model_name:
6262
# If using torch.device("meta"), FSDP training hang
6363
# FYI: https://github.com/iwiwi/epochraft-hf-fsdp/pull/10#issuecomment-1803360147
6464
# https://github.com/pytorch/pytorch/issues/105840 are maybe helpful
@@ -94,6 +94,8 @@ def get_model(
9494
return model # type: ignore
9595

9696
elif "Yi-1.5" in model_name:
97+
# https://huggingface.co/01-ai/Yi-1.5-9B/blob/main/config.json
98+
9799
model = LlamaForCausalLM.from_pretrained(
98100
model_name,
99101
load_in_8bit=True if args.quantization else None,

tools/tokenizer_check.py

Lines changed: 1 addition & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -10,7 +10,7 @@ def is_sentencepiece_model(file_path):
1010
return False
1111

1212

13-
file_path = '/home/ext_kazuki_fujii_rio_gsic_titech/hf_checkpoints/Yi-1.5-9B/tokenizer.model'
13+
file_path = '/home/ext_kazuki_fujii_rio_gsic_titech/hf_checkpoints/Codestral-22B-v0.1/tokenizer.model'
1414
if is_sentencepiece_model(file_path):
1515
print("The file is a SentencePiece model.")
1616
else:

0 commit comments

Comments
 (0)