diff --git a/.vscode/settings.json b/.vscode/settings.json index 5e8fb88..79d6fa1 100644 --- a/.vscode/settings.json +++ b/.vscode/settings.json @@ -18,6 +18,7 @@ "anyprecision", "autocast", "bettertransformer", + "Codestral", "colour", "Concatenator", "detokenize", diff --git a/scripts/abci/phi3/phi3-14b.sh b/scripts/abci/phi3/phi3-14b.sh index b057715..512d7f9 100644 --- a/scripts/abci/phi3/phi3-14b.sh +++ b/scripts/abci/phi3/phi3-14b.sh @@ -1,6 +1,6 @@ #!/bin/bash #$ -l rt_AF=4 -#$ -l h_rt=5:0:00:00 +#$ -l h_rt=10:00:00:00 #$ -j y #$ -o outputs/phi-3/ #$ -cwd @@ -73,8 +73,36 @@ mkdir -p ${CHECKPOINT_SAVE_DIR} DATA_PATH="" +# Swallow v1 +DATA_PATH="${DATA_PATH} 9108171060 /bb/llm/gaf51275/datasets/Phi-3_original_transformers-4.40.1/split_0_text_document" +DATA_PATH="${DATA_PATH} 9017389663 /bb/llm/gaf51275/datasets/Phi-3_original_transformers-4.40.1/split_1_text_document" +DATA_PATH="${DATA_PATH} 10781891782 /bb/llm/gaf51275/datasets/Phi-3_original_transformers-4.40.1/split_2_text_document" +DATA_PATH="${DATA_PATH} 14229527811 /bb/llm/gaf51275/datasets/Phi-3_original_transformers-4.40.1/split_3_text_document" +DATA_PATH="${DATA_PATH} 33251122086 /bb/llm/gaf51275/datasets/Phi-3_original_transformers-4.40.1/split_4_text_document" + # ja wikipedia -DATA_PATH="${DATA_PATH} 2657688677 /bb/llm/gaf51275/binarized/phi-3-default/ja_wiki_text_document" +DATA_PATH="${DATA_PATH} 2659052072 /bb/llm/gaf51275/datasets/Phi-3_original_transformers-4.40.1/ja_wiki_merged_text_document" + +# parallel corpus +DATA_PATH="${DATA_PATH} 1265915426 /bb/llm/gaf51275/datasets/Phi-3_original_transformers-4.40.1/default_plain_text_format_text_document" + +# en wikipedia +DATA_PATH="${DATA_PATH} 1400935123 /bb/llm/gaf51275/datasets/Phi-3_original_transformers-4.40.1/en_wiki_merged_train_text_document" + +# en refinedweb +DATA_PATH="${DATA_PATH} 1400935123 /bb/llm/gaf51275/datasets/Phi-3_original_transformers-4.40.1/lumi_en_falcon_merge_text_document" + +# en cosmopedia +DATA_PATH="${DATA_PATH} 1394911660 /bb/llm/gaf51275/datasets/Phi-3_original_transformers-4.40.1/cosmopedia_automathtext_train_text_document" +DATA_PATH="${DATA_PATH} 22852028 /bb/llm/gaf51275/datasets/Phi-3_original_transformers-4.40.1/cosmopedia_khanacademy_train_text_document" +DATA_PATH="${DATA_PATH} 115215400 /bb/llm/gaf51275/datasets/Phi-3_original_transformers-4.40.1/cosmopedia_openstax_train_text_document" +DATA_PATH="${DATA_PATH} 1120661316 /bb/llm/gaf51275/datasets/Phi-3_original_transformers-4.40.1/cosmopedia_stanford_train_text_document" +DATA_PATH="${DATA_PATH} 3131907229 /bb/llm/gaf51275/datasets/Phi-3_original_transformers-4.40.1/cosmopedia_stories_train_text_document" +DATA_PATH="${DATA_PATH} 195599284 /bb/llm/gaf51275/datasets/Phi-3_original_transformers-4.40.1/cosmopedia_wikihow_train_text_document" + +# code algebraic stack +DATA_PATH="${DATA_PATH} 10903912936 /bb/llm/gaf51275/datasets/Phi-3_original_transformers-4.40.1/algebraic-stack_text_document" + # job name JOB_NAME="Phi-3-ABCI-${NODE_TYPE}-${NUM_NODES}node-${NUM_GPUS}gpu-${SEQ_LENGTH}s-BS=${GLOBAL_BATCH_SIZE}-LR=${LR}-MINLR=${MIN_LR}-WARMUP=${LR_WARMUP_STEPS}-WD=${WEIGHT_DECAY}-GC=${GRAD_CLIP}" diff --git a/scripts/gcp/codestral-22b.sh b/scripts/gcp/codestral-22b.sh new file mode 100644 index 0000000..caa9ee6 --- /dev/null +++ b/scripts/gcp/codestral-22b.sh @@ -0,0 +1,151 @@ +#!/bin/bash +#SBATCH --job-name=codestral +#SBATCH --partition=a3 +#SBATCH --exclusive +#SBATCH --nodes 2 +#SBATCH --gpus-per-node=8 +#SBATCH --ntasks-per-node=8 +#SBATCH --output=outputs/codestral/%x-%j.out +#SBATCH --error=outputs/codestral/%x-%j.out + +set -e + +# module load +module load cuda/12.1 +module load cudnn/8.9.7 +module load hpcx/2.17.1 + +# open file limit +ulimit -n 65536 1048576 + +# python virtualenv +source .env/bin/activate + +# Important TCPX environment variables +UDS_PATH="/run/tcpx-${SLURM_JOB_ID}" + +# Only use TCPX for multi-node jobs. +[[ "${SLURM_JOB_NUM_NODES}" -gt 1 ]] && export USE_TCPX=yes || export USE_TCPX=no + +# Only use TCPX for multi-node jobs. +if [[ ${USE_TCPX} = "yes" ]]; then + # Set up NCCL Environment variables + export NCCL_NET=GPUDirectTCPX_v7 + # These network interfaces use Ubuntu's consistent naming scheme. See + # https://manpages.ubuntu.com/manpages/focal/man7/systemd.net-naming-scheme.7.html + export NCCL_SOCKET_IFNAME=enp0s12 + export NCCL_GPUDIRECTTCPX_CTRL_DEV=enp0s12 + export NCCL_GPUDIRECTTCPX_SOCKET_IFNAME=enp6s0,enp12s0,enp134s0,enp140s0 + export NCCL_CROSS_NIC=0 + export NCCL_ALGO=Ring + export NCCL_PROTO=Simple + export NCCL_NSOCKS_PERTHREAD=4 + export NCCL_SOCKET_NTHREADS=1 + export NCCL_DYNAMIC_CHUNK_SIZE=524288 + export NCCL_P2P_NET_CHUNKSIZE=524288 + export NCCL_P2P_PCI_CHUNKSIZE=524288 + export NCCL_P2P_NVL_CHUNKSIZE=1048576 + export CUDA_VISIBLE_DEVICES=0,1,2,3,4,5,6,7 + export NCCL_NET_GDR_LEVEL=PIX + export NCCL_P2P_PXN_LEVEL=0 + export NCCL_GPUDIRECTTCPX_UNIX_CLIENT_PREFIX=${UDS_PATH} + export NCCL_GPUDIRECTTCPX_PROGRAM_FLOW_STEERING_WAIT_MICROS=500000 + export NCCL_GPUDIRECTTCPX_TX_BINDINGS="enp6s0:8-21,112-125;enp12s0:8-21,112-125;enp134s0:60-73,164-177;enp140s0:60-73,164-177" + export NCCL_GPUDIRECTTCPX_RX_BINDINGS="enp6s0:22-35,126-139;enp12s0:22-35,126-139;enp134s0:74-87,178-191;enp140s0:74-87,178-191" + + export LD_LIBRARY_PATH=/var/lib/tcpx/lib64:${LD_LIBRARY_PATH} +else + unset NCCL_NET +fi + +# distributed settings +export MASTER_ADDR=$(scontrol show hostname $SLURM_JOB_NODELIST | head -n1) +export MASTER_PORT=$((10000 + ($SLURM_JOBID % 50000))) + +echo "MASTER_ADDR=${MASTER_ADDR}" + +# hostfile +export NUM_GPU_PER_NODE=8 +NODE_TYPE="H100" + +NUM_NODES=$SLURM_JOB_NUM_NODES +NUM_GPUS=$((${NUM_NODES} * ${NUM_GPU_PER_NODE})) + +# training config +SEQ_LENGTH=4096 +DATA_PARALLEL_SIZE=$NUM_GPUS + +MICRO_BATCH_SIZE=1 +GLOBAL_BATCH_SIZE=1024 +TRAIN_STEPS=25000 + +# optimizer config +LR=2.5E-5 +MIN_LR=2.5E-6 +LR_WARMUP_STEPS=1000 +LR_DECAY_STEPS=25000 +WEIGHT_DECAY=0.1 +GRAD_CLIP=1 +# model config +TOKENIZER_MODEL=/home/ext_kazuki_fujii_rio_gsic_titech/hf_checkpoints/Codestral-22B-v0.1/tokenizer.model +CHECKPOINT_DIR=/home/ext_kazuki_fujii_rio_gsic_titech/hf_checkpoints/Codestral-22B-v0.1 +CHECKPOINT_SAVE_DIR=/home/ext_kazuki_fujii_rio_gsic_titech/checkpoints/Codestral-22B-v0.1 + +mkdir -p ${CHECKPOINT_SAVE_DIR} + +# data config +DATASET_DIR=/home/ext_kazuki_fujii_rio_gsic_titech/datasets/debug/Codestral-22B-v0.1 + +TRAIN_DATA_PATH="" + +# ja wiki +TRAIN_DATA_PATH="${TRAIN_DATA_PATH} 2741303196 ${DATASET_DIR}/ja_wiki_text_document" + +# job name +JOB_NAME="Codestral-22B-v0.1-gcp-${NODE_TYPE}-${NUM_NODES}node-${NUM_GPUS}gpu-BS=${GLOBAL_BATCH_SIZE}-LR=${LR}-MINLR=${MIN_LR}-WARMUP=${LR_WARMUP_STEPS}-WD=${WEIGHT_DECAY}-GC=${GRAD_CLIP}" + +# run +mpirun -np $NUM_GPUS \ + --npernode $NUM_GPU_PER_NODE \ + -x MASTER_ADDR=$MASTER_ADDR \ + -x MASTER_PORT=$MASTER_PORT \ + -bind-to none \ + -x LD_LIBRARY_PATH \ + -x PATH \ + python examples/finetuning.py \ + --seq-length ${SEQ_LENGTH} \ + --sliding-window-size ${SEQ_LENGTH} \ + --micro-batch-size ${MICRO_BATCH_SIZE} \ + --global-batch-size ${GLOBAL_BATCH_SIZE} \ + --train-iters ${TRAIN_STEPS} \ + --tokenizer-type Llama2Tokenizer \ + --tokenizer-model ${TOKENIZER_MODEL} \ + --data-path ${TRAIN_DATA_PATH} \ + --split 949,50,1 \ + --lr ${LR} \ + --min-lr ${MIN_LR} \ + --lr-decay-style cosine \ + --lr-warmup-iters ${LR_WARMUP_STEPS} \ + --lr-decay-iters ${LR_DECAY_STEPS} \ + --weight-decay ${WEIGHT_DECAY} \ + --grad-clip-norm ${GRAD_CLIP} \ + --optimizer adam \ + --adam-beta1 0.9 \ + --adam-beta2 0.95 \ + --adam-eps 1e-5 \ + --save-interval 500 \ + --eval-interval 100 \ + --eval-iters 10 \ + --bf16 \ + --mixed-precision \ + --base-model ${CHECKPOINT_DIR} \ + --save ${CHECKPOINT_SAVE_DIR} \ + --load ${CHECKPOINT_SAVE_DIR} \ + --low-cpu-fsdp \ + --sharding-strategy FULL_SHARD \ + --checkpoint-type LOCAL_STATE_DICT \ + --fsdp-activation-checkpointing \ + --use-mpi \ + --wandb-entity "okoge" \ + --wandb-project "llm-recipes" \ + --wandb-name "${JOB_NAME}" diff --git a/scripts/gcp/tokenize/codestral-ja-wiki.sh b/scripts/gcp/tokenize/codestral-ja-wiki.sh new file mode 100644 index 0000000..7c61e08 --- /dev/null +++ b/scripts/gcp/tokenize/codestral-ja-wiki.sh @@ -0,0 +1,18 @@ +#!/bin/bash + +# swich virtual env +source .env/bin/activate + +DATASET_DIR=/home/ext_kazuki_fujii_rio_gsic_titech/datasets/samples +OUTPUT_DIR=/home/ext_kazuki_fujii_rio_gsic_titech/datasets/debug/Codestral-22B-v0.1 + +mkdir -p ${OUTPUT_DIR} + +# tokenize japanese wikipedia +python megatron_lm/tools/preprocess_data.py \ + --input ${DATASET_DIR}/ja_wiki.jsonl \ + --output-prefix ${OUTPUT_DIR}/ja_wiki \ + --tokenizer-type Llama2Tokenizer \ + --tokenizer-model /home/ext_kazuki_fujii_rio_gsic_titech/hf_checkpoints/Codestral-22B-v0.1/tokenizer.model \ + --append-eod \ + --workers 64 diff --git a/scripts/gcp/tokenize/yi-1.5-ja-wiki.sh b/scripts/gcp/tokenize/yi-1.5-ja-wiki.sh new file mode 100644 index 0000000..63edeaf --- /dev/null +++ b/scripts/gcp/tokenize/yi-1.5-ja-wiki.sh @@ -0,0 +1,18 @@ +#!/bin/bash + +# swich virtual env +source .env/bin/activate + +DATASET_DIR=/home/ext_kazuki_fujii_rio_gsic_titech/datasets/samples +OUTPUT_DIR=/home/ext_kazuki_fujii_rio_gsic_titech/datasets/debug/yi-1.5 + +mkdir -p ${OUTPUT_DIR} + +# tokenize japanese wikipedia +python megatron_lm/tools/preprocess_data.py \ + --input ${DATASET_DIR}/ja_wiki.jsonl \ + --output-prefix ${OUTPUT_DIR}/ja_wiki \ + --tokenizer-type Llama2Tokenizer \ + --tokenizer-model /home/ext_kazuki_fujii_rio_gsic_titech/hf_checkpoints/Yi-1.5-9B/tokenizer.model \ + --append-eod \ + --workers 64 diff --git a/scripts/gcp/yi-1.5-9b.sh b/scripts/gcp/yi-1.5-9b.sh new file mode 100644 index 0000000..fcd8ac9 --- /dev/null +++ b/scripts/gcp/yi-1.5-9b.sh @@ -0,0 +1,151 @@ +#!/bin/bash +#SBATCH --job-name=yi-1.5-9b +#SBATCH --partition=a3 +#SBATCH --exclusive +#SBATCH --nodes 2 +#SBATCH --gpus-per-node=8 +#SBATCH --ntasks-per-node=8 +#SBATCH --output=outputs/yi-1.5-9b/%x-%j.out +#SBATCH --error=outputs/yi-1.5-9b/%x-%j.out + +set -e + +# module load +module load cuda/12.1 +module load cudnn/8.9.7 +module load hpcx/2.17.1 + +# open file limit +ulimit -n 65536 1048576 + +# python virtualenv +source .env/bin/activate + +# Important TCPX environment variables +UDS_PATH="/run/tcpx-${SLURM_JOB_ID}" + +# Only use TCPX for multi-node jobs. +[[ "${SLURM_JOB_NUM_NODES}" -gt 1 ]] && export USE_TCPX=yes || export USE_TCPX=no + +# Only use TCPX for multi-node jobs. +if [[ ${USE_TCPX} = "yes" ]]; then + # Set up NCCL Environment variables + export NCCL_NET=GPUDirectTCPX_v7 + # These network interfaces use Ubuntu's consistent naming scheme. See + # https://manpages.ubuntu.com/manpages/focal/man7/systemd.net-naming-scheme.7.html + export NCCL_SOCKET_IFNAME=enp0s12 + export NCCL_GPUDIRECTTCPX_CTRL_DEV=enp0s12 + export NCCL_GPUDIRECTTCPX_SOCKET_IFNAME=enp6s0,enp12s0,enp134s0,enp140s0 + export NCCL_CROSS_NIC=0 + export NCCL_ALGO=Ring + export NCCL_PROTO=Simple + export NCCL_NSOCKS_PERTHREAD=4 + export NCCL_SOCKET_NTHREADS=1 + export NCCL_DYNAMIC_CHUNK_SIZE=524288 + export NCCL_P2P_NET_CHUNKSIZE=524288 + export NCCL_P2P_PCI_CHUNKSIZE=524288 + export NCCL_P2P_NVL_CHUNKSIZE=1048576 + export CUDA_VISIBLE_DEVICES=0,1,2,3,4,5,6,7 + export NCCL_NET_GDR_LEVEL=PIX + export NCCL_P2P_PXN_LEVEL=0 + export NCCL_GPUDIRECTTCPX_UNIX_CLIENT_PREFIX=${UDS_PATH} + export NCCL_GPUDIRECTTCPX_PROGRAM_FLOW_STEERING_WAIT_MICROS=500000 + export NCCL_GPUDIRECTTCPX_TX_BINDINGS="enp6s0:8-21,112-125;enp12s0:8-21,112-125;enp134s0:60-73,164-177;enp140s0:60-73,164-177" + export NCCL_GPUDIRECTTCPX_RX_BINDINGS="enp6s0:22-35,126-139;enp12s0:22-35,126-139;enp134s0:74-87,178-191;enp140s0:74-87,178-191" + + export LD_LIBRARY_PATH=/var/lib/tcpx/lib64:${LD_LIBRARY_PATH} +else + unset NCCL_NET +fi + +# distributed settings +export MASTER_ADDR=$(scontrol show hostname $SLURM_JOB_NODELIST | head -n1) +export MASTER_PORT=$((10000 + ($SLURM_JOBID % 50000))) + +echo "MASTER_ADDR=${MASTER_ADDR}" + +# hostfile +export NUM_GPU_PER_NODE=8 +NODE_TYPE="H100" + +NUM_NODES=$SLURM_JOB_NUM_NODES +NUM_GPUS=$((${NUM_NODES} * ${NUM_GPU_PER_NODE})) + +# training config +SEQ_LENGTH=4096 +DATA_PARALLEL_SIZE=$NUM_GPUS + +MICRO_BATCH_SIZE=8 +GLOBAL_BATCH_SIZE=1024 +TRAIN_STEPS=25000 + +# optimizer config +LR=2.5E-5 +MIN_LR=2.5E-6 +LR_WARMUP_STEPS=1000 +LR_DECAY_STEPS=25000 +WEIGHT_DECAY=0.1 +GRAD_CLIP=1 +# model config +TOKENIZER_MODEL=/home/ext_kazuki_fujii_rio_gsic_titech/hf_checkpoints/Yi-1.5-9B/tokenizer.model +CHECKPOINT_DIR=/home/ext_kazuki_fujii_rio_gsic_titech/hf_checkpoints/Yi-1.5-9B +CHECKPOINT_SAVE_DIR=/home/ext_kazuki_fujii_rio_gsic_titech/checkpoints/Yi-1.5-9B + +mkdir -p ${CHECKPOINT_SAVE_DIR} + +# data config +DATASET_DIR=/home/ext_kazuki_fujii_rio_gsic_titech/datasets/debug/yi-1.5 + +DATA_PATH="" + +# ja wiki +DATA_PATH="${DATA_PATH} 2990167836 ${DATASET_DIR}/ja_wiki_text_document" + +# job name +JOB_NAME="Yi-1.5-9B-gcp-${NODE_TYPE}-${NUM_NODES}node-${NUM_GPUS}gpu-BS=${GLOBAL_BATCH_SIZE}-LR=${LR}-MINLR=${MIN_LR}-WARMUP=${LR_WARMUP_STEPS}-WD=${WEIGHT_DECAY}-GC=${GRAD_CLIP}" + +# run +mpirun -np $NUM_GPUS \ + --npernode $NUM_GPU_PER_NODE \ + -x MASTER_ADDR=$MASTER_ADDR \ + -x MASTER_PORT=$MASTER_PORT \ + -bind-to none \ + -x LD_LIBRARY_PATH \ + -x PATH \ + python examples/finetuning.py \ + --seq-length ${SEQ_LENGTH} \ + --sliding-window-size ${SEQ_LENGTH} \ + --micro-batch-size ${MICRO_BATCH_SIZE} \ + --global-batch-size ${GLOBAL_BATCH_SIZE} \ + --train-iters ${TRAIN_STEPS} \ + --tokenizer-type Llama2Tokenizer \ + --tokenizer-model ${TOKENIZER_MODEL} \ + --data-path ${DATA_PATH} \ + --split 949,50,1 \ + --lr ${LR} \ + --min-lr ${MIN_LR} \ + --lr-decay-style cosine \ + --lr-warmup-iters ${LR_WARMUP_STEPS} \ + --lr-decay-iters ${LR_DECAY_STEPS} \ + --weight-decay ${WEIGHT_DECAY} \ + --grad-clip-norm ${GRAD_CLIP} \ + --optimizer adam \ + --adam-beta1 0.9 \ + --adam-beta2 0.95 \ + --adam-eps 1e-5 \ + --save-interval 500 \ + --eval-interval 100 \ + --eval-iters 10 \ + --bf16 \ + --mixed-precision \ + --base-model ${CHECKPOINT_DIR} \ + --save ${CHECKPOINT_SAVE_DIR} \ + --load ${CHECKPOINT_SAVE_DIR} \ + --low-cpu-fsdp \ + --sharding-strategy FULL_SHARD \ + --checkpoint-type LOCAL_STATE_DICT \ + --fsdp-activation-checkpointing \ + --use-mpi \ + --wandb-entity "okoge" \ + --wandb-project "llm-recipes" \ + --wandb-name "${JOB_NAME}" diff --git a/src/llama_recipes/arguments.py b/src/llama_recipes/arguments.py index f5b05d3..25cb33f 100644 --- a/src/llama_recipes/arguments.py +++ b/src/llama_recipes/arguments.py @@ -182,9 +182,6 @@ def _add_training_args(parser: argparse.ArgumentParser) -> argparse.ArgumentPars group.add_argument("--save", type=str, default=None) group.add_argument("--base-model", type=str, default=None) - # use flash attention, better transformer - group.add_argument("--use-better-transformer", action="store_true") - group.add_argument("--grad-clip-norm", type=float, default=1.0) # interval diff --git a/src/llama_recipes/finetuning.py b/src/llama_recipes/finetuning.py index 5a19c64..2e61de3 100644 --- a/src/llama_recipes/finetuning.py +++ b/src/llama_recipes/finetuning.py @@ -102,14 +102,6 @@ def main() -> None: if args.load: load_model_state_dict(model, args.load) # type: ignore - if args.use_better_transformer: - try: - from optimum.bettertransformer import BetterTransformer - - model = BetterTransformer.transform(model) # type: ignore - except ImportError: - print("Module 'optimum' not found. Please install 'optimum' it before proceeding.") - print_model_size(model, args.base_model, rank) # type: ignore # Convert the model to bfloat16 if fsdp and pure_bf16 is enabled diff --git a/src/llama_recipes/get_model_decoder_layer.py b/src/llama_recipes/get_model_decoder_layer.py index e2ce5a0..6082fdc 100644 --- a/src/llama_recipes/get_model_decoder_layer.py +++ b/src/llama_recipes/get_model_decoder_layer.py @@ -6,9 +6,9 @@ def get_model_decoder_layer( model_name: str, ) -> type[LlamaDecoderLayer] | type[MistralDecoderLayer] | type[Phi3DecoderLayer]: - if "Llama" in model_name or "Swallow" in model_name: + if "Llama" in model_name or "Swallow" in model_name or "Yi" in model_name: return LlamaDecoderLayer - elif "Mistral" in model_name or "mistral" in model_name: + elif "Mistral" in model_name or "mistral" in model_name or "Codestral" in model_name: return MistralDecoderLayer elif "Phi-3" in model_name: return Phi3DecoderLayer diff --git a/src/llama_recipes/get_models.py b/src/llama_recipes/get_models.py index 084c5a2..487ccfc 100644 --- a/src/llama_recipes/get_models.py +++ b/src/llama_recipes/get_models.py @@ -58,7 +58,7 @@ def get_model( return model # type: ignore - elif "Mistral" in model_name or "mistral" in model_name: + elif "Mistral" in model_name or "mistral" in model_name or "Codestral" in model_name: # If using torch.device("meta"), FSDP training hang # FYI: https://github.com/iwiwi/epochraft-hf-fsdp/pull/10#issuecomment-1803360147 # https://github.com/pytorch/pytorch/issues/105840 are maybe helpful @@ -93,5 +93,20 @@ def get_model( return model # type: ignore + elif "Yi-1.5" in model_name: + # https://huggingface.co/01-ai/Yi-1.5-9B/blob/main/config.json + + model = LlamaForCausalLM.from_pretrained( + model_name, + load_in_8bit=True if args.quantization else None, + device_map="auto" if args.quantization else None, + use_cache=use_cache, + max_position_embeddings=args.seq_length, + attn_implementation="flash_attention_2", + torch_dtype=torch.bfloat16 if args.bf16 else torch.float16, + ) + + return model # type: ignore + else: raise NotImplementedError("model not implemented") diff --git a/src/llama_recipes/utils/train_utils.py b/src/llama_recipes/utils/train_utils.py index b0bb80d..cd9c800 100644 --- a/src/llama_recipes/utils/train_utils.py +++ b/src/llama_recipes/utils/train_utils.py @@ -1,6 +1,5 @@ import os import time -from pkg_resources import packaging # type: ignore from contextlib import nullcontext import torch @@ -328,7 +327,6 @@ def get_policies(rank: int, model_name: str): verify_bfloat_support: bool = ( torch.version.cuda # type: ignore and torch.cuda.is_bf16_supported() - and packaging.version.parse(torch.version.cuda).release >= (11, 0) # type: ignore and torch_distributed.is_nccl_available() and nccl.version() >= (2, 10) ) diff --git a/tools/tokenizer_check.py b/tools/tokenizer_check.py index 2fadd40..8aaa73d 100644 --- a/tools/tokenizer_check.py +++ b/tools/tokenizer_check.py @@ -10,7 +10,7 @@ def is_sentencepiece_model(file_path): return False -file_path = '/bb/llm/gaf51275/hf-checkpoints/Phi-3-medium-4k-instruct/tokenizer.model' +file_path = '/home/ext_kazuki_fujii_rio_gsic_titech/hf_checkpoints/Codestral-22B-v0.1/tokenizer.model' if is_sentencepiece_model(file_path): print("The file is a SentencePiece model.") else: