diff --git a/.vscode/settings.json b/.vscode/settings.json
index 79d6fa1..bbadf97 100644
--- a/.vscode/settings.json
+++ b/.vscode/settings.json
@@ -47,6 +47,7 @@
     "pbar",
     "peft",
     "plamo",
+    "probs",
     "psutil",
     "pubmed",
     "samsum",
diff --git a/megatron_lm/megatron/core/datasets/Makefile b/megatron_lm/megatron/core/datasets/Makefile
index a409f51..7bd3930 100644
--- a/megatron_lm/megatron/core/datasets/Makefile
+++ b/megatron_lm/megatron/core/datasets/Makefile
@@ -1,7 +1,7 @@
 CXXFLAGS += -O3 -Wall -shared -std=c++11 -fPIC -fdiagnostics-color
 CPPFLAGS += $(shell python3 -m pybind11 --includes)
 LIBNAME = helpers
-LIBEXT = $(shell ${PYENV_ROOT}/versions/3.10.12/bin/python3-config --extension-suffix)
+LIBEXT = $(shell ${PYENV_ROOT}/versions/3.11.9/bin/python3-config --extension-suffix)
 
 default: $(LIBNAME)$(LIBEXT)
 
diff --git a/requirements.txt b/requirements.txt
index 773ef20..d67edc1 100644
--- a/requirements.txt
+++ b/requirements.txt
@@ -1,5 +1,5 @@
 --find-links https://download.pytorch.org/whl/torch_stable.html
-torch==2.2.2+cu121
+torch==2.3.1+cu121
 
 # huggingface
 transformers>=4.41.1
diff --git a/scripts/abci/instruction/swallow-7b/swallow-7b-baseline.sh b/scripts/abci/instruction/Llama-3-8B/Llama-3-8B-instruct-v0.2.sh
similarity index 65%
rename from scripts/abci/instruction/swallow-7b/swallow-7b-baseline.sh
rename to scripts/abci/instruction/Llama-3-8B/Llama-3-8B-instruct-v0.2.sh
index 5c74799..4ebf185 100644
--- a/scripts/abci/instruction/swallow-7b/swallow-7b-baseline.sh
+++ b/scripts/abci/instruction/Llama-3-8B/Llama-3-8B-instruct-v0.2.sh
@@ -1,16 +1,19 @@
 #!/bin/bash
 #$ -l rt_AF=2
-#$ -l h_rt=1:00:00:00
+#$ -l h_rt=0:01:00:00
 #$ -j y
-#$ -o outputs/instruction/swallow-7b/
+#$ -o outputs/instruction/Llama-3-8B/
 #$ -cwd
 
 # module load
 source /etc/profile.d/modules.sh
-module load cuda/11.8/11.8.0
-module load cudnn/8.9/8.9.2
-module load nccl/2.16/2.16.2-1
+module use /bb/llm/gaf51275/modules/modulefiles
+
+module load cuda/12.1/12.1.1
+module load cudnn/cuda-12.1/9.0.0
+module load nccl/2.20.5
 module load hpcx/2.12
+module load gcc/11.4.0
 
 # swich virtual env
 source .env/bin/activate
@@ -44,33 +47,33 @@ while read -r line; do
 done <"$SGE_JOB_HOSTLIST" >"$HOSTFILE_NAME"
 
 # training config
-SEQ_LENGTH=4096
+SEQ_LENGTH=8192
 DATA_PARALLEL_SIZE=$NUM_GPUS
 
-MICRO_BATCH_SIZE=4
-GLOBAL_BATCH_SIZE=64
+MICRO_BATCH_SIZE=1
+GLOBAL_BATCH_SIZE=128
 
 # optimizer config
-LR=2e-5
-MIN_LR=2e-6
+LR=1e-5
+MIN_LR=1e-6
 WEIGHT_DECAY=0.1
 GRAD_CLIP=1
 
-# checkpoint & tokenizer
-TOKENIZER_MODEL=/bb/llm/gaf51275/llama/huggingface-checkpoint/Swallow-7b-hf/tokenizer.model
-CHECKPOINT_DIR=/bb/llm/gaf51275/llama/huggingface-checkpoint/Swallow-7b-hf
-CHECKPOINT_SAVE_DIR="/bb/llm/gaf51275/llama/checkpoints/Swallow-7b-VE-chat/baseline-lr_${LR}-minlr_${MIN_LR}"
+# checkpoint
+TOKENIZER_DIR=/groups/gag51395/hf-checkpoints/Meta-Llama-3-8B-Instruct
+CHECKPOINT_DIR=/groups/gag51395/hf-checkpoints/Meta-Llama-3-8B-Instruct
+CHECKPOINT_SAVE_DIR="/bb/llm/gaf51275/2024/checkpoints/Llama-3-8B-Instruct-v0.2/LR_${LR}_MINLR_${MIN_LR}_WD_${WEIGHT_DECAY}_GC_${GRAD_CLIP}"
 
 mkdir -p ${CHECKPOINT_SAVE_DIR}
 
 # dataset
-DATASET_DIR=/bb/llm/gaf51275/llama/finetuning/datasets/training/baseline
+DATASET_DIR=/groups/gag51395/datasets/instruction/2023-swallow/training/baseline
 
 TRAIN_DATA_PATH=${DATASET_DIR}/train.jsonl
-VALID_DATA_PATH=${DATASET_DIR}/val.jsonl
+VALID_DATA_PATH=${DATASET_DIR}/train.jsonl
 
 # job name
-JOB_NAME="Swallow-7b-VE-baseline-BS=${GLOBAL_BATCH_SIZE}-LR=${LR}-MINLR=${MIN_LR}"
+JOB_NAME="Llama-3-8B-instruct-v0.2-BS=${GLOBAL_BATCH_SIZE}-LR=${LR}-MINLR=${MIN_LR}-WD=${WEIGHT_DECAY}-GC=${GRAD_CLIP}"
 
 # run
 mpirun -np $NUM_GPUS \
@@ -78,20 +81,18 @@ mpirun -np $NUM_GPUS \
   -hostfile $HOSTFILE_NAME \
   -x MASTER_ADDR=$MASTER_ADDR \
   -x MASTER_PORT=$MASTER_PORT \
-  -bind-to none -map-by slot \
+  -bind-to none \
+  -x PATH \
+  -x LD_LIBRARY_PATH \
   -x PATH \
   python examples/finetuning.py \
   --seq-length ${SEQ_LENGTH} \
-  --sliding-window-size ${SEQ_LENGTH} \
   --micro-batch-size ${MICRO_BATCH_SIZE} \
   --global-batch-size ${GLOBAL_BATCH_SIZE} \
-  --hf-transformer-model-dir ${CHECKPOINT_DIR} \
-  --tokenizer-type Llama2Tokenizer \
-  --tokenizer-model ${TOKENIZER_MODEL} \
+  --hf-transformer-model-dir ${TOKENIZER_DIR} \
   --instruction-train-data-path ${TRAIN_DATA_PATH} \
   --instruction-valid-data-path ${VALID_DATA_PATH} \
-  --epoch 2 \
-  --train-iters 500000 \
+  --epoch 1 \
   --lr ${LR} \
   --min-lr ${MIN_LR} \
   --lr-decay-style cosine \
@@ -100,10 +101,10 @@ mpirun -np $NUM_GPUS \
   --optimizer adam \
   --adam-beta1 0.9 \
   --adam-beta2 0.95 \
-  --adam-eps 1e-6 \
+  --adam-eps 1e-8 \
   --save-interval 500 \
-  --eval-interval 100 \
-  --eval-iters 20 \
+  --eval-interval 500 \
+  --eval-iters 10 \
   --bf16 \
   --mixed-precision \
   --base-model ${CHECKPOINT_DIR} \
@@ -116,6 +117,6 @@ mpirun -np $NUM_GPUS \
   --instruction-tuning \
   --save-sampler-state \
   --use-mpi \
-  --wandb-entity "prj-jalm" \
-  --wandb-project "Llama-2-7b-instruct" \
+  --wandb-entity "okoge" \
+  --wandb-project "llm-recipes" \
   --wandb-name "${JOB_NAME}"
diff --git a/scripts/abci/instruction/swallow-13b/swallow-13b-dolly-oasst2-top1-imitation-2-3.sh b/scripts/abci/instruction/swallow-13b/swallow-13b-dolly-oasst2-top1-imitation-2-3.sh
deleted file mode 100644
index 10b834e..0000000
--- a/scripts/abci/instruction/swallow-13b/swallow-13b-dolly-oasst2-top1-imitation-2-3.sh
+++ /dev/null
@@ -1,121 +0,0 @@
-#!/bin/bash
-#$ -l rt_AF=2
-#$ -l h_rt=1:00:00:00
-#$ -j y
-#$ -o outputs/instruction/swallow-13b/
-#$ -cwd
-
-# module load
-source /etc/profile.d/modules.sh
-module load cuda/11.8/11.8.0
-module load cudnn/8.9/8.9.2
-module load nccl/2.16/2.16.2-1
-module load hpcx/2.12
-
-# swich virtual env
-source .env/bin/activate
-
-# distributed settings
-export MASTER_ADDR=$(/usr/sbin/ip a show dev bond0 | grep 'inet ' | awk '{ print $2 }' | cut -d "/" -f 1)
-export MASTER_PORT=$((10000 + ($JOB_ID % 50000)))
-
-echo "MASTER_ADDR=${MASTER_ADDR}"
-
-# hostfile
-
-if [[ "$SGE_RESOURCE_TYPE" == "rt_F" ]]; then
-  export NUM_GPU_PER_NODE=4
-  NODE_TYPE="v100"
-elif [[ "$SGE_RESOURCE_TYPE" == "rt_AF" ]]; then
-  export NUM_GPU_PER_NODE=8
-  NODE_TYPE="a100"
-else
-  echo "Unrecognized SGE_RESOURCE_TYPE: $SGE_RESOURCE_TYPE"
-fi
-
-NUM_NODES=$NHOSTS
-NUM_GPUS=$((${NUM_NODES} * ${NUM_GPU_PER_NODE}))
-
-mkdir -p ./hostfile
-
-HOSTFILE_NAME=./hostfile/hostfile_${JOB_ID}
-while read -r line; do
-  echo "${line} slots=${NUM_GPU_PER_NODE}"
-done <"$SGE_JOB_HOSTLIST" >"$HOSTFILE_NAME"
-
-# training config
-SEQ_LENGTH=4096
-DATA_PARALLEL_SIZE=$NUM_GPUS
-
-MICRO_BATCH_SIZE=2
-GLOBAL_BATCH_SIZE=256
-
-# optimizer config
-LR=2e-5
-MIN_LR=2e-6
-WEIGHT_DECAY=0.1
-GRAD_CLIP=1
-
-# checkpoint & tokenizer
-TOKENIZER_MODEL=/bb/llm/gaf51275/llama/huggingface-checkpoint/Swallow-13b-hf/tokenizer.model
-CHECKPOINT_DIR=/bb/llm/gaf51275/llama/huggingface-checkpoint/Swallow-13b-hf
-CHECKPOINT_SAVE_DIR="/bb/llm/gaf51275/llama/checkpoints/Swallow-13b-VE-chat/dolly-oasst2-top1-imitation-2-3-lr_${LR}-minlr_${MIN_LR}-GB_${GLOBAL_BATCH_SIZE}"
-
-mkdir -p ${CHECKPOINT_SAVE_DIR}
-
-# dataset
-DATASET_DIR=/bb/llm/gaf51275/llama/finetuning/datasets/training/dolly-oasst2-top1-imitation-2-3
-
-TRAIN_DATA_PATH=${DATASET_DIR}/train.jsonl
-VALID_DATA_PATH=${DATASET_DIR}/val.jsonl
-
-# job name
-JOB_NAME="Swallow-13b-VE-dolly-oasst2-top1-imitation-2-3-BS=${GLOBAL_BATCH_SIZE}-LR=${LR}-MINLR=${MIN_LR}"
-
-# run
-mpirun -np $NUM_GPUS \
-  --npernode $NUM_GPU_PER_NODE \
-  -hostfile $HOSTFILE_NAME \
-  -x MASTER_ADDR=$MASTER_ADDR \
-  -x MASTER_PORT=$MASTER_PORT \
-  -bind-to none -map-by slot \
-  -x PATH \
-  python examples/finetuning.py \
-  --seq-length ${SEQ_LENGTH} \
-  --sliding-window-size ${SEQ_LENGTH} \
-  --micro-batch-size ${MICRO_BATCH_SIZE} \
-  --global-batch-size ${GLOBAL_BATCH_SIZE} \
-  --hf-transformer-model-dir ${CHECKPOINT_DIR} \
-  --tokenizer-type Llama2Tokenizer \
-  --tokenizer-model ${TOKENIZER_MODEL} \
-  --instruction-train-data-path ${TRAIN_DATA_PATH} \
-  --instruction-valid-data-path ${VALID_DATA_PATH} \
-  --epoch 2 \
-  --train-iters 500000 \
-  --lr ${LR} \
-  --min-lr ${MIN_LR} \
-  --lr-decay-style cosine \
-  --weight-decay ${WEIGHT_DECAY} \
-  --grad-clip-norm ${GRAD_CLIP} \
-  --optimizer adam \
-  --adam-beta1 0.9 \
-  --adam-beta2 0.95 \
-  --adam-eps 1e-6 \
-  --save-interval 500 \
-  --eval-interval 100 \
-  --eval-iters 20 \
-  --bf16 \
-  --mixed-precision \
-  --base-model ${CHECKPOINT_DIR} \
-  --save ${CHECKPOINT_SAVE_DIR} \
-  --load ${CHECKPOINT_SAVE_DIR} \
-  --low-cpu-fsdp \
-  --sharding-strategy FULL_SHARD \
-  --checkpoint-type LOCAL_STATE_DICT \
-  --fsdp-activation-checkpointing \
-  --instruction-tuning \
-  --save-sampler-state \
-  --use-mpi \
-  --wandb-entity "prj-jalm" \
-  --wandb-project "Llama-2-13b-instruct" \
-  --wandb-name "${JOB_NAME}"
diff --git a/scripts/abci/instruction/swallow-13b/swallow-13b-oasst2-top1-imitation2-3.sh b/scripts/abci/instruction/swallow-13b/swallow-13b-oasst2-top1-imitation2-3.sh
deleted file mode 100644
index b360148..0000000
--- a/scripts/abci/instruction/swallow-13b/swallow-13b-oasst2-top1-imitation2-3.sh
+++ /dev/null
@@ -1,121 +0,0 @@
-#!/bin/bash
-#$ -l rt_AF=2
-#$ -l h_rt=1:00:00:00
-#$ -j y
-#$ -o outputs/instruction/swallow-13b/
-#$ -cwd
-
-# module load
-source /etc/profile.d/modules.sh
-module load cuda/11.8/11.8.0
-module load cudnn/8.9/8.9.2
-module load nccl/2.16/2.16.2-1
-module load hpcx/2.12
-
-# swich virtual env
-source .env/bin/activate
-
-# distributed settings
-export MASTER_ADDR=$(/usr/sbin/ip a show dev bond0 | grep 'inet ' | awk '{ print $2 }' | cut -d "/" -f 1)
-export MASTER_PORT=$((10000 + ($JOB_ID % 50000)))
-
-echo "MASTER_ADDR=${MASTER_ADDR}"
-
-# hostfile
-
-if [[ "$SGE_RESOURCE_TYPE" == "rt_F" ]]; then
-  export NUM_GPU_PER_NODE=4
-  NODE_TYPE="v100"
-elif [[ "$SGE_RESOURCE_TYPE" == "rt_AF" ]]; then
-  export NUM_GPU_PER_NODE=8
-  NODE_TYPE="a100"
-else
-  echo "Unrecognized SGE_RESOURCE_TYPE: $SGE_RESOURCE_TYPE"
-fi
-
-NUM_NODES=$NHOSTS
-NUM_GPUS=$((${NUM_NODES} * ${NUM_GPU_PER_NODE}))
-
-mkdir -p ./hostfile
-
-HOSTFILE_NAME=./hostfile/hostfile_${JOB_ID}
-while read -r line; do
-  echo "${line} slots=${NUM_GPU_PER_NODE}"
-done <"$SGE_JOB_HOSTLIST" >"$HOSTFILE_NAME"
-
-# training config
-SEQ_LENGTH=4096
-DATA_PARALLEL_SIZE=$NUM_GPUS
-
-MICRO_BATCH_SIZE=2
-GLOBAL_BATCH_SIZE=256
-
-# optimizer config
-LR=2e-5
-MIN_LR=2e-6
-WEIGHT_DECAY=0.1
-GRAD_CLIP=1
-
-# checkpoint & tokenizer
-TOKENIZER_MODEL=/bb/llm/gaf51275/llama/huggingface-checkpoint/Swallow-13b-hf/tokenizer.model
-CHECKPOINT_DIR=/bb/llm/gaf51275/llama/huggingface-checkpoint/Swallow-13b-hf
-CHECKPOINT_SAVE_DIR="/bb/llm/gaf51275/llama/checkpoints/Swallow-13b-VE-chat/oasst2-top1-imitation-2-3-lr_${LR}-minlr_${MIN_LR}-GB_${GLOBAL_BATCH_SIZE}"
-
-mkdir -p ${CHECKPOINT_SAVE_DIR}
-
-# dataset
-DATASET_DIR=/bb/llm/gaf51275/llama/finetuning/datasets/training/oasst2-top1-imitation-2-3
-
-TRAIN_DATA_PATH=${DATASET_DIR}/train.jsonl
-VALID_DATA_PATH=${DATASET_DIR}/val.jsonl
-
-# job name
-JOB_NAME="Swallow-13b-VE-oasst2-top1-imitation-2-3-BS=${GLOBAL_BATCH_SIZE}-LR=${LR}-MINLR=${MIN_LR}"
-
-# run
-mpirun -np $NUM_GPUS \
-  --npernode $NUM_GPU_PER_NODE \
-  -hostfile $HOSTFILE_NAME \
-  -x MASTER_ADDR=$MASTER_ADDR \
-  -x MASTER_PORT=$MASTER_PORT \
-  -bind-to none -map-by slot \
-  -x PATH \
-  python examples/finetuning.py \
-  --seq-length ${SEQ_LENGTH} \
-  --sliding-window-size ${SEQ_LENGTH} \
-  --micro-batch-size ${MICRO_BATCH_SIZE} \
-  --global-batch-size ${GLOBAL_BATCH_SIZE} \
-  --hf-transformer-model-dir ${CHECKPOINT_DIR} \
-  --tokenizer-type Llama2Tokenizer \
-  --tokenizer-model ${TOKENIZER_MODEL} \
-  --instruction-train-data-path ${TRAIN_DATA_PATH} \
-  --instruction-valid-data-path ${VALID_DATA_PATH} \
-  --epoch 2 \
-  --train-iters 500000 \
-  --lr ${LR} \
-  --min-lr ${MIN_LR} \
-  --lr-decay-style cosine \
-  --weight-decay ${WEIGHT_DECAY} \
-  --grad-clip-norm ${GRAD_CLIP} \
-  --optimizer adam \
-  --adam-beta1 0.9 \
-  --adam-beta2 0.95 \
-  --adam-eps 1e-6 \
-  --save-interval 500 \
-  --eval-interval 100 \
-  --eval-iters 20 \
-  --bf16 \
-  --mixed-precision \
-  --base-model ${CHECKPOINT_DIR} \
-  --save ${CHECKPOINT_SAVE_DIR} \
-  --load ${CHECKPOINT_SAVE_DIR} \
-  --low-cpu-fsdp \
-  --sharding-strategy FULL_SHARD \
-  --checkpoint-type LOCAL_STATE_DICT \
-  --fsdp-activation-checkpointing \
-  --instruction-tuning \
-  --save-sampler-state \
-  --use-mpi \
-  --wandb-entity "prj-jalm" \
-  --wandb-project "Llama-2-13b-instruct" \
-  --wandb-name "${JOB_NAME}"
diff --git a/scripts/abci/instruction/swallow-70b/swallow-70b-baseline_imitatation_2.sh b/scripts/abci/instruction/swallow-70b/swallow-70b-baseline_imitatation_2.sh
deleted file mode 100644
index 6eb65ef..0000000
--- a/scripts/abci/instruction/swallow-70b/swallow-70b-baseline_imitatation_2.sh
+++ /dev/null
@@ -1,121 +0,0 @@
-#!/bin/bash
-#$ -l rt_AF=16
-#$ -l h_rt=0:10:00:00
-#$ -j y
-#$ -o outputs/instruction/swallow-70b/
-#$ -cwd
-
-# module load
-source /etc/profile.d/modules.sh
-module load cuda/11.8/11.8.0
-module load cudnn/8.9/8.9.2
-module load nccl/2.16/2.16.2-1
-module load hpcx/2.12
-
-# swich virtual env
-source .env/bin/activate
-
-# distributed settings
-export MASTER_ADDR=$(/usr/sbin/ip a show dev bond0 | grep 'inet ' | awk '{ print $2 }' | cut -d "/" -f 1)
-export MASTER_PORT=$((10000 + ($JOB_ID % 50000)))
-
-echo "MASTER_ADDR=${MASTER_ADDR}"
-
-# hostfile
-
-if [[ "$SGE_RESOURCE_TYPE" == "rt_F" ]]; then
-  export NUM_GPU_PER_NODE=4
-  NODE_TYPE="v100"
-elif [[ "$SGE_RESOURCE_TYPE" == "rt_AF" ]]; then
-  export NUM_GPU_PER_NODE=8
-  NODE_TYPE="a100"
-else
-  echo "Unrecognized SGE_RESOURCE_TYPE: $SGE_RESOURCE_TYPE"
-fi
-
-NUM_NODES=$NHOSTS
-NUM_GPUS=$((${NUM_NODES} * ${NUM_GPU_PER_NODE}))
-
-mkdir -p ./hostfile
-
-HOSTFILE_NAME=./hostfile/hostfile_${JOB_ID}
-while read -r line; do
-  echo "${line} slots=${NUM_GPU_PER_NODE}"
-done <"$SGE_JOB_HOSTLIST" >"$HOSTFILE_NAME"
-
-# training config
-SEQ_LENGTH=4096
-DATA_PARALLEL_SIZE=$NUM_GPUS
-
-MICRO_BATCH_SIZE=1
-GLOBAL_BATCH_SIZE=256
-
-# optimizer config
-LR=1e-5
-MIN_LR=1e-6
-WEIGHT_DECAY=0.1
-GRAD_CLIP=1
-
-# checkpoint & tokenizer
-TOKENIZER_MODEL=/bb/llm/gaf51275/llama/huggingface-checkpoint/Swallow-70b-hf/tokenizer.model
-CHECKPOINT_DIR=/bb/llm/gaf51275/llama/huggingface-checkpoint/Swallow-70b-hf
-CHECKPOINT_SAVE_DIR="/bb/llm/gaf51275/llama/checkpoints/Swallow-70b-VE-chat/baseline-imitation-2-lr_${LR}-minlr_${MIN_LR}-GB_${GLOBAL_BATCH_SIZE}-GB_${GLOBAL_BATCH_SIZE}"
-
-mkdir -p ${CHECKPOINT_SAVE_DIR}
-
-# dataset
-DATASET_DIR=/bb/llm/gaf51275/llama/finetuning/datasets/training/baseline-imitation_2
-
-TRAIN_DATA_PATH=${DATASET_DIR}/train.jsonl
-VALID_DATA_PATH=${DATASET_DIR}/val.jsonl
-
-# job name
-JOB_NAME="Swallow-70b-VE-baseline-imitation-2-BS=${GLOBAL_BATCH_SIZE}-LR=${LR}-MINLR=${MIN_LR}"
-
-# run
-mpirun -np $NUM_GPUS \
-  --npernode $NUM_GPU_PER_NODE \
-  -hostfile $HOSTFILE_NAME \
-  -x MASTER_ADDR=$MASTER_ADDR \
-  -x MASTER_PORT=$MASTER_PORT \
-  -bind-to none -map-by slot \
-  -x PATH \
-  python examples/finetuning.py \
-  --seq-length ${SEQ_LENGTH} \
-  --sliding-window-size ${SEQ_LENGTH} \
-  --micro-batch-size ${MICRO_BATCH_SIZE} \
-  --global-batch-size ${GLOBAL_BATCH_SIZE} \
-  --hf-transformer-model-dir ${CHECKPOINT_DIR} \
-  --tokenizer-type Llama2Tokenizer \
-  --tokenizer-model ${TOKENIZER_MODEL} \
-  --instruction-train-data-path ${TRAIN_DATA_PATH} \
-  --instruction-valid-data-path ${VALID_DATA_PATH} \
-  --epoch 2 \
-  --train-iters 500000 \
-  --lr ${LR} \
-  --min-lr ${MIN_LR} \
-  --lr-decay-style cosine \
-  --weight-decay ${WEIGHT_DECAY} \
-  --grad-clip-norm ${GRAD_CLIP} \
-  --optimizer adam \
-  --adam-beta1 0.9 \
-  --adam-beta2 0.95 \
-  --adam-eps 1e-6 \
-  --save-interval 500 \
-  --eval-interval 100 \
-  --eval-iters 20 \
-  --bf16 \
-  --mixed-precision \
-  --base-model ${CHECKPOINT_DIR} \
-  --save ${CHECKPOINT_SAVE_DIR} \
-  --load ${CHECKPOINT_SAVE_DIR} \
-  --low-cpu-fsdp \
-  --sharding-strategy FULL_SHARD \
-  --checkpoint-type LOCAL_STATE_DICT \
-  --fsdp-activation-checkpointing \
-  --instruction-tuning \
-  --save-sampler-state \
-  --use-mpi \
-  --wandb-entity "prj-jalm" \
-  --wandb-project "Llama-2-70b-instruct" \
-  --wandb-name "${JOB_NAME}"
diff --git a/scripts/abci/instruction/swallow-70b/swallow-70b-dolly-oasst2-top1-imitation-2-3.sh b/scripts/abci/instruction/swallow-70b/swallow-70b-dolly-oasst2-top1-imitation-2-3.sh
deleted file mode 100644
index 34f0aaf..0000000
--- a/scripts/abci/instruction/swallow-70b/swallow-70b-dolly-oasst2-top1-imitation-2-3.sh
+++ /dev/null
@@ -1,121 +0,0 @@
-#!/bin/bash
-#$ -l rt_AF=16
-#$ -l h_rt=1:05:00:00
-#$ -j y
-#$ -o outputs/instruction/swallow-70b/
-#$ -cwd
-
-# module load
-source /etc/profile.d/modules.sh
-module load cuda/11.8/11.8.0
-module load cudnn/8.9/8.9.2
-module load nccl/2.16/2.16.2-1
-module load hpcx/2.12
-
-# swich virtual env
-source .env/bin/activate
-
-# distributed settings
-export MASTER_ADDR=$(/usr/sbin/ip a show dev bond0 | grep 'inet ' | awk '{ print $2 }' | cut -d "/" -f 1)
-export MASTER_PORT=$((10000 + ($JOB_ID % 50000)))
-
-echo "MASTER_ADDR=${MASTER_ADDR}"
-
-# hostfile
-
-if [[ "$SGE_RESOURCE_TYPE" == "rt_F" ]]; then
-  export NUM_GPU_PER_NODE=4
-  NODE_TYPE="v100"
-elif [[ "$SGE_RESOURCE_TYPE" == "rt_AF" ]]; then
-  export NUM_GPU_PER_NODE=8
-  NODE_TYPE="a100"
-else
-  echo "Unrecognized SGE_RESOURCE_TYPE: $SGE_RESOURCE_TYPE"
-fi
-
-NUM_NODES=$NHOSTS
-NUM_GPUS=$((${NUM_NODES} * ${NUM_GPU_PER_NODE}))
-
-mkdir -p ./hostfile
-
-HOSTFILE_NAME=./hostfile/hostfile_${JOB_ID}
-while read -r line; do
-  echo "${line} slots=${NUM_GPU_PER_NODE}"
-done <"$SGE_JOB_HOSTLIST" >"$HOSTFILE_NAME"
-
-# training config
-SEQ_LENGTH=4096
-DATA_PARALLEL_SIZE=$NUM_GPUS
-
-MICRO_BATCH_SIZE=1
-GLOBAL_BATCH_SIZE=256
-
-# optimizer config
-LR=1e-5
-MIN_LR=1e-6
-WEIGHT_DECAY=0.1
-GRAD_CLIP=1
-
-# checkpoint & tokenizer
-TOKENIZER_MODEL=/bb/llm/gaf51275/llama/huggingface-checkpoint/Swallow-70b-hf/tokenizer.model
-CHECKPOINT_DIR=/bb/llm/gaf51275/llama/huggingface-checkpoint/Swallow-70b-hf
-CHECKPOINT_SAVE_DIR="/bb/llm/gaf51275/llama/checkpoints/Swallow-7-b-VE-chat/dolly-oasst2-top1-imitation-2-3-lr_${LR}-minlr_${MIN_LR}-GB_${GLOBAL_BATCH_SIZE}"
-
-mkdir -p ${CHECKPOINT_SAVE_DIR}
-
-# dataset
-DATASET_DIR=/bb/llm/gaf51275/llama/finetuning/datasets/training/dolly-oasst2-top1-imitation-2-3
-
-TRAIN_DATA_PATH=${DATASET_DIR}/train.jsonl
-VALID_DATA_PATH=${DATASET_DIR}/val.jsonl
-
-# job name
-JOB_NAME="Swallow-70b-VE-dolly-oasst2-top1-imitation-2-3-BS=${GLOBAL_BATCH_SIZE}-LR=${LR}-MINLR=${MIN_LR}"
-
-# run
-mpirun -np $NUM_GPUS \
-  --npernode $NUM_GPU_PER_NODE \
-  -hostfile $HOSTFILE_NAME \
-  -x MASTER_ADDR=$MASTER_ADDR \
-  -x MASTER_PORT=$MASTER_PORT \
-  -bind-to none -map-by slot \
-  -x PATH \
-  python examples/finetuning.py \
-  --seq-length ${SEQ_LENGTH} \
-  --sliding-window-size ${SEQ_LENGTH} \
-  --micro-batch-size ${MICRO_BATCH_SIZE} \
-  --global-batch-size ${GLOBAL_BATCH_SIZE} \
-  --hf-transformer-model-dir ${CHECKPOINT_DIR} \
-  --tokenizer-type Llama2Tokenizer \
-  --tokenizer-model ${TOKENIZER_MODEL} \
-  --instruction-train-data-path ${TRAIN_DATA_PATH} \
-  --instruction-valid-data-path ${VALID_DATA_PATH} \
-  --epoch 2 \
-  --train-iters 500000 \
-  --lr ${LR} \
-  --min-lr ${MIN_LR} \
-  --lr-decay-style cosine \
-  --weight-decay ${WEIGHT_DECAY} \
-  --grad-clip-norm ${GRAD_CLIP} \
-  --optimizer adam \
-  --adam-beta1 0.9 \
-  --adam-beta2 0.95 \
-  --adam-eps 1e-6 \
-  --save-interval 500 \
-  --eval-interval 100 \
-  --eval-iters 20 \
-  --bf16 \
-  --mixed-precision \
-  --base-model ${CHECKPOINT_DIR} \
-  --save ${CHECKPOINT_SAVE_DIR} \
-  --load ${CHECKPOINT_SAVE_DIR} \
-  --low-cpu-fsdp \
-  --sharding-strategy FULL_SHARD \
-  --checkpoint-type LOCAL_STATE_DICT \
-  --fsdp-activation-checkpointing \
-  --instruction-tuning \
-  --save-sampler-state \
-  --use-mpi \
-  --wandb-entity "prj-jalm" \
-  --wandb-project "Llama-2-70b-instruct" \
-  --wandb-name "${JOB_NAME}"
diff --git a/scripts/abci/instruction/swallow-70b/swallow-70b-oasst2-top1-imitation2-3.sh b/scripts/abci/instruction/swallow-70b/swallow-70b-oasst2-top1-imitation2-3.sh
deleted file mode 100644
index 8cdef14..0000000
--- a/scripts/abci/instruction/swallow-70b/swallow-70b-oasst2-top1-imitation2-3.sh
+++ /dev/null
@@ -1,121 +0,0 @@
-#!/bin/bash
-#$ -l rt_AF=16
-#$ -l h_rt=0:14:00:00
-#$ -j y
-#$ -o outputs/instruction/swallow-70b/
-#$ -cwd
-
-# module load
-source /etc/profile.d/modules.sh
-module load cuda/11.8/11.8.0
-module load cudnn/8.9/8.9.2
-module load nccl/2.16/2.16.2-1
-module load hpcx/2.12
-
-# swich virtual env
-source .env/bin/activate
-
-# distributed settings
-export MASTER_ADDR=$(/usr/sbin/ip a show dev bond0 | grep 'inet ' | awk '{ print $2 }' | cut -d "/" -f 1)
-export MASTER_PORT=$((10000 + ($JOB_ID % 50000)))
-
-echo "MASTER_ADDR=${MASTER_ADDR}"
-
-# hostfile
-
-if [[ "$SGE_RESOURCE_TYPE" == "rt_F" ]]; then
-  export NUM_GPU_PER_NODE=4
-  NODE_TYPE="v100"
-elif [[ "$SGE_RESOURCE_TYPE" == "rt_AF" ]]; then
-  export NUM_GPU_PER_NODE=8
-  NODE_TYPE="a100"
-else
-  echo "Unrecognized SGE_RESOURCE_TYPE: $SGE_RESOURCE_TYPE"
-fi
-
-NUM_NODES=$NHOSTS
-NUM_GPUS=$((${NUM_NODES} * ${NUM_GPU_PER_NODE}))
-
-mkdir -p ./hostfile
-
-HOSTFILE_NAME=./hostfile/hostfile_${JOB_ID}
-while read -r line; do
-  echo "${line} slots=${NUM_GPU_PER_NODE}"
-done <"$SGE_JOB_HOSTLIST" >"$HOSTFILE_NAME"
-
-# training config
-SEQ_LENGTH=4096
-DATA_PARALLEL_SIZE=$NUM_GPUS
-
-MICRO_BATCH_SIZE=1
-GLOBAL_BATCH_SIZE=256
-
-# optimizer config
-LR=1e-5
-MIN_LR=1e-6
-WEIGHT_DECAY=0.1
-GRAD_CLIP=1
-
-# checkpoint & tokenizer
-TOKENIZER_MODEL=/bb/llm/gaf51275/llama/huggingface-checkpoint/Swallow-70b-hf/tokenizer.model
-CHECKPOINT_DIR=/bb/llm/gaf51275/llama/huggingface-checkpoint/Swallow-70b-hf
-CHECKPOINT_SAVE_DIR="/bb/llm/gaf51275/llama/checkpoints/Swallow-70b-VE-chat/oasst2-top1-imitation-2-3-lr_${LR}-minlr_${MIN_LR}-GB_${GLOBAL_BATCH_SIZE}"
-
-mkdir -p ${CHECKPOINT_SAVE_DIR}
-
-# dataset
-DATASET_DIR=/bb/llm/gaf51275/llama/finetuning/datasets/training/oasst2-top1-imitation-2-3
-
-TRAIN_DATA_PATH=${DATASET_DIR}/train.jsonl
-VALID_DATA_PATH=${DATASET_DIR}/val.jsonl
-
-# job name
-JOB_NAME="Swallow-70b-VE-oasst2-top1-imitation-2-3-BS=${GLOBAL_BATCH_SIZE}-LR=${LR}-MINLR=${MIN_LR}"
-
-# run
-mpirun -np $NUM_GPUS \
-  --npernode $NUM_GPU_PER_NODE \
-  -hostfile $HOSTFILE_NAME \
-  -x MASTER_ADDR=$MASTER_ADDR \
-  -x MASTER_PORT=$MASTER_PORT \
-  -bind-to none -map-by slot \
-  -x PATH \
-  python examples/finetuning.py \
-  --seq-length ${SEQ_LENGTH} \
-  --sliding-window-size ${SEQ_LENGTH} \
-  --micro-batch-size ${MICRO_BATCH_SIZE} \
-  --global-batch-size ${GLOBAL_BATCH_SIZE} \
-  --hf-transformer-model-dir ${CHECKPOINT_DIR} \
-  --tokenizer-type Llama2Tokenizer \
-  --tokenizer-model ${TOKENIZER_MODEL} \
-  --instruction-train-data-path ${TRAIN_DATA_PATH} \
-  --instruction-valid-data-path ${VALID_DATA_PATH} \
-  --epoch 2 \
-  --train-iters 500000 \
-  --lr ${LR} \
-  --min-lr ${MIN_LR} \
-  --lr-decay-style cosine \
-  --weight-decay ${WEIGHT_DECAY} \
-  --grad-clip-norm ${GRAD_CLIP} \
-  --optimizer adam \
-  --adam-beta1 0.9 \
-  --adam-beta2 0.95 \
-  --adam-eps 1e-6 \
-  --save-interval 500 \
-  --eval-interval 100 \
-  --eval-iters 20 \
-  --bf16 \
-  --mixed-precision \
-  --base-model ${CHECKPOINT_DIR} \
-  --save ${CHECKPOINT_SAVE_DIR} \
-  --load ${CHECKPOINT_SAVE_DIR} \
-  --low-cpu-fsdp \
-  --sharding-strategy FULL_SHARD \
-  --checkpoint-type LOCAL_STATE_DICT \
-  --fsdp-activation-checkpointing \
-  --instruction-tuning \
-  --save-sampler-state \
-  --use-mpi \
-  --wandb-entity "prj-jalm" \
-  --wandb-project "Llama-2-70b-instruct" \
-  --wandb-name "${JOB_NAME}"
diff --git a/scripts/abci/instruction/swallow-7b/swallow-7b-baseline_GB_256.sh b/scripts/abci/instruction/swallow-7b/swallow-7b-baseline_GB_256.sh
deleted file mode 100644
index c9ae201..0000000
--- a/scripts/abci/instruction/swallow-7b/swallow-7b-baseline_GB_256.sh
+++ /dev/null
@@ -1,121 +0,0 @@
-#!/bin/bash
-#$ -l rt_AF=2
-#$ -l h_rt=0:08:00:00
-#$ -j y
-#$ -o outputs/instruction/swallow-7b/
-#$ -cwd
-
-# module load
-source /etc/profile.d/modules.sh
-module load cuda/11.8/11.8.0
-module load cudnn/8.9/8.9.2
-module load nccl/2.16/2.16.2-1
-module load hpcx/2.12
-
-# swich virtual env
-source .env/bin/activate
-
-# distributed settings
-export MASTER_ADDR=$(/usr/sbin/ip a show dev bond0 | grep 'inet ' | awk '{ print $2 }' | cut -d "/" -f 1)
-export MASTER_PORT=$((10000 + ($JOB_ID % 50000)))
-
-echo "MASTER_ADDR=${MASTER_ADDR}"
-
-# hostfile
-
-if [[ "$SGE_RESOURCE_TYPE" == "rt_F" ]]; then
-  export NUM_GPU_PER_NODE=4
-  NODE_TYPE="v100"
-elif [[ "$SGE_RESOURCE_TYPE" == "rt_AF" ]]; then
-  export NUM_GPU_PER_NODE=8
-  NODE_TYPE="a100"
-else
-  echo "Unrecognized SGE_RESOURCE_TYPE: $SGE_RESOURCE_TYPE"
-fi
-
-NUM_NODES=$NHOSTS
-NUM_GPUS=$((${NUM_NODES} * ${NUM_GPU_PER_NODE}))
-
-mkdir -p ./hostfile
-
-HOSTFILE_NAME=./hostfile/hostfile_${JOB_ID}
-while read -r line; do
-  echo "${line} slots=${NUM_GPU_PER_NODE}"
-done <"$SGE_JOB_HOSTLIST" >"$HOSTFILE_NAME"
-
-# training config
-SEQ_LENGTH=4096
-DATA_PARALLEL_SIZE=$NUM_GPUS
-
-MICRO_BATCH_SIZE=4
-GLOBAL_BATCH_SIZE=256
-
-# optimizer config
-LR=2e-5
-MIN_LR=2e-6
-WEIGHT_DECAY=0.1
-GRAD_CLIP=1
-
-# checkpoint & tokenizer
-TOKENIZER_MODEL=/bb/llm/gaf51275/llama/huggingface-checkpoint/Swallow-7b-hf/tokenizer.model
-CHECKPOINT_DIR=/bb/llm/gaf51275/llama/huggingface-checkpoint/Swallow-7b-hf
-CHECKPOINT_SAVE_DIR="/bb/llm/gaf51275/llama/checkpoints/Swallow-7b-VE-chat/baseline-lr_${LR}-minlr_${MIN_LR}_GB_${GLOBAL_BATCH_SIZE}"
-
-mkdir -p ${CHECKPOINT_SAVE_DIR}
-
-# dataset
-DATASET_DIR=/bb/llm/gaf51275/llama/finetuning/datasets/training/baseline
-
-TRAIN_DATA_PATH=${DATASET_DIR}/train.jsonl
-VALID_DATA_PATH=${DATASET_DIR}/val.jsonl
-
-# job name
-JOB_NAME="Swallow-7b-VE-baseline-BS=${GLOBAL_BATCH_SIZE}-LR=${LR}-MINLR=${MIN_LR}"
-
-# run
-mpirun -np $NUM_GPUS \
-  --npernode $NUM_GPU_PER_NODE \
-  -hostfile $HOSTFILE_NAME \
-  -x MASTER_ADDR=$MASTER_ADDR \
-  -x MASTER_PORT=$MASTER_PORT \
-  -bind-to none -map-by slot \
-  -x PATH \
-  python examples/finetuning.py \
-  --seq-length ${SEQ_LENGTH} \
-  --sliding-window-size ${SEQ_LENGTH} \
-  --micro-batch-size ${MICRO_BATCH_SIZE} \
-  --global-batch-size ${GLOBAL_BATCH_SIZE} \
-  --hf-transformer-model-dir ${CHECKPOINT_DIR} \
-  --tokenizer-type Llama2Tokenizer \
-  --tokenizer-model ${TOKENIZER_MODEL} \
-  --instruction-train-data-path ${TRAIN_DATA_PATH} \
-  --instruction-valid-data-path ${VALID_DATA_PATH} \
-  --epoch 2 \
-  --train-iters 500000 \
-  --lr ${LR} \
-  --min-lr ${MIN_LR} \
-  --lr-decay-style cosine \
-  --weight-decay ${WEIGHT_DECAY} \
-  --grad-clip-norm ${GRAD_CLIP} \
-  --optimizer adam \
-  --adam-beta1 0.9 \
-  --adam-beta2 0.95 \
-  --adam-eps 1e-6 \
-  --save-interval 500 \
-  --eval-interval 100 \
-  --eval-iters 20 \
-  --bf16 \
-  --mixed-precision \
-  --base-model ${CHECKPOINT_DIR} \
-  --save ${CHECKPOINT_SAVE_DIR} \
-  --load ${CHECKPOINT_SAVE_DIR} \
-  --low-cpu-fsdp \
-  --sharding-strategy FULL_SHARD \
-  --checkpoint-type LOCAL_STATE_DICT \
-  --fsdp-activation-checkpointing \
-  --instruction-tuning \
-  --save-sampler-state \
-  --use-mpi \
-  --wandb-entity "prj-jalm" \
-  --wandb-project "Llama-2-7b-instruct" \
-  --wandb-name "${JOB_NAME}"
diff --git a/scripts/abci/instruction/swallow-7b/swallow-7b-baseline_imitatation_2.sh b/scripts/abci/instruction/swallow-7b/swallow-7b-baseline_imitatation_2.sh
deleted file mode 100644
index 1f12d22..0000000
--- a/scripts/abci/instruction/swallow-7b/swallow-7b-baseline_imitatation_2.sh
+++ /dev/null
@@ -1,121 +0,0 @@
-#!/bin/bash
-#$ -l rt_AF=2
-#$ -l h_rt=0:10:00:00
-#$ -j y
-#$ -o outputs/instruction/swallow-7b/
-#$ -cwd
-
-# module load
-source /etc/profile.d/modules.sh
-module load cuda/11.8/11.8.0
-module load cudnn/8.9/8.9.2
-module load nccl/2.16/2.16.2-1
-module load hpcx/2.12
-
-# swich virtual env
-source .env/bin/activate
-
-# distributed settings
-export MASTER_ADDR=$(/usr/sbin/ip a show dev bond0 | grep 'inet ' | awk '{ print $2 }' | cut -d "/" -f 1)
-export MASTER_PORT=$((10000 + ($JOB_ID % 50000)))
-
-echo "MASTER_ADDR=${MASTER_ADDR}"
-
-# hostfile
-
-if [[ "$SGE_RESOURCE_TYPE" == "rt_F" ]]; then
-  export NUM_GPU_PER_NODE=4
-  NODE_TYPE="v100"
-elif [[ "$SGE_RESOURCE_TYPE" == "rt_AF" ]]; then
-  export NUM_GPU_PER_NODE=8
-  NODE_TYPE="a100"
-else
-  echo "Unrecognized SGE_RESOURCE_TYPE: $SGE_RESOURCE_TYPE"
-fi
-
-NUM_NODES=$NHOSTS
-NUM_GPUS=$((${NUM_NODES} * ${NUM_GPU_PER_NODE}))
-
-mkdir -p ./hostfile
-
-HOSTFILE_NAME=./hostfile/hostfile_${JOB_ID}
-while read -r line; do
-  echo "${line} slots=${NUM_GPU_PER_NODE}"
-done <"$SGE_JOB_HOSTLIST" >"$HOSTFILE_NAME"
-
-# training config
-SEQ_LENGTH=4096
-DATA_PARALLEL_SIZE=$NUM_GPUS
-
-MICRO_BATCH_SIZE=4
-GLOBAL_BATCH_SIZE=256
-
-# optimizer config
-LR=2e-5
-MIN_LR=2e-6
-WEIGHT_DECAY=0.1
-GRAD_CLIP=1
-
-# checkpoint & tokenizer
-TOKENIZER_MODEL=/bb/llm/gaf51275/llama/huggingface-checkpoint/Swallow-7b-hf/tokenizer.model
-CHECKPOINT_DIR=/bb/llm/gaf51275/llama/huggingface-checkpoint/Swallow-7b-hf
-CHECKPOINT_SAVE_DIR="/bb/llm/gaf51275/llama/checkpoints/Swallow-7b-VE-chat/baseline-imitation-2-lr_${LR}-minlr_${MIN_LR}-GB_${GLOBAL_BATCH_SIZE}"
-
-mkdir -p ${CHECKPOINT_SAVE_DIR}
-
-# dataset
-DATASET_DIR=/bb/llm/gaf51275/llama/finetuning/datasets/training/baseline-imitation_2
-
-TRAIN_DATA_PATH=${DATASET_DIR}/train.jsonl
-VALID_DATA_PATH=${DATASET_DIR}/val.jsonl
-
-# job name
-JOB_NAME="Swallow-7b-VE-baseline-imitation-2-BS=${GLOBAL_BATCH_SIZE}-LR=${LR}-MINLR=${MIN_LR}"
-
-# run
-mpirun -np $NUM_GPUS \
-  --npernode $NUM_GPU_PER_NODE \
-  -hostfile $HOSTFILE_NAME \
-  -x MASTER_ADDR=$MASTER_ADDR \
-  -x MASTER_PORT=$MASTER_PORT \
-  -bind-to none -map-by slot \
-  -x PATH \
-  python examples/finetuning.py \
-  --seq-length ${SEQ_LENGTH} \
-  --sliding-window-size ${SEQ_LENGTH} \
-  --micro-batch-size ${MICRO_BATCH_SIZE} \
-  --global-batch-size ${GLOBAL_BATCH_SIZE} \
-  --hf-transformer-model-dir ${CHECKPOINT_DIR} \
-  --tokenizer-type Llama2Tokenizer \
-  --tokenizer-model ${TOKENIZER_MODEL} \
-  --instruction-train-data-path ${TRAIN_DATA_PATH} \
-  --instruction-valid-data-path ${VALID_DATA_PATH} \
-  --epoch 2 \
-  --train-iters 500000 \
-  --lr ${LR} \
-  --min-lr ${MIN_LR} \
-  --lr-decay-style cosine \
-  --weight-decay ${WEIGHT_DECAY} \
-  --grad-clip-norm ${GRAD_CLIP} \
-  --optimizer adam \
-  --adam-beta1 0.9 \
-  --adam-beta2 0.95 \
-  --adam-eps 1e-6 \
-  --save-interval 500 \
-  --eval-interval 100 \
-  --eval-iters 20 \
-  --bf16 \
-  --mixed-precision \
-  --base-model ${CHECKPOINT_DIR} \
-  --save ${CHECKPOINT_SAVE_DIR} \
-  --load ${CHECKPOINT_SAVE_DIR} \
-  --low-cpu-fsdp \
-  --sharding-strategy FULL_SHARD \
-  --checkpoint-type LOCAL_STATE_DICT \
-  --fsdp-activation-checkpointing \
-  --instruction-tuning \
-  --save-sampler-state \
-  --use-mpi \
-  --wandb-entity "prj-jalm" \
-  --wandb-project "Llama-2-7b-instruct" \
-  --wandb-name "${JOB_NAME}"
diff --git a/scripts/abci/instruction/swallow-7b/swallow-7b-dolly-oasst2-top1-imitation-2-3.sh b/scripts/abci/instruction/swallow-7b/swallow-7b-dolly-oasst2-top1-imitation-2-3.sh
deleted file mode 100644
index 11bf134..0000000
--- a/scripts/abci/instruction/swallow-7b/swallow-7b-dolly-oasst2-top1-imitation-2-3.sh
+++ /dev/null
@@ -1,121 +0,0 @@
-#!/bin/bash
-#$ -l rt_AF=2
-#$ -l h_rt=1:00:00:00
-#$ -j y
-#$ -o outputs/instruction/swallow-7b/
-#$ -cwd
-
-# module load
-source /etc/profile.d/modules.sh
-module load cuda/11.8/11.8.0
-module load cudnn/8.9/8.9.2
-module load nccl/2.16/2.16.2-1
-module load hpcx/2.12
-
-# swich virtual env
-source .env/bin/activate
-
-# distributed settings
-export MASTER_ADDR=$(/usr/sbin/ip a show dev bond0 | grep 'inet ' | awk '{ print $2 }' | cut -d "/" -f 1)
-export MASTER_PORT=$((10000 + ($JOB_ID % 50000)))
-
-echo "MASTER_ADDR=${MASTER_ADDR}"
-
-# hostfile
-
-if [[ "$SGE_RESOURCE_TYPE" == "rt_F" ]]; then
-  export NUM_GPU_PER_NODE=4
-  NODE_TYPE="v100"
-elif [[ "$SGE_RESOURCE_TYPE" == "rt_AF" ]]; then
-  export NUM_GPU_PER_NODE=8
-  NODE_TYPE="a100"
-else
-  echo "Unrecognized SGE_RESOURCE_TYPE: $SGE_RESOURCE_TYPE"
-fi
-
-NUM_NODES=$NHOSTS
-NUM_GPUS=$((${NUM_NODES} * ${NUM_GPU_PER_NODE}))
-
-mkdir -p ./hostfile
-
-HOSTFILE_NAME=./hostfile/hostfile_${JOB_ID}
-while read -r line; do
-  echo "${line} slots=${NUM_GPU_PER_NODE}"
-done <"$SGE_JOB_HOSTLIST" >"$HOSTFILE_NAME"
-
-# training config
-SEQ_LENGTH=4096
-DATA_PARALLEL_SIZE=$NUM_GPUS
-
-MICRO_BATCH_SIZE=4
-GLOBAL_BATCH_SIZE=256
-
-# optimizer config
-LR=2e-5
-MIN_LR=2e-6
-WEIGHT_DECAY=0.1
-GRAD_CLIP=1
-
-# checkpoint & tokenizer
-TOKENIZER_MODEL=/bb/llm/gaf51275/llama/huggingface-checkpoint/Swallow-7b-hf/tokenizer.model
-CHECKPOINT_DIR=/bb/llm/gaf51275/llama/huggingface-checkpoint/Swallow-7b-hf
-CHECKPOINT_SAVE_DIR="/bb/llm/gaf51275/llama/checkpoints/Swallow-7b-VE-chat/dolly-oasst2-top1-imitation-2-3-lr_${LR}-minlr_${MIN_LR}-GB_${GLOBAL_BATCH_SIZE}"
-
-mkdir -p ${CHECKPOINT_SAVE_DIR}
-
-# dataset
-DATASET_DIR=/bb/llm/gaf51275/llama/finetuning/datasets/training/dolly-oasst2-top1-imitation-2-3
-
-TRAIN_DATA_PATH=${DATASET_DIR}/train.jsonl
-VALID_DATA_PATH=${DATASET_DIR}/val.jsonl
-
-# job name
-JOB_NAME="Swallow-7b-VE-dolly-oasst2-top1-imitation-2-3-BS=${GLOBAL_BATCH_SIZE}-LR=${LR}-MINLR=${MIN_LR}"
-
-# run
-mpirun -np $NUM_GPUS \
-  --npernode $NUM_GPU_PER_NODE \
-  -hostfile $HOSTFILE_NAME \
-  -x MASTER_ADDR=$MASTER_ADDR \
-  -x MASTER_PORT=$MASTER_PORT \
-  -bind-to none -map-by slot \
-  -x PATH \
-  python examples/finetuning.py \
-  --seq-length ${SEQ_LENGTH} \
-  --sliding-window-size ${SEQ_LENGTH} \
-  --micro-batch-size ${MICRO_BATCH_SIZE} \
-  --global-batch-size ${GLOBAL_BATCH_SIZE} \
-  --hf-transformer-model-dir ${CHECKPOINT_DIR} \
-  --tokenizer-type Llama2Tokenizer \
-  --tokenizer-model ${TOKENIZER_MODEL} \
-  --instruction-train-data-path ${TRAIN_DATA_PATH} \
-  --instruction-valid-data-path ${VALID_DATA_PATH} \
-  --epoch 2 \
-  --train-iters 500000 \
-  --lr ${LR} \
-  --min-lr ${MIN_LR} \
-  --lr-decay-style cosine \
-  --weight-decay ${WEIGHT_DECAY} \
-  --grad-clip-norm ${GRAD_CLIP} \
-  --optimizer adam \
-  --adam-beta1 0.9 \
-  --adam-beta2 0.95 \
-  --adam-eps 1e-6 \
-  --save-interval 500 \
-  --eval-interval 100 \
-  --eval-iters 20 \
-  --bf16 \
-  --mixed-precision \
-  --base-model ${CHECKPOINT_DIR} \
-  --save ${CHECKPOINT_SAVE_DIR} \
-  --load ${CHECKPOINT_SAVE_DIR} \
-  --low-cpu-fsdp \
-  --sharding-strategy FULL_SHARD \
-  --checkpoint-type LOCAL_STATE_DICT \
-  --fsdp-activation-checkpointing \
-  --instruction-tuning \
-  --save-sampler-state \
-  --use-mpi \
-  --wandb-entity "prj-jalm" \
-  --wandb-project "Llama-2-7b-instruct" \
-  --wandb-name "${JOB_NAME}"
diff --git a/scripts/abci/instruction/swallow-7b/swallow-7b-imitation-2-oass2-top1.sh b/scripts/abci/instruction/swallow-7b/swallow-7b-imitation-2-oass2-top1.sh
deleted file mode 100644
index e61df85..0000000
--- a/scripts/abci/instruction/swallow-7b/swallow-7b-imitation-2-oass2-top1.sh
+++ /dev/null
@@ -1,121 +0,0 @@
-#!/bin/bash
-#$ -l rt_AF=2
-#$ -l h_rt=0:08:00:00
-#$ -j y
-#$ -o outputs/instruction/swallow-7b/
-#$ -cwd
-
-# module load
-source /etc/profile.d/modules.sh
-module load cuda/11.8/11.8.0
-module load cudnn/8.9/8.9.2
-module load nccl/2.16/2.16.2-1
-module load hpcx/2.12
-
-# swich virtual env
-source .env/bin/activate
-
-# distributed settings
-export MASTER_ADDR=$(/usr/sbin/ip a show dev bond0 | grep 'inet ' | awk '{ print $2 }' | cut -d "/" -f 1)
-export MASTER_PORT=$((10000 + ($JOB_ID % 50000)))
-
-echo "MASTER_ADDR=${MASTER_ADDR}"
-
-# hostfile
-
-if [[ "$SGE_RESOURCE_TYPE" == "rt_F" ]]; then
-  export NUM_GPU_PER_NODE=4
-  NODE_TYPE="v100"
-elif [[ "$SGE_RESOURCE_TYPE" == "rt_AF" ]]; then
-  export NUM_GPU_PER_NODE=8
-  NODE_TYPE="a100"
-else
-  echo "Unrecognized SGE_RESOURCE_TYPE: $SGE_RESOURCE_TYPE"
-fi
-
-NUM_NODES=$NHOSTS
-NUM_GPUS=$((${NUM_NODES} * ${NUM_GPU_PER_NODE}))
-
-mkdir -p ./hostfile
-
-HOSTFILE_NAME=./hostfile/hostfile_${JOB_ID}
-while read -r line; do
-  echo "${line} slots=${NUM_GPU_PER_NODE}"
-done <"$SGE_JOB_HOSTLIST" >"$HOSTFILE_NAME"
-
-# training config
-SEQ_LENGTH=4096
-DATA_PARALLEL_SIZE=$NUM_GPUS
-
-MICRO_BATCH_SIZE=4
-GLOBAL_BATCH_SIZE=64
-
-# optimizer config
-LR=2e-5
-MIN_LR=2e-6
-WEIGHT_DECAY=0.1
-GRAD_CLIP=1
-
-# checkpoint & tokenizer
-TOKENIZER_MODEL=/bb/llm/gaf51275/llama/huggingface-checkpoint/Swallow-7b-hf/tokenizer.model
-CHECKPOINT_DIR=/bb/llm/gaf51275/llama/huggingface-checkpoint/Swallow-7b-hf
-CHECKPOINT_SAVE_DIR="/bb/llm/gaf51275/llama/checkpoints/Swallow-7b-VE-chat/imitation-2-oasst2-top1-lr_${LR}-minlr_${MIN_LR}"
-
-mkdir -p ${CHECKPOINT_SAVE_DIR}
-
-# dataset
-DATASET_DIR=/bb/llm/gaf51275/llama/finetuning/datasets/training/imitation_2_oasst2_top1
-
-TRAIN_DATA_PATH=${DATASET_DIR}/train.jsonl
-VALID_DATA_PATH=${DATASET_DIR}/val.jsonl
-
-# job name
-JOB_NAME="Swallow-7b-VE-imitation-2-oasst2-top1-BS=${GLOBAL_BATCH_SIZE}-LR=${LR}-MINLR=${MIN_LR}"
-
-# run
-mpirun -np $NUM_GPUS \
-  --npernode $NUM_GPU_PER_NODE \
-  -hostfile $HOSTFILE_NAME \
-  -x MASTER_ADDR=$MASTER_ADDR \
-  -x MASTER_PORT=$MASTER_PORT \
-  -bind-to none -map-by slot \
-  -x PATH \
-  python examples/finetuning.py \
-  --seq-length ${SEQ_LENGTH} \
-  --sliding-window-size ${SEQ_LENGTH} \
-  --micro-batch-size ${MICRO_BATCH_SIZE} \
-  --global-batch-size ${GLOBAL_BATCH_SIZE} \
-  --hf-transformer-model-dir ${CHECKPOINT_DIR} \
-  --tokenizer-type Llama2Tokenizer \
-  --tokenizer-model ${TOKENIZER_MODEL} \
-  --instruction-train-data-path ${TRAIN_DATA_PATH} \
-  --instruction-valid-data-path ${VALID_DATA_PATH} \
-  --epoch 2 \
-  --train-iters 500000 \
-  --lr ${LR} \
-  --min-lr ${MIN_LR} \
-  --lr-decay-style cosine \
-  --weight-decay ${WEIGHT_DECAY} \
-  --grad-clip-norm ${GRAD_CLIP} \
-  --optimizer adam \
-  --adam-beta1 0.9 \
-  --adam-beta2 0.95 \
-  --adam-eps 1e-6 \
-  --save-interval 500 \
-  --eval-interval 100 \
-  --eval-iters 20 \
-  --bf16 \
-  --mixed-precision \
-  --base-model ${CHECKPOINT_DIR} \
-  --save ${CHECKPOINT_SAVE_DIR} \
-  --load ${CHECKPOINT_SAVE_DIR} \
-  --low-cpu-fsdp \
-  --sharding-strategy FULL_SHARD \
-  --checkpoint-type LOCAL_STATE_DICT \
-  --fsdp-activation-checkpointing \
-  --instruction-tuning \
-  --save-sampler-state \
-  --use-mpi \
-  --wandb-entity "prj-jalm" \
-  --wandb-project "Llama-2-7b-instruct" \
-  --wandb-name "${JOB_NAME}"
diff --git a/scripts/abci/instruction/swallow-7b/swallow-7b-imitation_1_and_2.sh b/scripts/abci/instruction/swallow-7b/swallow-7b-imitation_1_and_2.sh
deleted file mode 100644
index 9c15834..0000000
--- a/scripts/abci/instruction/swallow-7b/swallow-7b-imitation_1_and_2.sh
+++ /dev/null
@@ -1,121 +0,0 @@
-#!/bin/bash
-#$ -l rt_AF=2
-#$ -l h_rt=0:08:00:00
-#$ -j y
-#$ -o outputs/instruction/swallow-7b/
-#$ -cwd
-
-# module load
-source /etc/profile.d/modules.sh
-module load cuda/11.8/11.8.0
-module load cudnn/8.9/8.9.2
-module load nccl/2.16/2.16.2-1
-module load hpcx/2.12
-
-# swich virtual env
-source .env/bin/activate
-
-# distributed settings
-export MASTER_ADDR=$(/usr/sbin/ip a show dev bond0 | grep 'inet ' | awk '{ print $2 }' | cut -d "/" -f 1)
-export MASTER_PORT=$((10000 + ($JOB_ID % 50000)))
-
-echo "MASTER_ADDR=${MASTER_ADDR}"
-
-# hostfile
-
-if [[ "$SGE_RESOURCE_TYPE" == "rt_F" ]]; then
-  export NUM_GPU_PER_NODE=4
-  NODE_TYPE="v100"
-elif [[ "$SGE_RESOURCE_TYPE" == "rt_AF" ]]; then
-  export NUM_GPU_PER_NODE=8
-  NODE_TYPE="a100"
-else
-  echo "Unrecognized SGE_RESOURCE_TYPE: $SGE_RESOURCE_TYPE"
-fi
-
-NUM_NODES=$NHOSTS
-NUM_GPUS=$((${NUM_NODES} * ${NUM_GPU_PER_NODE}))
-
-mkdir -p ./hostfile
-
-HOSTFILE_NAME=./hostfile/hostfile_${JOB_ID}
-while read -r line; do
-  echo "${line} slots=${NUM_GPU_PER_NODE}"
-done <"$SGE_JOB_HOSTLIST" >"$HOSTFILE_NAME"
-
-# training config
-SEQ_LENGTH=4096
-DATA_PARALLEL_SIZE=$NUM_GPUS
-
-MICRO_BATCH_SIZE=4
-GLOBAL_BATCH_SIZE=64
-
-# optimizer config
-LR=2e-5
-MIN_LR=2e-6
-WEIGHT_DECAY=0.1
-GRAD_CLIP=1
-
-# checkpoint & tokenizer
-TOKENIZER_MODEL=/bb/llm/gaf51275/llama/huggingface-checkpoint/Swallow-7b-hf/tokenizer.model
-CHECKPOINT_DIR=/bb/llm/gaf51275/llama/huggingface-checkpoint/Swallow-7b-hf
-CHECKPOINT_SAVE_DIR="/bb/llm/gaf51275/llama/checkpoints/Swallow-7b-VE-chat/imitation-1-and-2-lr_${LR}-minlr_${MIN_LR}"
-
-mkdir -p ${CHECKPOINT_SAVE_DIR}
-
-# dataset
-DATASET_DIR=/bb/llm/gaf51275/llama/finetuning/datasets/training/imitation_1_and_2
-
-TRAIN_DATA_PATH=${DATASET_DIR}/train.jsonl
-VALID_DATA_PATH=${DATASET_DIR}/val.jsonl
-
-# job name
-JOB_NAME="Swallow-7b-VE-imitation-1-and-2-BS=${GLOBAL_BATCH_SIZE}-LR=${LR}-MINLR=${MIN_LR}"
-
-# run
-mpirun -np $NUM_GPUS \
-  --npernode $NUM_GPU_PER_NODE \
-  -hostfile $HOSTFILE_NAME \
-  -x MASTER_ADDR=$MASTER_ADDR \
-  -x MASTER_PORT=$MASTER_PORT \
-  -bind-to none -map-by slot \
-  -x PATH \
-  python examples/finetuning.py \
-  --seq-length ${SEQ_LENGTH} \
-  --sliding-window-size ${SEQ_LENGTH} \
-  --micro-batch-size ${MICRO_BATCH_SIZE} \
-  --global-batch-size ${GLOBAL_BATCH_SIZE} \
-  --hf-transformer-model-dir ${CHECKPOINT_DIR} \
-  --tokenizer-type Llama2Tokenizer \
-  --tokenizer-model ${TOKENIZER_MODEL} \
-  --instruction-train-data-path ${TRAIN_DATA_PATH} \
-  --instruction-valid-data-path ${VALID_DATA_PATH} \
-  --epoch 2 \
-  --train-iters 500000 \
-  --lr ${LR} \
-  --min-lr ${MIN_LR} \
-  --lr-decay-style cosine \
-  --weight-decay ${WEIGHT_DECAY} \
-  --grad-clip-norm ${GRAD_CLIP} \
-  --optimizer adam \
-  --adam-beta1 0.9 \
-  --adam-beta2 0.95 \
-  --adam-eps 1e-6 \
-  --save-interval 500 \
-  --eval-interval 100 \
-  --eval-iters 20 \
-  --bf16 \
-  --mixed-precision \
-  --base-model ${CHECKPOINT_DIR} \
-  --save ${CHECKPOINT_SAVE_DIR} \
-  --load ${CHECKPOINT_SAVE_DIR} \
-  --low-cpu-fsdp \
-  --sharding-strategy FULL_SHARD \
-  --checkpoint-type LOCAL_STATE_DICT \
-  --fsdp-activation-checkpointing \
-  --instruction-tuning \
-  --save-sampler-state \
-  --use-mpi \
-  --wandb-entity "prj-jalm" \
-  --wandb-project "Llama-2-7b-instruct" \
-  --wandb-name "${JOB_NAME}"
diff --git a/scripts/abci/instruction/swallow-7b/swallow-7b-oasst2-top1-imitation2-3.sh b/scripts/abci/instruction/swallow-7b/swallow-7b-oasst2-top1-imitation2-3.sh
deleted file mode 100644
index cad631c..0000000
--- a/scripts/abci/instruction/swallow-7b/swallow-7b-oasst2-top1-imitation2-3.sh
+++ /dev/null
@@ -1,121 +0,0 @@
-#!/bin/bash
-#$ -l rt_AF=2
-#$ -l h_rt=0:10:00:00
-#$ -j y
-#$ -o outputs/instruction/swallow-7b/
-#$ -cwd
-
-# module load
-source /etc/profile.d/modules.sh
-module load cuda/11.8/11.8.0
-module load cudnn/8.9/8.9.2
-module load nccl/2.16/2.16.2-1
-module load hpcx/2.12
-
-# swich virtual env
-source .env/bin/activate
-
-# distributed settings
-export MASTER_ADDR=$(/usr/sbin/ip a show dev bond0 | grep 'inet ' | awk '{ print $2 }' | cut -d "/" -f 1)
-export MASTER_PORT=$((10000 + ($JOB_ID % 50000)))
-
-echo "MASTER_ADDR=${MASTER_ADDR}"
-
-# hostfile
-
-if [[ "$SGE_RESOURCE_TYPE" == "rt_F" ]]; then
-  export NUM_GPU_PER_NODE=4
-  NODE_TYPE="v100"
-elif [[ "$SGE_RESOURCE_TYPE" == "rt_AF" ]]; then
-  export NUM_GPU_PER_NODE=8
-  NODE_TYPE="a100"
-else
-  echo "Unrecognized SGE_RESOURCE_TYPE: $SGE_RESOURCE_TYPE"
-fi
-
-NUM_NODES=$NHOSTS
-NUM_GPUS=$((${NUM_NODES} * ${NUM_GPU_PER_NODE}))
-
-mkdir -p ./hostfile
-
-HOSTFILE_NAME=./hostfile/hostfile_${JOB_ID}
-while read -r line; do
-  echo "${line} slots=${NUM_GPU_PER_NODE}"
-done <"$SGE_JOB_HOSTLIST" >"$HOSTFILE_NAME"
-
-# training config
-SEQ_LENGTH=4096
-DATA_PARALLEL_SIZE=$NUM_GPUS
-
-MICRO_BATCH_SIZE=4
-GLOBAL_BATCH_SIZE=256
-
-# optimizer config
-LR=2e-5
-MIN_LR=2e-6
-WEIGHT_DECAY=0.1
-GRAD_CLIP=1
-
-# checkpoint & tokenizer
-TOKENIZER_MODEL=/bb/llm/gaf51275/llama/huggingface-checkpoint/Swallow-7b-hf/tokenizer.model
-CHECKPOINT_DIR=/bb/llm/gaf51275/llama/huggingface-checkpoint/Swallow-7b-hf
-CHECKPOINT_SAVE_DIR="/bb/llm/gaf51275/llama/checkpoints/Swallow-7b-VE-chat/oasst2-top1-imitation-2-3-lr_${LR}-minlr_${MIN_LR}-GB_${GLOBAL_BATCH_SIZE}"
-
-mkdir -p ${CHECKPOINT_SAVE_DIR}
-
-# dataset
-DATASET_DIR=/bb/llm/gaf51275/llama/finetuning/datasets/training/oasst2-top1-imitation-2-3
-
-TRAIN_DATA_PATH=${DATASET_DIR}/train.jsonl
-VALID_DATA_PATH=${DATASET_DIR}/val.jsonl
-
-# job name
-JOB_NAME="Swallow-7b-VE-oasst2-top1-imitation-2-3-BS=${GLOBAL_BATCH_SIZE}-LR=${LR}-MINLR=${MIN_LR}"
-
-# run
-mpirun -np $NUM_GPUS \
-  --npernode $NUM_GPU_PER_NODE \
-  -hostfile $HOSTFILE_NAME \
-  -x MASTER_ADDR=$MASTER_ADDR \
-  -x MASTER_PORT=$MASTER_PORT \
-  -bind-to none -map-by slot \
-  -x PATH \
-  python examples/finetuning.py \
-  --seq-length ${SEQ_LENGTH} \
-  --sliding-window-size ${SEQ_LENGTH} \
-  --micro-batch-size ${MICRO_BATCH_SIZE} \
-  --global-batch-size ${GLOBAL_BATCH_SIZE} \
-  --hf-transformer-model-dir ${CHECKPOINT_DIR} \
-  --tokenizer-type Llama2Tokenizer \
-  --tokenizer-model ${TOKENIZER_MODEL} \
-  --instruction-train-data-path ${TRAIN_DATA_PATH} \
-  --instruction-valid-data-path ${VALID_DATA_PATH} \
-  --epoch 2 \
-  --train-iters 500000 \
-  --lr ${LR} \
-  --min-lr ${MIN_LR} \
-  --lr-decay-style cosine \
-  --weight-decay ${WEIGHT_DECAY} \
-  --grad-clip-norm ${GRAD_CLIP} \
-  --optimizer adam \
-  --adam-beta1 0.9 \
-  --adam-beta2 0.95 \
-  --adam-eps 1e-6 \
-  --save-interval 500 \
-  --eval-interval 100 \
-  --eval-iters 20 \
-  --bf16 \
-  --mixed-precision \
-  --base-model ${CHECKPOINT_DIR} \
-  --save ${CHECKPOINT_SAVE_DIR} \
-  --load ${CHECKPOINT_SAVE_DIR} \
-  --low-cpu-fsdp \
-  --sharding-strategy FULL_SHARD \
-  --checkpoint-type LOCAL_STATE_DICT \
-  --fsdp-activation-checkpointing \
-  --instruction-tuning \
-  --save-sampler-state \
-  --use-mpi \
-  --wandb-entity "prj-jalm" \
-  --wandb-project "Llama-2-7b-instruct" \
-  --wandb-name "${JOB_NAME}"
diff --git a/scripts/abci/llama3/Llama-3-8b.sh b/scripts/abci/llama3/Llama-3-8b.sh
index 2208409..e4b9cb6 100644
--- a/scripts/abci/llama3/Llama-3-8b.sh
+++ b/scripts/abci/llama3/Llama-3-8b.sh
@@ -1,17 +1,17 @@
 #!/bin/bash
-#$ -l rt_AF=4
-#$ -l h_rt=5:0:00:00
+#$ -l rt_AF=2
+#$ -l h_rt=0:1:00:00
 #$ -j y
 #$ -o outputs/Llama-3-8b/
 #$ -cwd
 
 # module load
 source /etc/profile.d/modules.sh
-module use /groups/gag51395/modules/modulefiles
+module use /bb/llm/gaf51275/modules/modulefiles
 
 module load cuda/12.1/12.1.1
 module load cudnn/cuda-12.1/9.0.0
-module load nccl/2.17/2.17.1-1
+module load nccl/2.20.5
 module load hpcx/2.12
 module load gcc/11.4.0
 
@@ -108,7 +108,7 @@ mpirun -np $NUM_GPUS \
   --optimizer adam \
   --adam-beta1 0.9 \
   --adam-beta2 0.95 \
-  --adam-eps 1e-5 \
+  --adam-eps 1e-8 \
   --save-interval 500 \
   --eval-interval 100 \
   --eval-iters 10 \
diff --git a/scripts/gcp/gemma-swallow/gemma-swallow-27b.sh b/scripts/gcp/gemma-swallow/gemma-swallow-27b.sh
new file mode 100644
index 0000000..29da6ba
--- /dev/null
+++ b/scripts/gcp/gemma-swallow/gemma-swallow-27b.sh
@@ -0,0 +1,182 @@
+#!/bin/bash
+#SBATCH --job-name=gemma
+#SBATCH --partition=a3
+#SBATCH --exclusive
+#SBATCH --nodes 8
+#SBATCH --gpus-per-node=8
+#SBATCH --ntasks-per-node=8
+#SBATCH --time=3-12:00:00
+#SBATCH --output=outputs/gemma/%x-%j.out
+#SBATCH --error=outputs/gemma/%x-%j.out
+
+set -e
+
+# module load
+module load turing/cuda/12.1
+module load turing/cudnn/8.9.7
+module load turing/nccl/2.20.5
+module load turing/hpcx/2.17.1
+
+# open file limit
+ulimit -n 65536 1048576
+
+# python virtualenv
+source .env/bin/activate
+
+# Important TCPX environment variables
+UDS_PATH="/run/tcpx-${SLURM_JOB_ID}"
+
+# Only use TCPX for multi-node jobs.
+[[ "${SLURM_JOB_NUM_NODES}" -gt 1 ]] && export USE_TCPX=yes || export USE_TCPX=no
+
+# Only use TCPX for multi-node jobs.
+if [[ ${USE_TCPX} = "yes" ]]; then
+  # Set up NCCL Environment variables
+  export NCCL_NET=GPUDirectTCPX_v7
+  # These network interfaces use Ubuntu's consistent naming scheme. See
+  # https://manpages.ubuntu.com/manpages/focal/man7/systemd.net-naming-scheme.7.html
+  export NCCL_SOCKET_IFNAME=enp0s12
+  export NCCL_GPUDIRECTTCPX_CTRL_DEV=enp0s12
+  export NCCL_GPUDIRECTTCPX_SOCKET_IFNAME=enp6s0,enp12s0,enp134s0,enp140s0
+  export NCCL_CROSS_NIC=0
+  export NCCL_ALGO=Ring
+  export NCCL_PROTO=Simple
+  export NCCL_NSOCKS_PERTHREAD=4
+  export NCCL_SOCKET_NTHREADS=1
+  export NCCL_DYNAMIC_CHUNK_SIZE=524288
+  export NCCL_P2P_NET_CHUNKSIZE=524288
+  export NCCL_P2P_PCI_CHUNKSIZE=524288
+  export NCCL_P2P_NVL_CHUNKSIZE=1048576
+  export CUDA_VISIBLE_DEVICES=0,1,2,3,4,5,6,7
+  export NCCL_NET_GDR_LEVEL=PIX
+  export NCCL_P2P_PXN_LEVEL=0
+  export NCCL_GPUDIRECTTCPX_UNIX_CLIENT_PREFIX=${UDS_PATH}
+  export NCCL_GPUDIRECTTCPX_PROGRAM_FLOW_STEERING_WAIT_MICROS=500000
+  export NCCL_GPUDIRECTTCPX_TX_BINDINGS="enp6s0:8-21,112-125;enp12s0:8-21,112-125;enp134s0:60-73,164-177;enp140s0:60-73,164-177"
+  export NCCL_GPUDIRECTTCPX_RX_BINDINGS="enp6s0:22-35,126-139;enp12s0:22-35,126-139;enp134s0:74-87,178-191;enp140s0:74-87,178-191"
+
+  export LD_LIBRARY_PATH=/var/lib/tcpx/lib64:${LD_LIBRARY_PATH}
+else
+  unset NCCL_NET
+fi
+
+# distributed settings
+export MASTER_ADDR=$(scontrol show hostname $SLURM_JOB_NODELIST | head -n1)
+export MASTER_PORT=$((10000 + ($SLURM_JOBID % 50000)))
+
+echo "MASTER_ADDR=${MASTER_ADDR}"
+
+# hostfile
+export NUM_GPU_PER_NODE=8
+NODE_TYPE="H100"
+
+NUM_NODES=$SLURM_JOB_NUM_NODES
+NUM_GPUS=$((${NUM_NODES} * ${NUM_GPU_PER_NODE}))
+
+# training config
+SEQ_LENGTH=8192
+DATA_PARALLEL_SIZE=$NUM_GPUS
+
+MICRO_BATCH_SIZE=1
+GLOBAL_BATCH_SIZE=512
+TRAIN_STEPS=25000
+
+# optimizer config
+LR=1.5E-5
+MIN_LR=1.5E-6
+LR_WARMUP_STEPS=1000
+LR_DECAY_STEPS=25000
+WEIGHT_DECAY=0.1
+GRAD_CLIP=1
+# model config
+TOKENIZER_MODEL=/home/ext_kazuki_fujii_turing_motors_c/hf-checkpoints/gemma-2-27b/tokenizer.model
+CHECKPOINT_DIR=/home/ext_kazuki_fujii_turing_motors_c/hf-checkpoints/gemma-2-27b
+CHECKPOINT_SAVE_DIR="/data/checkpoints/gemma-2-27b/LR${LR}-MINLR${MIN_LR}-WARMUP${LR_WARMUP_STEPS}-WD${WEIGHT_DECAY}-GC${GRAD_CLIP}"
+
+mkdir -p ${CHECKPOINT_SAVE_DIR}
+
+# data config
+DATASET_DIR=/data/gemma_datasets/gemma-2_original_transformers-4.42.4
+
+TRAIN_DATA_PATH=""
+
+# ja swallow corpus
+TRAIN_DATA_PATH="${TRAIN_DATA_PATH} 9651717149 ${DATASET_DIR}/split_0_text_document"
+TRAIN_DATA_PATH="${TRAIN_DATA_PATH} 9509783737 ${DATASET_DIR}/split_1_text_document"
+TRAIN_DATA_PATH="${TRAIN_DATA_PATH} 11318518471 ${DATASET_DIR}/split_2_text_document"
+TRAIN_DATA_PATH="${TRAIN_DATA_PATH} 14904913186 ${DATASET_DIR}/split_3_text_document"
+TRAIN_DATA_PATH="${TRAIN_DATA_PATH} 34418569125 ${DATASET_DIR}/split_4_text_document"
+
+# ja wikipedia
+TRAIN_DATA_PATH="${TRAIN_DATA_PATH} 1478281282 ${DATASET_DIR}/ja_wiki_merged_text_document"
+
+# ja-en laboro
+TRAIN_DATA_PATH="${TRAIN_DATA_PATH} 769992751 ${DATASET_DIR}/default_plain_text_format_text_document"
+
+# en wikipedia
+TRAIN_DATA_PATH="${TRAIN_DATA_PATH} 1890321820 ${DATASET_DIR}/en_wiki_merged_train_text_document"
+
+# en refinedweb
+TRAIN_DATA_PATH="${TRAIN_DATA_PATH} 1890321820 ${DATASET_DIR}/lumi_en_falcon_merge_text_document"
+
+# en cosmopedia
+TRAIN_DATA_PATH="${TRAIN_DATA_PATH} 1276041352 ${DATASET_DIR}/cosmopedia_automathtext_train_text_document"
+TRAIN_DATA_PATH="${TRAIN_DATA_PATH} 20344318 ${DATASET_DIR}/cosmopedia_khanacademy_train_text_document"
+TRAIN_DATA_PATH="${TRAIN_DATA_PATH} 94906162 ${DATASET_DIR}/cosmopedia_openstax_train_text_document"
+TRAIN_DATA_PATH="${TRAIN_DATA_PATH} 914799590 ${DATASET_DIR}/cosmopedia_stanford_train_text_document"
+TRAIN_DATA_PATH="${TRAIN_DATA_PATH} 2621500949 ${DATASET_DIR}/cosmopedia_stories_train_text_document"
+TRAIN_DATA_PATH="${TRAIN_DATA_PATH} 158819730 ${DATASET_DIR}/cosmopedia_wikihow_train_text_document"
+
+# code
+TRAIN_DATA_PATH="${TRAIN_DATA_PATH} 4540584279 ${DATASET_DIR}/algebraic-stack_text_document"
+TRAIN_DATA_PATH="${TRAIN_DATA_PATH} 4540584279 ${DATASET_DIR}/proof-pile-2-train_merged_open-web-math_text_document"
+
+
+# job name
+JOB_NAME="gemma-2-turing-swallow-gcp-${NODE_TYPE}-${NUM_NODES}node-${NUM_GPUS}gpu-BS=${GLOBAL_BATCH_SIZE}-LR=${LR}-MINLR=${MIN_LR}-WARMUP=${LR_WARMUP_STEPS}-WD=${WEIGHT_DECAY}-GC=${GRAD_CLIP}"
+
+# run
+mpirun -np $NUM_GPUS \
+  --npernode $NUM_GPU_PER_NODE \
+  -x MASTER_ADDR=$MASTER_ADDR \
+  -x MASTER_PORT=$MASTER_PORT \
+  -bind-to none \
+  -x LD_LIBRARY_PATH \
+  -x PATH \
+  python examples/finetuning.py \
+  --seq-length ${SEQ_LENGTH} \
+  --sliding-window-size ${SEQ_LENGTH} \
+  --micro-batch-size ${MICRO_BATCH_SIZE} \
+  --global-batch-size ${GLOBAL_BATCH_SIZE} \
+  --train-iters ${TRAIN_STEPS} \
+  --tokenizer-type Llama2Tokenizer \
+  --tokenizer-model ${TOKENIZER_MODEL} \
+  --data-path ${TRAIN_DATA_PATH} \
+  --split 989,10,1 \
+  --lr ${LR} \
+  --min-lr ${MIN_LR} \
+  --lr-decay-style cosine \
+  --lr-warmup-iters ${LR_WARMUP_STEPS} \
+  --lr-decay-iters ${LR_DECAY_STEPS} \
+  --weight-decay ${WEIGHT_DECAY} \
+  --grad-clip-norm ${GRAD_CLIP} \
+  --optimizer adam \
+  --adam-beta1 0.9 \
+  --adam-beta2 0.95 \
+  --adam-eps 1e-8 \
+  --save-interval 250 \
+  --eval-interval 100 \
+  --eval-iters 10 \
+  --bf16 \
+  --mixed-precision \
+  --base-model ${CHECKPOINT_DIR} \
+  --save ${CHECKPOINT_SAVE_DIR} \
+  --load ${CHECKPOINT_SAVE_DIR} \
+  --low-cpu-fsdp \
+  --sharding-strategy FULL_SHARD \
+  --checkpoint-type LOCAL_STATE_DICT \
+  --fsdp-activation-checkpointing \
+  --use-mpi \
+  --wandb-entity "turing-geniac" \
+  --wandb-project "gemma-2-turing-swallow" \
+  --wandb-name "${JOB_NAME}"
diff --git a/scripts/gcp/gemma-swallow/gemma-swallow-9b.sh b/scripts/gcp/gemma-swallow/gemma-swallow-9b.sh
new file mode 100644
index 0000000..b6e7e16
--- /dev/null
+++ b/scripts/gcp/gemma-swallow/gemma-swallow-9b.sh
@@ -0,0 +1,182 @@
+#!/bin/bash
+#SBATCH --job-name=gemma
+#SBATCH --partition=a3
+#SBATCH --exclusive
+#SBATCH --nodes 8
+#SBATCH --gpus-per-node=8
+#SBATCH --ntasks-per-node=8
+#SBATCH --time=3-12:00:00
+#SBATCH --output=outputs/gemma/%x-%j.out
+#SBATCH --error=outputs/gemma/%x-%j.out
+
+set -e
+
+# module load
+module load turing/cuda/12.1
+module load turing/cudnn/8.9.7
+module load turing/nccl/2.20.5
+module load turing/hpcx/2.17.1
+
+# open file limit
+ulimit -n 65536 1048576
+
+# python virtualenv
+source .env/bin/activate
+
+# Important TCPX environment variables
+UDS_PATH="/run/tcpx-${SLURM_JOB_ID}"
+
+# Only use TCPX for multi-node jobs.
+[[ "${SLURM_JOB_NUM_NODES}" -gt 1 ]] && export USE_TCPX=yes || export USE_TCPX=no
+
+# Only use TCPX for multi-node jobs.
+if [[ ${USE_TCPX} = "yes" ]]; then
+  # Set up NCCL Environment variables
+  export NCCL_NET=GPUDirectTCPX_v7
+  # These network interfaces use Ubuntu's consistent naming scheme. See
+  # https://manpages.ubuntu.com/manpages/focal/man7/systemd.net-naming-scheme.7.html
+  export NCCL_SOCKET_IFNAME=enp0s12
+  export NCCL_GPUDIRECTTCPX_CTRL_DEV=enp0s12
+  export NCCL_GPUDIRECTTCPX_SOCKET_IFNAME=enp6s0,enp12s0,enp134s0,enp140s0
+  export NCCL_CROSS_NIC=0
+  export NCCL_ALGO=Ring
+  export NCCL_PROTO=Simple
+  export NCCL_NSOCKS_PERTHREAD=4
+  export NCCL_SOCKET_NTHREADS=1
+  export NCCL_DYNAMIC_CHUNK_SIZE=524288
+  export NCCL_P2P_NET_CHUNKSIZE=524288
+  export NCCL_P2P_PCI_CHUNKSIZE=524288
+  export NCCL_P2P_NVL_CHUNKSIZE=1048576
+  export CUDA_VISIBLE_DEVICES=0,1,2,3,4,5,6,7
+  export NCCL_NET_GDR_LEVEL=PIX
+  export NCCL_P2P_PXN_LEVEL=0
+  export NCCL_GPUDIRECTTCPX_UNIX_CLIENT_PREFIX=${UDS_PATH}
+  export NCCL_GPUDIRECTTCPX_PROGRAM_FLOW_STEERING_WAIT_MICROS=500000
+  export NCCL_GPUDIRECTTCPX_TX_BINDINGS="enp6s0:8-21,112-125;enp12s0:8-21,112-125;enp134s0:60-73,164-177;enp140s0:60-73,164-177"
+  export NCCL_GPUDIRECTTCPX_RX_BINDINGS="enp6s0:22-35,126-139;enp12s0:22-35,126-139;enp134s0:74-87,178-191;enp140s0:74-87,178-191"
+
+  export LD_LIBRARY_PATH=/var/lib/tcpx/lib64:${LD_LIBRARY_PATH}
+else
+  unset NCCL_NET
+fi
+
+# distributed settings
+export MASTER_ADDR=$(scontrol show hostname $SLURM_JOB_NODELIST | head -n1)
+export MASTER_PORT=$((10000 + ($SLURM_JOBID % 50000)))
+
+echo "MASTER_ADDR=${MASTER_ADDR}"
+
+# hostfile
+export NUM_GPU_PER_NODE=8
+NODE_TYPE="H100"
+
+NUM_NODES=$SLURM_JOB_NUM_NODES
+NUM_GPUS=$((${NUM_NODES} * ${NUM_GPU_PER_NODE}))
+
+# training config
+SEQ_LENGTH=8192
+DATA_PARALLEL_SIZE=$NUM_GPUS
+
+MICRO_BATCH_SIZE=1
+GLOBAL_BATCH_SIZE=512
+TRAIN_STEPS=25000
+
+# optimizer config
+LR=2.5E-5
+MIN_LR=2.5E-6
+LR_WARMUP_STEPS=1000
+LR_DECAY_STEPS=25000
+WEIGHT_DECAY=0.1
+GRAD_CLIP=1
+# model config
+TOKENIZER_MODEL=/home/ext_kazuki_fujii_turing_motors_c/hf-checkpoints/gemma-2-9b/tokenizer.model
+CHECKPOINT_DIR=/home/ext_kazuki_fujii_turing_motors_c/hf-checkpoints/gemma-2-9b
+CHECKPOINT_SAVE_DIR="/data/checkpoints/gemma-2-9b/LR${LR}-MINLR${MIN_LR}-WARMUP${LR_WARMUP_STEPS}-WD${WEIGHT_DECAY}-GC${GRAD_CLIP}"
+
+mkdir -p ${CHECKPOINT_SAVE_DIR}
+
+# data config
+DATASET_DIR=/data/gemma_datasets/gemma-2_original_transformers-4.42.4
+
+TRAIN_DATA_PATH=""
+
+# ja swallow corpus
+TRAIN_DATA_PATH="${TRAIN_DATA_PATH} 9651717149 ${DATASET_DIR}/split_0_text_document"
+TRAIN_DATA_PATH="${TRAIN_DATA_PATH} 9509783737 ${DATASET_DIR}/split_1_text_document"
+TRAIN_DATA_PATH="${TRAIN_DATA_PATH} 11318518471 ${DATASET_DIR}/split_2_text_document"
+TRAIN_DATA_PATH="${TRAIN_DATA_PATH} 14904913186 ${DATASET_DIR}/split_3_text_document"
+TRAIN_DATA_PATH="${TRAIN_DATA_PATH} 34418569125 ${DATASET_DIR}/split_4_text_document"
+
+# ja wikipedia
+TRAIN_DATA_PATH="${TRAIN_DATA_PATH} 1478281282 ${DATASET_DIR}/ja_wiki_merged_text_document"
+
+# ja-en laboro
+TRAIN_DATA_PATH="${TRAIN_DATA_PATH} 769992751 ${DATASET_DIR}/default_plain_text_format_text_document"
+
+# en wikipedia
+TRAIN_DATA_PATH="${TRAIN_DATA_PATH} 1890321820 ${DATASET_DIR}/en_wiki_merged_train_text_document"
+
+# en refinedweb
+TRAIN_DATA_PATH="${TRAIN_DATA_PATH} 1890321820 ${DATASET_DIR}/lumi_en_falcon_merge_text_document"
+
+# en cosmopedia
+TRAIN_DATA_PATH="${TRAIN_DATA_PATH} 1276041352 ${DATASET_DIR}/cosmopedia_automathtext_train_text_document"
+TRAIN_DATA_PATH="${TRAIN_DATA_PATH} 20344318 ${DATASET_DIR}/cosmopedia_khanacademy_train_text_document"
+TRAIN_DATA_PATH="${TRAIN_DATA_PATH} 94906162 ${DATASET_DIR}/cosmopedia_openstax_train_text_document"
+TRAIN_DATA_PATH="${TRAIN_DATA_PATH} 914799590 ${DATASET_DIR}/cosmopedia_stanford_train_text_document"
+TRAIN_DATA_PATH="${TRAIN_DATA_PATH} 2621500949 ${DATASET_DIR}/cosmopedia_stories_train_text_document"
+TRAIN_DATA_PATH="${TRAIN_DATA_PATH} 158819730 ${DATASET_DIR}/cosmopedia_wikihow_train_text_document"
+
+# code
+TRAIN_DATA_PATH="${TRAIN_DATA_PATH} 4540584279 ${DATASET_DIR}/algebraic-stack_text_document"
+TRAIN_DATA_PATH="${TRAIN_DATA_PATH} 4540584279 ${DATASET_DIR}/proof-pile-2-train_merged_open-web-math_text_document"
+
+
+# job name
+JOB_NAME="gemma-2-turing-swallow-gcp-${NODE_TYPE}-${NUM_NODES}node-${NUM_GPUS}gpu-BS=${GLOBAL_BATCH_SIZE}-LR=${LR}-MINLR=${MIN_LR}-WARMUP=${LR_WARMUP_STEPS}-WD=${WEIGHT_DECAY}-GC=${GRAD_CLIP}"
+
+# run
+mpirun -np $NUM_GPUS \
+  --npernode $NUM_GPU_PER_NODE \
+  -x MASTER_ADDR=$MASTER_ADDR \
+  -x MASTER_PORT=$MASTER_PORT \
+  -bind-to none \
+  -x LD_LIBRARY_PATH \
+  -x PATH \
+  python examples/finetuning.py \
+  --seq-length ${SEQ_LENGTH} \
+  --sliding-window-size ${SEQ_LENGTH} \
+  --micro-batch-size ${MICRO_BATCH_SIZE} \
+  --global-batch-size ${GLOBAL_BATCH_SIZE} \
+  --train-iters ${TRAIN_STEPS} \
+  --tokenizer-type Llama2Tokenizer \
+  --tokenizer-model ${TOKENIZER_MODEL} \
+  --data-path ${TRAIN_DATA_PATH} \
+  --split 989,10,1 \
+  --lr ${LR} \
+  --min-lr ${MIN_LR} \
+  --lr-decay-style cosine \
+  --lr-warmup-iters ${LR_WARMUP_STEPS} \
+  --lr-decay-iters ${LR_DECAY_STEPS} \
+  --weight-decay ${WEIGHT_DECAY} \
+  --grad-clip-norm ${GRAD_CLIP} \
+  --optimizer adam \
+  --adam-beta1 0.9 \
+  --adam-beta2 0.95 \
+  --adam-eps 1e-8 \
+  --save-interval 500 \
+  --eval-interval 100 \
+  --eval-iters 10 \
+  --bf16 \
+  --mixed-precision \
+  --base-model ${CHECKPOINT_DIR} \
+  --save ${CHECKPOINT_SAVE_DIR} \
+  --load ${CHECKPOINT_SAVE_DIR} \
+  --low-cpu-fsdp \
+  --sharding-strategy FULL_SHARD \
+  --checkpoint-type LOCAL_STATE_DICT \
+  --fsdp-activation-checkpointing \
+  --use-mpi \
+  --wandb-entity "turing-geniac" \
+  --wandb-project "gemma-2-turing-swallow" \
+  --wandb-name "${JOB_NAME}"
diff --git a/scripts/gcp/install.sh b/scripts/gcp/install.sh
new file mode 100644
index 0000000..ca35f01
--- /dev/null
+++ b/scripts/gcp/install.sh
@@ -0,0 +1,42 @@
+#!/bin/bash
+#SBATCH --job-name=install
+#SBATCH --partition=a3
+#SBATCH --nodes 1
+#SBATCH --gpus-per-node=8
+#SBATCH --ntasks-per-node=8
+#SBATCH --output=outputs/install/%x-%j.out
+#SBATCH --error=outputs/install/%x-%j.out
+
+set -e
+
+# module load
+module load turing/cuda/12.1
+module load turing/cudnn/8.9.7
+module load turing/nccl/2.20.5
+module load turing/hpcx/2.17.1
+
+# swich virtual env
+source .env/bin/activate
+
+# pip version up
+pip install --upgrade pip
+
+# pip install requirements
+pip install -r requirements.txt
+pip install ninja packaging wheel
+
+# distirbuted training requirements
+pip install mpi4py
+
+# huggingface requirements
+pip install huggingface_hub
+
+# install transformer engine
+pip install git+https://github.com/NVIDIA/TransformerEngine.git@v1.6
+pip uninstall flash-attn
+
+# install flash-atten
+git clone git@github.com:Dao-AILab/flash-attention.git
+cd flash-attention
+git checkout v2.4.2
+pip install -e .
diff --git a/scripts/tsubame/dpo/Llama-3-8B/Llama-3-8B-chat-v0.2.sh b/scripts/tsubame/dpo/Llama-3-8B/Llama-3-8B-chat-v0.2.sh
new file mode 100644
index 0000000..8e134ab
--- /dev/null
+++ b/scripts/tsubame/dpo/Llama-3-8B/Llama-3-8B-chat-v0.2.sh
@@ -0,0 +1,116 @@
+#!/bin/sh
+#$ -cwd
+#$ -l node_f=4
+#$ -l h_rt=1:00:00:00
+#$ -o outputs/Llama-3-8b-dpo/$JOB_ID.log
+#$ -e outputs/Llama-3-8b-dpo/$JOB_ID.log
+#$ -p -5
+
+# module load
+module use /gs/fs/tga-NII-LLM/modules/modulefiles
+
+module load ylab/cuda/12.1
+module load ylab/cudnn/8.9.7
+module load ylab/nccl/cuda-12.2/2.20.5
+module load ylab/hpcx/2.17.1
+module load ninja/1.11.1
+
+# swich virtual env
+source .env/bin/activate
+
+# distributed settings
+export MASTER_ADDR=$(/usr/sbin/ip a show dev bond0 | grep 'inet ' | awk '{ print $2 }' | cut -d "/" -f 1)
+export MASTER_PORT=$((10000 + ($JOB_ID % 50000)))
+
+echo "MASTER_ADDR=${MASTER_ADDR}"
+
+# hostfile
+export NUM_GPU_PER_NODE=4
+NODE_TYPE="h100"
+
+NUM_NODES=$NHOSTS
+NUM_GPUS=$((${NUM_NODES} * ${NUM_GPU_PER_NODE}))
+
+mkdir -p ./hostfile
+
+HOSTFILE_NAME=./hostfile/hostfile_${JOB_ID}
+while read -r hostname _ rest; do
+  echo "${hostname} slots=${NUM_GPU_PER_NODE}"
+done <"$PE_HOSTFILE" >"$HOSTFILE_NAME"
+
+# training config
+SEQ_LENGTH=8192
+DATA_PARALLEL_SIZE=$NUM_GPUS
+
+MICRO_BATCH_SIZE=2
+GLOBAL_BATCH_SIZE=128
+
+# optimizer config
+LR=1e-5
+MIN_LR=1e-6
+WEIGHT_DECAY=0.1
+GRAD_CLIP=1
+
+# checkpoint
+TOKENIZER_DIR=/gs/bs/tga-NII-LLM/hf-checkpoints/Meta-Llama-3-8B-Instruct
+CHECKPOINT_DIR=/gs/bs/tga-NII-LLM/swallow-hf/Llama-3-Swallow-8B-v0.1
+CHECKPOINT_SAVE_DIR="/gs/bs/tga-NII-LLM/checkpoints/Llama-3-8B-chat-v0.2/LR_${LR}_MINLR_${MIN_LR}_WD_${WEIGHT_DECAY}_GC_${GRAD_CLIP}"
+
+mkdir -p ${CHECKPOINT_SAVE_DIR}
+
+# dataset
+DATASET_DIR=/gs/bs/tga-NII-LLM/datasets/raw/dpo/hh-rlhf-12k-ja
+
+TRAIN_DATA_PATH=${DATASET_DIR}/converted.jsonl
+VALID_DATA_PATH=${DATASET_DIR}/converted.jsonl
+
+# job name
+JOB_NAME="Llama-3-8B-dpo-v0.2-BS=${GLOBAL_BATCH_SIZE}-LR=${LR}-MINLR=${MIN_LR}-WD=${WEIGHT_DECAY}-GC=${GRAD_CLIP}"
+
+# run
+mpirun -np $NUM_GPUS \
+  --npernode $NUM_GPU_PER_NODE \
+  -hostfile $HOSTFILE_NAME \
+  -x MASTER_ADDR=$MASTER_ADDR \
+  -x MASTER_PORT=$MASTER_PORT \
+  -x CUDA_DEVICE_MAX_CONNECTIONS=1 \
+  -x LD_LIBRARY_PATH \
+  -x PATH \
+  -bind-to none \
+  python examples/finetuning.py \
+  --seq-length ${SEQ_LENGTH} \
+  --micro-batch-size ${MICRO_BATCH_SIZE} \
+  --global-batch-size ${GLOBAL_BATCH_SIZE} \
+  --hf-transformer-model-dir ${TOKENIZER_DIR} \
+  --dpo-train-data-path ${TRAIN_DATA_PATH} \
+  --dpo-valid-data-path ${VALID_DATA_PATH} \
+  --epoch 1 \
+  --lr ${LR} \
+  --min-lr ${MIN_LR} \
+  --lr-decay-style cosine \
+  --weight-decay ${WEIGHT_DECAY} \
+  --grad-clip-norm ${GRAD_CLIP} \
+  --optimizer adam \
+  --adam-beta1 0.9 \
+  --adam-beta2 0.95 \
+  --adam-eps 1e-8 \
+  --save-interval 500 \
+  --eval-interval 500 \
+  --eval-iters 10 \
+  --bf16 \
+  --mixed-precision \
+  --base-model ${CHECKPOINT_DIR} \
+  --save ${CHECKPOINT_SAVE_DIR} \
+  --load ${CHECKPOINT_SAVE_DIR} \
+  --low-cpu-fsdp \
+  --sharding-strategy FULL_SHARD \
+  --checkpoint-type LOCAL_STATE_DICT \
+  --fsdp-activation-checkpointing \
+  --direct-preference-optimization \
+  --dpo-beta 0.1 \
+  --dpo-label-smoothing 0.0 \
+  --save-sampler-state \
+  --use-mpi \
+  --wandb-entity "prj-jalm" \
+  --wandb-project "Llama-3-8B-chat-v0.2" \
+  --wandb-name "${JOB_NAME}"
diff --git a/scripts/tsubame/install.sh b/scripts/tsubame/install.sh
index 0535157..900f578 100644
--- a/scripts/tsubame/install.sh
+++ b/scripts/tsubame/install.sh
@@ -4,23 +4,30 @@
 #$ -l h_rt=1:00:00
 #$ -p -5
 
-# priotiry: -5: normal, -4: high, -3: highest
-
 # Load modules
-module load cuda/12.1.0
-module load nccl/2.20.5
-module load openmpi/5.0.2-gcc
+module use /gs/fs/tga-NII-LLM/modules/modulefiles
+
+module load ylab/cuda/12.1
+module load ylab/cudnn/8.9.7
+module load ylab/nccl/cuda-12.2/2.20.5
+module load ylab/hpcx/2.17.1
 module load ninja/1.11.1
-module load ~/modulefiles/cudnn/9.0.0
 
 # Set environment variables
 source .env/bin/activate
 
+# pip version up
 pip install --upgrade pip
 
-# Install packages
+# pip install requirements
 pip install -r requirements.txt
 
-# flash attn
+# distirbuted training requirements
+pip install mpi4py
+
+# huggingface requirements
+pip install huggingface_hub
+
+# install flash-atten
 pip install ninja packaging wheel
 pip install flash-attn --no-build-isolation
diff --git a/scripts/abci/instruction/swallow-13b/swallow-13b-baseline_imitatation_2.sh b/scripts/tsubame/instruct/Llama-3-8B/Llama-3-8B-instruct-v0.2.sh
similarity index 51%
rename from scripts/abci/instruction/swallow-13b/swallow-13b-baseline_imitatation_2.sh
rename to scripts/tsubame/instruct/Llama-3-8B/Llama-3-8B-instruct-v0.2.sh
index 582e974..fa9e3f5 100644
--- a/scripts/abci/instruction/swallow-13b/swallow-13b-baseline_imitatation_2.sh
+++ b/scripts/tsubame/instruct/Llama-3-8B/Llama-3-8B-instruct-v0.2.sh
@@ -1,16 +1,19 @@
-#!/bin/bash
-#$ -l rt_AF=2
-#$ -l h_rt=0:20:00:00
-#$ -j y
-#$ -o outputs/instruction/swallow-13b/
+#!/bin/sh
 #$ -cwd
+#$ -l node_f=4
+#$ -l h_rt=0:20:00:00
+#$ -o outputs/Llama-3-8b-instruct/$JOB_ID.log
+#$ -e outputs/Llama-3-8b-instruct/$JOB_ID.log
+#$ -p -5
 
 # module load
-source /etc/profile.d/modules.sh
-module load cuda/11.8/11.8.0
-module load cudnn/8.9/8.9.2
-module load nccl/2.16/2.16.2-1
-module load hpcx/2.12
+module use /gs/fs/tga-NII-LLM/modules/modulefiles
+
+module load ylab/cuda/12.1
+module load ylab/cudnn/8.9.7
+module load ylab/nccl/cuda-12.2/2.20.5
+module load ylab/hpcx/2.17.1
+module load ninja/1.11.1
 
 # swich virtual env
 source .env/bin/activate
@@ -22,16 +25,8 @@ export MASTER_PORT=$((10000 + ($JOB_ID % 50000)))
 echo "MASTER_ADDR=${MASTER_ADDR}"
 
 # hostfile
-
-if [[ "$SGE_RESOURCE_TYPE" == "rt_F" ]]; then
-  export NUM_GPU_PER_NODE=4
-  NODE_TYPE="v100"
-elif [[ "$SGE_RESOURCE_TYPE" == "rt_AF" ]]; then
-  export NUM_GPU_PER_NODE=8
-  NODE_TYPE="a100"
-else
-  echo "Unrecognized SGE_RESOURCE_TYPE: $SGE_RESOURCE_TYPE"
-fi
+export NUM_GPU_PER_NODE=4
+NODE_TYPE="h100"
 
 NUM_NODES=$NHOSTS
 NUM_GPUS=$((${NUM_NODES} * ${NUM_GPU_PER_NODE}))
@@ -39,38 +34,38 @@ NUM_GPUS=$((${NUM_NODES} * ${NUM_GPU_PER_NODE}))
 mkdir -p ./hostfile
 
 HOSTFILE_NAME=./hostfile/hostfile_${JOB_ID}
-while read -r line; do
-  echo "${line} slots=${NUM_GPU_PER_NODE}"
-done <"$SGE_JOB_HOSTLIST" >"$HOSTFILE_NAME"
+while read -r hostname _ rest; do
+  echo "${hostname} slots=${NUM_GPU_PER_NODE}"
+done <"$PE_HOSTFILE" >"$HOSTFILE_NAME"
 
 # training config
-SEQ_LENGTH=4096
+SEQ_LENGTH=8192
 DATA_PARALLEL_SIZE=$NUM_GPUS
 
 MICRO_BATCH_SIZE=2
-GLOBAL_BATCH_SIZE=256
+GLOBAL_BATCH_SIZE=128
 
 # optimizer config
-LR=2e-5
-MIN_LR=2e-6
+LR=1e-5
+MIN_LR=1e-6
 WEIGHT_DECAY=0.1
 GRAD_CLIP=1
 
-# checkpoint & tokenizer
-TOKENIZER_MODEL=/bb/llm/gaf51275/llama/huggingface-checkpoint/Swallow-13b-hf/tokenizer.model
-CHECKPOINT_DIR=/bb/llm/gaf51275/llama/huggingface-checkpoint/Swallow-13b-hf
-CHECKPOINT_SAVE_DIR="/bb/llm/gaf51275/llama/checkpoints/Swallow-13b-VE-chat/baseline-imitation-2-lr_${LR}-minlr_${MIN_LR}-GB_${GLOBAL_BATCH_SIZE}"
+# checkpoint
+TOKENIZER_DIR=/gs/bs/tga-NII-LLM/hf-checkpoints/Meta-Llama-3-8B-Instruct
+CHECKPOINT_DIR=/gs/bs/tga-NII-LLM/swallow-hf/Llama-3-Swallow-8B-v0.1
+CHECKPOINT_SAVE_DIR="/gs/bs/tga-NII-LLM/checkpoints/Llama-3-8B-Instruct-v0.2/LR_${LR}_MINLR_${MIN_LR}_WD_${WEIGHT_DECAY}_GC_${GRAD_CLIP}"
 
 mkdir -p ${CHECKPOINT_SAVE_DIR}
 
 # dataset
-DATASET_DIR=/bb/llm/gaf51275/llama/finetuning/datasets/training/baseline-imitation_2
+DATASET_DIR=/gs/bs/tga-NII-LLM/datasets/raw/instruct/synthetic/general/Synthetic-JP-Conversations-Magpie-Nemotron-4-10k
 
-TRAIN_DATA_PATH=${DATASET_DIR}/train.jsonl
-VALID_DATA_PATH=${DATASET_DIR}/val.jsonl
+TRAIN_DATA_PATH=${DATASET_DIR}/converted.jsonl
+VALID_DATA_PATH=${DATASET_DIR}/converted.jsonl
 
 # job name
-JOB_NAME="Swallow-13b-VE-baseline-imitation-2-BS=${GLOBAL_BATCH_SIZE}-LR=${LR}-MINLR=${MIN_LR}"
+JOB_NAME="Llama-3-8B-instruct-v0.2-BS=${GLOBAL_BATCH_SIZE}-LR=${LR}-MINLR=${MIN_LR}-WD=${WEIGHT_DECAY}-GC=${GRAD_CLIP}"
 
 # run
 mpirun -np $NUM_GPUS \
@@ -78,20 +73,18 @@ mpirun -np $NUM_GPUS \
   -hostfile $HOSTFILE_NAME \
   -x MASTER_ADDR=$MASTER_ADDR \
   -x MASTER_PORT=$MASTER_PORT \
-  -bind-to none -map-by slot \
+  -x CUDA_DEVICE_MAX_CONNECTIONS=1 \
+  -x LD_LIBRARY_PATH \
   -x PATH \
+  -bind-to none \
   python examples/finetuning.py \
   --seq-length ${SEQ_LENGTH} \
-  --sliding-window-size ${SEQ_LENGTH} \
   --micro-batch-size ${MICRO_BATCH_SIZE} \
   --global-batch-size ${GLOBAL_BATCH_SIZE} \
-  --hf-transformer-model-dir ${CHECKPOINT_DIR} \
-  --tokenizer-type Llama2Tokenizer \
-  --tokenizer-model ${TOKENIZER_MODEL} \
+  --hf-transformer-model-dir ${TOKENIZER_DIR} \
   --instruction-train-data-path ${TRAIN_DATA_PATH} \
   --instruction-valid-data-path ${VALID_DATA_PATH} \
-  --epoch 2 \
-  --train-iters 500000 \
+  --epoch 1 \
   --lr ${LR} \
   --min-lr ${MIN_LR} \
   --lr-decay-style cosine \
@@ -100,10 +93,10 @@ mpirun -np $NUM_GPUS \
   --optimizer adam \
   --adam-beta1 0.9 \
   --adam-beta2 0.95 \
-  --adam-eps 1e-6 \
+  --adam-eps 1e-8 \
   --save-interval 500 \
-  --eval-interval 100 \
-  --eval-iters 20 \
+  --eval-interval 500 \
+  --eval-iters 10 \
   --bf16 \
   --mixed-precision \
   --base-model ${CHECKPOINT_DIR} \
@@ -117,5 +110,5 @@ mpirun -np $NUM_GPUS \
   --save-sampler-state \
   --use-mpi \
   --wandb-entity "prj-jalm" \
-  --wandb-project "Llama-2-13b-instruct" \
+  --wandb-project "Llama-3-8B-Instruct-v0.2" \
   --wandb-name "${JOB_NAME}"
diff --git a/src/llama_recipes/arguments.py b/src/llama_recipes/arguments.py
index 62da802..c69f447 100644
--- a/src/llama_recipes/arguments.py
+++ b/src/llama_recipes/arguments.py
@@ -20,6 +20,8 @@ def parse_args() -> argparse.Namespace:
     # validate
     if args.use_freeze_layers:
         assert args.no_save_optimizer_state is True
+    # adam epsilon is very sensitive value so don't change
+    assert args.adam_eps == 1e-8
 
     return args
 
@@ -147,6 +149,13 @@ def _add_data_args(parser: argparse.ArgumentParser) -> argparse.ArgumentParser:
         '--vocab-extra-ids', type=int, default=0,
         help='Number of additional vocabulary tokens. They are used for span masking in the T5 model'
     )
+    # instruction tuning
+    group.add_argument(
+        '--system-prompt-role', type=str, default="system"
+    )
+    group.add_argument(
+        '--system-prompt-content', type=str, default='あなたは誠実で優秀な日本人のアシスタントです。'
+    )
 
     return parser
 
@@ -261,6 +270,8 @@ def _add_training_args(parser: argparse.ArgumentParser) -> argparse.ArgumentPars
     group.add_argument("--instruction-tuning", action="store_true")
     # DPO
     group.add_argument("--direct-preference-optimization", action="store_true")
+    group.add_argument('--dpo-beta', type=float, default=0.1)
+    group.add_argument('--dpo-label-smoothing', type=float, default=0.0)
 
     return parser
 
@@ -308,6 +319,12 @@ def _add_instruction_tuning_args(parser: argparse.ArgumentParser) -> argparse.Ar
     group.add_argument(
         "--instruction-valid-data-path", type=str, default=None,
     )
+    group.add_argument(
+        "--dpo-train-data-path", type=str, default=None,
+    )
+    group.add_argument(
+        "--dpo-valid-data-path", type=str, default=None,
+    )
     group.add_argument(
         "--epoch", type=int, default=2,
     )
diff --git a/src/llama_recipes/finetuning.py b/src/llama_recipes/finetuning.py
index 3fb45d0..8669558 100644
--- a/src/llama_recipes/finetuning.py
+++ b/src/llama_recipes/finetuning.py
@@ -1,3 +1,4 @@
+import copy
 import os
 import sys
 
@@ -49,7 +50,8 @@
 def main() -> None:
     # initialize
     args = parse_args()
-    set_global_variables(args=args)
+    is_pretraining = not (args.instruction_tuning or args.direct_preference_optimization)
+    set_global_variables(args=args, build_tokenizer=is_pretraining)
 
     # Set the seeds for reproducibility
     set_seed(seed=args.seed)
@@ -79,6 +81,7 @@ def main() -> None:
             "name": args.wandb_name,
             "config": vars(args),
         }
+        wandb.require("core")
         wandb.init(**wandb_setting)
 
     if torch_distributed.is_initialized():
@@ -99,6 +102,10 @@ def main() -> None:
     model = get_model(
         model_name=args.base_model, use_cache=use_cache
     )
+    if args.direct_preference_optimization:
+        reference_model = copy.deepcopy(model)
+        for param in reference_model.parameters():
+            param.requires_grad = False
 
     if args.load:
         load_model_state_dict(model, args.load)  # type: ignore
@@ -113,6 +120,13 @@ def main() -> None:
         elif args.fp16:
             model.to(torch.float16)  # type: ignore
 
+    if args.direct_preference_optimization:
+        with preserve_fp32_buffers(reference_model):
+            if args.bf16:
+                reference_model.to(torch.bfloat16)  # type: ignore
+            elif args.fp16:
+                reference_model.to(torch.float16)  # type: ignore
+
     if args.use_freeze_layers:
         print_rank_0("NOTE: freeze transformer layers")
         freeze_transformer_layers(model=model, layer_ranges=args.freeze_layers)
@@ -140,9 +154,27 @@ def main() -> None:
     if args.fsdp_activation_checkpointing:
         apply_fsdp_checkpointing(model=model, model_name=args.base_model)
 
+    if args.direct_preference_optimization:
+        reference_model = FSDP(
+            reference_model,  # type: ignore
+            auto_wrap_policy=wrapping_policy,
+            cpu_offload=CPUOffload(offload_params=True) if args.fsdp_cpu_offload else None,
+            mixed_precision=mixed_precision_policy,
+            sharding_strategy=get_sharding_strategy(),
+            device_id=torch.cuda.current_device(),
+            limit_all_gathers=True,
+            sync_module_states=args.low_cpu_fsdp,
+            param_init_fn=lambda module: module.to_empty(  # type: ignore
+                device=torch.cuda.current_device(), recurse=False,  # type: ignore
+            )
+            if args.low_cpu_fsdp and rank != 0
+            else None,
+        )
+
     if not args.instruction_tuning and not args.direct_preference_optimization:
         args.continual_pretraining = True
 
+    dpo_loss_fn = None
     if args.continual_pretraining:
         from llama_recipes.datasets.pretrain_dataset import build_train_valid_test_datasets
         from megatron_lm.megatron.data.data_samplers import build_pretraining_data_loader
@@ -165,6 +197,7 @@ def main() -> None:
     else:
         from transformers import AutoTokenizer
         from llama_recipes.utils.instruction_tuning import get_instruction_tuning_dataloader
+        from llama_recipes.utils.dpo_dataset import get_dpo_dataloader
 
         hf_tokenizer = AutoTokenizer.from_pretrained(
             pretrained_model_name_or_path=args.hf_transformer_model_dir
@@ -190,7 +223,30 @@ def main() -> None:
                 update_iter_info()
 
         elif args.direct_preference_optimization:
-            pass
+            from llama_recipes.utils.dpo_loss import DPOLoss
+
+            dpo_loss_fn = DPOLoss(
+                beta=args.dpo_beta,
+                label_smoothing=args.dpo_label_smoothing,
+            )
+
+            train_dataloader = get_dpo_dataloader(
+                tokenizer=hf_tokenizer,  # type: ignore
+                data_path=args.dpo_train_data_path,
+                train=True
+            )
+            validation_dataloader = get_dpo_dataloader(
+                tokenizer=hf_tokenizer,  # type: ignore
+                data_path=args.dpo_valid_data_path
+            )
+
+            args.train_iters = args.dpo_dataset_size // args.global_batch_size * args.epoch
+            args.lr_decay_iters = args.train_iters
+            args.lr_warmup_iters = args.lr_decay_iters // 10
+            args.save_sampler_state = True
+            if rank == 0:
+                from llama_recipes.utils.wandb_utils import update_iter_info
+                update_iter_info()
         else:
             raise ValueError("unknown training mode")
 
@@ -241,6 +297,8 @@ def main() -> None:
         gradient_accumulation_steps=args.gradient_accumulation_steps,
         local_rank=get_local_rank(),
         rank=get_rank(),
+        dpo_loss_fn=dpo_loss_fn,
+        reference_model=reference_model if args.direct_preference_optimization else None,
     )
 
 
diff --git a/src/llama_recipes/get_model_decoder_layer.py b/src/llama_recipes/get_model_decoder_layer.py
index 6082fdc..b989d28 100644
--- a/src/llama_recipes/get_model_decoder_layer.py
+++ b/src/llama_recipes/get_model_decoder_layer.py
@@ -1,16 +1,19 @@
 from transformers.models.llama.modeling_llama import LlamaDecoderLayer
 from transformers.models.mistral.modeling_mistral import MistralDecoderLayer
 from transformers.models.phi3.modeling_phi3 import Phi3DecoderLayer
+from transformers.models.gemma2.modeling_gemma2 import Gemma2DecoderLayer
 
 
 def get_model_decoder_layer(
     model_name: str,
-) -> type[LlamaDecoderLayer] | type[MistralDecoderLayer] | type[Phi3DecoderLayer]:
+) -> type[LlamaDecoderLayer] | type[MistralDecoderLayer] | type[Phi3DecoderLayer] | type[Gemma2DecoderLayer]:
     if "Llama" in model_name or "Swallow" in model_name or "Yi" in model_name:
         return LlamaDecoderLayer
     elif "Mistral" in model_name or "mistral" in model_name or "Codestral" in model_name:
         return MistralDecoderLayer
     elif "Phi-3" in model_name:
         return Phi3DecoderLayer
+    elif "gemma-2" in model_name:
+        return Gemma2DecoderLayer
     else:
         raise NotImplementedError(f"{model_name}: this model decoder layer is not implemented.")
diff --git a/src/llama_recipes/get_models.py b/src/llama_recipes/get_models.py
index 487ccfc..05a9851 100644
--- a/src/llama_recipes/get_models.py
+++ b/src/llama_recipes/get_models.py
@@ -1,8 +1,11 @@
+import time
+
 from transformers import (
     LlamaConfig,
     LlamaForCausalLM,
     MistralForCausalLM,
     Phi3ForCausalLM,
+    Gemma2ForCausalLM,
     AutoModelForCausalLM,
 )
 from llama_recipes.utils.distributed import is_rank_0
@@ -12,7 +15,7 @@
 
 def get_model(
     model_name: str, use_cache: bool = False
-) -> LlamaForCausalLM | MistralForCausalLM | AutoModelForCausalLM:
+) -> LlamaForCausalLM | MistralForCausalLM | Phi3ForCausalLM | Gemma2ForCausalLM:
     """return CausalLM model
 
     Args:
@@ -26,6 +29,9 @@ def get_model(
         LlamaForCausalLM | MistralForCausalLM: PyTorch model
     """
     args = get_args()
+    if is_rank_0():
+        print("Instantiating Model ...", flush=True)
+        init_time = time.perf_counter()
 
     if "Llama" in model_name or "Swallow" in model_name:
         if args.low_cpu_fsdp:
@@ -56,8 +62,6 @@ def get_model(
                 use_cache=use_cache,
             )
 
-        return model  # type: ignore
-
     elif "Mistral" in model_name or "mistral" in model_name or "Codestral" in model_name:
         # If using torch.device("meta"), FSDP training hang
         # FYI: https://github.com/iwiwi/epochraft-hf-fsdp/pull/10#issuecomment-1803360147
@@ -77,8 +81,6 @@ def get_model(
             torch_dtype=torch.bfloat16 if args.bf16 else torch.float16,
         )
 
-        return model  # type: ignore
-
     elif "Phi-3" in model_name:
 
         model = Phi3ForCausalLM.from_pretrained(
@@ -91,8 +93,6 @@ def get_model(
             torch_dtype=torch.bfloat16 if args.bf16 else torch.float16,
         )
 
-        return model  # type: ignore
-
     elif "Yi-1.5" in model_name:
         # https://huggingface.co/01-ai/Yi-1.5-9B/blob/main/config.json
 
@@ -106,7 +106,21 @@ def get_model(
             torch_dtype=torch.bfloat16 if args.bf16 else torch.float16,
         )
 
-        return model  # type: ignore
+    elif "gemma-2" in model_name:
+        model = Gemma2ForCausalLM.from_pretrained(
+            model_name,
+            load_in_8bit=True if args.quantization else None,
+            device_map="auto" if args.quantization else None,
+            use_cache=use_cache,
+            max_position_embeddings=args.seq_length,
+            attn_implementation="flash_attention_2",
+            torch_dtype=torch.bfloat16 if args.bf16 else torch.float16,
+        )
 
     else:
         raise NotImplementedError("model not implemented")
+
+    if is_rank_0():
+        print(f"Model instantiation took {time.perf_counter() - init_time:.2f} secs")
+
+    return model  # type: ignore
diff --git a/src/llama_recipes/policies/mixed_precision.py b/src/llama_recipes/policies/mixed_precision.py
index 4eb5d18..f7157c1 100644
--- a/src/llama_recipes/policies/mixed_precision.py
+++ b/src/llama_recipes/policies/mixed_precision.py
@@ -1,6 +1,3 @@
-# Copyright (c) Meta Platforms, Inc. and affiliates.
-# This software may be used and distributed according to the terms of the Llama 2 Community License Agreement.
-
 import torch
 
 from torch.distributed.fsdp.api import (
@@ -11,7 +8,7 @@
 fpSixteen = MixedPrecision(
     param_dtype=torch.float16,
     # Gradient communication precision.
-    reduce_dtype=torch.float16,
+    reduce_dtype=torch.float32,  # Use float32 for gradient communication. (Llama-3, Megatron_LM like)
     # Buffer precision.
     buffer_dtype=torch.float16,
 )
@@ -19,7 +16,7 @@
 bfSixteen = MixedPrecision(
     param_dtype=torch.bfloat16,
     # Gradient communication precision.
-    reduce_dtype=torch.bfloat16,
+    reduce_dtype=torch.float32,  # Use float32 for gradient communication. (Llama-3, Megatron_LM like)
     # Buffer precision.
     buffer_dtype=torch.bfloat16,
     cast_forward_inputs=True,
@@ -27,7 +24,7 @@
 
 bfSixteen_mixed = MixedPrecision(
     param_dtype=torch.float32,
-    reduce_dtype=torch.bfloat16,
+    reduce_dtype=torch.float32,  # Use float32 for gradient communication. (Llama-3, Megatron_LM like)
     buffer_dtype=torch.bfloat16,
 )
 
diff --git a/src/llama_recipes/utils/checkpoint.py b/src/llama_recipes/utils/checkpoint.py
index c68ac70..4826138 100644
--- a/src/llama_recipes/utils/checkpoint.py
+++ b/src/llama_recipes/utils/checkpoint.py
@@ -1,6 +1,7 @@
 import time
 import torch
 import torch.distributed as torch_distributed
+from torch.utils.data.distributed import DistributedSampler
 from torch.distributed.fsdp import (  # noqa: F401
     FullyShardedDataParallel as FSDP,  # type: ignore
     StateDictType,  # type: ignore
@@ -9,6 +10,7 @@
 from torch.distributed.fsdp.api import FullOptimStateDictConfig
 from pathlib import Path
 import os
+import gc
 
 from megatron_lm.megatron.global_vars import get_args, get_sampler
 
@@ -42,6 +44,8 @@ def save_model_state_dict(model: FSDP, path: str) -> None:
         print(f"Saving model state dict to {path}")
         torch.save(state_dict, path)
         print(f"Saved model state dict to {path}")
+        del state_dict
+        gc.collect()
 
 
 def save_optimizer_state_dict(model: FSDP, optimizer: torch.optim.Optimizer, path: str) -> None:
@@ -50,6 +54,8 @@ def save_optimizer_state_dict(model: FSDP, optimizer: torch.optim.Optimizer, pat
         print(f"Saving optimizer state dict to {path}")
         torch.save(state_dict, path)
         print(f"Saved optimizer state dict to {path}")
+        del state_dict
+        gc.collect()
 
 
 def save_scheduler_state_dict(scheduler: torch.optim.lr_scheduler.LRScheduler, path: str) -> None:
@@ -59,10 +65,10 @@ def save_scheduler_state_dict(scheduler: torch.optim.lr_scheduler.LRScheduler, p
         print(f"Saved scheduler state dict to {path}")
 
 
-def save_sampler_state_dict(sampler: torch.utils.data.distributed.DistributedSampler, path: str) -> None:
+def save_sampler_state_dict(sampler: DistributedSampler, path: str) -> None:
     if torch_distributed.get_rank() == 0:
         print(f"Saving sampler indices to {path}")
-        torch.save(sampler.state_dict(), path)
+        torch.save(sampler.state_dict(), path)  # type: ignore
         print(f"Saved sampler indices to {path}")
 
 
@@ -197,14 +203,14 @@ def load_scheduler_state_dict(scheduler: torch.optim.lr_scheduler.LRScheduler, p
     del state_dict
 
 
-def load_sampler_state_dict(sampler: torch.utils.data.distributed.DistributedSampler, path: str) -> None:
+def load_sampler_state_dict(sampler: DistributedSampler, path: str) -> None:
     latest_iteration: int = get_latest_iteration(path)
     if latest_iteration == 0:
         return
 
     latest_checkpoint_path: str = get_checkpoint_name(path, latest_iteration)
     state_dict = torch.load(f"{latest_checkpoint_path}/sampler.pt", map_location="cpu")
-    sampler.load_state_dict(state_dict)
+    sampler.load_state_dict(state_dict)  # type: ignore
     del state_dict
 
 
diff --git a/src/llama_recipes/utils/dpo.py b/src/llama_recipes/utils/dpo.py
new file mode 100644
index 0000000..ef9d325
--- /dev/null
+++ b/src/llama_recipes/utils/dpo.py
@@ -0,0 +1,85 @@
+from typing import Tuple
+import torch
+from torch import nn
+
+
+CROSS_ENTROPY_IGNORE_IDX = -100
+
+
+def get_batch_log_probs(
+    logits: torch.FloatTensor,
+    labels: torch.LongTensor,
+    label_pad_token_id: int = CROSS_ENTROPY_IGNORE_IDX,
+) -> torch.FloatTensor:
+    """
+    Calculate log probabilities based on provided logits and labels.
+
+    Args:
+        logits (torch.FloatTensor): direct logits output of the model of shape (b, s, v)
+        labels (torch.LongTensor): ground-truth labels to compute log probs with, shape (b, s).
+            Label tokens with a value of label_pad_token_id are ignored.
+        label_pad_token_id (int): token id to ignore in labels.
+
+    Returns:
+        Calculated log probs of shape (b, )
+
+    Raises:
+        ValueError: If logits and labels have different shapes.
+    """
+
+    if logits.shape[:-1] != labels.shape:
+        raise ValueError(
+            "Logits (batch and sequence length dim) and labels must have the same shape."
+        )
+
+    labels = labels[:, 1:].clone()  # type: ignore
+    logits = logits[:, :-1, :]  # type: ignore
+    loss_mask = labels != label_pad_token_id
+
+    labels[labels == label_pad_token_id] = 0
+    # take log-likelihood of the labels given our model
+    per_token_log_probs = torch.gather(
+        logits.log_softmax(-1), dim=2, index=labels.unsqueeze(2)
+    ).squeeze(2)
+
+    return (per_token_log_probs * loss_mask).sum(-1)  # type: ignore
+
+
+def concatenated_forward(
+    model: nn.Module, batch: dict[str, torch.Tensor], local_rank: int
+) -> Tuple[torch.Tensor, torch.Tensor, torch.Tensor, torch.Tensor]:
+    """
+    Run forward pass of the model with chosen and rejected samples concatenated.
+
+    Args:
+        model (nn.Module): The model to be used for the forward pass.
+        batch (Tuple[torch.Tensor, torch.Tensor]): Tuple of input_ids and labels.
+
+    Returns:
+        Tuple of chosen log probs, rejected log probs, chosen logits, rejected logits.
+    """
+    concatenated_input_ids = torch.cat(
+        [batch['chosen_input_ids'], batch['rejected_input_ids']], dim=0
+    )
+    concatenated_labels = torch.cat(
+        [batch['chosen_labels'], batch['rejected_labels']], dim=0
+    )
+    concatenated_input_ids = concatenated_input_ids.to(local_rank)
+    concatenated_labels = concatenated_labels.to(local_rank)
+
+    # formed by concatenating an equal number of "chosen" and "rejected".
+    len_chosen = concatenated_input_ids.shape[0] // 2
+
+    all_logits = model(concatenated_input_ids).logits
+
+    all_log_probs = get_batch_log_probs(
+        all_logits, concatenated_labels  # type: ignore
+    )
+
+    chosen_log_probs = all_log_probs[:len_chosen]
+    rejected_log_probs = all_log_probs[len_chosen:]
+
+    chosen_logits = all_logits[:len_chosen]
+    rejected_logits = all_logits[len_chosen:]
+
+    return (chosen_log_probs, rejected_log_probs, chosen_logits, rejected_logits)
diff --git a/src/llama_recipes/utils/dpo_dataset.py b/src/llama_recipes/utils/dpo_dataset.py
new file mode 100644
index 0000000..2c23c14
--- /dev/null
+++ b/src/llama_recipes/utils/dpo_dataset.py
@@ -0,0 +1,193 @@
+import copy
+import json
+import os
+
+import numpy as np
+import torch
+from torch.utils.data import Dataset, DataLoader
+import torch.distributed as torch_distributed
+from transformers.tokenization_utils import PreTrainedTokenizer
+from pathlib import Path
+from llama_recipes.utils.distributed import print_rank_0
+
+from megatron_lm.megatron.global_vars import get_args, set_sampler
+
+
+class DPODataset(Dataset):
+    def __init__(
+        self,
+        tokenizer: PreTrainedTokenizer,
+        data_path: str,
+    ) -> None:
+        args = get_args()
+
+        self.data_path: str = data_path
+        self.max_tokens: int = args.seq_length
+        self.tokenizer = tokenizer
+
+        # system prompt
+        self.system_prompt_role = args.system_prompt_role
+        self.system_prompt_content = args.system_prompt_content
+
+        # index file
+        dataset_dir = Path(self.data_path).parent
+        index_cache_dir = dataset_dir / ".index_cache"
+        os.makedirs(index_cache_dir, exist_ok=True)
+        index_file_path = index_cache_dir / str(os.path.basename(self.data_path)).replace(".jsonl", ".idx")
+        self.index_file_path: str = str(index_file_path)
+
+        try:
+            with open(self.index_file_path, "r", encoding="utf-8") as f:
+                self.indexes: list[int] = [int(line.strip()) for line in f]
+        except Exception as e:
+            print(f"index file error: {e}")
+            exit(1)
+
+    def __len__(self) -> int:
+        return len(self.indexes)
+
+    def __getitem__(self, index: int) -> dict[str, torch.Tensor]:
+        IGNORE_INDEX = -100  # The default setting in CrossEntropyLoss
+
+        with open(self.data_path, "r", encoding="utf-8") as file:
+            offset: int = self.indexes[index]
+            file.seek(offset)
+            try:
+                line = file.readline()
+            except Exception as e:
+                print(f"index={index}, offset={offset}, error={e}")
+                exit(1)
+
+            try:
+                conversations: dict[str, list[dict[str, str]] | str] = json.loads(line)
+            except Exception as e:
+                print(f"index={index}, offset={offset}, line={line}, error={e}")
+                exit(1)
+
+        SYSTEM_PROMPT: list[dict[str, str]] = [
+            {
+                "role": self.system_prompt_role,
+                "content": self.system_prompt_content,
+            }
+        ]
+        # chat template
+        prompt = self.tokenizer.apply_chat_template(
+            conversation=SYSTEM_PROMPT + conversations["conversations"],  # type: ignore
+            add_generation_prompt=True,
+            tokenize=True,
+        )
+
+        chosen = self.tokenizer.apply_chat_template(
+            conversation=SYSTEM_PROMPT + conversations["conversations"] + [  # type: ignore
+                {"role": "assistant", "content": conversations["chosen"]}
+            ],
+            tokenize=True,
+        )
+        rejected = self.tokenizer.apply_chat_template(
+            conversation=SYSTEM_PROMPT + conversations["conversations"] + [  # type: ignore
+                {"role": "assistant", "content": conversations["rejected"]}
+            ],
+            tokenize=True,
+        )
+        chosen_input_ids: torch.Tensor = torch.tensor(chosen, dtype=torch.int64)
+        rejected_input_ids: torch.Tensor = torch.tensor(rejected, dtype=torch.int64)
+
+        if len(chosen) > self.max_tokens or len(rejected) > self.max_tokens:
+            print(f"\n\nWARNING: chosen={self.tokenizer.decode(chosen)}\n\n")
+            print(f"\n\nWARNING: rejected={self.tokenizer.decode(rejected)}\n\n")
+
+        eos_token_id: int = self.tokenizer.encode("<|end_of_text|>", add_special_tokens=False)[0]
+        pad_token_id = eos_token_id
+
+        def pad_tensor(tensor: torch.Tensor) -> torch.Tensor:
+            padding_length: int = self.max_tokens - len(tensor)
+            if padding_length > 0:
+                pad_tensor = torch.full(
+                    (padding_length,), pad_token_id, dtype=torch.int64
+                )
+                tensor = torch.cat((tensor, pad_tensor))
+            elif padding_length < 0:
+                tensor = tensor[: self.max_tokens]
+
+            return tensor
+
+        chosen_input_ids = pad_tensor(tensor=chosen_input_ids)
+        rejected_input_ids = pad_tensor(tensor=rejected_input_ids)
+
+        chosen_labels = copy.deepcopy(chosen_input_ids)
+        rejected_labels = copy.deepcopy(rejected_input_ids)
+        # promptの長さ分だけ -1 で埋める -> 損失関数で無視するようになる
+        chosen_labels[: len(prompt)] = -1
+        rejected_labels[: len(prompt)] = -1
+        chosen_label_mask = chosen_labels.ge(0)
+        rejected_label_mask = rejected_labels.ge(0)
+
+        if torch.all(chosen_label_mask == 0) or torch.all(rejected_label_mask == 0):
+            random_index: int = np.random.randint(0, len(self.indexes))
+            self.__getitem__(random_index)
+
+        # ~label_mask -> prompt の部分を ignore_index で埋める
+        chosen_labels[~chosen_label_mask] = IGNORE_INDEX
+        rejected_labels[~rejected_label_mask] = IGNORE_INDEX
+        chosen_labels[chosen_labels == pad_token_id] = IGNORE_INDEX
+        rejected_labels[rejected_labels == pad_token_id] = IGNORE_INDEX
+
+        return {
+            "chosen_input_ids": chosen_input_ids,
+            "rejected_input_ids": rejected_input_ids,
+            "chosen_labels": chosen_labels,
+            "rejected_labels": rejected_labels,
+        }
+
+
+def worker_init_fn(worker_id: int) -> None:
+    import random
+
+    args = get_args()
+
+    worker_seed = args.seed + worker_id
+    np.random.seed(worker_seed)
+    random.seed(worker_seed)
+
+
+def get_dpo_dataloader(
+    tokenizer: PreTrainedTokenizer,
+    data_path: str,
+    train: bool = False,
+) -> DataLoader:
+    from llama_recipes.utils.sequence_length_warmup import CustomDistributedSampler
+    from llama_recipes.utils.checkpoint import load_sampler_state_dict
+
+    args = get_args()
+
+    dpo_dataset = DPODataset(
+        tokenizer=tokenizer,
+        data_path=data_path,
+    )
+
+    if train:
+        args.dpo_dataset_size = len(dpo_dataset)
+        print_rank_0(f"DPO dataset size: {args.dpo_dataset_size}")
+
+    train_sampler = CustomDistributedSampler(
+        dataset=dpo_dataset,
+        rank=torch_distributed.get_rank(),
+        num_replicas=torch_distributed.get_world_size(),
+        shuffle=True,
+        seed=args.seed,
+    )
+
+    if args.load:
+        load_sampler_state_dict(sampler=train_sampler, path=args.load)
+
+    set_sampler(sampler=train_sampler)
+
+    return DataLoader(
+        dpo_dataset,
+        batch_size=args.micro_batch_size,
+        sampler=train_sampler,
+        num_workers=args.num_workers,
+        pin_memory=True,
+        drop_last=True,
+        worker_init_fn=worker_init_fn,
+    )
diff --git a/src/llama_recipes/utils/dpo_loss.py b/src/llama_recipes/utils/dpo_loss.py
new file mode 100644
index 0000000..7ab7169
--- /dev/null
+++ b/src/llama_recipes/utils/dpo_loss.py
@@ -0,0 +1,241 @@
+# Copyright (c) Meta Platforms, Inc. and affiliates.
+# All rights reserved.
+#
+# This source code is licensed under the BSD-style license found in the
+# LICENSE file in the root directory of this source tree.
+
+from typing import Tuple
+
+import torch
+import torch.nn as nn
+import torch.nn.functional as F
+
+
+class DPOLoss(nn.Module):
+    """
+    Direct Preference Optimization (DPO) Loss module: https://arxiv.org/abs/2305.18290.
+    Simply stated from the paper:
+
+        Intuitively, the DPO update increases the relative log probability of preferred to dispreferred responses,
+        but it incorporates a dynamic, per-example importance weight that prevents
+        the model degeneration that we find occurs with a naive probability ratio objective.
+
+    Based on the implementation in HF's TRL library:
+    https://github.com/huggingface/trl/blob/5d1deb1445828cfd0e947cb3a7925b1c03a283fc/trl/trainer/dpo_trainer.py#L844
+
+    DPO retains similarities to PPO (https://arxiv.org/abs/2009.01325), where it optimizes a policy
+    (language) model to align with human preferences, and regularizes the loss function using a baseline
+    reference (the frozen, initial language model) to prevent over-fitting to the preference dataset.
+    It differs from PPO by optimizing the policy model directly using labelled preference data, rather
+    than using an additional reward model to provide feedback.
+    This significantly simplifies training and reduces compute overhead.
+
+    Args:
+        beta (float): Temperature parameter for the DPO loss, typically in the range of 0.1 to 0.5. Default is 0.1.
+        label_smoothing (float): Parameter encoding uncertainty about the labels. Default is 0.
+    """
+
+    def __init__(
+        self,
+        beta: float = 0.1,
+        label_smoothing: float = 0.0,
+    ):
+        super().__init__()
+        self.beta = beta
+        self.label_smoothing = label_smoothing
+
+    def forward(
+        self,
+        policy_chosen_logps: torch.Tensor,
+        policy_rejected_logps: torch.Tensor,
+        reference_chosen_logps: torch.Tensor,
+        reference_rejected_logps: torch.Tensor,
+    ) -> Tuple[torch.Tensor, torch.Tensor, torch.Tensor]:
+        """
+        Compute the DPO loss for a batch of policy and reference model log probabilities.
+
+        Args:
+            policy_chosen_logps (torch.Tensor): Log probabilities of the policy model
+                for the chosen responses. Shape: (batch_size)
+            policy_rejected_logps (torch.Tensor): Log probabilities of the policy model
+                for the rejected responses. Shape: (batch_size)
+            reference_chosen_logps (torch.Tensor): Log probabilities of the reference model
+                for the chosen responses. Shape: (batch_size)
+            reference_rejected_logps (torch.Tensor): Log probabilities of the reference model
+                for the rejected responses. Shape: (batch_size)
+
+        Returns:
+            Tuple[torch.Tensor, torch.Tensor, torch.Tensor]: A tuple of three tensors:
+                - losses: The DPO loss for each example in the batch.
+                - chosen_rewards: Rewards for the chosen responses.
+                - rejected_rewards: Rewards for the rejected responses.
+
+        """
+        pi_logratios = policy_chosen_logps - policy_rejected_logps
+        ref_logratios = reference_chosen_logps - reference_rejected_logps
+
+        logits = pi_logratios - ref_logratios
+
+        # The beta is a temperature parameter for the DPO loss, typically something in the range of 0.1 to 0.5.
+        # We ignore the reference model as beta -> 0. The label_smoothing parameter encodes our uncertainty about the labels and
+        # calculates a conservative DPO loss.
+        losses = (
+            -F.logsigmoid(self.beta * logits) * (1 - self.label_smoothing)
+            - F.logsigmoid(-self.beta * logits) * self.label_smoothing
+        )
+
+        chosen_rewards = (
+            self.beta * (policy_chosen_logps - reference_chosen_logps).detach()
+        )
+        rejected_rewards = (
+            self.beta * (policy_rejected_logps - reference_rejected_logps).detach()
+        )
+
+        return losses, chosen_rewards, rejected_rewards
+
+
+class RSOLoss(nn.Module):
+    """
+    Statistical Rejection Sampling Optimization (RSO) or "hinge" loss module: https://arxiv.org/abs/2309.06657.
+    Intuition from the paper:
+
+        DPO is a logistic regression on human preference data, and SLiC (https://arxiv.org/abs/2305.10425) is almost
+        equivalent to a support vector machine (SVM) with hinge loss. [RSO] improve[s] SLiC as the SVM counter part of DPO.
+
+    Based on the implementation in HF's TRL library:
+    https://github.com/huggingface/trl/blob/4dce042a3863db1d375358e8c8092b874b02934b/trl/trainer/dpo_trainer.py#L1141
+
+    Args:
+        gamma (float): Equivalent temperature parameter (from DPO) for the RSO loss.
+    """
+
+    def __init__(
+        self,
+        gamma: float = 0.1,
+    ):
+        super().__init__()
+        self.gamma = gamma
+
+    def forward(
+        self,
+        policy_chosen_logps: torch.Tensor,
+        policy_rejected_logps: torch.Tensor,
+        reference_chosen_logps: torch.Tensor,
+        reference_rejected_logps: torch.Tensor,
+    ) -> Tuple[torch.Tensor, torch.Tensor, torch.Tensor]:
+        """
+        Compute the RSO loss for a batch of policy and reference model log probabilities.
+
+        Args:
+            policy_chosen_logps (torch.Tensor): Log probabilities of the policy model
+                for the chosen responses. Shape: (batch_size)
+            policy_rejected_logps (torch.Tensor): Log probabilities of the policy model
+                for the rejected responses. Shape: (batch_size)
+            reference_chosen_logps (torch.Tensor): Log probabilities of the reference model
+                for the chosen responses. Shape: (batch_size)
+            reference_rejected_logps (torch.Tensor): Log probabilities of the reference model
+                for the rejected responses. Shape: (batch_size)
+
+        Returns:
+            Tuple[torch.Tensor, torch.Tensor, torch.Tensor]: A tuple of three tensors:
+                - losses: The RSO loss for each example in the batch.
+                - chosen_rewards: Rewards for the chosen responses.
+                - rejected_rewards: Rewards for the rejected responses.
+
+        """
+        pi_logratios = policy_chosen_logps - policy_rejected_logps
+        ref_logratios = reference_chosen_logps - reference_rejected_logps
+
+        logits = pi_logratios - ref_logratios
+
+        losses = torch.relu(1 - self.gamma * logits)
+
+        chosen_rewards = (
+            self.gamma * (policy_chosen_logps - reference_chosen_logps).detach()
+        )
+        rejected_rewards = (
+            self.gamma * (policy_rejected_logps - reference_rejected_logps).detach()
+        )
+
+        return losses, chosen_rewards, rejected_rewards
+
+
+class IPOLoss(nn.Module):
+    """
+    Identity Preference Optimization (IPO) Loss module: https://arxiv.org/abs/2310.12036.
+    Intuition from the paper:
+
+        (Given a policy pi and reference policy, pi_ref)
+
+        IPO learns from preferences dataset simply by regressing the gap between log-likelihood ratios
+
+        log(pi(chosen)/pi(rejected)) and log(pi_ref(chosen)/pi_ref(rejected))
+
+        to 1/(2*tau), where tau is the temperature parameter. [T]he weaker the regularisation becomes, the
+        higher would be the log-likelihood ratio of chosen to rejected logprobs. In other words IPO, unlike DPO,
+        always regularizes its solution towards pi_ref by controlling the gap between the log-likelihood ratios
+
+        log(pi(chosen)/pi(rejected)) and log(pi_ref(chosen)/pi_ref(rejected))
+
+        thus avoiding the over-fitting to the preference dataset.
+
+    Based on the implementation in HF's TRL library:
+    https://github.com/huggingface/trl/blob/4dce042a3863db1d375358e8c8092b874b02934b/trl/trainer/dpo_trainer.py#L1143
+
+
+    Args:
+        tau (float): Equivalent temperature scaling parameter (from DPO) for the IPO loss. From the TRL documentation:
+
+            the [tau] parameter is the reciprocal of the gap between the log-likelihood ratios of the
+            chosen vs the rejected completion pair and thus the smaller the tau the larger this gap is.
+    """
+
+    def __init__(
+        self,
+        tau: float = 0.1,
+    ):
+        super().__init__()
+        self.tau = tau
+
+    def forward(
+        self,
+        policy_chosen_logps: torch.Tensor,
+        policy_rejected_logps: torch.Tensor,
+        reference_chosen_logps: torch.Tensor,
+        reference_rejected_logps: torch.Tensor,
+    ) -> Tuple[torch.Tensor, torch.Tensor, torch.Tensor]:
+        """
+        Compute the DPO loss for a batch of policy and reference model log probabilities.
+
+        Args:
+            policy_chosen_logps (torch.Tensor): Log probabilities of the policy model
+                for the chosen responses. Shape: (batch_size)
+            policy_rejected_logps (torch.Tensor): Log probabilities of the policy model
+                for the rejected responses. Shape: (batch_size)
+            reference_chosen_logps (torch.Tensor): Log probabilities of the reference model
+                for the chosen responses. Shape: (batch_size)
+            reference_rejected_logps (torch.Tensor): Log probabilities of the reference model
+                for the rejected responses. Shape: (batch_size)
+
+        Returns:
+            Tuple[torch.Tensor, torch.Tensor, torch.Tensor]: A tuple of three tensors:
+                - losses: The DPO loss for each example in the batch.
+                - chosen_rewards: Rewards for the chosen responses.
+                - rejected_rewards: Rewards for the rejected responses.
+
+        """
+        pi_logratios = policy_chosen_logps - policy_rejected_logps
+        ref_logratios = reference_chosen_logps - reference_rejected_logps
+
+        logits = pi_logratios - ref_logratios
+
+        losses = (logits - 1 / (2 * self.tau)) ** 2
+
+        chosen_rewards = (
+            self.tau * (policy_chosen_logps - reference_chosen_logps).detach()
+        )
+        rejected_rewards = (
+            self.tau * (policy_rejected_logps - reference_rejected_logps).detach()
+        )
+
+        return losses, chosen_rewards, rejected_rewards
diff --git a/src/llama_recipes/utils/instruction_tuning.py b/src/llama_recipes/utils/instruction_tuning.py
index a843529..1cb73aa 100644
--- a/src/llama_recipes/utils/instruction_tuning.py
+++ b/src/llama_recipes/utils/instruction_tuning.py
@@ -4,7 +4,8 @@
 
 import numpy as np
 import torch
-from torch.utils.data import Dataset
+from torch.utils.data import Dataset, DataLoader
+import torch.distributed as torch_distributed
 from transformers.tokenization_utils import PreTrainedTokenizer
 from pathlib import Path
 from llama_recipes.utils.distributed import print_rank_0
@@ -24,6 +25,10 @@ def __init__(
         self.max_words: int = args.seq_length
         self.tokenizer = tokenizer
 
+        # system prompt
+        self.system_prompt_role = args.system_prompt_role
+        self.system_prompt_content = args.system_prompt_content
+
         # index file
         dataset_dir = Path(self.data_path).parent
         index_cache_dir = dataset_dir / ".index_cache"
@@ -54,60 +59,65 @@ def __getitem__(self, index: int) -> dict[str, torch.Tensor]:
                 exit(1)
 
             try:
-                conversations: dict[str, str | list[dict[str, str]]] = json.loads(line)
+                conversations: dict[str, list[dict[str, str]] | str] = json.loads(line)
             except Exception as e:
                 print(f"index={index}, offset={offset}, line={line}, error={e}")
                 exit(1)
 
-        SYSTEM_PROMPT = [
-            {"role": "system", "text": "あなたは誠実で優秀な日本人のアシスタントです。"}
+        SYSTEM_PROMPT: list[dict[str, str]] = [
+            {
+                "role": self.system_prompt_role,
+                "content": self.system_prompt_content,
+            }
         ]
         # chat template
-        prompt: str = self.tokenizer.apply_chat_template(
+        prompt = self.tokenizer.apply_chat_template(
             conversation=SYSTEM_PROMPT + conversations["input"],  # type: ignore
-            tokenize=False
+            add_generation_prompt=True,
+            tokenize=True,
         )
 
-        example: str = prompt + conversations["output"]  # type: ignore
-        encoded_prompt: torch.Tensor = torch.tensor(
-            self.tokenizer.encode(prompt, add_special_tokens=False),
-            dtype=torch.int64
-        )
-        encoded_example: list[int] = self.tokenizer.encode(
-            example, add_special_tokens=False
+        example = self.tokenizer.apply_chat_template(
+            conversation=SYSTEM_PROMPT + conversations["input"] + [  # type: ignore
+                {"role": "assistant", "content": conversations["output"]}
+            ],
+            tokenize=True,
         )
-        encoded_example.append(self.tokenizer.eos_token_id)  # type: ignore
-        encoded_tensor_example: torch.Tensor = torch.tensor(encoded_example, dtype=torch.int64)
-
-        if len(encoded_example) > self.max_words:
-            print(f"\n\nWARNING: example={example}\n\n")
-
-        padding: int = self.max_words - encoded_tensor_example.shape[0]
-        if padding > 0:  # pad_token_id = 0 (substitute unk_token)
-            encoded_tensor_example = torch.cat((encoded_tensor_example, torch.zeros(padding, dtype=torch.int64) - 1))
-        elif padding < 0:
-            encoded_tensor_example = encoded_tensor_example[: self.max_words]
-
-        labels = copy.deepcopy(encoded_tensor_example)
+        tensor_example: torch.Tensor = torch.tensor(example, dtype=torch.int64)
+
+        if len(example) > self.max_words:
+            print(f"\n\nWARNING: example={self.tokenizer.decode(example)}\n\n")
+
+        padding_length: int = self.max_words - len(example)
+        eos_token_id: int = self.tokenizer.encode("<|end_of_text|>", add_special_tokens=False)[0]
+        pad_token_id = eos_token_id
+        if padding_length > 0:
+            pad_tensor = torch.full(
+                (padding_length,), pad_token_id, dtype=torch.int64
+            )
+            tensor_example = torch.cat((tensor_example, pad_tensor))
+        elif padding_length < 0:
+            tensor_example = tensor_example[: self.max_words]
+
+        labels = copy.deepcopy(tensor_example)
         # promptの長さ分だけ -1 で埋める -> 損失関数で無視するようになる
-        labels[: len(encoded_prompt)] = -1
-        # 0より大きい(ge)かどうかの真偽値でmaskを作成
-        example_mask = encoded_tensor_example.ge(0)
+        labels[: len(prompt)] = -1
         label_mask = labels.ge(0)
 
-        if torch.all(label_mask == 0):  # len(output) == 0
+        if torch.all(label_mask == 0):  # 予測部分がない
             random_index: int = np.random.randint(0, len(self.indexes))
             self.__getitem__(random_index)
 
-        # ~example_mask -> paddingの部分を 0 で埋める
-        encoded_tensor_example[~example_mask] = 0
         # ~label_mask -> prompt の部分を ignore_index で埋める
         labels[~label_mask] = IGNORE_INDEX
+        labels[labels == pad_token_id] = IGNORE_INDEX
+        # mask out pad token
+        attention_mask = (tensor_example != pad_token_id).float()
 
         return {
-            "input_ids": encoded_tensor_example,
+            "input_ids": tensor_example,
             "labels": labels,
-            "attention_mask": example_mask.float(),
+            "attention_mask": attention_mask,
         }
 
 
@@ -125,7 +135,7 @@ def get_instruction_tuning_dataloader(
     tokenizer: PreTrainedTokenizer,
     data_path: str,
     train: bool = False,
-) -> torch.utils.data.DataLoader:
+) -> DataLoader:
     from llama_recipes.utils.sequence_length_warmup import CustomDistributedSampler
     from llama_recipes.utils.checkpoint import load_sampler_state_dict
 
@@ -142,8 +152,8 @@ def get_instruction_tuning_dataloader(
 
     train_sampler = CustomDistributedSampler(
         dataset=instruction_dataset,
-        rank=torch.distributed.get_rank(),
-        num_replicas=torch.distributed.get_world_size(),
+        rank=torch_distributed.get_rank(),
+        num_replicas=torch_distributed.get_world_size(),
         shuffle=True,
         seed=args.seed,
     )
@@ -153,7 +163,7 @@ def get_instruction_tuning_dataloader(
 
     set_sampler(sampler=train_sampler)
 
-    return torch.utils.data.DataLoader(
+    return DataLoader(
         instruction_dataset,
         batch_size=args.micro_batch_size,
         sampler=train_sampler,
diff --git a/src/llama_recipes/utils/train_utils.py b/src/llama_recipes/utils/train_utils.py
index cd9c800..12b948e 100644
--- a/src/llama_recipes/utils/train_utils.py
+++ b/src/llama_recipes/utils/train_utils.py
@@ -1,6 +1,5 @@
 import os
 import time
-from contextlib import nullcontext
 
 import torch
 import torch.cuda.nccl as nccl
@@ -11,6 +10,7 @@
 from llama_recipes.policies import fpSixteen, bfSixteen, bfSixteen_mixed, get_decoder_layer_wrapper
 from llama_recipes.utils.wandb_utils import log_model_info, log_wandb
 from llama_recipes.utils.checkpoint import save_checkpoint, get_latest_iteration
+from llama_recipes.utils.dpo_loss import DPOLoss
 
 from typing import Optional, Any
 import wandb
@@ -39,6 +39,8 @@ def train(
     gradient_accumulation_steps: int,
     local_rank: Optional[int] = None,
     rank: Optional[int] = None,
+    dpo_loss_fn: Optional[DPOLoss] = None,
+    reference_model: Optional[torch.nn.Module] = None,
 ) -> None:
     """
     Trains the model on the given dataloader
@@ -63,7 +65,6 @@ def train(
 
     world_size = int(os.environ["WORLD_SIZE"])
     local_rank = local_rank if local_rank is not None else 0
-    autocast = torch.cuda.amp.autocast if args.fp16 else nullcontext  # type: ignore
 
     # set model info
     if rank == 0 and args.wandb_name:
@@ -100,11 +101,60 @@ def train(
 
             batch = next(train_dataloader)
 
-            for key in batch.keys():
-                batch[key] = batch[key].to(local_rank)
+            if args.direct_preference_optimization:
+                # DPO( Direct Preference Optimization)
+                from llama_recipes.utils.dpo import concatenated_forward
+
+                if dpo_loss_fn is None:
+                    raise ValueError(
+                        "DPO(Direct Preference Optimization) is enabled, but dpo loss function  is None"
+                    )
+                if reference_model is None:
+                    raise ValueError(
+                        "DPO(Direct Preference Optimization) is enabled, but reference model is None"
+                    )
+
+                # forward
+                (
+                    policy_chosen_log_probs,
+                    policy_rejected_log_probs,
+                    policy_chosen_logits,
+                    policy_rejected_logits,
+                ) = concatenated_forward(model=model, batch=batch, local_rank=local_rank)
+
+                policy_chosen_logits_mean = policy_chosen_logits.detach().mean()
+                policy_rejected_logits_mean = policy_rejected_logits.detach().mean()
+
+                # deleting logits here helps reduce (peak) memory usage - we only need them for metric logging
+                del policy_chosen_logits, policy_rejected_logits
+
+                with torch.no_grad():
+                    (
+                        reference_chosen_log_probs,
+                        reference_rejected_log_probs,
+                        _,
+                        _,
+                    ) = concatenated_forward(model=reference_model, batch=batch, local_rank=local_rank)
+
+                loss, chosen_rewards, rejected_rewards = dpo_loss_fn(
+                    policy_chosen_log_probs,
+                    policy_rejected_log_probs,
+                    reference_chosen_log_probs,
+                    reference_rejected_log_probs,
+                )
+                loss = loss.mean()
+                reward_accuracies = (chosen_rewards > rejected_rewards).float()
+            else:
+                # continual-pre-training & Instruction Tuning
+                for key in batch.keys():
+                    batch[key] = batch[key].to(local_rank)
+
+                with torch.cuda.amp.autocast(
+                    enabled=args.mixed_precision,
+                    dtype=torch.bfloat16 if args.bf16 else torch.float16
+                ):
+                    loss: torch.Tensor = model(**batch).loss
 
-            with autocast():
-                loss: torch.Tensor = model(**batch).loss
             loss = loss / gradient_accumulation_steps
 
             if args.fp16:
@@ -119,8 +169,13 @@ def train(
             # gradient clipping
             if args.grad_clip_norm > 0:
                 clip_grad_norm_(model.parameters(), args.grad_clip_norm)
-            real_batch_size: int = batch["input_ids"].shape[0]
-            real_seq_len: int = batch["input_ids"].shape[1]
+
+            if args.direct_preference_optimization:
+                real_batch_size: int = batch["chosen_input_ids"].shape[0]
+                real_seq_len: int = batch["chosen_input_ids"].shape[1]
+            else:
+                real_batch_size: int = batch["input_ids"].shape[0]
+                real_seq_len: int = batch["input_ids"].shape[1]
 
         # gradient accumulation end
         iteration += 1
@@ -152,6 +207,21 @@ def train(
                     world_size=world_size,
                     iteration_start_time=iteration_start_time,
                 )
+                if args.direct_preference_optimization:
+                    wandb.log(
+                        {
+                            "rewards/chosen": chosen_rewards.mean().cpu(),
+                            "rewards/rejected": rejected_rewards.mean().cpu(),
+                            "rewards/accuracies": reward_accuracies.mean().cpu(),
+                            "rewards/margins": (chosen_rewards - rejected_rewards).mean().cpu(),  # type: ignore
+                            "log_probs/rejected": policy_rejected_log_probs.detach().mean().cpu(),
+                            "log_probs/chosen": policy_chosen_log_probs.detach().mean().cpu(),
+                            "logits/rejected": policy_rejected_logits_mean.cpu(),
+                            "logits/chosen": policy_chosen_logits_mean.cpu(),
+                        },
+                        step=iteration,
+                    )
+
             total_loss = 0.0
             iteration_start_time = time.perf_counter()
 
diff --git a/tools/checkpoint-convert/scripts/abci/convert_ckpt_instruct.sh b/tools/checkpoint-convert/scripts/abci/convert_ckpt_instruct.sh
index 4efc617..e54860b 100644
--- a/tools/checkpoint-convert/scripts/abci/convert_ckpt_instruct.sh
+++ b/tools/checkpoint-convert/scripts/abci/convert_ckpt_instruct.sh
@@ -4,12 +4,16 @@
 #$ -j y
 #$ -o outputs/convert/ckpt/
 #$ -cwd
+
 # module load
 source /etc/profile.d/modules.sh
-module load cuda/11.8/11.8.0
-module load cudnn/8.9/8.9.2
-module load nccl/2.16/2.16.2-1
+module use /bb/llm/gaf51275/modules/modulefiles
+
+module load cuda/12.1/12.1.1
+module load cudnn/cuda-12.1/9.0.0
+module load nccl/2.20.5
 module load hpcx/2.12
+module load gcc/11.4.0
 
 set -e
 
diff --git a/tools/dataset/convert_dataset_dpo.py b/tools/dataset/convert_dataset_dpo.py
new file mode 100644
index 0000000..4b487dc
--- /dev/null
+++ b/tools/dataset/convert_dataset_dpo.py
@@ -0,0 +1,42 @@
+import argparse
+import json
+
+
+def convert_jsonl(input_path: str, output_path: str, from_key: str, value_key: str) -> None:
+    converted_data = []
+
+    with open(input_path, 'r', encoding='utf-8') as file:
+        for line in file:
+            item = json.loads(line)
+            conversations = item['conversations']
+            chosen = item['chosen']
+            rejected = item['rejected']
+            converted_conversations = []
+            for conversation in conversations:
+                converted_conversation = {
+                    'role': conversation[from_key],
+                    'content': conversation[value_key]
+                }
+                converted_conversations.append(converted_conversation)
+            converted_data.append({
+                'conversations': converted_conversations,
+                'chosen': chosen,
+                'rejected': rejected
+            })
+    with open(output_path, 'w', encoding='utf-8') as outfile:
+        for item in converted_data:
+            outfile.write(json.dumps(item, ensure_ascii=False) + '\n')
+
+def main():
+    parser = argparse.ArgumentParser(description="Convert JSONL file keys to specified format.")
+    parser.add_argument('--input-path', type=str, help='Path to the input JSONL file')
+    parser.add_argument('--output-path', type=str, help='Path to the output JSONL file')
+    parser.add_argument('--from-key', type=str, default='from', help='Key name to be converted to role')
+    parser.add_argument('--value-key', type=str, default='value', help='Key name to be converted to context')
+
+    args = parser.parse_args()
+
+    convert_jsonl(args.input_path, args.output_path, args.from_key, args.value_key)
+
+if __name__ == "__main__":
+    main()
diff --git a/tools/dataset/convert_dataset_instruct.py b/tools/dataset/convert_dataset_instruct.py
new file mode 100644
index 0000000..ed35caf
--- /dev/null
+++ b/tools/dataset/convert_dataset_instruct.py
@@ -0,0 +1,42 @@
+import argparse
+import json
+import copy
+
+
+def convert_jsonl(input_path: str, output_path: str) -> None:
+    converted_data = []
+
+    with open(input_path, 'r', encoding='utf-8') as file:
+        for line in file:
+            item = json.loads(line)
+            messages = item['messages']
+
+            assert len(messages) % 2 == 0
+            conversation_turn: int = len(messages) // 2
+
+            inputs = []
+            for i in range(conversation_turn):
+                user_message = messages[i * 2]
+                assistant_message = messages[i * 2 + 1]
+                inputs.append(user_message)
+                converted_data.append({
+                    "input": copy.deepcopy(inputs),
+                    "output": assistant_message
+                })
+                inputs.append(assistant_message)
+
+    with open(output_path, 'w', encoding='utf-8') as outfile:
+        for item in converted_data:
+            outfile.write(json.dumps(item, ensure_ascii=False) + '\n')
+
+def main():
+    parser = argparse.ArgumentParser(description="Convert JSONL file keys to specified format.")
+    parser.add_argument('--input-path', type=str, help='Path to the input JSONL file')
+    parser.add_argument('--output-path', type=str, help='Path to the output JSONL file')
+
+    args = parser.parse_args()
+
+    convert_jsonl(args.input_path, args.output_path)
+
+if __name__ == "__main__":
+    main()
diff --git a/tools/dataset/debug_chat_template.py b/tools/dataset/debug_chat_template.py
index 03e22b6..025b71e 100644
--- a/tools/dataset/debug_chat_template.py
+++ b/tools/dataset/debug_chat_template.py
@@ -13,28 +13,54 @@
     "input": [
         {
             "role": "user",
-            "text": "こんにちは！"
+            "content": "こんにちは！"
         },
         {
             "role": "assistant",
-            "text": "こんにちは！ご質問やお困りのことがありましたら、何でもご相談ください。何が必要か教えてください。"
+            "content": "こんにちは！ご質問やお困りのことがありましたら、何でもご相談ください。何が必要か教えてください。"
         },
         {
             "role": "user",
-            "text": "世界のすべての国をアルファベット順に、それぞれの国の人口を教えてください。"
+            "content": "世界のすべての国をアルファベット順に、それぞれの国の人口を教えてください。"
         }
-    ],
-    "output": "世界中の国をアルファベット順に並べたリストと、その国の推定人口です：\n\nアフガニスタン: 38,928,346 アルバニア: 2,877,797 アルジェリア：44,344,744 アンドラ: 77,265 アンゴラ: 32,878,272 アンティグア・バーブーダ: 97,929 アルゼンチン: 45,195,774 アルメニア: 2,977,600 オーストラリア: 25,499,884 オーストリア: 9,006,398 アゼルバイジャン: 10,134,604 バハマ：393,248 バーレーン: 1,714,571 バングラデシュ: 164,689,383164,689,383 バルバドス: 287,375 ベラルーシ: 9,449,323 ベルギー: 11,589,623 ベリーズ: 397,628 ベナン: 12,123,200 ブータン: 754,288 ボリビア: 11,673,095 ボスニア・ヘルツェゴビナ: 3,279,000 ボツワナ：2,335,814 ブラジル: 212,559,417 ブルネイ: 441,484 ブルガリア: 7,034,879 ブルキナファソ: 20,903,273 ブルンジ: 11,890,781 カボベルデ: 555,987 カンボジア: 16,205,218 カメルーン: 26,545,863 カナダ：37,742,154 中央アフリカ共和国4,829,767 チャド: 16,425,864 チリ: 19,116,201 中国: 1,439,323,776 コロンビア: 50,882,891 コモロ: 869,601 コンゴ民主共和国：87,534,403 コンゴ共和国5,457,821 コスタリカ5,094,118 コートジボワール: 26,378,274 クロアチア: 4,105,267 キューバ: 11,239,224 キプロス：1,207,359 チェコ：10,708,919 デンマーク：5,792,2025,792,202"
+    ]
 }
 
-chat_template: str = "{% if messages[0]['role'] == 'system' %}{% set loop_messages = messages[1:] %}{% set system_message = messages[0]['text'] %}{% elif false == true and not '<<SYS>>' in messages[0]['text'] %}{% set loop_messages = messages %}{% set system_message = 'あなたは誠実で優秀な日本人のアシスタントです。' %}{% else %}{% set loop_messages = messages %}{% set system_message = false %}{% endif %}{{ bos_token }}{% for message in loop_messages %}{% if (message['role'] == 'user') != (loop.index0 % 2 == 0) %}{{ raise_exception('Conversation roles must alternate user/assistant/user/assistant/...') }}{% endif %}{% if loop.index0 == 0 and system_message != false %}{% set content = '<<SYS>>\\n' + system_message + '\\n<</SYS>>\\n\\n' + message['text'] %}{% else %}{% set content = message['text'] %}{% endif %}{% if message['role'] == 'user' %}{{ '[INST] ' + content.strip() + ' [/INST] ' }}{% elif message['role'] == 'system' %}{{ '<<SYS>>\\n' + content.strip() + '\\n<</SYS>>\\n\\n' }}{% elif message['role'] == 'assistant' %}{{ ''  + content.strip() + '' + eos_token }}{% endif %}{% endfor %}"
+conversations_with_output = [
+    {
+        "role": "user",
+        "content": "こんにちは！"
+    },
+    {
+        "role": "assistant",
+        "content": "こんにちは！ご質問やお困りのことがありましたら、何でもご相談ください。何が必要か教えてください。"
+    },
+    {
+        "role": "user",
+        "content": "世界のすべての国をアルファベット順に、それぞれの国の人口を教えてください。"
+    },
+    {
+        "role": "assistant",
+        "content": "output",
+    }
+]
+
+chat_template: str = "{% set loop_messages = messages %}{% for message in loop_messages %}{% set content = '<|start_header_id|>' + message['role'] + '<|end_header_id|>\n\n'+ message['content'] | trim + '<|eot_id|>' %}{% if loop.index0 == 0 %}{% set content = bos_token + content %}{% endif %}{{ content }}{% endfor %}{% if add_generation_prompt %}{{ '<|start_header_id|>assistant<|end_header_id|>\n\n' }}{% endif %}"
 
 print("before apply chat template")
 
 prompt: str = tokenizer.apply_chat_template(
-    [{"role": "system", "text": "あなたは誠実で優秀な日本人のアシスタントです。"}] + conversations["input"],  # type: ignore
-    # chat_template=chat_template,
-    tokenize=False
+    [{"role": "system", "content": "あなたは誠実で優秀な日本人のアシスタントです。"}] + conversations["input"],  # type: ignore
+    add_generation_prompt=True,
+    # tokenize=False
 )
 
 print(prompt)
+print(type(prompt))
+
+print("--------------------------------")
+
+print(tokenizer.apply_chat_template(
+    conversation=conversations_with_output,
+    # tokenize=False
+))
diff --git a/tools/inference/inference.py b/tools/inference/inference.py
index 0828a1b..e0a1a58 100644
--- a/tools/inference/inference.py
+++ b/tools/inference/inference.py
@@ -2,7 +2,7 @@
 
 import torch
 
-from transformers import AutoTokenizer, MistralForCausalLM
+from transformers import AutoTokenizer, AutoModelForCausalLM
 
 
 parser = argparse.ArgumentParser(description="Generation")
@@ -17,7 +17,7 @@
 tokenizer = AutoTokenizer.from_pretrained(
     pretrained_model_name_or_path=args.tokenizer_path,
 )
-model = MistralForCausalLM.from_pretrained(
+model = AutoModelForCausalLM.from_pretrained(
     args.model_path,
     device_map="auto", torch_dtype=torch.bfloat16
 )
@@ -29,8 +29,8 @@
 )
 outputs = model.generate(  # type: ignore
     input_ids.to(device=model.device),  # type: ignore
-    max_new_tokens=128,
-    temperature=0.99,
+    max_new_tokens=1024,
+    temperature=0.7,
     top_p=0.95,
     do_sample=True,
 )
diff --git a/tools/inference/inference.sh b/tools/inference/inference.sh
index 33d8c51..4d05001 100644
--- a/tools/inference/inference.sh
+++ b/tools/inference/inference.sh
@@ -4,24 +4,22 @@
 #$ -j y
 #$ -o outputs/inference/
 #$ -cwd
+
 # module load
 source /etc/profile.d/modules.sh
-module load cuda/11.8/11.8.0
-module load cudnn/8.9/8.9.2
-module load nccl/2.16/2.16.2-1
+module use /bb/llm/gaf51275/modules/modulefiles
+
+module load cuda/12.1/12.1.1
+module load cudnn/cuda-12.1/9.0.0
+module load nccl/2.20.5
 module load hpcx/2.12
+module load gcc/11.4.0
 
 set -e
 
 # swich virtual env
 source .env/bin/activate
 
-# distributed settings
-export MASTER_ADDR=$(/usr/sbin/ip a show dev bond0 | grep 'inet ' | awk '{ print $2 }' | cut -d "/" -f 1)
-export MASTER_PORT=$((10000 + ($JOB_ID % 50000)))
-
-echo "MASTER_ADDR=${MASTER_ADDR}"
-
 python tools/inference/inference.py \
   --model-path /bb/llm/gaf51275/llama/converted-hf-checkpoint/mistral-7B-VE/okazaki-cc/iter_0004000 \
   --tokenizer-path /bb/llm/gaf51275/llama/converted-hf-checkpoint/mistral-7B-VE/okazaki-cc/iter_0004000 \
diff --git a/tools/model-upload/upload.py b/tools/model-upload/upload.py
index 76816fd..91828fe 100644
--- a/tools/model-upload/upload.py
+++ b/tools/model-upload/upload.py
@@ -1,25 +1,41 @@
 import os
 import argparse
-
+from tqdm import tqdm
 from huggingface_hub import HfApi, create_repo
 
 
+def upload_directory(api, local_dir, repo_name, repo_type, branch_name):
+    for root, _, files in os.walk(local_dir):
+        for file in tqdm(files, desc=f"Uploading files in {root}"):
+            local_path = os.path.join(root, file)
+            repo_path = os.path.relpath(local_path, local_dir)
+
+            print(f"Uploading {repo_path} to branch {branch_name}...")
+            api.upload_file(
+                path_or_fileobj=local_path,
+                path_in_repo=repo_path,
+                repo_id=repo_name,
+                repo_type=repo_type,
+                commit_message=f"Upload {repo_path}",
+                revision=branch_name,
+            )
+            print(f"Successfully uploaded {repo_path}!")
+
+
 parser = argparse.ArgumentParser()
-parser.add_argument("--ckpt-path", type=str)
-parser.add_argument("--repo-name", type=str)
-parser.add_argument("--branch-name", type=str, default="main")
+parser.add_argument("--ckpt-path", type=str, help="Path to the checkpoint directory")
+parser.add_argument("--repo-name", type=str, help="Name of the Hugging Face repository")
+parser.add_argument("--branch-name", type=str, default="main", help="Branch name in the repository")
 args = parser.parse_args()
 
 converted_ckpt: str = args.ckpt_path
 repo_name: str = args.repo_name
 branch_name: str = args.branch_name
+
 try:
     create_repo(repo_name, repo_type="model", private=True)
 except Exception as e:
-    print(f"repo {repo_name} already exists! error: {e}")
-    pass
-
-files = os.listdir(converted_ckpt)
+    print(f"Repository {repo_name} already exists! Error: {e}")
 
 api = HfApi()
 if branch_name != "main":
@@ -29,17 +45,9 @@
             repo_type="model",
             branch=branch_name,
         )
-    except Exception:
-        print(f"branch {branch_name} already exists, try again...")
-print(f"to upload: {files}")
-for file in files:
-    print(f"Uploading {file} to branch {branch_name}...")
-    api.upload_file(
-        path_or_fileobj=os.path.join(converted_ckpt, file),
-        path_in_repo=file,
-        repo_id=repo_name,
-        repo_type="model",
-        commit_message=f"Upload {file}",
-        revision=branch_name,
-    )
-    print(f"Successfully uploaded {file} !")
+    except Exception as e:
+        print(f"Branch {branch_name} already exists. Error: {e}")
+
+print(f"Starting upload of directory: {converted_ckpt}")
+upload_directory(api, converted_ckpt, repo_name, "model", branch_name)
+print("Upload completed successfully!")
diff --git a/tools/model-upload/upload.sh b/tools/model-upload/upload.sh
index a5dba10..b60b766 100644
--- a/tools/model-upload/upload.sh
+++ b/tools/model-upload/upload.sh
@@ -2,16 +2,42 @@
 
 set -e
 
-start=2080
-end=2080
-increment=5000
+start=9250
+end=9250
+increment=2500
 
-upload_base_dir=/bb/llm/gaf51275/llama/converted-hf-checkpoint/Swallow-7b-VE-chat/imitation-1-and-2-lr_2e-5-minlr_2e-6-GB_64
+EXPERIMENT_NAME=exp1
+
+upload_base_dir=/bb/llm/gaf51275/2024/checkpoints/Llama-3-8b-instruct-v0.2/${EXPERIMENT_NAME}/LR1.0E-5-MINLR1.0E-6-WD0.1
+
+upload_checkpoint() {
+  local upload_dir=$1
+  local repo_name=$2
+  local max_retries=5
+  local retry_count=0
+
+  while [ $retry_count -lt $max_retries ]; do
+    if python scripts/abci/upload/upload.py \
+        --ckpt-path "$upload_dir" \
+        --repo-name "$repo_name"; then
+        echo "Successfully uploaded $repo_name"
+        return 0
+    else
+        echo "Upload failed for $repo_name. Retrying..."
+        ((retry_count++))
+        sleep 5
+    fi
+  done
+
+  echo "Failed to upload $repo_name after $max_retries attempts"
+  return 1
+}
 
 for ((i = start; i <= end; i += increment)); do
   upload_dir=$upload_base_dir/iter_$(printf "%07d" $i)
+  repo_name="tokyotech-llm/Llama-3-8b-instruct-v0.2-${EXPERIMENT_NAME}-LR1.0e-5-MINLR1.0E-6-iter$(printf "%07d" $i)"
 
-  python tools/model-upload/upload.py \
-    --ckpt-path $upload_dir \
-    --repo-name tokyotech-llm/Swallow-7b-VE-instruct-v1.0-imitation-1-and-2-lr_2e-5-minlr_2e-6-GB_64-iter$(printf "%07d" $i)
+  if ! upload_checkpoint "$upload_dir" "$repo_name"; then
+    echo "Skipping to next checkpoint after repeated failures for $repo_name"
+  fi
 done
diff --git a/tools/pre-process/scripts/index.sh b/tools/pre-process/scripts/index.sh
index e333900..2a46f3f 100644
--- a/tools/pre-process/scripts/index.sh
+++ b/tools/pre-process/scripts/index.sh
@@ -2,39 +2,8 @@
 
 source .env/bin/activate
 
-INPUT_DIR=/bb/llm/gaf51275/llama/finetuning/datasets/training
+INPUT_DIR=/gs/bs/tga-NII-LLM/datasets/raw/instruct/synthetic/general/Synthetic-JP-Conversations-Magpie-Nemotron-4-10k
 
 # baseline
 python tools/pre-process/index_dataset.py \
-  --data-file-path $INPUT_DIR/baseline/train.jsonl
-
-python tools/pre-process/index_dataset.py \
-  --data-file-path $INPUT_DIR/baseline/val.jsonl
-
-# baseline-imitation_2
-python tools/pre-process/index_dataset.py \
-  --data-file-path $INPUT_DIR/baseline-imitation_2/train.jsonl
-
-python tools/pre-process/index_dataset.py \
-  --data-file-path $INPUT_DIR/baseline-imitation_2/val.jsonl
-
-# ichikara
-python tools/pre-process/index_dataset.py \
-  --data-file-path $INPUT_DIR/ichikara/train.jsonl
-
-python tools/pre-process/index_dataset.py \
-  --data-file-path $INPUT_DIR/ichikara/val.jsonl
-
-# imitation_1_and_2
-python tools/pre-process/index_dataset.py \
-  --data-file-path $INPUT_DIR/imitation_1_and_2/train.jsonl
-
-python tools/pre-process/index_dataset.py \
-  --data-file-path $INPUT_DIR/imitation_1_and_2/val.jsonl
-
-# imitation_2_oasst2_top1
-python tools/pre-process/index_dataset.py \
-  --data-file-path $INPUT_DIR/imitation_2_oasst2_top1/train.jsonl
-
-python tools/pre-process/index_dataset.py \
-  --data-file-path $INPUT_DIR/imitation_2_oasst2_top1/val.jsonl
+  --data-file-path $INPUT_DIR/Synthetic-JP-Conversations-Magpie-Nemotron-4-10k.jsonl