From c5b6618d952050d2e5d54753f4e72a8cfdec9011 Mon Sep 17 00:00:00 2001 From: Yuma Ochi Date: Thu, 19 Oct 2023 08:03:09 +0000 Subject: [PATCH 01/50] Add [train_ds]: initial commit for train_ds.py --- scripts/run_ds.sh | 6 ++++++ train_ds.py | 0 2 files changed, 6 insertions(+) create mode 100755 scripts/run_ds.sh create mode 100644 train_ds.py diff --git a/scripts/run_ds.sh b/scripts/run_ds.sh new file mode 100755 index 0000000..63ddbd2 --- /dev/null +++ b/scripts/run_ds.sh @@ -0,0 +1,6 @@ +#!/bin/bash +export WANDB_PROJECT=heron +export PROJECT_NAME=opt/exp001 +export WANDB_NAME=$PROJECT_NAME + +deepspeed train_ds.py --config_file projects/$PROJECT_NAME.yml diff --git a/train_ds.py b/train_ds.py new file mode 100644 index 0000000..e69de29 From 997457890c069a1e4309fe0e0d736653bbb5c285 Mon Sep 17 00:00:00 2001 From: Yuma Ochi Date: Thu, 19 Oct 2023 08:18:32 +0000 Subject: [PATCH 02/50] WIP update [train_ds]: cp from VisualChat --- train_ds.py | 534 ++++++++++++++++++++++++++++++++++++++++++++++++++++ 1 file changed, 534 insertions(+) diff --git a/train_ds.py b/train_ds.py index e69de29..06b96ce 100644 --- a/train_ds.py +++ b/train_ds.py @@ -0,0 +1,534 @@ +#!/usr/bin/env python +# Copyright (c) Microsoft Corporation. +# SPDX-License-Identifier: Apache-2.0 + +# DeepSpeed Team +import argparse +import math +import os +import random +import sys + +import deepspeed +import numpy as np +import torch +import yaml +from torch.utils.data import DataLoader +from torch.utils.data.distributed import DistributedSampler +from transformers import AdamW, AutoTokenizer, SchedulerType, get_scheduler + +sys.path.append(os.path.abspath(os.path.join(os.path.dirname(__file__), os.path.pardir))) +from utils.data import ( + DataCollatorPadToMaxLen, + build_dataset, + shuffle_dataset, + split_dataset, +) +from utils.ds_utils import get_train_ds_config +from utils.model import create_dsvl_model_and_transforms +from utils.module.lora import ( + convert_linear_layer_to_lora, + fuse_lora, + only_optimize_lora_parameters, + unfuse_lora, +) +from utils.utils import ( + get_all_reduce_mean, + get_optimizer_grouped_parameters, + print_rank_0, + save_hf_format, + save_zero_three_model, + set_random_seed, + to_device, +) + +# import heron library +sys.path.append("/home/yuma_ochi/heron-exp") +from heron.datasets.utils import get_dataset +from heron.models.utils import ( + apply_lora_model, + load_model, + load_pretrained_weight, + set_trainable_params, + unload_and_merge_lora, +) + + +def parse_args(): + parser = argparse.ArgumentParser( + description="Finetune a transformers model on a multi-modal task" + ) + + parser.add_argument( + "--data_path", type=str, default="./data/", help="Where the training data are stored." + ) + + parser.add_argument( + "--data_debug_path", + type=str, + default=None, + help="If provided, will save 10 training samples" "to the path for debug purpose.", + ) + + parser.add_argument( + "--data_train_split_ratio", + type=float, + default=0.9, + help="Ratio of dataset to be splitted as train data. The remaining becomes eval data.", + ) + parser.add_argument( + "--dataset_names", + nargs="*", + default=["minigpt4"], + help="Name of training dataset(s) to be used. Accepted format:" + "1) a single dataset name, 2) multiple dataset names in the" + "form: dataset1 dataset2 ...", + ) + + parser.add_argument( + "--dataset_samples", + nargs="*", + default=["all"], + help="How many samples do we use from each dataset." + "Should be either a integer number or string all which" + "means use all samples. For example: all 512 means" + "using all samples form first data and 512 samples" + "from second data", + ) + + parser.add_argument( + "--dataset_concatenate_samples", + nargs="*", + default=[1], + help="How many samples do we concatenate from each dataset." + "Should be either a integer number or string. 1 which" + "means use 1 sample for each datapoint", + ) + + parser.add_argument( + "--max_num_image_per_sample", + type=int, + default=8, + help="The maximum number of images per sample.", + ) + parser.add_argument( + "--per_device_train_batch_size", + type=int, + default=2, + help="Batch size (per device) for the training dataloader.", + ) + parser.add_argument( + "--per_device_eval_batch_size", + type=int, + default=2, + help="Batch size (per device) for the evaluation dataloader.", + ) + parser.add_argument( + "--max_seq_len", + type=int, + default=4096, + help="The maximum sequence length, note that image tokens are included.", + ) + parser.add_argument( + "--learning_rate", + type=float, + default=1e-3, + help="Initial learning rate (after the potential warmup period) to use.", + ) + parser.add_argument( + "--learning_rate_pretraining_components", + type=float, + default=0, + help="Initial learning rate for pre-trained weight, e.g., embedding (after the potential warmup period) to use.", + ) + parser.add_argument("--weight_decay", type=float, default=0.0, help="Weight decay to use.") + parser.add_argument( + "--num_train_epochs", + type=int, + default=6, + help="Total number of training epochs to perform.", + ) + parser.add_argument( + "--gradient_accumulation_steps", + type=int, + default=1, + help="Number of updates steps to accumulate before performing a backward/update pass.", + ) + parser.add_argument( + "--lr_scheduler_type", + type=SchedulerType, + default="cosine", + help="The scheduler type to use.", + choices=[ + "linear", + "cosine", + "cosine_with_restarts", + "polynomial", + "constant", + "constant_with_warmup", + ], + ) + parser.add_argument( + "--num_warmup_steps", + type=float, + default=0, + help="Number of steps (>1) or ratios (<=1) for the warmup in the lr scheduler.", + ) + parser.add_argument("--output_dir", type=str, default=None, help="Where to store the model.") + parser.add_argument("--seed", type=int, default=1234, help="A seed for reproducible training.") + parser.add_argument( + "--local_rank", type=int, default=-1, help="local_rank for distributed training on gpus" + ) + parser.add_argument( + "--gradient_checkpointing", + action="store_true", + help="Enable HF gradient checkpointing for model.", + ) + parser.add_argument( + "--lm_model_name_or_path", + type=str, + help="Path to pretrained model or model identifier from huggingface.co/models.", + required=True, + ) + parser.add_argument( + "--vision_model_name_or_path", default="openai/clip-vit-large-patch14", type=str + ) + parser.add_argument( + "--enable_mmca_attention", + action="store_true", + help="enable the new proposed attn, which is similar to cross attention", + ) + parser.add_argument( + "--vis_proj", + type=str, + default="baseline", + help="[baseline, vit, or perceiver], used to projection vision feature to LLM embedding", + ) + # deepspeed features + parser.add_argument( + "--zero_stage", + type=int, + default=0, + help="ZeRO optimization stage for Actor model (and clones).", + ) + parser.add_argument( + "--precision", + type=str, + choices=["fp16", "bf16"], + default="fp16", + help="FP16 or BF16 precision. FP16 is recommended for typical use cases. BF16 is good for large models", + ) + parser.add_argument( + "--enable_tensorboard", action="store_true", help="Enable tensorboard logging" + ) + ## LoRA for efficient training setting + parser.add_argument( + "--lang_lora_dim", + type=int, + default=0, + help="Use LoRA for fine-tuning language decoder (> 0).", + ) + parser.add_argument( + "--lang_lora_module_name", + type=str, + default="model.layers.", + help="The scope name of the target LoRA parameters.", + ) + parser.add_argument( + "--vis_lora_dim", + type=int, + default=0, + help="Use LoRA for fine-tuning visual encoder (> 0).", + ) + parser.add_argument( + "--vis_lora_module_name", + type=str, + default="encoder.layers.", + help="The scope name of the target LoRA parameters.", + ) + parser.add_argument( + "--only_optimize_lora", action="store_true", help="Only optimize the LoRA parameters." + ) + parser.add_argument("--heron_config_file", type=str, help="heronのconfigファイルパス") + + parser = deepspeed.add_config_arguments(parser) + args = parser.parse_args() + + if args.learning_rate_pretraining_components == 0.0: + # if we do not provide special learning rate, mainly for embedding, the same lr is applied + args.learning_rate_pretraining_components = args.learning_rate + assert args.num_warmup_steps >= 0, "--num_warmup_steps must be >= 0" + if "qwen" in args.vision_model_name_or_path.lower(): + assert ( + args.vis_proj == "baseline" + ), "qwen's model only support baseline vis_proj as it has the perceiver module inside" + return args + + +def main(): + args = parse_args() + + if args.local_rank == -1: + device = torch.device("cuda") + else: + torch.cuda.set_device(args.local_rank) + device = torch.device("cuda", args.local_rank) + # Initializes the distributed backend which will take care of sychronizing nodes/GPUs + deepspeed.init_distributed() + + args.global_rank = torch.distributed.get_rank() + + ds_config = get_train_ds_config(args, offload=False, stage=args.zero_stage) + ds_config["train_micro_batch_size_per_gpu"] = args.per_device_train_batch_size + ds_config["train_batch_size"] = ( + args.per_device_train_batch_size + * torch.distributed.get_world_size() + * args.gradient_accumulation_steps + ) + + # Heronのconfigを読み出す + with open(args.heron_config_file, "r") as i_: + config = yaml.safe_load(i_) + model_config = config["model_config"] + training_config = config["training_config"] + + # If passed along, set the training seed now. + set_random_seed(args.seed) + + torch.distributed.barrier() + """ ↓ VisualChatのモデル定義 """ + # tokenizer = AutoTokenizer.from_pretrained(args.lm_model_name_or_path, + # fast_tokenizer=True) + # tokenizer.padding_side = 'right' + + # model, image_processor, tokenizer = create_dsvl_model_and_transforms( + # text_tokenizer=tokenizer, + # args=args, + # ds_config=ds_config) + # if args.lang_lora_dim > 0: + # model.lang_decoder = convert_linear_layer_to_lora(model.lang_decoder, args.lang_lora_module_name, args.lang_lora_dim) + # if args.only_optimize_lora: + # model.lang_decoder = only_optimize_lora_parameters(model.lang_decoder) + + # if args.vis_lora_dim > 0: + # model.vis_encoder = convert_linear_layer_to_lora(model.vis_encoder, args.vis_lora_module_name, args.vis_lora_dim) + # if args.only_optimize_lora: + # model.vis_encoder = only_optimize_lora_parameters(model.vis_encoder) + """ ↑ VsualChatのモデル定義 """ + + """ ↓ Heronのモデル定義 """ + # load model + model = load_model(model_config) + + if model_config["use_lora"]: + model = apply_lora_model(model, model_config) + """ ↑ Heronのモデル定義 """ + + print_rank_0(model, args.global_rank) + + """ ↓ VisualChat """ + # # Prepare the data + # if len(args.dataset_samples) < len(args.dataset_names): + # assert len(args.dataset_samples) == 1, "when args.dataset_samples is not the same length as args.dataset_names, it should be only one number" + # args.dataset_samples = [args.dataset_samples[0]] * len(args.dataset_names) + # if len(args.dataset_concatenate_samples) < len(args.dataset_names): + # assert len(args.dataset_concatenate_samples) == 1, "when args.dataset_concatenate_samples is not the same length as args.dataset_names, it should be only one number" + # args.dataset_concatenate_samples = [args.dataset_concatenate_samples[0]] * len(args.dataset_names) + # # convert to int + # args.dataset_concatenate_samples = [int(i) for i in args.dataset_concatenate_samples] + + # dataset = build_dataset( + # args.data_path, + # args.data_debug_path, + # args.dataset_names, + # args.dataset_samples, + # args.dataset_concatenate_samples, + # args.max_num_image_per_sample, + # vis_processor=image_processor, + # tokenizer=tokenizer, + # ) + # # split the dataset into train and evaluation + # total_data = len(dataset) + # np_rng = np.random.RandomState(seed=args.seed) + # dataset = shuffle_dataset(dataset, np_rng) + # train_dataset, eval_dataset = split_dataset(dataset, args.data_train_split_ratio) + """ ↑ VisualChat """ + + """ ↓ Heron """ + config["dataset_config_path"] = [ + os.path.join("/home/yuma_ochi/heron-exp", path) for path in config["dataset_config_path"] + ] + train_dataset, eval_dataset = get_dataset(config) + """ ↑ Heron """ + + train_dataloader = DataLoader( + train_dataset, + batch_size=args.per_device_train_batch_size, + sampler=DistributedSampler(train_dataset, shuffle=True, drop_last=True), + # collate_fn=DataCollatorPadToMaxLen(args.max_seq_len, tokenizer.pad_token_id), # Heronはtokenizeしない + ) + + eval_dataloader = DataLoader( + eval_dataset, + batch_size=args.per_device_eval_batch_size, + sampler=DistributedSampler(eval_dataset, shuffle=False), + # collate_fn=DataCollatorPadToMaxLen(args.max_seq_len, tokenizer.pad_token_id), # Heronはtokenizeしない + ) + + # Split weights in two groups, one with weight decay and the other not. + optimizer_grouped_parameters = get_optimizer_grouped_parameters( + model, args.weight_decay, small_lr=args.learning_rate_pretraining_components + ) + + optimizer = AdamW(optimizer_grouped_parameters, lr=args.learning_rate, betas=(0.9, 0.95)) + + num_update_steps_per_epoch = math.ceil( + len(train_dataloader) / args.gradient_accumulation_steps + ) + if args.num_warmup_steps <= 1: + args.num_warmup_steps = int( + args.num_warmup_steps * args.num_train_epochs * num_update_steps_per_epoch + ) + else: + args.num_warmup_steps = int(args.num_warmup_steps) + lr_scheduler = get_scheduler( + name=args.lr_scheduler_type, + optimizer=optimizer, + num_warmup_steps=args.num_warmup_steps, + num_training_steps=args.num_train_epochs * num_update_steps_per_epoch, + ) + + model, optimizer, _, lr_scheduler = deepspeed.initialize( + model=model, + optimizer=optimizer, + args=args, + config=ds_config, + lr_scheduler=lr_scheduler, + dist_init_required=True, + ) + # assert 0 + + start_epoch = 0 + """ 学習済みモデルは使わない """ + # # let load checkpoint + # if os.path.exists(os.path.join(args.output_dir, 'latest')): + # # we have the deepspeed chekpoint so it is a resumed job + # # TODO: after loading the ckpt, the global step is not loaded. Need to ask Tunji/Ammar for help. + # _, client_state = model.load_checkpoint(args.output_dir) + # start_epoch = client_state['epoch'] + # best_loss = client_state['best_loss'] + # random.setstate(client_state['random_rng_state']) + # np.random.set_state(client_state['np_rng_state']) + # torch.set_rng_state(client_state['torch_rng_state']) + # torch.cuda.set_rng_state(client_state['torch_cuda_rng_state']) + + if args.gradient_checkpointing: + model.gradient_checkpointing_enable() + + def evaluation(model, eval_dataloader): + model.eval() + print("Evaluation") + acc_loss = 0 + for step, batch in enumerate(eval_dataloader): + print(step) + with torch.no_grad(): + batch = to_device(batch, device) + loss = model( + batch["image"].half(), + batch["input_ids"], + attention_mask=batch["attention_mask"], + input_labels=batch["labels"], + image_num=batch["image_num"], + )[0] + acc_loss += loss + model.train() + acc_loss = get_all_reduce_mean(acc_loss).item() + ave_loss = acc_loss / (step + 1) + print_rank_0(f"the eval average_loss: {ave_loss}", args.global_rank) + return ave_loss + + # Train! + if start_epoch == 0: + print_rank_0("***** Before training *****", args.global_rank) + # evaluation(model, eval_dataloader) + best_loss = 1e6 + + print_rank_0("***** Running training *****", args.global_rank) + for epoch in range(start_epoch, args.num_train_epochs): + print_rank_0( + f"Beginning of Epoch {epoch+1}/{args.num_train_epochs}, Total Micro Batches {len(train_dataloader)}", + args.global_rank, + ) + model.train() + acc_loss = 0 + for step, batch in enumerate(train_dataloader): + batch = to_device( + batch, device + ) # torch.size(1, 3, 224, 224]) #torch.Size([1, 1, 3, 224, 224]) + """ ↓ VisualChatの1step """ + # images = batch["image"].half() + # attention_mask = batch["attention_mask"] + # labels = batch["labels"] + # loss = model( + # images, + # input_ids, + # attention_mask=attention_mask, + # input_labels=labels, + # image_num=batch["image_num"], + # )[0] + """ ↑ VisualChatの1step """ + + """ ↓ Heronの1step """ + input_ids = batch["input_ids"] + attention_mask = batch["attention_mask"] + # position_ids = batch["position_ids"] + pixel_values = batch["pixel_values"] + labels = batch["labels"] + loss = model( + input_ids=input_ids, + attention_mask=attention_mask, + pixel_values=pixel_values, + labels=labels, + )[0] + print(loss) + + """ ↑ Heronの1step """ + + acc_loss += loss.detach().clone() + model.backward(loss) + model.step() + model.tput_timer.update_epoch_count() + acc_loss = get_all_reduce_mean(acc_loss).item() + print_rank_0(f"Epoch {epoch+1}, the average_loss: {acc_loss/step}", args.global_rank) + eval_loss = evaluation(model, eval_dataloader) + + if eval_loss < best_loss: + best_loss = eval_loss + + model = fuse_lora(model) + if args.global_rank == 0: + save_hf_format(model, tokenizer, args, f"epoch-{epoch}") + if args.zero_stage == 3: + # For zero stage 3, each gpu only has a part of the model, so we need a special save function + save_zero_three_model( + model, + args.global_rank, + args.output_dir, + zero_stage=args.zero_stage, + sub_folder=f"epoch-{epoch}", + ) + model = unfuse_lora(model) + # save deepspeed zero checkpoint so we can resume training if needed + client_state = { + "random_rng_state": random.getstate(), + "np_rng_state": np.random.get_state(), + "torch_rng_state": torch.get_rng_state(), + "torch_cuda_rng_state": torch.cuda.get_rng_state(), + "epoch": epoch + 1, # start from next epoch + "best_loss": best_loss, + } + model.save_checkpoint(args.output_dir, client_state=client_state) # save to the latest + + +if __name__ == "__main__": + main() From 9f32e64c9aac116fef4d7132be6d7e03ed4f671a Mon Sep 17 00:00:00 2001 From: Yuma Ochi Date: Thu, 19 Oct 2023 08:27:47 +0000 Subject: [PATCH 03/50] rm comment and argparser, set fire --- train_ds.py | 80 ++++++++++------------------------------------------- 1 file changed, 14 insertions(+), 66 deletions(-) diff --git a/train_ds.py b/train_ds.py index 06b96ce..690e225 100644 --- a/train_ds.py +++ b/train_ds.py @@ -3,13 +3,13 @@ # SPDX-License-Identifier: Apache-2.0 # DeepSpeed Team -import argparse import math import os import random import sys import deepspeed +import fire import numpy as np import torch import yaml @@ -265,8 +265,18 @@ def parse_args(): return args -def main(): - args = parse_args() +def main(config_file: str, local_rank: int = 0): + with open(config_file, "r") as i_: + config = yaml.safe_load(i_) + model_config = config["model_config"] + training_config = config["training_config"] + + if os.environ.get("WANDB_NAME") is not None: + training_config["output_dir"] = os.path.join( + training_config["output_dir"], os.environ["WANDB_NAME"] + ) + + # args = parse_args() if args.local_rank == -1: device = torch.device("cuda") @@ -296,26 +306,6 @@ def main(): set_random_seed(args.seed) torch.distributed.barrier() - """ ↓ VisualChatのモデル定義 """ - # tokenizer = AutoTokenizer.from_pretrained(args.lm_model_name_or_path, - # fast_tokenizer=True) - # tokenizer.padding_side = 'right' - - # model, image_processor, tokenizer = create_dsvl_model_and_transforms( - # text_tokenizer=tokenizer, - # args=args, - # ds_config=ds_config) - # if args.lang_lora_dim > 0: - # model.lang_decoder = convert_linear_layer_to_lora(model.lang_decoder, args.lang_lora_module_name, args.lang_lora_dim) - # if args.only_optimize_lora: - # model.lang_decoder = only_optimize_lora_parameters(model.lang_decoder) - - # if args.vis_lora_dim > 0: - # model.vis_encoder = convert_linear_layer_to_lora(model.vis_encoder, args.vis_lora_module_name, args.vis_lora_dim) - # if args.only_optimize_lora: - # model.vis_encoder = only_optimize_lora_parameters(model.vis_encoder) - """ ↑ VsualChatのモデル定義 """ - """ ↓ Heronのモデル定義 """ # load model model = load_model(model_config) @@ -326,34 +316,6 @@ def main(): print_rank_0(model, args.global_rank) - """ ↓ VisualChat """ - # # Prepare the data - # if len(args.dataset_samples) < len(args.dataset_names): - # assert len(args.dataset_samples) == 1, "when args.dataset_samples is not the same length as args.dataset_names, it should be only one number" - # args.dataset_samples = [args.dataset_samples[0]] * len(args.dataset_names) - # if len(args.dataset_concatenate_samples) < len(args.dataset_names): - # assert len(args.dataset_concatenate_samples) == 1, "when args.dataset_concatenate_samples is not the same length as args.dataset_names, it should be only one number" - # args.dataset_concatenate_samples = [args.dataset_concatenate_samples[0]] * len(args.dataset_names) - # # convert to int - # args.dataset_concatenate_samples = [int(i) for i in args.dataset_concatenate_samples] - - # dataset = build_dataset( - # args.data_path, - # args.data_debug_path, - # args.dataset_names, - # args.dataset_samples, - # args.dataset_concatenate_samples, - # args.max_num_image_per_sample, - # vis_processor=image_processor, - # tokenizer=tokenizer, - # ) - # # split the dataset into train and evaluation - # total_data = len(dataset) - # np_rng = np.random.RandomState(seed=args.seed) - # dataset = shuffle_dataset(dataset, np_rng) - # train_dataset, eval_dataset = split_dataset(dataset, args.data_train_split_ratio) - """ ↑ VisualChat """ - """ ↓ Heron """ config["dataset_config_path"] = [ os.path.join("/home/yuma_ochi/heron-exp", path) for path in config["dataset_config_path"] @@ -365,14 +327,12 @@ def main(): train_dataset, batch_size=args.per_device_train_batch_size, sampler=DistributedSampler(train_dataset, shuffle=True, drop_last=True), - # collate_fn=DataCollatorPadToMaxLen(args.max_seq_len, tokenizer.pad_token_id), # Heronはtokenizeしない ) eval_dataloader = DataLoader( eval_dataset, batch_size=args.per_device_eval_batch_size, sampler=DistributedSampler(eval_dataset, shuffle=False), - # collate_fn=DataCollatorPadToMaxLen(args.max_seq_len, tokenizer.pad_token_id), # Heronはtokenizeしない ) # Split weights in two groups, one with weight decay and the other not. @@ -465,18 +425,6 @@ def evaluation(model, eval_dataloader): batch = to_device( batch, device ) # torch.size(1, 3, 224, 224]) #torch.Size([1, 1, 3, 224, 224]) - """ ↓ VisualChatの1step """ - # images = batch["image"].half() - # attention_mask = batch["attention_mask"] - # labels = batch["labels"] - # loss = model( - # images, - # input_ids, - # attention_mask=attention_mask, - # input_labels=labels, - # image_num=batch["image_num"], - # )[0] - """ ↑ VisualChatの1step """ """ ↓ Heronの1step """ input_ids = batch["input_ids"] @@ -531,4 +479,4 @@ def evaluation(model, eval_dataloader): if __name__ == "__main__": - main() + fire.Fire(main) From 5c821694436c283ddbe630bcca464193e8c9fcf8 Mon Sep 17 00:00:00 2001 From: Yuma Ochi Date: Thu, 19 Oct 2023 13:32:05 +0000 Subject: [PATCH 04/50] update [train_ds.py, config] --- projects/opt/exp002_ds.yml | 56 ++++++ train_ds.py | 375 +++++++------------------------------ 2 files changed, 127 insertions(+), 304 deletions(-) create mode 100644 projects/opt/exp002_ds.yml diff --git a/projects/opt/exp002_ds.yml b/projects/opt/exp002_ds.yml new file mode 100644 index 0000000..b7282ed --- /dev/null +++ b/projects/opt/exp002_ds.yml @@ -0,0 +1,56 @@ +training_config: + per_device_train_batch_size: 2 + per_device_eval_batch_size: 2 + gradient_accumulation_steps: 4 + num_train_epochs: 1 + dataloader_num_workers: 16 + fp16: true + optim: "adamw_torch" + learning_rate: 5.0e-5 + logging_steps: 100 + evaluation_strategy: "steps" + save_strategy: "steps" + eval_steps: 4000 + save_steps: 4000 + save_total_limit: 1 + deepspeed: ./configs/deepspeed/ds_config_zero1.json + output_dir: ./output/ + report_to: "wandb" + zero_stage: 3 + precision: "fp16" + enable_tensorboard: False + seed: 0 + weight_decay: 0. + learning_rate_pretraining_components: 0. + num_warmup_steps: 0. + lr_scheduler_type: "cosine" + gradient_checkpointing: False + + +model_config: + fp16: true + pretrained_path: # None or path to model weight + model_type: git_llm + language_model_name: facebook/opt-350m + vision_model_name: openai/clip-vit-base-patch16 + num_image_with_embedding: 1 # if 1, no img_temporal_embedding + max_length: 512 + keys_to_finetune: + - visual_projection + - num_image_with_embedding + keys_to_freeze: [] + + use_lora: true + lora: + r: 8 + lora_alpha: 32 + target_modules: + - q_proj + - k_proj + - v_proj + lora_dropout: 0.01 + bias: none + task_type: CAUSAL_LM + +dataset_config_path: + - ./configs/datasets/m3it.yaml diff --git a/train_ds.py b/train_ds.py index 690e225..07f2702 100644 --- a/train_ds.py +++ b/train_ds.py @@ -17,22 +17,16 @@ from torch.utils.data.distributed import DistributedSampler from transformers import AdamW, AutoTokenizer, SchedulerType, get_scheduler -sys.path.append(os.path.abspath(os.path.join(os.path.dirname(__file__), os.path.pardir))) -from utils.data import ( - DataCollatorPadToMaxLen, - build_dataset, - shuffle_dataset, - split_dataset, -) -from utils.ds_utils import get_train_ds_config -from utils.model import create_dsvl_model_and_transforms -from utils.module.lora import ( - convert_linear_layer_to_lora, - fuse_lora, - only_optimize_lora_parameters, - unfuse_lora, +from heron.datasets.utils import get_dataset +from heron.models.utils import ( + apply_lora_model, + load_model, + load_pretrained_weight, + set_trainable_params, + unload_and_merge_lora, ) -from utils.utils import ( +from heron.utils.ds_utils import get_train_ds_config +from heron.utils.utils import ( get_all_reduce_mean, get_optimizer_grouped_parameters, print_rank_0, @@ -42,228 +36,6 @@ to_device, ) -# import heron library -sys.path.append("/home/yuma_ochi/heron-exp") -from heron.datasets.utils import get_dataset -from heron.models.utils import ( - apply_lora_model, - load_model, - load_pretrained_weight, - set_trainable_params, - unload_and_merge_lora, -) - - -def parse_args(): - parser = argparse.ArgumentParser( - description="Finetune a transformers model on a multi-modal task" - ) - - parser.add_argument( - "--data_path", type=str, default="./data/", help="Where the training data are stored." - ) - - parser.add_argument( - "--data_debug_path", - type=str, - default=None, - help="If provided, will save 10 training samples" "to the path for debug purpose.", - ) - - parser.add_argument( - "--data_train_split_ratio", - type=float, - default=0.9, - help="Ratio of dataset to be splitted as train data. The remaining becomes eval data.", - ) - parser.add_argument( - "--dataset_names", - nargs="*", - default=["minigpt4"], - help="Name of training dataset(s) to be used. Accepted format:" - "1) a single dataset name, 2) multiple dataset names in the" - "form: dataset1 dataset2 ...", - ) - - parser.add_argument( - "--dataset_samples", - nargs="*", - default=["all"], - help="How many samples do we use from each dataset." - "Should be either a integer number or string all which" - "means use all samples. For example: all 512 means" - "using all samples form first data and 512 samples" - "from second data", - ) - - parser.add_argument( - "--dataset_concatenate_samples", - nargs="*", - default=[1], - help="How many samples do we concatenate from each dataset." - "Should be either a integer number or string. 1 which" - "means use 1 sample for each datapoint", - ) - - parser.add_argument( - "--max_num_image_per_sample", - type=int, - default=8, - help="The maximum number of images per sample.", - ) - parser.add_argument( - "--per_device_train_batch_size", - type=int, - default=2, - help="Batch size (per device) for the training dataloader.", - ) - parser.add_argument( - "--per_device_eval_batch_size", - type=int, - default=2, - help="Batch size (per device) for the evaluation dataloader.", - ) - parser.add_argument( - "--max_seq_len", - type=int, - default=4096, - help="The maximum sequence length, note that image tokens are included.", - ) - parser.add_argument( - "--learning_rate", - type=float, - default=1e-3, - help="Initial learning rate (after the potential warmup period) to use.", - ) - parser.add_argument( - "--learning_rate_pretraining_components", - type=float, - default=0, - help="Initial learning rate for pre-trained weight, e.g., embedding (after the potential warmup period) to use.", - ) - parser.add_argument("--weight_decay", type=float, default=0.0, help="Weight decay to use.") - parser.add_argument( - "--num_train_epochs", - type=int, - default=6, - help="Total number of training epochs to perform.", - ) - parser.add_argument( - "--gradient_accumulation_steps", - type=int, - default=1, - help="Number of updates steps to accumulate before performing a backward/update pass.", - ) - parser.add_argument( - "--lr_scheduler_type", - type=SchedulerType, - default="cosine", - help="The scheduler type to use.", - choices=[ - "linear", - "cosine", - "cosine_with_restarts", - "polynomial", - "constant", - "constant_with_warmup", - ], - ) - parser.add_argument( - "--num_warmup_steps", - type=float, - default=0, - help="Number of steps (>1) or ratios (<=1) for the warmup in the lr scheduler.", - ) - parser.add_argument("--output_dir", type=str, default=None, help="Where to store the model.") - parser.add_argument("--seed", type=int, default=1234, help="A seed for reproducible training.") - parser.add_argument( - "--local_rank", type=int, default=-1, help="local_rank for distributed training on gpus" - ) - parser.add_argument( - "--gradient_checkpointing", - action="store_true", - help="Enable HF gradient checkpointing for model.", - ) - parser.add_argument( - "--lm_model_name_or_path", - type=str, - help="Path to pretrained model or model identifier from huggingface.co/models.", - required=True, - ) - parser.add_argument( - "--vision_model_name_or_path", default="openai/clip-vit-large-patch14", type=str - ) - parser.add_argument( - "--enable_mmca_attention", - action="store_true", - help="enable the new proposed attn, which is similar to cross attention", - ) - parser.add_argument( - "--vis_proj", - type=str, - default="baseline", - help="[baseline, vit, or perceiver], used to projection vision feature to LLM embedding", - ) - # deepspeed features - parser.add_argument( - "--zero_stage", - type=int, - default=0, - help="ZeRO optimization stage for Actor model (and clones).", - ) - parser.add_argument( - "--precision", - type=str, - choices=["fp16", "bf16"], - default="fp16", - help="FP16 or BF16 precision. FP16 is recommended for typical use cases. BF16 is good for large models", - ) - parser.add_argument( - "--enable_tensorboard", action="store_true", help="Enable tensorboard logging" - ) - ## LoRA for efficient training setting - parser.add_argument( - "--lang_lora_dim", - type=int, - default=0, - help="Use LoRA for fine-tuning language decoder (> 0).", - ) - parser.add_argument( - "--lang_lora_module_name", - type=str, - default="model.layers.", - help="The scope name of the target LoRA parameters.", - ) - parser.add_argument( - "--vis_lora_dim", - type=int, - default=0, - help="Use LoRA for fine-tuning visual encoder (> 0).", - ) - parser.add_argument( - "--vis_lora_module_name", - type=str, - default="encoder.layers.", - help="The scope name of the target LoRA parameters.", - ) - parser.add_argument( - "--only_optimize_lora", action="store_true", help="Only optimize the LoRA parameters." - ) - parser.add_argument("--heron_config_file", type=str, help="heronのconfigファイルパス") - - parser = deepspeed.add_config_arguments(parser) - args = parser.parse_args() - - if args.learning_rate_pretraining_components == 0.0: - # if we do not provide special learning rate, mainly for embedding, the same lr is applied - args.learning_rate_pretraining_components = args.learning_rate - assert args.num_warmup_steps >= 0, "--num_warmup_steps must be >= 0" - if "qwen" in args.vision_model_name_or_path.lower(): - assert ( - args.vis_proj == "baseline" - ), "qwen's model only support baseline vis_proj as it has the perceiver module inside" - return args - def main(config_file: str, local_rank: int = 0): with open(config_file, "r") as i_: @@ -278,111 +50,107 @@ def main(config_file: str, local_rank: int = 0): # args = parse_args() - if args.local_rank == -1: + if local_rank == -1: device = torch.device("cuda") else: - torch.cuda.set_device(args.local_rank) - device = torch.device("cuda", args.local_rank) + torch.cuda.set_device(local_rank) + device = torch.device("cuda", local_rank) # Initializes the distributed backend which will take care of sychronizing nodes/GPUs deepspeed.init_distributed() - args.global_rank = torch.distributed.get_rank() + training_config["global_rank"] = torch.distributed.get_rank() - ds_config = get_train_ds_config(args, offload=False, stage=args.zero_stage) - ds_config["train_micro_batch_size_per_gpu"] = args.per_device_train_batch_size + ds_config = get_train_ds_config( + training_config, offload=False, stage=training_config["zero_stage"] + ) + ds_config["train_micro_batch_size_per_gpu"] = training_config["per_device_train_batch_size"] ds_config["train_batch_size"] = ( - args.per_device_train_batch_size + training_config["per_device_train_batch_size"] * torch.distributed.get_world_size() - * args.gradient_accumulation_steps + * training_config["gradient_accumulation_steps"] ) - # Heronのconfigを読み出す - with open(args.heron_config_file, "r") as i_: - config = yaml.safe_load(i_) - model_config = config["model_config"] - training_config = config["training_config"] - # If passed along, set the training seed now. - set_random_seed(args.seed) + set_random_seed(training_config["seed"]) torch.distributed.barrier() - """ ↓ Heronのモデル定義 """ + # load model model = load_model(model_config) if model_config["use_lora"]: model = apply_lora_model(model, model_config) - """ ↑ Heronのモデル定義 """ - print_rank_0(model, args.global_rank) + print_rank_0(model, training_config["global_rank"]) - """ ↓ Heron """ config["dataset_config_path"] = [ os.path.join("/home/yuma_ochi/heron-exp", path) for path in config["dataset_config_path"] ] train_dataset, eval_dataset = get_dataset(config) - """ ↑ Heron """ train_dataloader = DataLoader( train_dataset, - batch_size=args.per_device_train_batch_size, + batch_size=training_config["per_device_train_batch_size"], sampler=DistributedSampler(train_dataset, shuffle=True, drop_last=True), ) eval_dataloader = DataLoader( eval_dataset, - batch_size=args.per_device_eval_batch_size, + batch_size=training_config["per_device_eval_batch_size"], sampler=DistributedSampler(eval_dataset, shuffle=False), ) # Split weights in two groups, one with weight decay and the other not. optimizer_grouped_parameters = get_optimizer_grouped_parameters( - model, args.weight_decay, small_lr=args.learning_rate_pretraining_components + model, + training_config["weight_decay"], + small_lr=training_config["learning_rate_pretraining_components"], ) - optimizer = AdamW(optimizer_grouped_parameters, lr=args.learning_rate, betas=(0.9, 0.95)) + optimizer = AdamW( + optimizer_grouped_parameters, lr=training_config["learning_rate"], betas=(0.9, 0.95) + ) num_update_steps_per_epoch = math.ceil( - len(train_dataloader) / args.gradient_accumulation_steps + len(train_dataloader) / training_config["gradient_accumulation_steps"] ) - if args.num_warmup_steps <= 1: - args.num_warmup_steps = int( - args.num_warmup_steps * args.num_train_epochs * num_update_steps_per_epoch + if training_config["num_warmup_steps"] <= 1: + training_config["num_warmup_steps"] = int( + training_config["num_warmup_steps"] + * training_config["num_train_epochs"] + * num_update_steps_per_epoch ) else: - args.num_warmup_steps = int(args.num_warmup_steps) + training_config["num_warmup_steps"] = int(training_config["num_warmup_steps"]) lr_scheduler = get_scheduler( - name=args.lr_scheduler_type, + name=training_config["lr_scheduler_type"], optimizer=optimizer, - num_warmup_steps=args.num_warmup_steps, - num_training_steps=args.num_train_epochs * num_update_steps_per_epoch, + num_warmup_steps=training_config["num_warmup_steps"], + num_training_steps=training_config["num_train_epochs"] * num_update_steps_per_epoch, ) model, optimizer, _, lr_scheduler = deepspeed.initialize( model=model, optimizer=optimizer, - args=args, config=ds_config, lr_scheduler=lr_scheduler, dist_init_required=True, ) - # assert 0 start_epoch = 0 - """ 学習済みモデルは使わない """ - # # let load checkpoint - # if os.path.exists(os.path.join(args.output_dir, 'latest')): - # # we have the deepspeed chekpoint so it is a resumed job - # # TODO: after loading the ckpt, the global step is not loaded. Need to ask Tunji/Ammar for help. - # _, client_state = model.load_checkpoint(args.output_dir) - # start_epoch = client_state['epoch'] - # best_loss = client_state['best_loss'] - # random.setstate(client_state['random_rng_state']) - # np.random.set_state(client_state['np_rng_state']) - # torch.set_rng_state(client_state['torch_rng_state']) - # torch.cuda.set_rng_state(client_state['torch_cuda_rng_state']) - - if args.gradient_checkpointing: + # let load checkpoint + if os.path.exists(os.path.join(training_config["output_dir"], "latest")): + # we have the deepspeed chekpoint so it is a resumed job + # TODO: after loading the ckpt, the global step is not loaded. Need to ask Tunji/Ammar for help. + _, client_state = model.load_checkpoint(training_config["output_dir"]) + start_epoch = client_state["epoch"] + best_loss = client_state["best_loss"] + random.setstate(client_state["random_rng_state"]) + np.random.set_state(client_state["np_rng_state"]) + torch.set_rng_state(client_state["torch_rng_state"]) + torch.cuda.set_rng_state(client_state["torch_cuda_rng_state"]) + + if training_config["gradient_checkpointing"]: model.gradient_checkpointing_enable() def evaluation(model, eval_dataloader): @@ -390,7 +158,6 @@ def evaluation(model, eval_dataloader): print("Evaluation") acc_loss = 0 for step, batch in enumerate(eval_dataloader): - print(step) with torch.no_grad(): batch = to_device(batch, device) loss = model( @@ -404,20 +171,20 @@ def evaluation(model, eval_dataloader): model.train() acc_loss = get_all_reduce_mean(acc_loss).item() ave_loss = acc_loss / (step + 1) - print_rank_0(f"the eval average_loss: {ave_loss}", args.global_rank) + print_rank_0(f"the eval average_loss: {ave_loss}", training_config["global_rank"]) return ave_loss # Train! if start_epoch == 0: - print_rank_0("***** Before training *****", args.global_rank) + print_rank_0("***** Before training *****", training_config["global_rank"]) # evaluation(model, eval_dataloader) best_loss = 1e6 - print_rank_0("***** Running training *****", args.global_rank) - for epoch in range(start_epoch, args.num_train_epochs): + print_rank_0("***** Running training *****", training_config["global_rank"]) + for epoch in range(start_epoch, training_config["num_train_epochs"]): print_rank_0( - f"Beginning of Epoch {epoch+1}/{args.num_train_epochs}, Total Micro Batches {len(train_dataloader)}", - args.global_rank, + f"Beginning of Epoch {epoch+1}/{training_config['num_train_epochs']}, Total Micro Batches {len(train_dataloader)}", + training_config["global_rank"], ) model.train() acc_loss = 0 @@ -426,7 +193,6 @@ def evaluation(model, eval_dataloader): batch, device ) # torch.size(1, 3, 224, 224]) #torch.Size([1, 1, 3, 224, 224]) - """ ↓ Heronの1step """ input_ids = batch["input_ids"] attention_mask = batch["attention_mask"] # position_ids = batch["position_ids"] @@ -438,31 +204,30 @@ def evaluation(model, eval_dataloader): pixel_values=pixel_values, labels=labels, )[0] - print(loss) - - """ ↑ Heronの1step """ acc_loss += loss.detach().clone() model.backward(loss) model.step() model.tput_timer.update_epoch_count() acc_loss = get_all_reduce_mean(acc_loss).item() - print_rank_0(f"Epoch {epoch+1}, the average_loss: {acc_loss/step}", args.global_rank) + print_rank_0( + f"Epoch {epoch+1}, the average_loss: {acc_loss/step}", training_config["global_rank"] + ) eval_loss = evaluation(model, eval_dataloader) if eval_loss < best_loss: best_loss = eval_loss model = fuse_lora(model) - if args.global_rank == 0: - save_hf_format(model, tokenizer, args, f"epoch-{epoch}") - if args.zero_stage == 3: + if training_config["global_rank"] == 0: + save_hf_format(model, tokenizer, training_config, f"epoch-{epoch}") + if training_config["zero_stage"] == 3: # For zero stage 3, each gpu only has a part of the model, so we need a special save function save_zero_three_model( model, - args.global_rank, - args.output_dir, - zero_stage=args.zero_stage, + training_config["global_rank"], + training_config["output_dir"], + zero_stage=training_config["zero_stage"], sub_folder=f"epoch-{epoch}", ) model = unfuse_lora(model) @@ -475,7 +240,9 @@ def evaluation(model, eval_dataloader): "epoch": epoch + 1, # start from next epoch "best_loss": best_loss, } - model.save_checkpoint(args.output_dir, client_state=client_state) # save to the latest + model.save_checkpoint( + training_config["output_dir"], client_state=client_state + ) # save to the latest if __name__ == "__main__": From 62c4ed522703e53436a2e94c41531082c45668b1 Mon Sep 17 00:00:00 2001 From: Yuma Ochi Date: Thu, 19 Oct 2023 14:35:22 +0000 Subject: [PATCH 05/50] fix [train_ds.py]: set to half the input batch and eval model input --- train_ds.py | 20 ++++++++++---------- 1 file changed, 10 insertions(+), 10 deletions(-) diff --git a/train_ds.py b/train_ds.py index 07f2702..c6fdc78 100644 --- a/train_ds.py +++ b/train_ds.py @@ -38,6 +38,8 @@ def main(config_file: str, local_rank: int = 0): + torch.cuda.empty_cache() + with open(config_file, "r") as i_: config = yaml.safe_load(i_) model_config = config["model_config"] @@ -48,8 +50,6 @@ def main(config_file: str, local_rank: int = 0): training_config["output_dir"], os.environ["WANDB_NAME"] ) - # args = parse_args() - if local_rank == -1: device = torch.device("cuda") else: @@ -60,6 +60,9 @@ def main(config_file: str, local_rank: int = 0): training_config["global_rank"] = torch.distributed.get_rank() + set_random_seed(training_config["seed"]) + + # DeepSpeedの初期化に必要な変数を設定 ds_config = get_train_ds_config( training_config, offload=False, stage=training_config["zero_stage"] ) @@ -70,9 +73,7 @@ def main(config_file: str, local_rank: int = 0): * training_config["gradient_accumulation_steps"] ) - # If passed along, set the training seed now. - set_random_seed(training_config["seed"]) - + # すべてのプロセスの処理が終わるまで待機 torch.distributed.barrier() # load model @@ -161,11 +162,10 @@ def evaluation(model, eval_dataloader): with torch.no_grad(): batch = to_device(batch, device) loss = model( - batch["image"].half(), - batch["input_ids"], + input_ids=batch["input_ids"], attention_mask=batch["attention_mask"], - input_labels=batch["labels"], - image_num=batch["image_num"], + pixel_values=batch["pixel_values"], + labels=batch["labels"], )[0] acc_loss += loss model.train() @@ -196,7 +196,7 @@ def evaluation(model, eval_dataloader): input_ids = batch["input_ids"] attention_mask = batch["attention_mask"] # position_ids = batch["position_ids"] - pixel_values = batch["pixel_values"] + pixel_values = batch["pixel_values"].half() labels = batch["labels"] loss = model( input_ids=input_ids, From 62cb28d05a6c2bf8ca2a7ad84ab31c9440cfa5fa Mon Sep 17 00:00:00 2001 From: Yuma Ochi Date: Mon, 23 Oct 2023 10:52:48 +0000 Subject: [PATCH 06/50] add [utils.py]: for train_dspy --- heron/utils/ds_utils.py | 91 +++++++++++++++++ heron/utils/utils.py | 217 ++++++++++++++++++++++++++++++++++++++++ 2 files changed, 308 insertions(+) create mode 100755 heron/utils/ds_utils.py create mode 100644 heron/utils/utils.py diff --git a/heron/utils/ds_utils.py b/heron/utils/ds_utils.py new file mode 100755 index 0000000..0be0198 --- /dev/null +++ b/heron/utils/ds_utils.py @@ -0,0 +1,91 @@ +# Copyright (c) Microsoft Corporation. +# SPDX-License-Identifier: Apache-2.0 + +# DeepSpeed Team +GLOBAL_BATCH_SIZE = 32 +MICRO_BATCH_SIZE = 4 + + +def get_train_ds_config( + config, + offload, + stage=2, + enable_hybrid_engine=False, + inference_tp_size=1, + release_inference_cache=False, + pin_parameters=True, + tp_gather_partition_size=8, + max_out_tokens=512, +): + if config["precision"] == "fp16": + enable_fp16 = True + enable_bf16 = False + elif config["precision"] == "bf16": + enable_fp16 = False + enable_bf16 = True + else: + raise ValueError(f"Invalid precision {config['precision']}") + device = "cpu" if offload else "none" + zero_opt_dict = { + "stage": stage, + "offload_param": {"device": device}, + "offload_optimizer": {"device": device}, + "stage3_param_persistence_threshold": 1e4, + "stage3_max_live_parameters": 3e7, + "stage3_prefetch_bucket_size": 0, + "memory_efficient_linear": False, + } + output = { + "train_batch_size": GLOBAL_BATCH_SIZE, + "train_micro_batch_size_per_gpu": MICRO_BATCH_SIZE, + "steps_per_print": 10, + "zero_optimization": zero_opt_dict, + "zero_allow_untested_optimizer": True, + "zero_force_ds_cpu_optimizer": False, + "fp16": {"enabled": enable_fp16, "loss_scale_window": 100}, + "bf16": { + "enabled": enable_bf16, + }, + "gradient_clipping": 1.0, + "prescale_gradients": False, + "wall_clock_breakdown": False, + "hybrid_engine": { + "enabled": enable_hybrid_engine, + "max_out_tokens": max_out_tokens, + "inference_tp_size": inference_tp_size, + "release_inference_cache": release_inference_cache, + "pin_parameters": pin_parameters, + "tp_gather_partition_size": tp_gather_partition_size, + }, + } + if config["enable_tensorboard"]: + output.update( + { + "tensorboard": { + "enabled": True, + "output_path": config["output_dir"], + "job_name": "tb_logging", + } + } + ) + return output + + +def get_eval_ds_config(offload, stage=0): + device = "cpu" if offload else "none" + zero_opt_dict = { + "stage": stage, + "stage3_param_persistence_threshold": 1e4, + "offload_param": {"device": device}, + "memory_efficient_linear": False, + } + return { + "train_batch_size": GLOBAL_BATCH_SIZE, + "train_micro_batch_size_per_gpu": MICRO_BATCH_SIZE, + "steps_per_print": 10, + "zero_optimization": zero_opt_dict, + "fp16": {"enabled": True}, + "gradient_clipping": 1.0, + "prescale_gradients": False, + "wall_clock_breakdown": False, + } diff --git a/heron/utils/utils.py b/heron/utils/utils.py new file mode 100644 index 0000000..bebe908 --- /dev/null +++ b/heron/utils/utils.py @@ -0,0 +1,217 @@ +# Copyright (c) Microsoft Corporation. +# SPDX-License-Identifier: Apache-2.0 + +import json + +# DeepSpeed Team +import os +import random + +import deepspeed +import numpy as np +import torch +from deepspeed.runtime.zero.partition_parameters import ZeroParamStatus +from transformers import AutoTokenizer, set_seed + + +def print_rank_0(msg, rank=None): + if rank is not None and rank <= 0: + print(msg) + elif is_rank_0(): + print(msg) + + +def is_rank_0(): + """Check whether it is rank 0.""" + if torch.distributed.is_initialized(): + if torch.distributed.get_rank() == 0: + return True + else: + return False + else: + return True + + +def get_rank(): + """Check whether it is rank 0.""" + if torch.distributed.is_initialized(): + return torch.distributed.get_rank() + else: + return 0 + + +def to_device(batch, device): + output = {} + for k, v in batch.items(): + try: + output[k] = v.to(device) + except: + output[k] = v + return output + + +class MovingAverage: + def __init__(self): + self.count = 0 + self.total = 0 + self.mean = 0 + + def update(self, num): + self.total += num + self.count += 1 + self.mean = self.total / self.count + + return self.mean + + +def set_random_seed(seed): + if seed is not None: + set_seed(seed) + random.seed(seed) + np.random.seed(seed) + torch.manual_seed(seed) + torch.cuda.manual_seed_all(seed) + + +def get_all_reduce_mean(tensor): + torch.distributed.all_reduce(tensor, op=torch.distributed.ReduceOp.SUM) + tensor = tensor / torch.distributed.get_world_size() + return tensor + + +def get_optimizer_grouped_parameters( + model, + weight_decay, + no_decay_name_list=["bias", "LayerNorm.weight"], + small_learning_rate_list=["embed"], + small_lr=1e-4, +): + optimizer_grouped_parameters = [ + { + "params": [ + p + for n, p in model.named_parameters() + if ( + not any(nd in n for nd in no_decay_name_list) + and (not any(nd in n for nd in small_learning_rate_list)) + and p.requires_grad + ) + ], + "weight_decay": weight_decay, + }, + { + "params": [ + p + for n, p in model.named_parameters() + if ( + any(nd in n for nd in no_decay_name_list) + and (not any(nd in n for nd in small_learning_rate_list)) + and p.requires_grad + ) + ], + "weight_decay": 0.0, + }, + { + "params": [ + p + for n, p in model.named_parameters() + if ( + not any(nd in n for nd in no_decay_name_list) + and (any(nd in n for nd in small_learning_rate_list)) + and p.requires_grad + ) + ], + "weight_decay": weight_decay, + "lr": small_lr, + }, + { + "params": [ + p + for n, p in model.named_parameters() + if ( + any(nd in n for nd in no_decay_name_list) + and (any(nd in n for nd in small_learning_rate_list)) + and p.requires_grad + ) + ], + "weight_decay": 0.0, + "lr": small_lr, + }, + ] + return optimizer_grouped_parameters + + +def _z3_params_to_fetch(param_list): + return [ + p + for p in param_list + if hasattr(p, "ds_id") and p.ds_status == ZeroParamStatus.NOT_AVAILABLE + ] + + +def moving_average(model, model_ema, beta=0.992, device=None, zero_stage=0): + zero_stage_3 = zero_stage == 3 + with torch.no_grad(): + for param, param_ema in zip(model.parameters(), model_ema.parameters()): + # TODO: use prefiltering for efficiency + params_to_fetch = _z3_params_to_fetch([param, param_ema]) if zero_stage_3 else [] + should_gather_param = len(params_to_fetch) > 0 + with deepspeed.zero.GatheredParameters(params_to_fetch, enabled=should_gather_param): + data = param.data + if device is not None: + data = data.to(device) + param_ema.data.copy_(torch.lerp(data, param_ema.data, beta)) + + +def save_hf_format(model, tokenizer, config, sub_folder=""): + # used to save huggingface format, so we can use it for hf.from_pretrained + model_to_save = model.module if hasattr(model, "module") else model + CONFIG_NAME = "config.json" + WEIGHTS_NAME = "pytorch_model.bin" + output_dir = os.path.join(args.output_dir, sub_folder) + os.makedirs(output_dir, exist_ok=True) + output_model_file = os.path.join(output_dir, WEIGHTS_NAME) + output_config_file = os.path.join(output_dir, CONFIG_NAME) + save_dict = model_to_save.state_dict() + # for key in list(save_dict.keys()): + # if "lora" in key: + # del save_dict[key] + torch.save(save_dict, output_model_file) + try: + model_to_save.config.to_json_file(output_config_file) + except: + # args_dict = vars(args) + torch.save(config, os.path.join(output_dir, "train_args.pt")) + print("config can't be saved") + # tokenizer.save_vocabulary(output_dir) + tokenizer.save_pretrained(output_dir) # this will save all tokenizer files + + +def save_zero_three_model(model_ema, global_rank, save_dir, zero_stage=0, sub_folder=""): + zero_stage_3 = zero_stage == 3 + output_dir = os.path.join(save_dir, sub_folder) + os.makedirs(output_dir, exist_ok=True) + WEIGHTS_NAME = "pytorch_model.bin" + output_model_file = os.path.join(output_dir, WEIGHTS_NAME) + + model_to_save = model_ema.module if hasattr(model_ema, "module") else model_ema + if not zero_stage_3: + if global_rank == 0: + torch.save(model_to_save.state_dict(), output_model_file) + else: + output_state_dict = {} + for k, v in model_to_save.named_parameters(): + if hasattr(v, "ds_id"): + with deepspeed.zero.GatheredParameters( + _z3_params_to_fetch([v]), enabled=zero_stage_3 + ): + v_p = ( + v.data.clone().detach().cpu() + ) # this is a hack to get around the fact that we can't get the data from the param + else: + v_p = v.cpu() + if global_rank == 0 and "lora" not in k: + output_state_dict[k] = v_p + if global_rank == 0: + torch.save(output_state_dict, output_model_file) + del output_state_dict From 2ac2834fe6ba97de2abdf1ccdce6a9a7f3b772cd Mon Sep 17 00:00:00 2001 From: Yuma Ochi Date: Mon, 23 Oct 2023 13:21:10 +0000 Subject: [PATCH 07/50] update [train_ds]: fix input, rm config element --- projects/opt/exp002_ds.yml | 11 +---------- train_ds.py | 4 +++- 2 files changed, 4 insertions(+), 11 deletions(-) diff --git a/projects/opt/exp002_ds.yml b/projects/opt/exp002_ds.yml index b7282ed..9da143b 100644 --- a/projects/opt/exp002_ds.yml +++ b/projects/opt/exp002_ds.yml @@ -4,16 +4,8 @@ training_config: gradient_accumulation_steps: 4 num_train_epochs: 1 dataloader_num_workers: 16 - fp16: true - optim: "adamw_torch" learning_rate: 5.0e-5 - logging_steps: 100 - evaluation_strategy: "steps" - save_strategy: "steps" - eval_steps: 4000 - save_steps: 4000 - save_total_limit: 1 - deepspeed: ./configs/deepspeed/ds_config_zero1.json + # logging_steps: 100 output_dir: ./output/ report_to: "wandb" zero_stage: 3 @@ -28,7 +20,6 @@ training_config: model_config: - fp16: true pretrained_path: # None or path to model weight model_type: git_llm language_model_name: facebook/opt-350m diff --git a/train_ds.py b/train_ds.py index c6fdc78..196eca4 100644 --- a/train_ds.py +++ b/train_ds.py @@ -93,12 +93,14 @@ def main(config_file: str, local_rank: int = 0): train_dataset, batch_size=training_config["per_device_train_batch_size"], sampler=DistributedSampler(train_dataset, shuffle=True, drop_last=True), + num_workers=training_config["dataloader_num_workers"], ) eval_dataloader = DataLoader( eval_dataset, batch_size=training_config["per_device_eval_batch_size"], sampler=DistributedSampler(eval_dataset, shuffle=False), + num_workers=training_config["dataloader_num_workers"], ) # Split weights in two groups, one with weight decay and the other not. @@ -164,7 +166,7 @@ def evaluation(model, eval_dataloader): loss = model( input_ids=batch["input_ids"], attention_mask=batch["attention_mask"], - pixel_values=batch["pixel_values"], + pixel_values=batch["pixel_values"].half(), labels=batch["labels"], )[0] acc_loss += loss From ffdf56ab30e8e6b41fc8d693351c6fc4d425b2dc Mon Sep 17 00:00:00 2001 From: Yuma Ochi Date: Wed, 25 Oct 2023 06:28:26 +0000 Subject: [PATCH 08/50] fix [train_ds.py]: saving model --- train_ds.py | 44 +++++++++++++++++++++++--------------------- 1 file changed, 23 insertions(+), 21 deletions(-) diff --git a/train_ds.py b/train_ds.py index 196eca4..fde9f55 100644 --- a/train_ds.py +++ b/train_ds.py @@ -38,8 +38,6 @@ def main(config_file: str, local_rank: int = 0): - torch.cuda.empty_cache() - with open(config_file, "r") as i_: config = yaml.safe_load(i_) model_config = config["model_config"] @@ -64,7 +62,9 @@ def main(config_file: str, local_rank: int = 0): # DeepSpeedの初期化に必要な変数を設定 ds_config = get_train_ds_config( - training_config, offload=False, stage=training_config["zero_stage"] + training_config, + offload=training_config["cpu_offload"], + stage=training_config["zero_stage"], ) ds_config["train_micro_batch_size_per_gpu"] = training_config["per_device_train_batch_size"] ds_config["train_batch_size"] = ( @@ -84,9 +84,7 @@ def main(config_file: str, local_rank: int = 0): print_rank_0(model, training_config["global_rank"]) - config["dataset_config_path"] = [ - os.path.join("/home/yuma_ochi/heron-exp", path) for path in config["dataset_config_path"] - ] + # datasetの読み込み train_dataset, eval_dataset = get_dataset(config) train_dataloader = DataLoader( @@ -195,9 +193,9 @@ def evaluation(model, eval_dataloader): batch, device ) # torch.size(1, 3, 224, 224]) #torch.Size([1, 1, 3, 224, 224]) + # ここはDatasetの出力とモデルのforward関数を参考にした input_ids = batch["input_ids"] attention_mask = batch["attention_mask"] - # position_ids = batch["position_ids"] pixel_values = batch["pixel_values"].half() labels = batch["labels"] loss = model( @@ -209,6 +207,7 @@ def evaluation(model, eval_dataloader): acc_loss += loss.detach().clone() model.backward(loss) + # この中でgradient accumulationが行われることに注意 model.step() model.tput_timer.update_epoch_count() acc_loss = get_all_reduce_mean(acc_loss).item() @@ -220,20 +219,7 @@ def evaluation(model, eval_dataloader): if eval_loss < best_loss: best_loss = eval_loss - model = fuse_lora(model) - if training_config["global_rank"] == 0: - save_hf_format(model, tokenizer, training_config, f"epoch-{epoch}") - if training_config["zero_stage"] == 3: - # For zero stage 3, each gpu only has a part of the model, so we need a special save function - save_zero_three_model( - model, - training_config["global_rank"], - training_config["output_dir"], - zero_stage=training_config["zero_stage"], - sub_folder=f"epoch-{epoch}", - ) - model = unfuse_lora(model) - # save deepspeed zero checkpoint so we can resume training if needed + # 途中のチェックポイントの保存 client_state = { "random_rng_state": random.getstate(), "np_rng_state": np.random.get_state(), @@ -246,6 +232,22 @@ def evaluation(model, eval_dataloader): training_config["output_dir"], client_state=client_state ) # save to the latest + # モデルの保存(LoRAをモデルにマージしたもの) + if model_config["use_lora"]: + model = unload_and_merge_lora(model, model_config) + + if training_config["global_rank"] == 0: + save_hf_format(model, tokenizer, training_config, f"epoch-{epoch}") + if training_config["zero_stage"] == 3: + # For zero stage 3, each gpu only has a part of the model, so we need a special save function + save_zero_three_model( + model, + training_config["global_rank"], + training_config["output_dir"], + zero_stage=training_config["zero_stage"], + sub_folder=f"epoch-{epoch}", + ) + if __name__ == "__main__": fire.Fire(main) From dae2e3ba68b1c6529db6c5288c579181a630ae2a Mon Sep 17 00:00:00 2001 From: Yuma Ochi Date: Wed, 25 Oct 2023 07:07:14 +0000 Subject: [PATCH 09/50] add wandb [train_ds.py] --- train_ds.py | 24 ++++++++++++++++++++++++ 1 file changed, 24 insertions(+) diff --git a/train_ds.py b/train_ds.py index fde9f55..9b90fc3 100644 --- a/train_ds.py +++ b/train_ds.py @@ -17,6 +17,7 @@ from torch.utils.data.distributed import DistributedSampler from transformers import AdamW, AutoTokenizer, SchedulerType, get_scheduler +import wandb from heron.datasets.utils import get_dataset from heron.models.utils import ( apply_lora_model, @@ -72,6 +73,9 @@ def main(config_file: str, local_rank: int = 0): * torch.distributed.get_world_size() * training_config["gradient_accumulation_steps"] ) + # wandb の初期化 + if os.environ.get("WANDB_NAME") is not None and local_rank == 0: + wandb.init(project=os.environ["WANDB_PROJECT"]) # すべてのプロセスの処理が終わるまで待機 torch.distributed.barrier() @@ -209,6 +213,18 @@ def evaluation(model, eval_dataloader): model.backward(loss) # この中でgradient accumulationが行われることに注意 model.step() + + # wandbへのlog + if os.environ.get("WANDB_NAME") is not None and local_rank == 0: + wandb.log( + { + "Train/epoch": epoch, + "Train/step": step, + "Train/loss": loss.detach(), + "Train/average_loss": loss.detach() / step, + } + ) + model.tput_timer.update_epoch_count() acc_loss = get_all_reduce_mean(acc_loss).item() print_rank_0( @@ -219,6 +235,14 @@ def evaluation(model, eval_dataloader): if eval_loss < best_loss: best_loss = eval_loss + # wandbへのlog + if os.environ.get("WANDB_NAME") is not None and local_rank == 0: + wandb.log( + { + "Eval/loss": eval_loss, + } + ) + # 途中のチェックポイントの保存 client_state = { "random_rng_state": random.getstate(), From 8043bbf63a8351975090b5bd103649667a8d25db Mon Sep 17 00:00:00 2001 From: Yuma Ochi Date: Wed, 25 Oct 2023 07:10:56 +0000 Subject: [PATCH 10/50] fix avg loss metric --- train_ds.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/train_ds.py b/train_ds.py index 9b90fc3..729935e 100644 --- a/train_ds.py +++ b/train_ds.py @@ -221,7 +221,7 @@ def evaluation(model, eval_dataloader): "Train/epoch": epoch, "Train/step": step, "Train/loss": loss.detach(), - "Train/average_loss": loss.detach() / step, + "Train/average_loss": acc_loss / step, } ) From 0050f85efdd8e02c8dd56854acd076b1ec36160f Mon Sep 17 00:00:00 2001 From: Yuma Ochi Date: Wed, 25 Oct 2023 07:17:55 +0000 Subject: [PATCH 11/50] update config for train_ds.py[exp_002.yml] --- projects/opt/exp002_ds.yml | 1 + scripts/run_ds.sh | 2 +- 2 files changed, 2 insertions(+), 1 deletion(-) diff --git a/projects/opt/exp002_ds.yml b/projects/opt/exp002_ds.yml index 9da143b..00e0e3b 100644 --- a/projects/opt/exp002_ds.yml +++ b/projects/opt/exp002_ds.yml @@ -17,6 +17,7 @@ training_config: num_warmup_steps: 0. lr_scheduler_type: "cosine" gradient_checkpointing: False + cpu_offload: False model_config: diff --git a/scripts/run_ds.sh b/scripts/run_ds.sh index 63ddbd2..b01678a 100755 --- a/scripts/run_ds.sh +++ b/scripts/run_ds.sh @@ -1,6 +1,6 @@ #!/bin/bash export WANDB_PROJECT=heron -export PROJECT_NAME=opt/exp001 +export PROJECT_NAME=opt/exp002_ds export WANDB_NAME=$PROJECT_NAME deepspeed train_ds.py --config_file projects/$PROJECT_NAME.yml From 555a1c761483d976d42f120de9cea0a62ba79bf8 Mon Sep 17 00:00:00 2001 From: Yuma Ochi Date: Sat, 28 Oct 2023 11:39:22 +0000 Subject: [PATCH 12/50] update [train_ds.py] --- heron/utils/utils.py | 6 +++--- train_ds.py | 18 ++++++++++-------- 2 files changed, 13 insertions(+), 11 deletions(-) diff --git a/heron/utils/utils.py b/heron/utils/utils.py index bebe908..88a5b69 100644 --- a/heron/utils/utils.py +++ b/heron/utils/utils.py @@ -163,12 +163,12 @@ def moving_average(model, model_ema, beta=0.992, device=None, zero_stage=0): param_ema.data.copy_(torch.lerp(data, param_ema.data, beta)) -def save_hf_format(model, tokenizer, config, sub_folder=""): +def save_hf_format(model, config, sub_folder=""): # used to save huggingface format, so we can use it for hf.from_pretrained model_to_save = model.module if hasattr(model, "module") else model CONFIG_NAME = "config.json" WEIGHTS_NAME = "pytorch_model.bin" - output_dir = os.path.join(args.output_dir, sub_folder) + output_dir = os.path.join(config["output_dir"], sub_folder) os.makedirs(output_dir, exist_ok=True) output_model_file = os.path.join(output_dir, WEIGHTS_NAME) output_config_file = os.path.join(output_dir, CONFIG_NAME) @@ -184,7 +184,7 @@ def save_hf_format(model, tokenizer, config, sub_folder=""): torch.save(config, os.path.join(output_dir, "train_args.pt")) print("config can't be saved") # tokenizer.save_vocabulary(output_dir) - tokenizer.save_pretrained(output_dir) # this will save all tokenizer files + # tokenizer.save_pretrained(output_dir) # this will save all tokenizer files def save_zero_three_model(model_ema, global_rank, save_dir, zero_stage=0, sub_folder=""): diff --git a/train_ds.py b/train_ds.py index 729935e..3c973f1 100644 --- a/train_ds.py +++ b/train_ds.py @@ -75,7 +75,7 @@ def main(config_file: str, local_rank: int = 0): ) # wandb の初期化 if os.environ.get("WANDB_NAME") is not None and local_rank == 0: - wandb.init(project=os.environ["WANDB_PROJECT"]) + wandb.init(project=os.environ["WANDB_PROJECT"], config=config) # すべてのプロセスの処理が終わるまで待機 torch.distributed.barrier() @@ -84,6 +84,10 @@ def main(config_file: str, local_rank: int = 0): model = load_model(model_config) if model_config["use_lora"]: + # VisualChatのLoRA実装 (w/o peft) + # model = convert_linear_layer_to_lora(model, ["query_key_value"], lora_dim=8) + + # HeronのLoRA実装 (w/ peft) model = apply_lora_model(model, model_config) print_rank_0(model, training_config["global_rank"]) @@ -160,7 +164,7 @@ def main(config_file: str, local_rank: int = 0): def evaluation(model, eval_dataloader): model.eval() - print("Evaluation") + print_rank_0("***** Evaluation *****", training_config["global_rank"]) acc_loss = 0 for step, batch in enumerate(eval_dataloader): with torch.no_grad(): @@ -193,9 +197,7 @@ def evaluation(model, eval_dataloader): model.train() acc_loss = 0 for step, batch in enumerate(train_dataloader): - batch = to_device( - batch, device - ) # torch.size(1, 3, 224, 224]) #torch.Size([1, 1, 3, 224, 224]) + batch = to_device(batch, device) # ここはDatasetの出力とモデルのforward関数を参考にした input_ids = batch["input_ids"] @@ -257,11 +259,11 @@ def evaluation(model, eval_dataloader): ) # save to the latest # モデルの保存(LoRAをモデルにマージしたもの) - if model_config["use_lora"]: - model = unload_and_merge_lora(model, model_config) + # if model_config["use_lora"]: + # model = unload_and_merge_lora(model, model_config) if training_config["global_rank"] == 0: - save_hf_format(model, tokenizer, training_config, f"epoch-{epoch}") + save_hf_format(model, training_config) if training_config["zero_stage"] == 3: # For zero stage 3, each gpu only has a part of the model, so we need a special save function save_zero_three_model( From 103737c2292b115bade7af30510c319a43454da1 Mon Sep 17 00:00:00 2001 From: Yuma Ochi Date: Mon, 30 Oct 2023 13:09:28 +0000 Subject: [PATCH 13/50] update train_ds.py --- train_ds.py | 34 ++++++++++++++++++---------------- 1 file changed, 18 insertions(+), 16 deletions(-) diff --git a/train_ds.py b/train_ds.py index 3c973f1..3e6fdaf 100644 --- a/train_ds.py +++ b/train_ds.py @@ -31,8 +31,6 @@ get_all_reduce_mean, get_optimizer_grouped_parameters, print_rank_0, - save_hf_format, - save_zero_three_model, set_random_seed, to_device, ) @@ -90,6 +88,16 @@ def main(config_file: str, local_rank: int = 0): # HeronのLoRA実装 (w/ peft) model = apply_lora_model(model, model_config) + # configの割り当て + keys_to_finetune = config["model_config"]["keys_to_finetune"] + keys_to_freeze = config["model_config"]["keys_to_freeze"] + # Set trainable params + trainable_list, untrainable_list = set_trainable_params( + model, keys_to_finetune, keys_to_freeze, train_lora=model_config["use_lora"] + ) + print("trainable_list", trainable_list) + print("untrainable_list", untrainable_list) + print_rank_0(model, training_config["global_rank"]) # datasetの読み込み @@ -259,20 +267,14 @@ def evaluation(model, eval_dataloader): ) # save to the latest # モデルの保存(LoRAをモデルにマージしたもの) - # if model_config["use_lora"]: - # model = unload_and_merge_lora(model, model_config) - - if training_config["global_rank"] == 0: - save_hf_format(model, training_config) - if training_config["zero_stage"] == 3: - # For zero stage 3, each gpu only has a part of the model, so we need a special save function - save_zero_three_model( - model, - training_config["global_rank"], - training_config["output_dir"], - zero_stage=training_config["zero_stage"], - sub_folder=f"epoch-{epoch}", - ) + if model_config["use_lora"]: + # model <- base_model <- module (DeepSpeedEngine) と2重にwrapされている + model_unlora = unload_and_merge_lora(model.module, model_config).base_model + else: + model_unlora = model + + save_path = os.path.join(training_config["output_dir"], f"epoch_{epoch}") + model_unlora.save_pretrained(save_path) if __name__ == "__main__": From 97edccbdf2ffb34acd4650e8a7da59f22e816616 Mon Sep 17 00:00:00 2001 From: Yuma Ochi Date: Tue, 31 Oct 2023 08:24:56 +0000 Subject: [PATCH 14/50] fix print -> print_rank_0[train_ds.py] --- train_ds.py | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/train_ds.py b/train_ds.py index 3e6fdaf..14666c8 100644 --- a/train_ds.py +++ b/train_ds.py @@ -95,8 +95,8 @@ def main(config_file: str, local_rank: int = 0): trainable_list, untrainable_list = set_trainable_params( model, keys_to_finetune, keys_to_freeze, train_lora=model_config["use_lora"] ) - print("trainable_list", trainable_list) - print("untrainable_list", untrainable_list) + print_rank_0(f"trainable_list {trainable_list}", training_config["global_rank"]) + print_rank_0(f"untrainable_list {untrainable_list}", training_config["global_rank"]) print_rank_0(model, training_config["global_rank"]) From ceda701f050a2ebe630eaf83fee0cc9acef8b80d Mon Sep 17 00:00:00 2001 From: Yuma Ochi Date: Tue, 31 Oct 2023 08:28:48 +0000 Subject: [PATCH 15/50] add simple coco (captioning) dataset --- configs/datasets/coco.yaml | 3 +++ projects/opt/exp002_ds.yml | 8 ++++---- 2 files changed, 7 insertions(+), 4 deletions(-) create mode 100644 configs/datasets/coco.yaml diff --git a/configs/datasets/coco.yaml b/configs/datasets/coco.yaml new file mode 100644 index 0000000..6ebc917 --- /dev/null +++ b/configs/datasets/coco.yaml @@ -0,0 +1,3 @@ +dataset_type: m3it +dataset_names: + - coco diff --git a/projects/opt/exp002_ds.yml b/projects/opt/exp002_ds.yml index 00e0e3b..73cded8 100644 --- a/projects/opt/exp002_ds.yml +++ b/projects/opt/exp002_ds.yml @@ -2,13 +2,13 @@ training_config: per_device_train_batch_size: 2 per_device_eval_batch_size: 2 gradient_accumulation_steps: 4 - num_train_epochs: 1 + num_train_epochs: 5 dataloader_num_workers: 16 learning_rate: 5.0e-5 # logging_steps: 100 output_dir: ./output/ report_to: "wandb" - zero_stage: 3 + zero_stage: 2 precision: "fp16" enable_tensorboard: False seed: 0 @@ -23,7 +23,7 @@ training_config: model_config: pretrained_path: # None or path to model weight model_type: git_llm - language_model_name: facebook/opt-350m + language_model_name: facebook/opt-125m vision_model_name: openai/clip-vit-base-patch16 num_image_with_embedding: 1 # if 1, no img_temporal_embedding max_length: 512 @@ -45,4 +45,4 @@ model_config: task_type: CAUSAL_LM dataset_config_path: - - ./configs/datasets/m3it.yaml + - ./configs/datasets/coco.yaml # cocoのみの訓練 From 102fbcae4f601cd328fbeef76cf3176fdec5701a Mon Sep 17 00:00:00 2001 From: Yuma Ochi Date: Tue, 31 Oct 2023 12:52:29 +0000 Subject: [PATCH 16/50] add progressbar, fix model save point(rm merging LoRA while training) --- train_ds.py | 32 ++++++++++++++++++++++---------- 1 file changed, 22 insertions(+), 10 deletions(-) diff --git a/train_ds.py b/train_ds.py index 14666c8..fab6fcc 100644 --- a/train_ds.py +++ b/train_ds.py @@ -15,6 +15,7 @@ import yaml from torch.utils.data import DataLoader from torch.utils.data.distributed import DistributedSampler +from tqdm import tqdm from transformers import AdamW, AutoTokenizer, SchedulerType, get_scheduler import wandb @@ -174,7 +175,8 @@ def evaluation(model, eval_dataloader): model.eval() print_rank_0("***** Evaluation *****", training_config["global_rank"]) acc_loss = 0 - for step, batch in enumerate(eval_dataloader): + progress_bar = tqdm(eval_dataloader, dynamic_ncols=True) + for step, batch in enumerate(progress_bar): with torch.no_grad(): batch = to_device(batch, device) loss = model( @@ -184,6 +186,8 @@ def evaluation(model, eval_dataloader): labels=batch["labels"], )[0] acc_loss += loss + text = f"step {step}, loss: {loss.detach():.5f} the average_loss: {acc_loss/step:.5f}" + progress_bar.set_description(text) model.train() acc_loss = get_all_reduce_mean(acc_loss).item() ave_loss = acc_loss / (step + 1) @@ -204,7 +208,8 @@ def evaluation(model, eval_dataloader): ) model.train() acc_loss = 0 - for step, batch in enumerate(train_dataloader): + progress_bar = tqdm(train_dataloader, dynamic_ncols=True) + for step, batch in enumerate(progress_bar): batch = to_device(batch, device) # ここはDatasetの出力とモデルのforward関数を参考にした @@ -235,6 +240,9 @@ def evaluation(model, eval_dataloader): } ) + text = f"step {step}, loss: {loss.detach():.5f} the average_loss: {acc_loss/step:.5f}" + progress_bar.set_description(text) + model.tput_timer.update_epoch_count() acc_loss = get_all_reduce_mean(acc_loss).item() print_rank_0( @@ -266,15 +274,19 @@ def evaluation(model, eval_dataloader): training_config["output_dir"], client_state=client_state ) # save to the latest - # モデルの保存(LoRAをモデルにマージしたもの) - if model_config["use_lora"]: - # model <- base_model <- module (DeepSpeedEngine) と2重にwrapされている - model_unlora = unload_and_merge_lora(model.module, model_config).base_model - else: - model_unlora = model - save_path = os.path.join(training_config["output_dir"], f"epoch_{epoch}") - model_unlora.save_pretrained(save_path) + model_to_save = ( + model.module.base_model if hasattr(model, "module.base_model") else model.module + ) + model_to_save.save_pretrained(save_path) + + if model_config["use_lora"]: + # model <- base_model <- module (DeepSpeedEngine) と2重にwrapされている + model = unload_and_merge_lora(model.module, model_config).base_model + else: + model = model.module + save_path = os.path.join(training_config["output_dir"], f"epoch_final") + model.save_pretrained(save_path) if __name__ == "__main__": From 3ac13c513c7a912f9e8ea9dddf6864004cac4782 Mon Sep 17 00:00:00 2001 From: Yuma Ochi Date: Tue, 31 Oct 2023 13:16:10 +0000 Subject: [PATCH 17/50] add beta to config --- projects/opt/exp002_ds.yml | 3 +++ train_ds.py | 4 +++- 2 files changed, 6 insertions(+), 1 deletion(-) diff --git a/projects/opt/exp002_ds.yml b/projects/opt/exp002_ds.yml index 73cded8..7c0c7f7 100644 --- a/projects/opt/exp002_ds.yml +++ b/projects/opt/exp002_ds.yml @@ -15,6 +15,9 @@ training_config: weight_decay: 0. learning_rate_pretraining_components: 0. num_warmup_steps: 0. + optim_betas: + - 0.9 + - 0.95 lr_scheduler_type: "cosine" gradient_checkpointing: False cpu_offload: False diff --git a/train_ds.py b/train_ds.py index fab6fcc..de4abe8 100644 --- a/train_ds.py +++ b/train_ds.py @@ -126,7 +126,9 @@ def main(config_file: str, local_rank: int = 0): ) optimizer = AdamW( - optimizer_grouped_parameters, lr=training_config["learning_rate"], betas=(0.9, 0.95) + optimizer_grouped_parameters, + lr=training_config["learning_rate"], + betas=tuple(training_config["optim_betas"]), ) num_update_steps_per_epoch = math.ceil( From 1c0a7ea80cb31b148f02566c64cdccaaecee6f38 Mon Sep 17 00:00:00 2001 From: Yuma Ochi Date: Tue, 31 Oct 2023 13:20:39 +0000 Subject: [PATCH 18/50] restore utils.py --- heron/utils/utils.py | 10 +++++----- 1 file changed, 5 insertions(+), 5 deletions(-) diff --git a/heron/utils/utils.py b/heron/utils/utils.py index 88a5b69..1e56816 100644 --- a/heron/utils/utils.py +++ b/heron/utils/utils.py @@ -163,12 +163,12 @@ def moving_average(model, model_ema, beta=0.992, device=None, zero_stage=0): param_ema.data.copy_(torch.lerp(data, param_ema.data, beta)) -def save_hf_format(model, config, sub_folder=""): +def save_hf_format(model, tokenizer, args, sub_folder=""): # used to save huggingface format, so we can use it for hf.from_pretrained model_to_save = model.module if hasattr(model, "module") else model CONFIG_NAME = "config.json" WEIGHTS_NAME = "pytorch_model.bin" - output_dir = os.path.join(config["output_dir"], sub_folder) + output_dir = os.path.join(args.output_dir, sub_folder) os.makedirs(output_dir, exist_ok=True) output_model_file = os.path.join(output_dir, WEIGHTS_NAME) output_config_file = os.path.join(output_dir, CONFIG_NAME) @@ -180,11 +180,11 @@ def save_hf_format(model, config, sub_folder=""): try: model_to_save.config.to_json_file(output_config_file) except: - # args_dict = vars(args) - torch.save(config, os.path.join(output_dir, "train_args.pt")) + args_dict = vars(args) + torch.save(args_dict, os.path.join(output_dir, "train_args.pt")) print("config can't be saved") # tokenizer.save_vocabulary(output_dir) - # tokenizer.save_pretrained(output_dir) # this will save all tokenizer files + tokenizer.save_pretrained(output_dir) # this will save all tokenizer files def save_zero_three_model(model_ema, global_rank, save_dir, zero_stage=0, sub_folder=""): From 51695a2fb44ca0061fb08199f899503dac349772 Mon Sep 17 00:00:00 2001 From: Yuma Ochi Date: Tue, 31 Oct 2023 13:23:28 +0000 Subject: [PATCH 19/50] rm comment --- train_ds.py | 3 --- 1 file changed, 3 deletions(-) diff --git a/train_ds.py b/train_ds.py index de4abe8..b54c220 100644 --- a/train_ds.py +++ b/train_ds.py @@ -83,9 +83,6 @@ def main(config_file: str, local_rank: int = 0): model = load_model(model_config) if model_config["use_lora"]: - # VisualChatのLoRA実装 (w/o peft) - # model = convert_linear_layer_to_lora(model, ["query_key_value"], lora_dim=8) - # HeronのLoRA実装 (w/ peft) model = apply_lora_model(model, model_config) From 0cb821eb2025b4e0a33a1c87ea7743bcf248b33d Mon Sep 17 00:00:00 2001 From: Yuma Ochi Date: Sun, 5 Nov 2023 05:29:32 +0000 Subject: [PATCH 20/50] Jp -> En [comment] --- projects/opt/exp002_ds.yml | 2 +- train_ds.py | 25 +++++++++++-------------- 2 files changed, 12 insertions(+), 15 deletions(-) diff --git a/projects/opt/exp002_ds.yml b/projects/opt/exp002_ds.yml index 7c0c7f7..d3b830c 100644 --- a/projects/opt/exp002_ds.yml +++ b/projects/opt/exp002_ds.yml @@ -48,4 +48,4 @@ model_config: task_type: CAUSAL_LM dataset_config_path: - - ./configs/datasets/coco.yaml # cocoのみの訓練 + - ./configs/datasets/coco.yaml # only coco dataset diff --git a/train_ds.py b/train_ds.py index b54c220..0e3ae35 100644 --- a/train_ds.py +++ b/train_ds.py @@ -60,7 +60,7 @@ def main(config_file: str, local_rank: int = 0): set_random_seed(training_config["seed"]) - # DeepSpeedの初期化に必要な変数を設定 + # Get configs for initialize DeepSpeed ds_config = get_train_ds_config( training_config, offload=training_config["cpu_offload"], @@ -72,24 +72,22 @@ def main(config_file: str, local_rank: int = 0): * torch.distributed.get_world_size() * training_config["gradient_accumulation_steps"] ) - # wandb の初期化 + # Initialization of wandb if os.environ.get("WANDB_NAME") is not None and local_rank == 0: wandb.init(project=os.environ["WANDB_PROJECT"], config=config) - # すべてのプロセスの処理が終わるまで待機 + # Wait for all processes torch.distributed.barrier() - # load model + # Load model model = load_model(model_config) if model_config["use_lora"]: - # HeronのLoRA実装 (w/ peft) model = apply_lora_model(model, model_config) - # configの割り当て + # Set trainable params keys_to_finetune = config["model_config"]["keys_to_finetune"] keys_to_freeze = config["model_config"]["keys_to_freeze"] - # Set trainable params trainable_list, untrainable_list = set_trainable_params( model, keys_to_finetune, keys_to_freeze, train_lora=model_config["use_lora"] ) @@ -98,7 +96,7 @@ def main(config_file: str, local_rank: int = 0): print_rank_0(model, training_config["global_rank"]) - # datasetの読み込み + # Load datasets train_dataset, eval_dataset = get_dataset(config) train_dataloader = DataLoader( @@ -211,7 +209,6 @@ def evaluation(model, eval_dataloader): for step, batch in enumerate(progress_bar): batch = to_device(batch, device) - # ここはDatasetの出力とモデルのforward関数を参考にした input_ids = batch["input_ids"] attention_mask = batch["attention_mask"] pixel_values = batch["pixel_values"].half() @@ -225,10 +222,10 @@ def evaluation(model, eval_dataloader): acc_loss += loss.detach().clone() model.backward(loss) - # この中でgradient accumulationが行われることに注意 + # Attention: gradient accumulation in the function model.step() - # wandbへのlog + # Log to wandb if os.environ.get("WANDB_NAME") is not None and local_rank == 0: wandb.log( { @@ -252,7 +249,7 @@ def evaluation(model, eval_dataloader): if eval_loss < best_loss: best_loss = eval_loss - # wandbへのlog + # Log to wandb if os.environ.get("WANDB_NAME") is not None and local_rank == 0: wandb.log( { @@ -260,7 +257,7 @@ def evaluation(model, eval_dataloader): } ) - # 途中のチェックポイントの保存 + # Save the checkpoint client_state = { "random_rng_state": random.getstate(), "np_rng_state": np.random.get_state(), @@ -280,7 +277,7 @@ def evaluation(model, eval_dataloader): model_to_save.save_pretrained(save_path) if model_config["use_lora"]: - # model <- base_model <- module (DeepSpeedEngine) と2重にwrapされている + # model is double-warapped: model <- base_model <- module (DeepSpeedEngine) model = unload_and_merge_lora(model.module, model_config).base_model else: model = model.module From 2aade141dde7cf0afac12f2a8a677dc70f78e175 Mon Sep 17 00:00:00 2001 From: Yuma Ochi Date: Sun, 5 Nov 2023 07:01:05 +0000 Subject: [PATCH 21/50] Log lr to wandb --- train_ds.py | 2 ++ 1 file changed, 2 insertions(+) diff --git a/train_ds.py b/train_ds.py index 0e3ae35..fe62211 100644 --- a/train_ds.py +++ b/train_ds.py @@ -227,12 +227,14 @@ def evaluation(model, eval_dataloader): # Log to wandb if os.environ.get("WANDB_NAME") is not None and local_rank == 0: + now_lr = lr_scheduler.get_lr()[0] wandb.log( { "Train/epoch": epoch, "Train/step": step, "Train/loss": loss.detach(), "Train/average_loss": acc_loss / step, + "Train/learning_rate": now_lr, } ) From 02289e16ae63525fdf8e5f56cb81066072a48531 Mon Sep 17 00:00:00 2001 From: Yuma Ochi Date: Sun, 5 Nov 2023 07:01:39 +0000 Subject: [PATCH 22/50] support full parameter tuning --- heron/models/utils.py | 5 ++++- 1 file changed, 4 insertions(+), 1 deletion(-) diff --git a/heron/models/utils.py b/heron/models/utils.py index 3aa0a5c..85e489f 100644 --- a/heron/models/utils.py +++ b/heron/models/utils.py @@ -191,6 +191,9 @@ def set_trainable_params( untrainable_list.append(name) else: - raise ValueError("either keys_to_freeze or keys_to_finetune should be specified") + # Full parameter Tuning + for name, p in model.named_parameters(): + p.requires_grad = True + trainable_list.append(name) return trainable_list, untrainable_list From 9f02ae7f0b42998bded945be87f626aa9bf6a858 Mon Sep 17 00:00:00 2001 From: Yuma Ochi Date: Sun, 5 Nov 2023 13:17:36 +0000 Subject: [PATCH 23/50] add license --- heron/utils/ds_utils.py | 1 + 1 file changed, 1 insertion(+) diff --git a/heron/utils/ds_utils.py b/heron/utils/ds_utils.py index 0be0198..bc03460 100755 --- a/heron/utils/ds_utils.py +++ b/heron/utils/ds_utils.py @@ -1,5 +1,6 @@ # Copyright (c) Microsoft Corporation. # SPDX-License-Identifier: Apache-2.0 +# Modifications copyright 2023 Turing Inc. # DeepSpeed Team GLOBAL_BATCH_SIZE = 32 From 281d1e0a1bb90cfdc4131190b144af26af3bc6bf Mon Sep 17 00:00:00 2001 From: Yuma Ochi Date: Sun, 5 Nov 2023 13:19:17 +0000 Subject: [PATCH 24/50] change license --- train_ds.py | 19 +++++++++++++------ 1 file changed, 13 insertions(+), 6 deletions(-) diff --git a/train_ds.py b/train_ds.py index fe62211..395202d 100644 --- a/train_ds.py +++ b/train_ds.py @@ -1,8 +1,17 @@ -#!/usr/bin/env python -# Copyright (c) Microsoft Corporation. -# SPDX-License-Identifier: Apache-2.0 +# Copyright 2023 Turing Inc. Team. All rights reserved. +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. -# DeepSpeed Team import math import os import random @@ -155,8 +164,6 @@ def main(config_file: str, local_rank: int = 0): start_epoch = 0 # let load checkpoint if os.path.exists(os.path.join(training_config["output_dir"], "latest")): - # we have the deepspeed chekpoint so it is a resumed job - # TODO: after loading the ckpt, the global step is not loaded. Need to ask Tunji/Ammar for help. _, client_state = model.load_checkpoint(training_config["output_dir"]) start_epoch = client_state["epoch"] best_loss = client_state["best_loss"] From debe6bed4b755a1ea62851ed0e8836cd066844e4 Mon Sep 17 00:00:00 2001 From: Yuma Ochi Date: Sun, 5 Nov 2023 13:29:24 +0000 Subject: [PATCH 25/50] add DeepSpeedExamples to acknowledge --- README.md | 1 + 1 file changed, 1 insertion(+) diff --git a/README.md b/README.md index 97f26c7..7a037d3 100644 --- a/README.md +++ b/README.md @@ -225,3 +225,4 @@ Released under the [Apache License 2.0](./LICENSE). - [GenerativeImage2Text](https://github.com/microsoft/GenerativeImage2Text): The main idia of the model is based on original GIT. - [Llava](https://github.com/haotian-liu/LLaVA): This project is learned a lot from the great Llava project. - [GIT-LLM](https://github.com/Ino-Ichan/GIT-LLM) +- [DeepSpeedExamples](https://github.com/microsoft/DeepSpeedExamples) From 4464c1055e9d7459ac48c4b5987c85b7f43ca1c7 Mon Sep 17 00:00:00 2001 From: Yuma Ochi Date: Mon, 6 Nov 2023 10:18:11 +0000 Subject: [PATCH 26/50] change: coco.yaml -> m3it_coco.yaml --- configs/datasets/{coco.yaml => m3it_coco.yaml} | 0 projects/opt/exp002_ds.yml | 8 ++++---- 2 files changed, 4 insertions(+), 4 deletions(-) rename configs/datasets/{coco.yaml => m3it_coco.yaml} (100%) diff --git a/configs/datasets/coco.yaml b/configs/datasets/m3it_coco.yaml similarity index 100% rename from configs/datasets/coco.yaml rename to configs/datasets/m3it_coco.yaml diff --git a/projects/opt/exp002_ds.yml b/projects/opt/exp002_ds.yml index d3b830c..b2b9387 100644 --- a/projects/opt/exp002_ds.yml +++ b/projects/opt/exp002_ds.yml @@ -30,9 +30,9 @@ model_config: vision_model_name: openai/clip-vit-base-patch16 num_image_with_embedding: 1 # if 1, no img_temporal_embedding max_length: 512 - keys_to_finetune: - - visual_projection - - num_image_with_embedding + keys_to_finetune: [] + # - visual_projection + # - num_image_with_embedding keys_to_freeze: [] use_lora: true @@ -48,4 +48,4 @@ model_config: task_type: CAUSAL_LM dataset_config_path: - - ./configs/datasets/coco.yaml # only coco dataset + - ./configs/datasets/m3it_coco.yaml # only coco dataset From b675e82540a6eb148198148f013eec900a17b1c1 Mon Sep 17 00:00:00 2001 From: Yuma Ochi Date: Mon, 6 Nov 2023 10:22:50 +0000 Subject: [PATCH 27/50] add notice, copyright --- heron/utils/ds_utils.py | 9 +++++++++ train_ds.py | 27 ++++++++++++++------------- 2 files changed, 23 insertions(+), 13 deletions(-) diff --git a/heron/utils/ds_utils.py b/heron/utils/ds_utils.py index bc03460..ace5212 100755 --- a/heron/utils/ds_utils.py +++ b/heron/utils/ds_utils.py @@ -2,6 +2,15 @@ # SPDX-License-Identifier: Apache-2.0 # Modifications copyright 2023 Turing Inc. +""" +NOTICE: This code is subject to the terms of the Apache License 2.0. + +The code is modified from the original one. +original code: https://github.com/microsoft/DeepSpeedExamples/blob/master/applications/DeepSpeed-VisualChat/utils/ds_utils.py + +Additional contributions by Turing Inc. team +""" + # DeepSpeed Team GLOBAL_BATCH_SIZE = 32 MICRO_BATCH_SIZE = 4 diff --git a/train_ds.py b/train_ds.py index 395202d..eae026a 100644 --- a/train_ds.py +++ b/train_ds.py @@ -1,16 +1,14 @@ -# Copyright 2023 Turing Inc. Team. All rights reserved. -# -# Licensed under the Apache License, Version 2.0 (the "License"); -# you may not use this file except in compliance with the License. -# You may obtain a copy of the License at -# -# http://www.apache.org/licenses/LICENSE-2.0 -# -# Unless required by applicable law or agreed to in writing, software -# distributed under the License is distributed on an "AS IS" BASIS, -# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. -# See the License for the specific language governing permissions and -# limitations under the License. +# Copyright (c) Microsoft Corporation. +# SPDX-License-Identifier: Apache-2.0 + +""" +NOTICE: This code is subject to the terms of the Apache License 2.0. + +The code is modified from the original one. +original code: https://github.com/microsoft/DeepSpeedExamples/blob/master/applications/DeepSpeed-VisualChat/training/main.py + +Additional contributions by Turing Inc. team +""" import math import os @@ -81,6 +79,7 @@ def main(config_file: str, local_rank: int = 0): * torch.distributed.get_world_size() * training_config["gradient_accumulation_steps"] ) + # Initialization of wandb if os.environ.get("WANDB_NAME") is not None and local_rank == 0: wandb.init(project=os.environ["WANDB_PROJECT"], config=config) @@ -146,6 +145,7 @@ def main(config_file: str, local_rank: int = 0): ) else: training_config["num_warmup_steps"] = int(training_config["num_warmup_steps"]) + lr_scheduler = get_scheduler( name=training_config["lr_scheduler_type"], optimizer=optimizer, @@ -290,6 +290,7 @@ def evaluation(model, eval_dataloader): model = unload_and_merge_lora(model.module, model_config).base_model else: model = model.module + save_path = os.path.join(training_config["output_dir"], f"epoch_final") model.save_pretrained(save_path) From 45ef591eb7b82a0b4fd3f7655af2974d7001c442 Mon Sep 17 00:00:00 2001 From: Yuma Ochi Date: Mon, 6 Nov 2023 10:53:57 +0000 Subject: [PATCH 28/50] add todo (merge LoRA) --- train_ds.py | 18 ++++++++++-------- 1 file changed, 10 insertions(+), 8 deletions(-) diff --git a/train_ds.py b/train_ds.py index eae026a..efdf70f 100644 --- a/train_ds.py +++ b/train_ds.py @@ -285,14 +285,16 @@ def evaluation(model, eval_dataloader): ) model_to_save.save_pretrained(save_path) - if model_config["use_lora"]: - # model is double-warapped: model <- base_model <- module (DeepSpeedEngine) - model = unload_and_merge_lora(model.module, model_config).base_model - else: - model = model.module - - save_path = os.path.join(training_config["output_dir"], f"epoch_final") - model.save_pretrained(save_path) + # TODO: support merging LoRA for ZeRO3 training + if training_config["zero_stage"] != 3: + if model_config["use_lora"]: + # model is double-warapped: model <- base_model <- module (DeepSpeedEngine) + model = unload_and_merge_lora(model.module, model_config).base_model + else: + model = model.module + + save_path = os.path.join(training_config["output_dir"], f"epoch_final") + model.save_pretrained(save_path) if __name__ == "__main__": From d9d4635ac4391a5f105f106660027f9fb8f7fb87 Mon Sep 17 00:00:00 2001 From: Yuma Ochi Date: Mon, 6 Nov 2023 10:58:23 +0000 Subject: [PATCH 29/50] add uses [README] --- README.md | 74 +++++++++++++++++++++++++++++++++++++++++++++++++++++++ 1 file changed, 74 insertions(+) diff --git a/README.md b/README.md index 7a037d3..48d9b2a 100644 --- a/README.md +++ b/README.md @@ -143,6 +143,80 @@ To start learning, execute the following command. GPU is required for learning; we have tested on Ubuntu 20.04, CUDA 11.7. +# Training (w/o Trainer) + +We offer `train_ds.py`, a training script independent of Hugging Face's `Trainer` class for more flexible learning configurations. +For example, the contents of [projects/opt/exp002_ds.yml](. projects/opt/exp002_ds.yml) has the following contents: + +```yaml +training_config: + per_device_train_batch_size: 2 + per_device_eval_batch_size: 2 + gradient_accumulation_steps: 4 + num_train_epochs: 5 + dataloader_num_workers: 16 + learning_rate: 5.0e-5 + # logging_steps: 100 + output_dir: ./output/ + report_to: "wandb" + zero_stage: 2 + precision: "fp16" + enable_tensorboard: False + seed: 0 + weight_decay: 0. + learning_rate_pretraining_components: 0. + num_warmup_steps: 0. + optim_betas: + - 0.9 + - 0.95 + lr_scheduler_type: "cosine" + gradient_checkpointing: False + cpu_offload: False + + +model_config: + pretrained_path: # None or path to model weight + model_type: git_llm + language_model_name: facebook/opt-125m + vision_model_name: openai/clip-vit-base-patch16 + num_image_with_embedding: 1 # if 1, no img_temporal_embedding + max_length: 512 + keys_to_finetune: + - visual_projection + - num_image_with_embedding + keys_to_freeze: [] + + use_lora: true + lora: + r: 8 + lora_alpha: 32 + target_modules: + - q_proj + - k_proj + - v_proj + lora_dropout: 0.01 + bias: none + task_type: CAUSAL_LM + +dataset_config_path: + - ./configs/datasets/m3it_coco.yaml # only coco dataset +``` + +To start learning, execute the following command. + +```bash +./scripts/run_ds.sh +``` + +## [ZeRO-3] Conversion + +After training the model with ZeRO-3, you can obtain a model for inference by generating `pytorch_model.bin` from the checkpoint with the following steps: + +```bash +cd output/opt/exp002_ds # Move to the output dir of your experiment. +python zero_to_fp32.py . pytorch_model.bin # Generating the PyTorch model in the current directory. +``` + # Evaluation You can get the pretrained weight form Hugging Face Hub: [turing-motors/heron-chat-git-ja-stablelm-base-7b-v0](https://huggingface.co/turing-motors/heron-chat-git-ja-stablelm-base-7b-v0)
From f1acdba20c4f0e227ea9ffe675174cca20d2cb9b Mon Sep 17 00:00:00 2001 From: Yuma Ochi Date: Mon, 6 Nov 2023 11:00:33 +0000 Subject: [PATCH 30/50] fix path --- README.md | 3 +-- 1 file changed, 1 insertion(+), 2 deletions(-) diff --git a/README.md b/README.md index 48d9b2a..0aee49b 100644 --- a/README.md +++ b/README.md @@ -146,7 +146,7 @@ GPU is required for learning; we have tested on Ubuntu 20.04, CUDA 11.7. # Training (w/o Trainer) We offer `train_ds.py`, a training script independent of Hugging Face's `Trainer` class for more flexible learning configurations. -For example, the contents of [projects/opt/exp002_ds.yml](. projects/opt/exp002_ds.yml) has the following contents: +For example, the contents of [projects/opt/exp002_ds.yml](projects/opt/exp002_ds.yml) has the following contents: ```yaml training_config: @@ -156,7 +156,6 @@ training_config: num_train_epochs: 5 dataloader_num_workers: 16 learning_rate: 5.0e-5 - # logging_steps: 100 output_dir: ./output/ report_to: "wandb" zero_stage: 2 From 73a02b26c75045c55b0f0701b01f004a3a526cf0 Mon Sep 17 00:00:00 2001 From: Yuma Ochi Date: Tue, 7 Nov 2023 13:22:54 +0000 Subject: [PATCH 31/50] fix calc loss stepwise --- train_ds.py | 16 ++++++++-------- 1 file changed, 8 insertions(+), 8 deletions(-) diff --git a/train_ds.py b/train_ds.py index efdf70f..11152ab 100644 --- a/train_ds.py +++ b/train_ds.py @@ -189,11 +189,11 @@ def evaluation(model, eval_dataloader): pixel_values=batch["pixel_values"].half(), labels=batch["labels"], )[0] - acc_loss += loss - text = f"step {step}, loss: {loss.detach():.5f} the average_loss: {acc_loss/step:.5f}" + loss_log = get_all_reduce_mean(loss.detach().clone()).item() + acc_loss += loss_log + text = f"step {step}, loss: {loss_log:.5f} the average_loss: {acc_loss/(step+1):.5f}" progress_bar.set_description(text) model.train() - acc_loss = get_all_reduce_mean(acc_loss).item() ave_loss = acc_loss / (step + 1) print_rank_0(f"the eval average_loss: {ave_loss}", training_config["global_rank"]) return ave_loss @@ -227,7 +227,8 @@ def evaluation(model, eval_dataloader): labels=labels, )[0] - acc_loss += loss.detach().clone() + loss_log = get_all_reduce_mean(loss.detach().clone()).item() + acc_loss += loss_log model.backward(loss) # Attention: gradient accumulation in the function model.step() @@ -239,17 +240,16 @@ def evaluation(model, eval_dataloader): { "Train/epoch": epoch, "Train/step": step, - "Train/loss": loss.detach(), - "Train/average_loss": acc_loss / step, + "Train/loss": loss_log, + "Train/average_loss": acc_loss / (step + 1), "Train/learning_rate": now_lr, } ) - text = f"step {step}, loss: {loss.detach():.5f} the average_loss: {acc_loss/step:.5f}" + text = f"step {step}, loss: {loss.detach():.5f} the average_loss: {acc_loss/(step + 1):.5f}" progress_bar.set_description(text) model.tput_timer.update_epoch_count() - acc_loss = get_all_reduce_mean(acc_loss).item() print_rank_0( f"Epoch {epoch+1}, the average_loss: {acc_loss/step}", training_config["global_rank"] ) From 45563b3b7c95a470f458cd1d0747a78c85c53ffc Mon Sep 17 00:00:00 2001 From: Yuma Ochi Date: Tue, 7 Nov 2023 13:25:28 +0000 Subject: [PATCH 32/50] fix [exp002_ds.yml] --- projects/opt/exp002_ds.yml | 7 +++---- 1 file changed, 3 insertions(+), 4 deletions(-) diff --git a/projects/opt/exp002_ds.yml b/projects/opt/exp002_ds.yml index b2b9387..59b94a3 100644 --- a/projects/opt/exp002_ds.yml +++ b/projects/opt/exp002_ds.yml @@ -5,7 +5,6 @@ training_config: num_train_epochs: 5 dataloader_num_workers: 16 learning_rate: 5.0e-5 - # logging_steps: 100 output_dir: ./output/ report_to: "wandb" zero_stage: 2 @@ -30,9 +29,9 @@ model_config: vision_model_name: openai/clip-vit-base-patch16 num_image_with_embedding: 1 # if 1, no img_temporal_embedding max_length: 512 - keys_to_finetune: [] - # - visual_projection - # - num_image_with_embedding + keys_to_finetune: + - visual_projection + - num_image_with_embedding keys_to_freeze: [] use_lora: true From 50110a837da797e059037711e57ec5312415b45a Mon Sep 17 00:00:00 2001 From: Yuma Ochi Date: Tue, 7 Nov 2023 13:27:45 +0000 Subject: [PATCH 33/50] chore typo --- train_ds.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/train_ds.py b/train_ds.py index 11152ab..fbc9507 100644 --- a/train_ds.py +++ b/train_ds.py @@ -285,7 +285,7 @@ def evaluation(model, eval_dataloader): ) model_to_save.save_pretrained(save_path) - # TODO: support merging LoRA for ZeRO3 training + # TODO: support merging LoRA for ZeRO-3 training if training_config["zero_stage"] != 3: if model_config["use_lora"]: # model is double-warapped: model <- base_model <- module (DeepSpeedEngine) From d06d5e4ddcb799b85b74c9d4d9c7c194065ebe93 Mon Sep 17 00:00:00 2001 From: Yuma Ochi Date: Tue, 7 Nov 2023 13:32:19 +0000 Subject: [PATCH 34/50] rm redundent saving model --- train_ds.py | 6 ------ 1 file changed, 6 deletions(-) diff --git a/train_ds.py b/train_ds.py index fbc9507..67e9d01 100644 --- a/train_ds.py +++ b/train_ds.py @@ -279,12 +279,6 @@ def evaluation(model, eval_dataloader): training_config["output_dir"], client_state=client_state ) # save to the latest - save_path = os.path.join(training_config["output_dir"], f"epoch_{epoch}") - model_to_save = ( - model.module.base_model if hasattr(model, "module.base_model") else model.module - ) - model_to_save.save_pretrained(save_path) - # TODO: support merging LoRA for ZeRO-3 training if training_config["zero_stage"] != 3: if model_config["use_lora"]: From 1c31aee13cdc85c877e45d77f9f777847e9644b6 Mon Sep 17 00:00:00 2001 From: Yuma Ochi Date: Tue, 7 Nov 2023 13:46:40 +0000 Subject: [PATCH 35/50] adapt the saving model structure to original --- train_ds.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/train_ds.py b/train_ds.py index 67e9d01..1cd44dd 100644 --- a/train_ds.py +++ b/train_ds.py @@ -283,7 +283,7 @@ def evaluation(model, eval_dataloader): if training_config["zero_stage"] != 3: if model_config["use_lora"]: # model is double-warapped: model <- base_model <- module (DeepSpeedEngine) - model = unload_and_merge_lora(model.module, model_config).base_model + model = unload_and_merge_lora(model.module, model_config) else: model = model.module From d261c8c4d7dfcbb6170a96796322525787682b41 Mon Sep 17 00:00:00 2001 From: Yuma Ochi Date: Mon, 13 Nov 2023 11:34:13 +0000 Subject: [PATCH 36/50] Fix: all reduce logic --- train_ds.py | 11 +++++------ 1 file changed, 5 insertions(+), 6 deletions(-) diff --git a/train_ds.py b/train_ds.py index 1cd44dd..68883f5 100644 --- a/train_ds.py +++ b/train_ds.py @@ -189,9 +189,9 @@ def evaluation(model, eval_dataloader): pixel_values=batch["pixel_values"].half(), labels=batch["labels"], )[0] - loss_log = get_all_reduce_mean(loss.detach().clone()).item() - acc_loss += loss_log - text = f"step {step}, loss: {loss_log:.5f} the average_loss: {acc_loss/(step+1):.5f}" + acc_loss += loss.float() + text = f"step {step}, loss: {loss:.5f} the average_loss: {acc_loss.item()/(step+1)=}" + # print_rank_0(text) progress_bar.set_description(text) model.train() ave_loss = acc_loss / (step + 1) @@ -227,8 +227,7 @@ def evaluation(model, eval_dataloader): labels=labels, )[0] - loss_log = get_all_reduce_mean(loss.detach().clone()).item() - acc_loss += loss_log + acc_loss += loss.float() model.backward(loss) # Attention: gradient accumulation in the function model.step() @@ -240,7 +239,7 @@ def evaluation(model, eval_dataloader): { "Train/epoch": epoch, "Train/step": step, - "Train/loss": loss_log, + "Train/loss": loss, "Train/average_loss": acc_loss / (step + 1), "Train/learning_rate": now_lr, } From 1354b33f196c13ffbe997cacee129f89d7870854 Mon Sep 17 00:00:00 2001 From: Yuma Ochi Date: Mon, 13 Nov 2023 11:52:15 +0000 Subject: [PATCH 37/50] add notice (applied format) --- heron/utils/utils.py | 9 +++++++++ 1 file changed, 9 insertions(+) diff --git a/heron/utils/utils.py b/heron/utils/utils.py index 1e56816..38a7b9a 100644 --- a/heron/utils/utils.py +++ b/heron/utils/utils.py @@ -1,6 +1,15 @@ # Copyright (c) Microsoft Corporation. # SPDX-License-Identifier: Apache-2.0 +""" +NOTICE: This code is subject to the terms of the Apache License 2.0. + +The code is modified from the original one. +original code: https://github.com/microsoft/DeepSpeedExamples/blob/master/applications/DeepSpeed-VisualChat/utils/ds_utils.py + +Additional contributions by Turing Inc. team +""" + import json # DeepSpeed Team From 77a7fe077086f26a6f3bd5f32c8ce2445c79a520 Mon Sep 17 00:00:00 2001 From: Yuma Ochi Date: Fri, 17 Nov 2023 12:32:33 +0000 Subject: [PATCH 38/50] rm lora --- train_ds.py | 11 ++--------- 1 file changed, 2 insertions(+), 9 deletions(-) diff --git a/train_ds.py b/train_ds.py index 68883f5..d8fb7f8 100644 --- a/train_ds.py +++ b/train_ds.py @@ -90,14 +90,11 @@ def main(config_file: str, local_rank: int = 0): # Load model model = load_model(model_config) - if model_config["use_lora"]: - model = apply_lora_model(model, model_config) - # Set trainable params keys_to_finetune = config["model_config"]["keys_to_finetune"] keys_to_freeze = config["model_config"]["keys_to_freeze"] trainable_list, untrainable_list = set_trainable_params( - model, keys_to_finetune, keys_to_freeze, train_lora=model_config["use_lora"] + model, keys_to_finetune, keys_to_freeze, train_lora=False ) print_rank_0(f"trainable_list {trainable_list}", training_config["global_rank"]) print_rank_0(f"untrainable_list {untrainable_list}", training_config["global_rank"]) @@ -280,11 +277,7 @@ def evaluation(model, eval_dataloader): # TODO: support merging LoRA for ZeRO-3 training if training_config["zero_stage"] != 3: - if model_config["use_lora"]: - # model is double-warapped: model <- base_model <- module (DeepSpeedEngine) - model = unload_and_merge_lora(model.module, model_config) - else: - model = model.module + model = model.module save_path = os.path.join(training_config["output_dir"], f"epoch_final") model.save_pretrained(save_path) From b4b59ea8bf8516ea6ac1f30b17e6a6a13164ad57 Mon Sep 17 00:00:00 2001 From: Yuma Ochi Date: Mon, 20 Nov 2023 10:47:02 +0000 Subject: [PATCH 39/50] add ZeRO-3 instruction --- README.md | 21 +++++++++++++++++++++ 1 file changed, 21 insertions(+) diff --git a/README.md b/README.md index 0aee49b..f8013a8 100644 --- a/README.md +++ b/README.md @@ -218,6 +218,7 @@ python zero_to_fp32.py . pytorch_model.bin # Generating the PyTorch model in th # Evaluation +If you have the model trained by ZeRO-3 You can get the pretrained weight form Hugging Face Hub: [turing-motors/heron-chat-git-ja-stablelm-base-7b-v0](https://huggingface.co/turing-motors/heron-chat-git-ja-stablelm-base-7b-v0)
See also [notebooks](./notebooks). @@ -270,6 +271,26 @@ with torch.no_grad(): print(processor.tokenizer.batch_decode(out)[0]) ``` +If you have a model trained using ZeRO-3, it must be modified as follows: + +```diff +- # prepare a pretrained model +- model = GitLlamaForCausalLM.from_pretrained( +- 'turing-motors/heron-chat-git-Llama-2-7b-v0', torch_dtype=torch.float16 +- ) ++ from heron.models.utils import load_model, load_pretrained_weight ++ import yaml ++ ++ config_file = f"./projects/opt/exp002_ds.yml" ++ ++ # get config ++ with open(config_file, "r") as i_: ++ config = yaml.safe_load(i_) ++ ++ model = load_model(config["model_config"]) ++ model.load_state_dict(torch.load('./output/opt/exp002_ds/pytorch_model.bin'), strict=True) +``` + ### Pretrained Models |model|LLM module|adapter|size| From 9342e04f840bd0c2ebf54067239b3cc32ece3d63 Mon Sep 17 00:00:00 2001 From: Yuma Ochi Date: Mon, 20 Nov 2023 10:56:40 +0000 Subject: [PATCH 40/50] add JP docs --- docs/README_JP.md | 93 +++++++++++++++++++++++++++++++++++++++++++++++ 1 file changed, 93 insertions(+) diff --git a/docs/README_JP.md b/docs/README_JP.md index 437f1d4..1ec0f38 100644 --- a/docs/README_JP.md +++ b/docs/README_JP.md @@ -142,6 +142,78 @@ dataset_config_path: 学習にはGPUが必要です。Ubuntu20.04, CUDA11.7で動作確認をしています。 +# 学習方法 (Trainerなし) +Hugging Faceの `Trainer` クラスに依存しない訓練スクリプト `train_ds.py` を提供しています。
+例えば、[projects/opt/exp002_ds.yml](../projects/opt/exp_002_ds.yml)の内容は次のようになっています。 + +```yaml +training_config: + per_device_train_batch_size: 2 + per_device_eval_batch_size: 2 + gradient_accumulation_steps: 4 + num_train_epochs: 5 + dataloader_num_workers: 16 + learning_rate: 5.0e-5 + output_dir: ./output/ + report_to: "wandb" + zero_stage: 2 + precision: "fp16" + enable_tensorboard: False + seed: 0 + weight_decay: 0. + learning_rate_pretraining_components: 0. + num_warmup_steps: 0. + optim_betas: + - 0.9 + - 0.95 + lr_scheduler_type: "cosine" + gradient_checkpointing: False + cpu_offload: False + + +model_config: + pretrained_path: # None or path to model weight + model_type: git_llm + language_model_name: facebook/opt-125m + vision_model_name: openai/clip-vit-base-patch16 + num_image_with_embedding: 1 # if 1, no img_temporal_embedding + max_length: 512 + keys_to_finetune: + - visual_projection + - num_image_with_embedding + keys_to_freeze: [] + + use_lora: true + lora: + r: 8 + lora_alpha: 32 + target_modules: + - q_proj + - k_proj + - v_proj + lora_dropout: 0.01 + bias: none + task_type: CAUSAL_LM + +dataset_config_path: + - ./configs/datasets/m3it_coco.yaml # only coco dataset +``` + +学習を開始する場合は、次のコマンドを実行してください。 + +```bash +./scripts/run_ds.sh +``` + +## [ZeRO-3] モデルの変換 + +ZeRO-3で訓練した後、推論に使う `pytorch_model.bin` を生成するには次の手順を実行します。 + +```bash +cd output/opt/exp002_ds # 実験のアウトプットディレクトリに移動します +python zero_to_fp32.py . pytorch_model.bin # PyTorchモデルを生成します +``` + # 利用方法 Hugging Face Hubから学習済みモデルをダウンロードすることができます: [turing-motors/heron-chat-git-ja-stablelm-base-7b-v0](https://huggingface.co/turing-motors/heron-chat-git-ja-stablelm-base-7b-v0)
@@ -194,6 +266,26 @@ with torch.no_grad(): print(processor.tokenizer.batch_decode(out)) ``` +もしZeRO-3で訓練されたモデルならば、推論用コードに次の変更を加えます。 + +```diff +- # prepare a pretrained model +- model = GitLlamaForCausalLM.from_pretrained( +- 'turing-motors/heron-chat-git-Llama-2-7b-v0', torch_dtype=torch.float16 +- ) ++ from heron.models.utils import load_model, load_pretrained_weight ++ import yaml ++ ++ config_file = f"./projects/opt/exp002_ds.yml" ++ ++ # get config ++ with open(config_file, "r") as i_: ++ config = yaml.safe_load(i_) ++ ++ model = load_model(config["model_config"]) ++ model.load_state_dict(torch.load('./output/opt/exp002_ds/pytorch_model.bin'), strict=True) +``` + ### 学習済みモデル一覧 |model|LLM module|adapter|size| @@ -221,3 +313,4 @@ print(processor.tokenizer.batch_decode(out)) - [GenerativeImage2Text](https://github.com/microsoft/GenerativeImage2Text): モデルの構成方法の着想はGITに基づいています。 - [Llava](https://github.com/haotian-liu/LLaVA): 本ライブラリはLlavaプロジェクトを参考にしています。 - [GIT-LLM](https://github.com/Ino-Ichan/GIT-LLM) +- [DeepSpeedExamples](https://github.com/microsoft/DeepSpeedExamples) From 3ec603703971d67c093799d51449619cbd9b9458 Mon Sep 17 00:00:00 2001 From: Yuma Ochi Date: Mon, 20 Nov 2023 11:03:04 +0000 Subject: [PATCH 41/50] add chinese docs --- docs/README_CN.md | 92 +++++++++++++++++++++++++++++++++++++++++++++++ 1 file changed, 92 insertions(+) diff --git a/docs/README_CN.md b/docs/README_CN.md index 8f710d5..d69d2aa 100644 --- a/docs/README_CN.md +++ b/docs/README_CN.md @@ -143,6 +143,77 @@ training_config "为训练设置, "model_config "为模型设置,"dataset_conf 学习需要 GPU;我们在 Ubuntu 20.04 和 CUDA 11.7 上对系统进行了测试. +# 学习方法 (不含 Trainer) +我们提供 `train_ds.py` ,一个独立于Hugging Face训练师类的训练脚本,用于更灵活的学习配置。例如,[projects/opt/exp002_ds.yml](../projects/opt/exp002_ds.yml) 的内容如下: + +```yaml +training_config: + per_device_train_batch_size: 2 + per_device_eval_batch_size: 2 + gradient_accumulation_steps: 4 + num_train_epochs: 5 + dataloader_num_workers: 16 + learning_rate: 5.0e-5 + output_dir: ./output/ + report_to: "wandb" + zero_stage: 2 + precision: "fp16" + enable_tensorboard: False + seed: 0 + weight_decay: 0. + learning_rate_pretraining_components: 0. + num_warmup_steps: 0. + optim_betas: + - 0.9 + - 0.95 + lr_scheduler_type: "cosine" + gradient_checkpointing: False + cpu_offload: False + + +model_config: + pretrained_path: # None or path to model weight + model_type: git_llm + language_model_name: facebook/opt-125m + vision_model_name: openai/clip-vit-base-patch16 + num_image_with_embedding: 1 # if 1, no img_temporal_embedding + max_length: 512 + keys_to_finetune: + - visual_projection + - num_image_with_embedding + keys_to_freeze: [] + + use_lora: true + lora: + r: 8 + lora_alpha: 32 + target_modules: + - q_proj + - k_proj + - v_proj + lora_dropout: 0.01 + bias: none + task_type: CAUSAL_LM + +dataset_config_path: + - ./configs/datasets/m3it_coco.yaml # only coco dataset +``` + +要开始学习, 请执行以下命令. + + +```bash +./scripts/run_ds.sh +``` + +## [ZeRO-3] 转化 + +使用ZeRO-3训练模型后,可以通过以下步骤从检查点生成 `pytorch_model.bin` ,从而获得用于推理的模型: + +```bash +cd output/opt/exp002_ds # 実験のアウトプットディレクトリに移動します +python zero_to_fp32.py . pytorch_model.bin # PyTorchモデルを生成します +``` # 如何使用 您可以从 Hugging Face Hub 下载训练好的模型:[turing-motors/heron-chat-git-ja-stablelm-base-7b-v0](https://huggingface.co/turing-motors/heron-chat-git-ja-stablelm-base-7b-v0)
@@ -195,6 +266,26 @@ with torch.no_grad(): print(processor.tokenizer.batch_decode(out)) ``` +如果模型是用 ZeRO-3 训练的,请进行以下更改. + +```diff +- # prepare a pretrained model +- model = GitLlamaForCausalLM.from_pretrained( +- 'turing-motors/heron-chat-git-Llama-2-7b-v0', torch_dtype=torch.float16 +- ) ++ from heron.models.utils import load_model, load_pretrained_weight ++ import yaml ++ ++ config_file = f"./projects/opt/exp002_ds.yml" ++ ++ # get config ++ with open(config_file, "r") as i_: ++ config = yaml.safe_load(i_) ++ ++ model = load_model(config["model_config"]) ++ model.load_state_dict(torch.load('./output/opt/exp002_ds/pytorch_model.bin'), strict=True) +``` + ### 训练有素的模型列表 |model|LLM module|adapter|size| @@ -222,3 +313,4 @@ print(processor.tokenizer.batch_decode(out)) - [GenerativeImage2Text](https://github.com/microsoft/GenerativeImage2Text) - [Llava](https://github.com/haotian-liu/LLaVA) - [GIT-LLM](https://github.com/Ino-Ichan/GIT-LLM) +- [DeepSpeedExamples](https://github.com/microsoft/DeepSpeedExamples) From 92a422e6a7378b7aefa4c7de631d646f01958c91 Mon Sep 17 00:00:00 2001 From: Yuma Ochi Date: Mon, 20 Nov 2023 11:08:40 +0000 Subject: [PATCH 42/50] fix diff --- README.md | 30 +++++++++++++++--------------- docs/README_CN.md | 30 +++++++++++++++--------------- docs/README_JP.md | 30 +++++++++++++++--------------- 3 files changed, 45 insertions(+), 45 deletions(-) diff --git a/README.md b/README.md index f8013a8..9016eef 100644 --- a/README.md +++ b/README.md @@ -274,21 +274,21 @@ print(processor.tokenizer.batch_decode(out)[0]) If you have a model trained using ZeRO-3, it must be modified as follows: ```diff -- # prepare a pretrained model -- model = GitLlamaForCausalLM.from_pretrained( -- 'turing-motors/heron-chat-git-Llama-2-7b-v0', torch_dtype=torch.float16 -- ) -+ from heron.models.utils import load_model, load_pretrained_weight -+ import yaml -+ -+ config_file = f"./projects/opt/exp002_ds.yml" -+ -+ # get config -+ with open(config_file, "r") as i_: -+ config = yaml.safe_load(i_) -+ -+ model = load_model(config["model_config"]) -+ model.load_state_dict(torch.load('./output/opt/exp002_ds/pytorch_model.bin'), strict=True) +-# prepare a pretrained model +-model = GitLlamaForCausalLM.from_pretrained( +- 'turing-motors/heron-chat-git-Llama-2-7b-v0', torch_dtype=torch.float16 +-) ++from heron.models.utils import load_model, load_pretrained_weight ++import yaml ++ ++config_file = f"./projects/opt/exp002_ds.yml" ++ ++# get config ++with open(config_file, "r") as i_: ++ config = yaml.safe_load(i_) ++ ++model = load_model(config["model_config"]) ++model.load_state_dict(torch.load('./output/opt/exp002_ds/pytorch_model.bin'), strict=True) ``` ### Pretrained Models diff --git a/docs/README_CN.md b/docs/README_CN.md index d69d2aa..b532bbb 100644 --- a/docs/README_CN.md +++ b/docs/README_CN.md @@ -269,21 +269,21 @@ print(processor.tokenizer.batch_decode(out)) 如果模型是用 ZeRO-3 训练的,请进行以下更改. ```diff -- # prepare a pretrained model -- model = GitLlamaForCausalLM.from_pretrained( -- 'turing-motors/heron-chat-git-Llama-2-7b-v0', torch_dtype=torch.float16 -- ) -+ from heron.models.utils import load_model, load_pretrained_weight -+ import yaml -+ -+ config_file = f"./projects/opt/exp002_ds.yml" -+ -+ # get config -+ with open(config_file, "r") as i_: -+ config = yaml.safe_load(i_) -+ -+ model = load_model(config["model_config"]) -+ model.load_state_dict(torch.load('./output/opt/exp002_ds/pytorch_model.bin'), strict=True) +-# prepare a pretrained model +-model = GitLlamaForCausalLM.from_pretrained( +- 'turing-motors/heron-chat-git-Llama-2-7b-v0', torch_dtype=torch.float16 +-) ++from heron.models.utils import load_model, load_pretrained_weight ++import yaml ++ ++config_file = f"./projects/opt/exp002_ds.yml" ++ ++# get config ++with open(config_file, "r") as i_: ++ config = yaml.safe_load(i_) ++ ++model = load_model(config["model_config"]) ++model.load_state_dict(torch.load('./output/opt/exp002_ds/pytorch_model.bin'), strict=True) ``` ### 训练有素的模型列表 diff --git a/docs/README_JP.md b/docs/README_JP.md index 1ec0f38..2be8760 100644 --- a/docs/README_JP.md +++ b/docs/README_JP.md @@ -269,21 +269,21 @@ print(processor.tokenizer.batch_decode(out)) もしZeRO-3で訓練されたモデルならば、推論用コードに次の変更を加えます。 ```diff -- # prepare a pretrained model -- model = GitLlamaForCausalLM.from_pretrained( -- 'turing-motors/heron-chat-git-Llama-2-7b-v0', torch_dtype=torch.float16 -- ) -+ from heron.models.utils import load_model, load_pretrained_weight -+ import yaml -+ -+ config_file = f"./projects/opt/exp002_ds.yml" -+ -+ # get config -+ with open(config_file, "r") as i_: -+ config = yaml.safe_load(i_) -+ -+ model = load_model(config["model_config"]) -+ model.load_state_dict(torch.load('./output/opt/exp002_ds/pytorch_model.bin'), strict=True) +-# prepare a pretrained model +-model = GitLlamaForCausalLM.from_pretrained( +- 'turing-motors/heron-chat-git-Llama-2-7b-v0', torch_dtype=torch.float16 +-) ++from heron.models.utils import load_model, load_pretrained_weight ++import yaml ++ ++config_file = f"./projects/opt/exp002_ds.yml" ++ ++# get config ++with open(config_file, "r") as i_: ++ config = yaml.safe_load(i_) ++ ++model = load_model(config["model_config"]) ++model.load_state_dict(torch.load('./output/opt/exp002_ds/pytorch_model.bin'), strict=True) ``` ### 学習済みモデル一覧 From d7b0f689e9017c44a9e13cdf584c328d26e7ac3d Mon Sep 17 00:00:00 2001 From: Yuma Ochi Date: Mon, 20 Nov 2023 11:09:45 +0000 Subject: [PATCH 43/50] Revert "fix diff" This reverts commit 92a422e6a7378b7aefa4c7de631d646f01958c91. --- README.md | 30 +++++++++++++++--------------- docs/README_CN.md | 30 +++++++++++++++--------------- docs/README_JP.md | 30 +++++++++++++++--------------- 3 files changed, 45 insertions(+), 45 deletions(-) diff --git a/README.md b/README.md index 9016eef..f8013a8 100644 --- a/README.md +++ b/README.md @@ -274,21 +274,21 @@ print(processor.tokenizer.batch_decode(out)[0]) If you have a model trained using ZeRO-3, it must be modified as follows: ```diff --# prepare a pretrained model --model = GitLlamaForCausalLM.from_pretrained( -- 'turing-motors/heron-chat-git-Llama-2-7b-v0', torch_dtype=torch.float16 --) -+from heron.models.utils import load_model, load_pretrained_weight -+import yaml -+ -+config_file = f"./projects/opt/exp002_ds.yml" -+ -+# get config -+with open(config_file, "r") as i_: -+ config = yaml.safe_load(i_) -+ -+model = load_model(config["model_config"]) -+model.load_state_dict(torch.load('./output/opt/exp002_ds/pytorch_model.bin'), strict=True) +- # prepare a pretrained model +- model = GitLlamaForCausalLM.from_pretrained( +- 'turing-motors/heron-chat-git-Llama-2-7b-v0', torch_dtype=torch.float16 +- ) ++ from heron.models.utils import load_model, load_pretrained_weight ++ import yaml ++ ++ config_file = f"./projects/opt/exp002_ds.yml" ++ ++ # get config ++ with open(config_file, "r") as i_: ++ config = yaml.safe_load(i_) ++ ++ model = load_model(config["model_config"]) ++ model.load_state_dict(torch.load('./output/opt/exp002_ds/pytorch_model.bin'), strict=True) ``` ### Pretrained Models diff --git a/docs/README_CN.md b/docs/README_CN.md index b532bbb..d69d2aa 100644 --- a/docs/README_CN.md +++ b/docs/README_CN.md @@ -269,21 +269,21 @@ print(processor.tokenizer.batch_decode(out)) 如果模型是用 ZeRO-3 训练的,请进行以下更改. ```diff --# prepare a pretrained model --model = GitLlamaForCausalLM.from_pretrained( -- 'turing-motors/heron-chat-git-Llama-2-7b-v0', torch_dtype=torch.float16 --) -+from heron.models.utils import load_model, load_pretrained_weight -+import yaml -+ -+config_file = f"./projects/opt/exp002_ds.yml" -+ -+# get config -+with open(config_file, "r") as i_: -+ config = yaml.safe_load(i_) -+ -+model = load_model(config["model_config"]) -+model.load_state_dict(torch.load('./output/opt/exp002_ds/pytorch_model.bin'), strict=True) +- # prepare a pretrained model +- model = GitLlamaForCausalLM.from_pretrained( +- 'turing-motors/heron-chat-git-Llama-2-7b-v0', torch_dtype=torch.float16 +- ) ++ from heron.models.utils import load_model, load_pretrained_weight ++ import yaml ++ ++ config_file = f"./projects/opt/exp002_ds.yml" ++ ++ # get config ++ with open(config_file, "r") as i_: ++ config = yaml.safe_load(i_) ++ ++ model = load_model(config["model_config"]) ++ model.load_state_dict(torch.load('./output/opt/exp002_ds/pytorch_model.bin'), strict=True) ``` ### 训练有素的模型列表 diff --git a/docs/README_JP.md b/docs/README_JP.md index 2be8760..1ec0f38 100644 --- a/docs/README_JP.md +++ b/docs/README_JP.md @@ -269,21 +269,21 @@ print(processor.tokenizer.batch_decode(out)) もしZeRO-3で訓練されたモデルならば、推論用コードに次の変更を加えます。 ```diff --# prepare a pretrained model --model = GitLlamaForCausalLM.from_pretrained( -- 'turing-motors/heron-chat-git-Llama-2-7b-v0', torch_dtype=torch.float16 --) -+from heron.models.utils import load_model, load_pretrained_weight -+import yaml -+ -+config_file = f"./projects/opt/exp002_ds.yml" -+ -+# get config -+with open(config_file, "r") as i_: -+ config = yaml.safe_load(i_) -+ -+model = load_model(config["model_config"]) -+model.load_state_dict(torch.load('./output/opt/exp002_ds/pytorch_model.bin'), strict=True) +- # prepare a pretrained model +- model = GitLlamaForCausalLM.from_pretrained( +- 'turing-motors/heron-chat-git-Llama-2-7b-v0', torch_dtype=torch.float16 +- ) ++ from heron.models.utils import load_model, load_pretrained_weight ++ import yaml ++ ++ config_file = f"./projects/opt/exp002_ds.yml" ++ ++ # get config ++ with open(config_file, "r") as i_: ++ config = yaml.safe_load(i_) ++ ++ model = load_model(config["model_config"]) ++ model.load_state_dict(torch.load('./output/opt/exp002_ds/pytorch_model.bin'), strict=True) ``` ### 学習済みモデル一覧 From ad4a19b12f76be9cd5eb15a10d9dbf9db8e9b3bc Mon Sep 17 00:00:00 2001 From: Yuma Ochi Date: Mon, 20 Nov 2023 12:19:03 +0000 Subject: [PATCH 44/50] comment lora config --- projects/opt/exp002_ds.yml | 31 +++++++++++++++---------------- 1 file changed, 15 insertions(+), 16 deletions(-) diff --git a/projects/opt/exp002_ds.yml b/projects/opt/exp002_ds.yml index 59b94a3..0cfdd4a 100644 --- a/projects/opt/exp002_ds.yml +++ b/projects/opt/exp002_ds.yml @@ -2,12 +2,12 @@ training_config: per_device_train_batch_size: 2 per_device_eval_batch_size: 2 gradient_accumulation_steps: 4 - num_train_epochs: 5 + num_train_epochs: 3 dataloader_num_workers: 16 learning_rate: 5.0e-5 output_dir: ./output/ report_to: "wandb" - zero_stage: 2 + zero_stage: 3 precision: "fp16" enable_tensorboard: False seed: 0 @@ -29,22 +29,21 @@ model_config: vision_model_name: openai/clip-vit-base-patch16 num_image_with_embedding: 1 # if 1, no img_temporal_embedding max_length: 512 - keys_to_finetune: - - visual_projection - - num_image_with_embedding + keys_to_finetune: [] keys_to_freeze: [] - use_lora: true - lora: - r: 8 - lora_alpha: 32 - target_modules: - - q_proj - - k_proj - - v_proj - lora_dropout: 0.01 - bias: none - task_type: CAUSAL_LM + # TODO: support LoRA + # use_lora: false + # lora: + # r: 8 + # lora_alpha: 32 + # target_modules: + # - q_proj + # - k_proj + # - v_proj + # lora_dropout: 0.01 + # bias: none + # task_type: CAUSAL_LM dataset_config_path: - ./configs/datasets/m3it_coco.yaml # only coco dataset From b693714d0bbc463fda2ce637bddeea1809b105b5 Mon Sep 17 00:00:00 2001 From: Yuma Ochi Date: Mon, 20 Nov 2023 12:21:14 +0000 Subject: [PATCH 45/50] update README --- README.md | 23 ++++++++++++----------- docs/README_CN.md | 23 ++++++++++++----------- docs/README_JP.md | 23 ++++++++++++----------- 3 files changed, 36 insertions(+), 33 deletions(-) diff --git a/README.md b/README.md index f8013a8..871da31 100644 --- a/README.md +++ b/README.md @@ -185,17 +185,18 @@ model_config: - num_image_with_embedding keys_to_freeze: [] - use_lora: true - lora: - r: 8 - lora_alpha: 32 - target_modules: - - q_proj - - k_proj - - v_proj - lora_dropout: 0.01 - bias: none - task_type: CAUSAL_LM + # TODO: support LoRA + # use_lora: false + # lora: + # r: 8 + # lora_alpha: 32 + # target_modules: + # - q_proj + # - k_proj + # - v_proj + # lora_dropout: 0.01 + # bias: none + # task_type: CAUSAL_LM dataset_config_path: - ./configs/datasets/m3it_coco.yaml # only coco dataset diff --git a/docs/README_CN.md b/docs/README_CN.md index d69d2aa..f603d7b 100644 --- a/docs/README_CN.md +++ b/docs/README_CN.md @@ -183,17 +183,18 @@ model_config: - num_image_with_embedding keys_to_freeze: [] - use_lora: true - lora: - r: 8 - lora_alpha: 32 - target_modules: - - q_proj - - k_proj - - v_proj - lora_dropout: 0.01 - bias: none - task_type: CAUSAL_LM + # TODO: support LoRA + # use_lora: false + # lora: + # r: 8 + # lora_alpha: 32 + # target_modules: + # - q_proj + # - k_proj + # - v_proj + # lora_dropout: 0.01 + # bias: none + # task_type: CAUSAL_LM dataset_config_path: - ./configs/datasets/m3it_coco.yaml # only coco dataset diff --git a/docs/README_JP.md b/docs/README_JP.md index 1ec0f38..ebb66b1 100644 --- a/docs/README_JP.md +++ b/docs/README_JP.md @@ -183,17 +183,18 @@ model_config: - num_image_with_embedding keys_to_freeze: [] - use_lora: true - lora: - r: 8 - lora_alpha: 32 - target_modules: - - q_proj - - k_proj - - v_proj - lora_dropout: 0.01 - bias: none - task_type: CAUSAL_LM + # TODO: support LoRA + # use_lora: false + # lora: + # r: 8 + # lora_alpha: 32 + # target_modules: + # - q_proj + # - k_proj + # - v_proj + # lora_dropout: 0.01 + # bias: none + # task_type: CAUSAL_LM dataset_config_path: - ./configs/datasets/m3it_coco.yaml # only coco dataset From d92c798bbfe4619ad2a44cb245a8b93818c1394a Mon Sep 17 00:00:00 2001 From: Yuma Ochi Date: Fri, 24 Nov 2023 12:36:11 +0000 Subject: [PATCH 46/50] save model trained by ZeRO-3 --- heron/utils/utils.py | 2 +- train_ds.py | 9 +++++++++ 2 files changed, 10 insertions(+), 1 deletion(-) diff --git a/heron/utils/utils.py b/heron/utils/utils.py index 38a7b9a..fa8204a 100644 --- a/heron/utils/utils.py +++ b/heron/utils/utils.py @@ -209,7 +209,7 @@ def save_zero_three_model(model_ema, global_rank, save_dir, zero_stage=0, sub_fo torch.save(model_to_save.state_dict(), output_model_file) else: output_state_dict = {} - for k, v in model_to_save.named_parameters(): + for k, v in model_to_save.named_parameters(remove_duplicate=False): if hasattr(v, "ds_id"): with deepspeed.zero.GatheredParameters( _z3_params_to_fetch([v]), enabled=zero_stage_3 diff --git a/train_ds.py b/train_ds.py index d8fb7f8..138c2da 100644 --- a/train_ds.py +++ b/train_ds.py @@ -40,6 +40,7 @@ get_optimizer_grouped_parameters, print_rank_0, set_random_seed, + save_zero_three_model, to_device, ) @@ -275,6 +276,14 @@ def evaluation(model, eval_dataloader): training_config["output_dir"], client_state=client_state ) # save to the latest + if training_config["zero_stage"] == 3: + save_zero_three_model(model, + training_config["global_rank"], + training_config["output_dir"], + zero_stage=training_config["zero_stage"], + sub_folder=f'epoch-{epoch}') + + # TODO: support merging LoRA for ZeRO-3 training if training_config["zero_stage"] != 3: model = model.module From 74d969656702a8f2c55294a0835a81745fcb9429 Mon Sep 17 00:00:00 2001 From: Yuma Ochi Date: Fri, 24 Nov 2023 12:39:13 +0000 Subject: [PATCH 47/50] rm conversion [README] --- README.md | 11 +---------- docs/README_CN.md | 10 +--------- docs/README_JP.md | 11 +---------- 3 files changed, 3 insertions(+), 29 deletions(-) diff --git a/README.md b/README.md index 871da31..d26d040 100644 --- a/README.md +++ b/README.md @@ -208,15 +208,6 @@ To start learning, execute the following command. ./scripts/run_ds.sh ``` -## [ZeRO-3] Conversion - -After training the model with ZeRO-3, you can obtain a model for inference by generating `pytorch_model.bin` from the checkpoint with the following steps: - -```bash -cd output/opt/exp002_ds # Move to the output dir of your experiment. -python zero_to_fp32.py . pytorch_model.bin # Generating the PyTorch model in the current directory. -``` - # Evaluation If you have the model trained by ZeRO-3 @@ -289,7 +280,7 @@ If you have a model trained using ZeRO-3, it must be modified as follows: + config = yaml.safe_load(i_) + + model = load_model(config["model_config"]) -+ model.load_state_dict(torch.load('./output/opt/exp002_ds/pytorch_model.bin'), strict=True) ++ model.load_state_dict(torch.load('./output/opt/exp002_ds/epoch-1/pytorch_model.bin'), strict=True) ``` ### Pretrained Models diff --git a/docs/README_CN.md b/docs/README_CN.md index f603d7b..92daaef 100644 --- a/docs/README_CN.md +++ b/docs/README_CN.md @@ -207,14 +207,6 @@ dataset_config_path: ./scripts/run_ds.sh ``` -## [ZeRO-3] 转化 - -使用ZeRO-3训练模型后,可以通过以下步骤从检查点生成 `pytorch_model.bin` ,从而获得用于推理的模型: - -```bash -cd output/opt/exp002_ds # 実験のアウトプットディレクトリに移動します -python zero_to_fp32.py . pytorch_model.bin # PyTorchモデルを生成します -``` # 如何使用 您可以从 Hugging Face Hub 下载训练好的模型:[turing-motors/heron-chat-git-ja-stablelm-base-7b-v0](https://huggingface.co/turing-motors/heron-chat-git-ja-stablelm-base-7b-v0)
@@ -284,7 +276,7 @@ print(processor.tokenizer.batch_decode(out)) + config = yaml.safe_load(i_) + + model = load_model(config["model_config"]) -+ model.load_state_dict(torch.load('./output/opt/exp002_ds/pytorch_model.bin'), strict=True) ++ model.load_state_dict(torch.load('./output/opt/exp002_ds/epoch-1/pytorch_model.bin'), strict=True) ``` ### 训练有素的模型列表 diff --git a/docs/README_JP.md b/docs/README_JP.md index ebb66b1..09e1d9e 100644 --- a/docs/README_JP.md +++ b/docs/README_JP.md @@ -206,15 +206,6 @@ dataset_config_path: ./scripts/run_ds.sh ``` -## [ZeRO-3] モデルの変換 - -ZeRO-3で訓練した後、推論に使う `pytorch_model.bin` を生成するには次の手順を実行します。 - -```bash -cd output/opt/exp002_ds # 実験のアウトプットディレクトリに移動します -python zero_to_fp32.py . pytorch_model.bin # PyTorchモデルを生成します -``` - # 利用方法 Hugging Face Hubから学習済みモデルをダウンロードすることができます: [turing-motors/heron-chat-git-ja-stablelm-base-7b-v0](https://huggingface.co/turing-motors/heron-chat-git-ja-stablelm-base-7b-v0)
@@ -284,7 +275,7 @@ print(processor.tokenizer.batch_decode(out)) + config = yaml.safe_load(i_) + + model = load_model(config["model_config"]) -+ model.load_state_dict(torch.load('./output/opt/exp002_ds/pytorch_model.bin'), strict=True) ++ model.load_state_dict(torch.load('./output/opt/exp002_ds/epoch-1/pytorch_model.bin'), strict=True) ``` ### 学習済みモデル一覧 From f8aecb319e970e20bf27e21c1d93867b2314514f Mon Sep 17 00:00:00 2001 From: Yuma Ochi Date: Wed, 29 Nov 2023 08:22:40 +0000 Subject: [PATCH 48/50] add HfDeepSpeedConfig on ZeRO-3 training --- train_ds.py | 19 ++++++++++++------- 1 file changed, 12 insertions(+), 7 deletions(-) diff --git a/train_ds.py b/train_ds.py index 138c2da..8634765 100644 --- a/train_ds.py +++ b/train_ds.py @@ -24,6 +24,7 @@ from torch.utils.data.distributed import DistributedSampler from tqdm import tqdm from transformers import AdamW, AutoTokenizer, SchedulerType, get_scheduler +from transformers.integrations import HfDeepSpeedConfig import wandb from heron.datasets.utils import get_dataset @@ -39,8 +40,8 @@ get_all_reduce_mean, get_optimizer_grouped_parameters, print_rank_0, - set_random_seed, save_zero_three_model, + set_random_seed, to_device, ) @@ -81,6 +82,9 @@ def main(config_file: str, local_rank: int = 0): * training_config["gradient_accumulation_steps"] ) + if training_config["zero_stage"] == 2: + dschf = HFDeepSpeedConfig(ds_config) + # Initialization of wandb if os.environ.get("WANDB_NAME") is not None and local_rank == 0: wandb.init(project=os.environ["WANDB_PROJECT"], config=config) @@ -277,12 +281,13 @@ def evaluation(model, eval_dataloader): ) # save to the latest if training_config["zero_stage"] == 3: - save_zero_three_model(model, - training_config["global_rank"], - training_config["output_dir"], - zero_stage=training_config["zero_stage"], - sub_folder=f'epoch-{epoch}') - + save_zero_three_model( + model, + training_config["global_rank"], + training_config["output_dir"], + zero_stage=training_config["zero_stage"], + sub_folder=f"epoch-{epoch}", + ) # TODO: support merging LoRA for ZeRO-3 training if training_config["zero_stage"] != 3: From c7418791dacd93e6d11374219ac6f65c11b49acc Mon Sep 17 00:00:00 2001 From: Yuma Ochi Date: Fri, 1 Dec 2023 11:16:08 +0000 Subject: [PATCH 49/50] fix zero stage --- train_ds.py | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/train_ds.py b/train_ds.py index 8634765..d948095 100644 --- a/train_ds.py +++ b/train_ds.py @@ -82,8 +82,8 @@ def main(config_file: str, local_rank: int = 0): * training_config["gradient_accumulation_steps"] ) - if training_config["zero_stage"] == 2: - dschf = HFDeepSpeedConfig(ds_config) + if training_config["zero_stage"] == 3: + dschf = HfDeepSpeedConfig(ds_config) # Initialization of wandb if os.environ.get("WANDB_NAME") is not None and local_rank == 0: From 2ebd456dc38b30bb1901ac65774bc24bb01e4591 Mon Sep 17 00:00:00 2001 From: Yuma Ochi Date: Fri, 1 Dec 2023 11:17:21 +0000 Subject: [PATCH 50/50] add initialization for mpirun --- train_ds.py | 7 +++++++ 1 file changed, 7 insertions(+) diff --git a/train_ds.py b/train_ds.py index d948095..7df7417 100644 --- a/train_ds.py +++ b/train_ds.py @@ -59,6 +59,13 @@ def main(config_file: str, local_rank: int = 0): if local_rank == -1: device = torch.device("cuda") + elif local_rank == -100: + # for mpirun launcher + # Initializes the distributed backend which will take care of sychronizing nodes/GPUs + deepspeed.init_distributed() + local_rank = int(os.environ["LOCAL_RANK"]) + torch.cuda.set_device(local_rank) + device = torch.device("cuda", local_rank) else: torch.cuda.set_device(local_rank) device = torch.device("cuda", local_rank)