01-ai · ZhaoFancy · Dec 2, 2023 · Dec 1, 2023 · Dec 1, 2023 · Dec 2, 2023
diff --git a/finetune/utils/model/model_utils.py b/finetune/utils/model/model_utils.py
@@ -32,6 +32,7 @@ def create_hf_model(
             from_tf=bool(".ckpt" in model_name_or_path),
             config=model_config,
             trust_remote_code=True,
+            use_flash_attention_2=True,
         )
     else:
         model = model_class.from_pretrained(

diff --git a/finetune/utils/utils.py b/finetune/utils/utils.py
@@ -1,6 +1,5 @@
 import json
 import os
-from shutil import copy
 
 import deepspeed
 import torch
@@ -90,12 +89,6 @@ def save_hf_format(model, tokenizer, args, sub_folder=""):
     print(os.listdir(output_dir))
     print(os.getcwd())
 
-    source = args.model_name_or_path
-    target = os.path.abspath(os.path.join(os.getcwd(), args.output_dir))
-    copy(os.path.join(source, "configuration_yi.py"), target)
-    copy(os.path.join(source, "modeling_yi.py"), target)
-    copy(os.path.join(source, "tokenization_yi.py"), target)
-
 
 def get_all_reduce_mean(tensor):
     torch.distributed.all_reduce(tensor, op=torch.distributed.ReduceOp.SUM)
@@ -258,31 +251,3 @@ def save_zero_three_model(model_ema, global_rank, save_dir, zero_stage=0):
         if global_rank == 0:
             torch.save(output_state_dict, output_model_file)
         del output_state_dict
-
-
-def save_zero_three_model(model_ema, global_rank, save_dir, zero_stage=0):
-    zero_stage_3 = zero_stage == 3
-    os.makedirs(save_dir, exist_ok=True)
-    WEIGHTS_NAME = "pytorch_model.bin"
-    output_model_file = os.path.join(save_dir, WEIGHTS_NAME)
-
-    model_to_save = model_ema.module if hasattr(model_ema, "module") else model_ema
-    if not zero_stage_3:
-        if global_rank == 0:
-            torch.save(model_to_save.state_dict(), output_model_file)
-    else:
-        output_state_dict = {}
-        for k, v in model_to_save.named_parameters():
-            if hasattr(v, "ds_id"):
-                with deepspeed.zero.GatheredParameters(
-                    _z3_params_to_fetch([v]), enabled=zero_stage_3
-                ):
-                    v_p = v.data.cpu()
-            else:
-                v_p = v.cpu()
-            if global_rank == 0 and "lora" not in k:
-                output_state_dict[k] = v_p
-
-        if global_rank == 0:
-            torch.save(output_state_dict, output_model_file)
-        del output_state_dict