update t5 and llama training

dw-innovation · Jan 3, 2024 · 612d84f · 612d84f
1 parent 943338d
commit 612d84f
Show file tree

Hide file tree

Showing 9 changed files with 122 additions and 94 deletions.
diff --git a/.gitignore b/.gitignore
@@ -157,4 +157,6 @@ cython_debug/
 #  be found at https://github.com/github/gitignore/blob/main/Global/JetBrains.gitignore
 #  and can be added to the global gitignore or merged into this file.  For a more nuclear
 #  option (not recommended) you can uncomment the following to ignore the entire idea folder.
-#.idea/
+#.idea/
+
+Dockerfile
diff --git a/README.md b/README.md
@@ -22,14 +22,18 @@ Before training the model, you need to create .env file since we need to use GPU
 
 Model training:
 ```shell
-screen -L -Logfile t5_train sudo docker run --rm --gpus all -v /reco/llm_training/:/app --env-file .env --name llm_training llm_training:latest bash scripts/spot/train_t5.sh
+screen -L -Logfile t5_train sudo docker run --rm --gpus all -v /reco/llm-training/:/app --env-file .env --name llm_training llm_training:latest bash scripts/spot/train_t5.sh
 ```
 
 Model testing, you need to remove --train from the script:
 ```shell
-screen -L -Logfile t5_train sudo docker run --rm --gpus all -v /reco/llm_training/:/app --env-file .env --name llm_training llm_training:latest bash scripts/spot/train_t5.sh
+screen -L -Logfile t5_train sudo docker run --rm --gpus all -v /reco/llm-training/:/app --env-file .env --name llm_training llm_training:latest bash scripts/spot/train_t5.sh
 ```
 
+To fine-tune Llama2, make sure that you add HF credentials in `.env` as follows:
+
+`HF_INFERENCE_TOKEN=YOUR_TOKEN`
+
 
 ### Zero-shot Learning
 Zero-shot learning codes for ChatGPT and Llama2 are located under `app/nshot/{model_name}_zero_shot.py`. 

diff --git a/app/llama_models.py b/app/llama_models.py
@@ -1,6 +1,7 @@
 import os
 import json
 from dotenv import load_dotenv
+from app.nshot.utils import read_prompt_file
 
 load_dotenv()
 # comment the below lines if you need to use gpu 0.
@@ -13,17 +14,34 @@
 from typing import Dict
 from tqdm import tqdm
 from transformers import (AutoTokenizer, AutoModelForCausalLM, DataCollatorForSeq2Seq,
-                          EarlyStoppingCallback, Text2TextGenerationPipeline,TrainingArguments)
+                          EarlyStoppingCallback, pipeline, TrainingArguments, BitsAndBytesConfig)
 from loguru import logger
 from trl import SFTTrainer
 from peft import LoraConfig, get_peft_model, TaskType, PeftConfig, PeftModel
+from datasets import Dataset
 
+# reference: https://github.com/huggingface/trl/blob/main/examples/research_projects/stack_llama_2/scripts/sft_llama2.py
 
 class Llama2Model:
     def __init__(self, tokenizer):
         self.tokenizer = tokenizer
 
     def train(self, train_ds, val_ds, params: Dict):
+        # todo: rename the function name
+        instruction_file = read_prompt_file(prompt_file=f"scripts/{params['task']}/prompt/zero_shot_cot_prompt.txt")
+
+        if params['debug']:
+            random_seed = params['random_seed']
+            sample_ratio = params['sample_ratio']
+            train_ds = train_ds.sample(int(len(train_ds)*sample_ratio), random_state=random_seed)
+            val_ds = val_ds.sample(int(len(val_ds)*sample_ratio), random_state=random_seed)
+
+        train_ds['instruction'] = train_ds.apply(lambda example: f"### Instruction: {instruction_file}\n ### Input: {example.sentence}\n ### Response: {example.query}", axis=1)
+        val_ds['instruction'] = val_ds.apply(lambda example: f"### Instruction: {instruction_file}\n ### Input: {example.sentence}\n ### Response: {example.query}", axis=1)
+
+        train_ds = Dataset.from_pandas(train_ds)
+        val_ds = Dataset.from_pandas(val_ds)
+
         cuda_device = params['cuda_device']
 
         logger.info(f"Available devices are {torch.cuda.device_count()}")
@@ -36,66 +54,39 @@ def train(self, train_ds, val_ds, params: Dict):
 
         logger.info(f"Selected device is {device}.")
 
-        # add metric func
-        rouge_score = evaluate.load("rouge")
-        logger.info("Added Rouge metric.")
-
-        def compute_metrics(eval_preds):
-            preds, labels = eval_preds
-
-            if isinstance(preds, tuple):
-                preds = preds[0]
-
-            # Replace -100 in the preds as we can't decode them
-            preds = np.where(preds != -100, preds, self.tokenizer.pad_token_id)
-
-            # Decode generated summaries into text
-            decoded_preds = self.tokenizer.batch_decode(preds, skip_special_tokens=True)
+        # define LoRA Config
+        lora_config = LoraConfig(
+            r=16,
+            lora_alpha=32,
+            lora_dropout=0.1,
+            target_modules=["q_proj", "v_proj"],
+            bias="none",
+            task_type="CAUSAL_LM"
+        )
 
-            # Replace -100 in the labels as we can't decode them
-            labels = np.where(labels != -100, labels, self.tokenizer.pad_token_id)
-            # Decode reference summaries into text
-            decoded_labels = self.tokenizer.batch_decode(labels, skip_special_tokens=True)
-            # ROUGE expects a newline after each sentence
-            decoded_preds = ["\n".join(pred.strip()) for pred in decoded_preds]
+        bnb_config = BitsAndBytesConfig(
+            load_in_4bit=True,
+            bnb_4bit_quant_type="nf4",
+            bnb_4bit_compute_dtype=torch.bfloat16,
+        )
 
-            decoded_labels = ["\n".join(label.strip()) for label in decoded_labels]
-            # Compute ROUGscores
-            result = rouge_score.compute(
-                predictions=decoded_preds, references=decoded_labels, use_stemmer=True
-            )
-            # Extract the median scores
-            result = {key: value * 100 for key, value in result.items()}
-            return {k: round(v, 4) for k, v in result.items()}
 
         # declare model
         print(f"pretrained model {params['pretrained_model']}")
         model = AutoModelForCausalLM.from_pretrained(params['pretrained_model'],
                                                      low_cpu_mem_usage=True,
                                                      trust_remote_code=True,
+                                                     quantization_config=bnb_config,
                                                      return_dict=True,
                                                      torch_dtype=torch.float16,
                                                      device_map="auto",
                                                      )
 
-        model.to(device)
+        model.config.use_cache = False
 
-        # define LoRA Config
-        lora_config = LoraConfig(
-            r=16,
-            lora_alpha=32,
-            lora_dropout=0.1,
-            bias="none",
-            task_type="CAUSAL_LM"
-        )
+        # recheck the following
+        # model.config.pretraining_tp = 1 
 
-        # add LoRa adaptor
-        model = get_peft_model(model, lora_config)
-        model.print_trainable_parameters()
-
-        # declare data collator
-        data_collator = DataCollatorForSeq2Seq(tokenizer=self.tokenizer, model=model, label_pad_token_id=-100,
-                                               pad_to_multiple_of=8)
 
         model_output_path = params['model_output_path']
         learning_rate = params['learning_rate']
@@ -113,11 +104,10 @@ def compute_metrics(eval_preds):
             weight_decay=0.01,
             save_total_limit=1,
             num_train_epochs=epochs,
-            predict_with_generate=True,
-            generation_max_length=max_length,
-            greater_is_better=True,
             auto_find_batch_size=True,
-            metric_for_best_model=params['eval_metric'],
+            # metric_for_best_model=params['eval_metric'],
+            greater_is_better=False,
+            metric_for_best_model='eval_loss',
             load_best_model_at_end=True
         )
 
@@ -127,9 +117,10 @@ def compute_metrics(eval_preds):
             args=training_args,
             train_dataset=train_ds,
             eval_dataset=val_ds,
+            dataset_text_field="instruction",
+            peft_config=lora_config,
             tokenizer=self.tokenizer,
-            data_collator=data_collator,
-            compute_metrics=compute_metrics,
+            max_seq_length= max_length,
             callbacks=[EarlyStoppingCallback(early_stopping_patience=10)]
         )
 
@@ -148,34 +139,44 @@ def compute_metrics(eval_preds):
         if str(device) != "cpu":
             torch.cuda.empty_cache()
 
-    def test(self, test_sentences, params):
+
+    @torch.inference_mode()
+    def test(self, test_ds, params):
         peft_model_id = params['model_output_path']
         config = PeftConfig.from_pretrained(params['model_output_path'])
 
         # load base LLM model and tokenizer
         model = AutoModelForCausalLM.from_pretrained(config.base_model_name_or_path)
         tokenizer = AutoTokenizer.from_pretrained(config.base_model_name_or_path)
 
-        model = PeftModel.from_pretrained(model, peft_model_id, device_map={"": 0})
+        tokenizer.pad_token = tokenizer.eos_token
+        tokenizer.padding_side = "right"
+
+        model = PeftModel.from_pretrained(model, peft_model_id, device_map='auto')
         model = model.merge_and_unload()
-        model.eval()
 
-        device = torch.device(f"cuda:{params['cuda_device']}" if torch.cuda.is_available() else "cpu")
-        pipeline = Text2TextGenerationPipeline(model=model, batch_size=16,
-                                               tokenizer=tokenizer,
-                                               device=device,  # model.device,
-                                               clean_up_tokenization_spaces=True)
-        logger.info('Getting predictions...')
-        generated_texts = pipeline(test_sentences, do_sample=False, max_length=params['max_length'],
-                                   pad_token_id=self.tokenizer.pad_token_id)
+        test_sentences = test_ds['sentence'].tolist()
 
-        logger.info('Predictions is done.')
+        device = torch.device(params['cuda_device'] if torch.cuda.is_available() else "cpu")
 
+        model.eval()
+
+        model.to(device)
+
+        instruction_text = read_prompt_file(prompt_file=f"scripts/{params['task']}/prompt/zero_shot_cot_prompt.txt")
+
         with open(params['result_file_path'], 'w') as outfile:
-            for test_inst, generated_text in tqdm(zip(test_sentences, generated_texts), total=len(test_sentences)):
+            for sentence in tqdm(test_sentences, total=len(test_sentences)):
+                prompt = f"### Instruction: {instruction_text}\n ### Input: {sentence}\n ### Response:\n "
+                input_ids = tokenizer(prompt, return_tensors="pt", truncation=True).input_ids.cuda()
+
+                outputs = model.generate(input_ids=input_ids, max_new_tokens=params['max_length'], do_sample=True, top_p=0.9,temperature=0.5)
+                generated_instruction =tokenizer.batch_decode(outputs.detach().cpu().numpy(), skip_special_tokens=True)[0][len(prompt):]
+
                 processed_data = {
-                    "sentence": test_inst,
-                    "model_result": generated_text['generated_text']
+                    "sentence": sentence,
+                    "model_result": generated_instruction
                 }
+
                 json.dump(processed_data, outfile)
-                outfile.write('\n')
+                outfile.write('\n')
diff --git a/app/main.py b/app/main.py
@@ -1,8 +1,10 @@
 import os
+import pandas as pd
 from dotenv import load_dotenv
 from app.models import MODELS
-from app.tasks import TASKS
+from app.nshot.utils import read_prompt_file
 from transformers import AutoTokenizer
+from huggingface_hub import login
 
 load_dotenv()
 # comment the below lines if you need to use gpu 0.
@@ -25,6 +27,7 @@
     parser.add_argument('--model_output_path')
     parser.add_argument("--result_file_path")
     parser.add_argument('--random_seed', type=int)
+    parser.add_argument('--sample_ratio', type=float)
     parser.add_argument("--train", action="store_true")
     parser.add_argument("--test", action="store_true")
     parser.add_argument("--debug", action="store_true")
@@ -48,6 +51,10 @@
 
     set_random_seed(args.random_seed)
 
+
+    hf_token = os.getenv("HF_INFERENCE_TOKEN")
+    login(token=hf_token)
+
     print(f"pretrained model {args.pretrained_model}")
 
     tokenizer = AutoTokenizer.from_pretrained(args.pretrained_model, trust_remote_code=True)
@@ -61,7 +68,7 @@
         tokenizer.padding_side = "right"
 
     model = MODELS[args.model_type](tokenizer=tokenizer)
-    task_func = TASKS[args.task]
+
     task = args.task
 
     if args.train:
@@ -74,8 +81,12 @@
         logger.info(f"Max length is {max_length}")
 
         # dataset loading
-        train_ds = task_func(dataset_path=train_file_path, tokenizer=tokenizer, max_length=max_length, debug=debug)
-        val_ds = task_func(dataset_path=val_file_path, tokenizer=tokenizer, max_length=max_length, debug=debug)
+
+        # train_ds = task_func(dataset_path=train_file_path, tokenizer=tokenizer, max_length=max_length, debug=debug, prompt=prompt)
+        # val_ds = task_func(dataset_path=val_file_path, tokenizer=tokenizer, max_length=max_length, debug=debug, prompt=prompt)
+
+        train_ds = pd.read_csv(train_file_path, sep='\t')
+        val_ds = pd.read_csv(train_file_path, sep='\t')
 
         model.train(train_ds, val_ds, params)
 
@@ -88,9 +99,5 @@
         result_file_path = params['result_file_path']
         debug = params['debug']
 
-        test_ds = task_func(dataset_path=test_file_path, tokenizer=tokenizer, max_length=max_length, debug=debug,
-                            test=True)
-
-        test_sentences = test_ds['sentence'].tolist()
-
-        model.test(test_sentences=test_sentences, params=params)
+        test_ds = pd.read_csv(test_file_path, sep='\t')
+        model.test(test_ds=test_ds, params=params)
diff --git a/app/t5_models.py b/app/t5_models.py
@@ -17,13 +17,27 @@
 from loguru import logger
 from peft import LoraConfig, get_peft_model, TaskType, PeftConfig, PeftModel
 from transformers import Seq2SeqTrainer, Seq2SeqTrainingArguments, AutoModelForSeq2SeqLM
+from app.tasks import TASKS
 
 
 class T5Model:
     def __init__(self, tokenizer):
         self.tokenizer = tokenizer
 
     def train(self, train_ds, val_ds, params: Dict):
+        max_length = params['max_length']
+        task_func = TASKS[params['task']]
+
+        if params['debug']:
+            random_seed = params['random_seed']
+            sample_ratio = params['sample_ratio']
+            train_ds = train_ds.sample(int(len(train_ds)*sample_ratio), random_state=random_seed)
+            val_ds = val_ds.sample(int(len(val_ds)*sample_ratio), random_state=random_seed)
+
+        train_ds = task_func(dataset=train_ds, tokenizer=self.tokenizer, max_length=max_length)
+        val_ds = task_func(dataset=val_ds, tokenizer=self.tokenizer, max_length=max_length)
+
+
         cuda_device = params['cuda_device']
 
         logger.info(f"Available devices are {torch.cuda.device_count()}")
@@ -143,7 +157,8 @@ def compute_metrics(eval_preds):
         if str(device) != "cpu":
             torch.cuda.empty_cache()
 
-    def test(self, test_sentences, params):
+    def test(self, test_ds, params):
+        test_sentences = test_ds['sentence'].tolist()
         peft_model_id = params['model_output_path']
         config = PeftConfig.from_pretrained(params['model_output_path'])
 

diff --git a/app/tasks.py b/app/tasks.py
@@ -30,13 +30,8 @@ def preprocess_function(examples: Dict[str, Any], max_length: int, tokenizer: ob
                                  return_tensors="np")
     return model_inputs
 
-
-def load_spot_dataset(dataset_path, tokenizer, max_length, debug, test=False, output_type='yaml'):
+def load_spot_dataset(dataset, tokenizer, max_length, test=False, output_type='yaml'):
     output_col = None
-    dataset = pd.read_csv(dataset_path, sep='\t')
-
-    if debug:
-        dataset = dataset[:20]
 
     dataset["sentence"] = dataset["sentence"].apply(lambda x: x.lower())
 

diff --git a/requirements.txt b/requirements.txt
@@ -15,4 +15,5 @@ jinja2==3.1.2
 strictyaml==1.7.3
 openai==1.3.9
 pint==0.23
-trl==0.7.7
+trl==0.7.7
+bitsandbytes
diff --git a/scripts/spot/train_llama2.sh b/scripts/spot/train_llama2.sh
@@ -8,8 +8,9 @@ MAX_LENGTH=1024
 EVAL_METRIC=eval_rouge2
 RESULT_FILE_PATH=results/${MODEL_TYPE}_tuned_base_minimized_${MODEL_VERSION}_db-${DATASET_VERSION}_output_yaml_out.tsv
 
-LEARNING_RATE=1e-3
-EPOCHS=50
+LEARNING_RATE=2e-4
+# EPOCHS=50
+EPOCHS=3
 RANDOM_SEED=0
 
 TRAIN_DATASET=tasks/spot/${DATASET_VERSION}/IMR_Dataset_${DATASET_VERSION}_train_ChatNL_minimized_yaml_out.csv
@@ -40,5 +41,4 @@ python3 -m app.main \
 --eval_metric $EVAL_METRIC \
 --result_file_path $RESULT_FILE_PATH \
 --train \
---test \
---debug
+--test