Skip to content

Commit

Permalink
update t5 and llama training
Browse files Browse the repository at this point in the history
  • Loading branch information
BarisSchlichtI committed Jan 3, 2024
1 parent 943338d commit 612d84f
Show file tree
Hide file tree
Showing 9 changed files with 122 additions and 94 deletions.
4 changes: 3 additions & 1 deletion .gitignore
Original file line number Diff line number Diff line change
Expand Up @@ -157,4 +157,6 @@ cython_debug/
# be found at https://github.com/github/gitignore/blob/main/Global/JetBrains.gitignore
# and can be added to the global gitignore or merged into this file. For a more nuclear
# option (not recommended) you can uncomment the following to ignore the entire idea folder.
#.idea/
#.idea/

Dockerfile
8 changes: 6 additions & 2 deletions README.md
Original file line number Diff line number Diff line change
Expand Up @@ -22,14 +22,18 @@ Before training the model, you need to create .env file since we need to use GPU

Model training:
```shell
screen -L -Logfile t5_train sudo docker run --rm --gpus all -v /reco/llm_training/:/app --env-file .env --name llm_training llm_training:latest bash scripts/spot/train_t5.sh
screen -L -Logfile t5_train sudo docker run --rm --gpus all -v /reco/llm-training/:/app --env-file .env --name llm_training llm_training:latest bash scripts/spot/train_t5.sh
```

Model testing, you need to remove --train from the script:
```shell
screen -L -Logfile t5_train sudo docker run --rm --gpus all -v /reco/llm_training/:/app --env-file .env --name llm_training llm_training:latest bash scripts/spot/train_t5.sh
screen -L -Logfile t5_train sudo docker run --rm --gpus all -v /reco/llm-training/:/app --env-file .env --name llm_training llm_training:latest bash scripts/spot/train_t5.sh
```

To fine-tune Llama2, make sure that you add HF credentials in `.env` as follows:

`HF_INFERENCE_TOKEN=YOUR_TOKEN`


### Zero-shot Learning
Zero-shot learning codes for ChatGPT and Llama2 are located under `app/nshot/{model_name}_zero_shot.py`.
Expand Down
137 changes: 69 additions & 68 deletions app/llama_models.py
Original file line number Diff line number Diff line change
@@ -1,6 +1,7 @@
import os
import json
from dotenv import load_dotenv
from app.nshot.utils import read_prompt_file

load_dotenv()
# comment the below lines if you need to use gpu 0.
Expand All @@ -13,17 +14,34 @@
from typing import Dict
from tqdm import tqdm
from transformers import (AutoTokenizer, AutoModelForCausalLM, DataCollatorForSeq2Seq,
EarlyStoppingCallback, Text2TextGenerationPipeline,TrainingArguments)
EarlyStoppingCallback, pipeline, TrainingArguments, BitsAndBytesConfig)
from loguru import logger
from trl import SFTTrainer
from peft import LoraConfig, get_peft_model, TaskType, PeftConfig, PeftModel
from datasets import Dataset

# reference: https://github.com/huggingface/trl/blob/main/examples/research_projects/stack_llama_2/scripts/sft_llama2.py

class Llama2Model:
def __init__(self, tokenizer):
self.tokenizer = tokenizer

def train(self, train_ds, val_ds, params: Dict):
# todo: rename the function name
instruction_file = read_prompt_file(prompt_file=f"scripts/{params['task']}/prompt/zero_shot_cot_prompt.txt")

if params['debug']:
random_seed = params['random_seed']
sample_ratio = params['sample_ratio']
train_ds = train_ds.sample(int(len(train_ds)*sample_ratio), random_state=random_seed)
val_ds = val_ds.sample(int(len(val_ds)*sample_ratio), random_state=random_seed)

train_ds['instruction'] = train_ds.apply(lambda example: f"### Instruction: {instruction_file}\n ### Input: {example.sentence}\n ### Response: {example.query}", axis=1)
val_ds['instruction'] = val_ds.apply(lambda example: f"### Instruction: {instruction_file}\n ### Input: {example.sentence}\n ### Response: {example.query}", axis=1)

train_ds = Dataset.from_pandas(train_ds)
val_ds = Dataset.from_pandas(val_ds)

cuda_device = params['cuda_device']

logger.info(f"Available devices are {torch.cuda.device_count()}")
Expand All @@ -36,66 +54,39 @@ def train(self, train_ds, val_ds, params: Dict):

logger.info(f"Selected device is {device}.")

# add metric func
rouge_score = evaluate.load("rouge")
logger.info("Added Rouge metric.")

def compute_metrics(eval_preds):
preds, labels = eval_preds

if isinstance(preds, tuple):
preds = preds[0]

# Replace -100 in the preds as we can't decode them
preds = np.where(preds != -100, preds, self.tokenizer.pad_token_id)

# Decode generated summaries into text
decoded_preds = self.tokenizer.batch_decode(preds, skip_special_tokens=True)
# define LoRA Config
lora_config = LoraConfig(
r=16,
lora_alpha=32,
lora_dropout=0.1,
target_modules=["q_proj", "v_proj"],
bias="none",
task_type="CAUSAL_LM"
)

# Replace -100 in the labels as we can't decode them
labels = np.where(labels != -100, labels, self.tokenizer.pad_token_id)
# Decode reference summaries into text
decoded_labels = self.tokenizer.batch_decode(labels, skip_special_tokens=True)
# ROUGE expects a newline after each sentence
decoded_preds = ["\n".join(pred.strip()) for pred in decoded_preds]
bnb_config = BitsAndBytesConfig(
load_in_4bit=True,
bnb_4bit_quant_type="nf4",
bnb_4bit_compute_dtype=torch.bfloat16,
)

decoded_labels = ["\n".join(label.strip()) for label in decoded_labels]
# Compute ROUGscores
result = rouge_score.compute(
predictions=decoded_preds, references=decoded_labels, use_stemmer=True
)
# Extract the median scores
result = {key: value * 100 for key, value in result.items()}
return {k: round(v, 4) for k, v in result.items()}

# declare model
print(f"pretrained model {params['pretrained_model']}")
model = AutoModelForCausalLM.from_pretrained(params['pretrained_model'],
low_cpu_mem_usage=True,
trust_remote_code=True,
quantization_config=bnb_config,
return_dict=True,
torch_dtype=torch.float16,
device_map="auto",
)

model.to(device)
model.config.use_cache = False

# define LoRA Config
lora_config = LoraConfig(
r=16,
lora_alpha=32,
lora_dropout=0.1,
bias="none",
task_type="CAUSAL_LM"
)
# recheck the following
# model.config.pretraining_tp = 1

# add LoRa adaptor
model = get_peft_model(model, lora_config)
model.print_trainable_parameters()

# declare data collator
data_collator = DataCollatorForSeq2Seq(tokenizer=self.tokenizer, model=model, label_pad_token_id=-100,
pad_to_multiple_of=8)

model_output_path = params['model_output_path']
learning_rate = params['learning_rate']
Expand All @@ -113,11 +104,10 @@ def compute_metrics(eval_preds):
weight_decay=0.01,
save_total_limit=1,
num_train_epochs=epochs,
predict_with_generate=True,
generation_max_length=max_length,
greater_is_better=True,
auto_find_batch_size=True,
metric_for_best_model=params['eval_metric'],
# metric_for_best_model=params['eval_metric'],
greater_is_better=False,
metric_for_best_model='eval_loss',
load_best_model_at_end=True
)

Expand All @@ -127,9 +117,10 @@ def compute_metrics(eval_preds):
args=training_args,
train_dataset=train_ds,
eval_dataset=val_ds,
dataset_text_field="instruction",
peft_config=lora_config,
tokenizer=self.tokenizer,
data_collator=data_collator,
compute_metrics=compute_metrics,
max_seq_length= max_length,
callbacks=[EarlyStoppingCallback(early_stopping_patience=10)]
)

Expand All @@ -148,34 +139,44 @@ def compute_metrics(eval_preds):
if str(device) != "cpu":
torch.cuda.empty_cache()

def test(self, test_sentences, params):

@torch.inference_mode()
def test(self, test_ds, params):
peft_model_id = params['model_output_path']
config = PeftConfig.from_pretrained(params['model_output_path'])

# load base LLM model and tokenizer
model = AutoModelForCausalLM.from_pretrained(config.base_model_name_or_path)
tokenizer = AutoTokenizer.from_pretrained(config.base_model_name_or_path)

model = PeftModel.from_pretrained(model, peft_model_id, device_map={"": 0})
tokenizer.pad_token = tokenizer.eos_token
tokenizer.padding_side = "right"

model = PeftModel.from_pretrained(model, peft_model_id, device_map='auto')
model = model.merge_and_unload()
model.eval()

device = torch.device(f"cuda:{params['cuda_device']}" if torch.cuda.is_available() else "cpu")
pipeline = Text2TextGenerationPipeline(model=model, batch_size=16,
tokenizer=tokenizer,
device=device, # model.device,
clean_up_tokenization_spaces=True)
logger.info('Getting predictions...')
generated_texts = pipeline(test_sentences, do_sample=False, max_length=params['max_length'],
pad_token_id=self.tokenizer.pad_token_id)
test_sentences = test_ds['sentence'].tolist()

logger.info('Predictions is done.')
device = torch.device(params['cuda_device'] if torch.cuda.is_available() else "cpu")

model.eval()

model.to(device)

instruction_text = read_prompt_file(prompt_file=f"scripts/{params['task']}/prompt/zero_shot_cot_prompt.txt")

with open(params['result_file_path'], 'w') as outfile:
for test_inst, generated_text in tqdm(zip(test_sentences, generated_texts), total=len(test_sentences)):
for sentence in tqdm(test_sentences, total=len(test_sentences)):
prompt = f"### Instruction: {instruction_text}\n ### Input: {sentence}\n ### Response:\n "
input_ids = tokenizer(prompt, return_tensors="pt", truncation=True).input_ids.cuda()

outputs = model.generate(input_ids=input_ids, max_new_tokens=params['max_length'], do_sample=True, top_p=0.9,temperature=0.5)
generated_instruction =tokenizer.batch_decode(outputs.detach().cpu().numpy(), skip_special_tokens=True)[0][len(prompt):]

processed_data = {
"sentence": test_inst,
"model_result": generated_text['generated_text']
"sentence": sentence,
"model_result": generated_instruction
}

json.dump(processed_data, outfile)
outfile.write('\n')
outfile.write('\n')
27 changes: 17 additions & 10 deletions app/main.py
Original file line number Diff line number Diff line change
@@ -1,8 +1,10 @@
import os
import pandas as pd
from dotenv import load_dotenv
from app.models import MODELS
from app.tasks import TASKS
from app.nshot.utils import read_prompt_file
from transformers import AutoTokenizer
from huggingface_hub import login

load_dotenv()
# comment the below lines if you need to use gpu 0.
Expand All @@ -25,6 +27,7 @@
parser.add_argument('--model_output_path')
parser.add_argument("--result_file_path")
parser.add_argument('--random_seed', type=int)
parser.add_argument('--sample_ratio', type=float)
parser.add_argument("--train", action="store_true")
parser.add_argument("--test", action="store_true")
parser.add_argument("--debug", action="store_true")
Expand All @@ -48,6 +51,10 @@

set_random_seed(args.random_seed)


hf_token = os.getenv("HF_INFERENCE_TOKEN")
login(token=hf_token)

print(f"pretrained model {args.pretrained_model}")

tokenizer = AutoTokenizer.from_pretrained(args.pretrained_model, trust_remote_code=True)
Expand All @@ -61,7 +68,7 @@
tokenizer.padding_side = "right"

model = MODELS[args.model_type](tokenizer=tokenizer)
task_func = TASKS[args.task]

task = args.task

if args.train:
Expand All @@ -74,8 +81,12 @@
logger.info(f"Max length is {max_length}")

# dataset loading
train_ds = task_func(dataset_path=train_file_path, tokenizer=tokenizer, max_length=max_length, debug=debug)
val_ds = task_func(dataset_path=val_file_path, tokenizer=tokenizer, max_length=max_length, debug=debug)

# train_ds = task_func(dataset_path=train_file_path, tokenizer=tokenizer, max_length=max_length, debug=debug, prompt=prompt)
# val_ds = task_func(dataset_path=val_file_path, tokenizer=tokenizer, max_length=max_length, debug=debug, prompt=prompt)

train_ds = pd.read_csv(train_file_path, sep='\t')
val_ds = pd.read_csv(train_file_path, sep='\t')

model.train(train_ds, val_ds, params)

Expand All @@ -88,9 +99,5 @@
result_file_path = params['result_file_path']
debug = params['debug']

test_ds = task_func(dataset_path=test_file_path, tokenizer=tokenizer, max_length=max_length, debug=debug,
test=True)

test_sentences = test_ds['sentence'].tolist()

model.test(test_sentences=test_sentences, params=params)
test_ds = pd.read_csv(test_file_path, sep='\t')
model.test(test_ds=test_ds, params=params)
17 changes: 16 additions & 1 deletion app/t5_models.py
Original file line number Diff line number Diff line change
Expand Up @@ -17,13 +17,27 @@
from loguru import logger
from peft import LoraConfig, get_peft_model, TaskType, PeftConfig, PeftModel
from transformers import Seq2SeqTrainer, Seq2SeqTrainingArguments, AutoModelForSeq2SeqLM
from app.tasks import TASKS


class T5Model:
def __init__(self, tokenizer):
self.tokenizer = tokenizer

def train(self, train_ds, val_ds, params: Dict):
max_length = params['max_length']
task_func = TASKS[params['task']]

if params['debug']:
random_seed = params['random_seed']
sample_ratio = params['sample_ratio']
train_ds = train_ds.sample(int(len(train_ds)*sample_ratio), random_state=random_seed)
val_ds = val_ds.sample(int(len(val_ds)*sample_ratio), random_state=random_seed)

train_ds = task_func(dataset=train_ds, tokenizer=self.tokenizer, max_length=max_length)
val_ds = task_func(dataset=val_ds, tokenizer=self.tokenizer, max_length=max_length)


cuda_device = params['cuda_device']

logger.info(f"Available devices are {torch.cuda.device_count()}")
Expand Down Expand Up @@ -143,7 +157,8 @@ def compute_metrics(eval_preds):
if str(device) != "cpu":
torch.cuda.empty_cache()

def test(self, test_sentences, params):
def test(self, test_ds, params):
test_sentences = test_ds['sentence'].tolist()
peft_model_id = params['model_output_path']
config = PeftConfig.from_pretrained(params['model_output_path'])

Expand Down
7 changes: 1 addition & 6 deletions app/tasks.py
Original file line number Diff line number Diff line change
Expand Up @@ -30,13 +30,8 @@ def preprocess_function(examples: Dict[str, Any], max_length: int, tokenizer: ob
return_tensors="np")
return model_inputs


def load_spot_dataset(dataset_path, tokenizer, max_length, debug, test=False, output_type='yaml'):
def load_spot_dataset(dataset, tokenizer, max_length, test=False, output_type='yaml'):
output_col = None
dataset = pd.read_csv(dataset_path, sep='\t')

if debug:
dataset = dataset[:20]

dataset["sentence"] = dataset["sentence"].apply(lambda x: x.lower())

Expand Down
3 changes: 2 additions & 1 deletion requirements.txt
Original file line number Diff line number Diff line change
Expand Up @@ -15,4 +15,5 @@ jinja2==3.1.2
strictyaml==1.7.3
openai==1.3.9
pint==0.23
trl==0.7.7
trl==0.7.7
bitsandbytes
8 changes: 4 additions & 4 deletions scripts/spot/train_llama2.sh
Original file line number Diff line number Diff line change
Expand Up @@ -8,8 +8,9 @@ MAX_LENGTH=1024
EVAL_METRIC=eval_rouge2
RESULT_FILE_PATH=results/${MODEL_TYPE}_tuned_base_minimized_${MODEL_VERSION}_db-${DATASET_VERSION}_output_yaml_out.tsv

LEARNING_RATE=1e-3
EPOCHS=50
LEARNING_RATE=2e-4
# EPOCHS=50
EPOCHS=3
RANDOM_SEED=0

TRAIN_DATASET=tasks/spot/${DATASET_VERSION}/IMR_Dataset_${DATASET_VERSION}_train_ChatNL_minimized_yaml_out.csv
Expand Down Expand Up @@ -40,5 +41,4 @@ python3 -m app.main \
--eval_metric $EVAL_METRIC \
--result_file_path $RESULT_FILE_PATH \
--train \
--test \
--debug
--test
Loading

0 comments on commit 612d84f

Please sign in to comment.