-
Notifications
You must be signed in to change notification settings - Fork 2
/
main.py
192 lines (163 loc) · 6.33 KB
/
main.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
import os
import pickle
import shutil
from copy import deepcopy
import random
import json
import glob
import numpy as np
import pandas as pd
import torch
from torch import optim
from torch.optim.lr_scheduler import StepLR
from tqdm import tqdm
from transformers import AdamW, AutoTokenizer, AutoModelForSequenceClassification, TrainingArguments, Trainer
from transformers import EarlyStoppingCallback
from nltk.tokenize import TweetTokenizer
from utils.functions import load_model, WordSplitTokenizer
from utils.args_helper import get_parser, print_opts
from utils.data_utils import load_sequence_classification_dataset, SequenceClassificationDataset, load_dataset
from utils.metrics import sentiment_metrics_fn
from sklearn.metrics import classification_report
def set_seed(seed):
random.seed(seed)
np.random.seed(seed)
torch.manual_seed(seed)
torch.cuda.manual_seed(seed)
def get_lr(args, optimizer):
for param_group in optimizer.param_groups:
return param_group['lr']
def metrics_to_string(metric_dict):
string_list = []
for key, value in metric_dict.items():
string_list.append('{}:{:.2f}'.format(key, value))
return ' '.join(string_list)
if __name__ == "__main__":
# Define Constants
logging_dir = "logs"
# Make sure cuda is deterministic
torch.backends.cudnn.deterministic = True
# Parse args
args = get_parser()
## Helper 1: Create output directory
def create_output_directory(model_dir, dataset_name, task, dataset_lang, model_checkpoint, seed, num_sample, force):
output_dir = '{}/{}/{}/{}/{}_{}_{}'.format(
model_dir,
dataset_name,
task,
dataset_lang,
model_checkpoint.replace('/','-'),
seed,
num_sample,
)
print(f"output_dir: {output_dir}")
if not os.path.exists(f'{output_dir}/classification_report_df.csv'):
os.makedirs(output_dir, exist_ok=True)
elif args['force']:
print(f'overwriting model directory `{output_dir}`')
else:
raise Exception(f'model directory `{output_dir}` already exists, use --force if you want to overwrite the folder')
return output_dir
# Specify output dir
output_dir = create_output_directory(
args["model_dir"],
args["dataset_name"],
args["task"],
args["lang"],
args['model_checkpoint'].replace('/','-'),
args['seed'],
args["num_sample"],
args["force"]
)
# Set random seed
set_seed(args["seed"])
# Load dset
dset = load_dataset(
dataset=args["dataset_name"],
task=args["task"],
lang=args["lang"],
num_sample=int(args["num_sample"]),
base_path="./data"
)
# Get unique labels
unique_labels = list(set(dset["train"][args["label_column_name"]] + dset["valid"][args["label_column_name"]]))
strlabel2int = {}
for i, k in enumerate(unique_labels):
strlabel2int[k] = i
print(f"strlabel2int: {strlabel2int}")
args["num_labels"] = len(strlabel2int)
# load model
model, tokenizer, vocab_path, config_path = load_model(args)
optimizer = optim.Adam(model.parameters(), lr=args["lr"])
# transfer model to GPU
if args["device"] == "cuda":
model = model.cuda()
# Get train, valid and test split
train_dataset, valid_dataset, test_dataset = load_sequence_classification_dataset(
dset, strlabel2int, tokenizer, args["text_column_name"], args["label_column_name"], args["num_sample"], args["seed"]
)
print(f"len(train_dataset): {len(train_dataset)}")
print(f"len(valid_dataset): {len(valid_dataset)}")
print(f"len(test_dataset): {len(test_dataset)}")
# Define Training Arguments
training_args = TrainingArguments(
output_dir=output_dir, # output directory
dataloader_num_workers=8,
num_train_epochs=args["n_epochs"], # total number of training epochs
per_device_train_batch_size=args["train_batch_size"], # batch size per device during training
per_device_eval_batch_size=args["eval_batch_size"], # batch size for evaluation
learning_rate=args["lr"], # number of warmup steps for learning rate scheduler
weight_decay=args["gamma"], # strength of weight decay
gradient_accumulation_steps=args["grad_accum"], # Gradient accumulation
logging_dir=logging_dir, # directory for storing logs
logging_strategy="steps",
evaluation_strategy='epoch',
save_strategy="epoch",
logging_steps=10,
load_best_model_at_end = True,
save_total_limit=1
)
# Train model
trainer = Trainer(
model=model,
args=training_args,
train_dataset=train_dataset,
eval_dataset=valid_dataset,
tokenizer=tokenizer,
compute_metrics=sentiment_metrics_fn,
callbacks=[EarlyStoppingCallback(early_stopping_patience=3)]
)
trainer.train()
print("## -- Training Done. -- ##")
valid_res = trainer.predict(valid_dataset)
print(valid_res.metrics)
## -- Evaluation -- ##
print("=========== EVALUATION PHASE ===========")
eval_metrics = {}
test_res = trainer.predict(test_dataset)
eval_metrics[args["lang"]] = test_res.metrics
print(f'Test results: {test_res.metrics}')
# get prediction and true labels
y_pred = test_res.predictions.argmax(axis=1)
y_true = test_dataset.labels
y_true = [strlabel2int[true_i] for true_i in y_true]
# generate classification report
cr = classification_report(y_true, y_pred, output_dict=True)
cr_df = pd.DataFrame(cr).transpose()
# saving final model
trainer.save_model(f"{output_dir}/final_model")
# save test results
with open(f"{output_dir}/test_results.json", "w+") as f:
json.dump({"valid": valid_res.metrics, "test": eval_metrics}, f)
f.close()
# save classification report
cr_df.to_csv(f"{output_dir}/classification_report_df.csv")
# save mapping of str labels to int
with open(f"{output_dir}/strlabel2int.json", "w+") as f:
json.dump(strlabel2int, f)
f.close()
# save prediction results
with open(f"{output_dir}/test_prediction_{args['lang']}.pkl", "wb") as f:
pickle.dump(test_res, f)
f.close()
print("## -- Evaluation Done. -- ##")