-
Notifications
You must be signed in to change notification settings - Fork 0
/
Copy pathLM_text_generator.py
81 lines (66 loc) · 2.88 KB
/
LM_text_generator.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
from transformers import AutoTokenizer, AutoModelForCausalLM
from transformers import PreTrainedTokenizerFast
from preprocess import Preprocess
from os.path import exists
from train import Train
from config import Config
# Folder to write model to
output_dir = Config.output_dir
# Text file containing text to fine-tune or train from scratch
text_file = Config.text_file_location
from_scratch = Config.from_scratch
if not exists(output_dir):
print("Output directory does not exist")
exit()
if not exists(text_file):
print("Text file does not exist")
exit()
print("Output directory")
print(output_dir)
print("Text file location")
print(text_file)
tokenizer_file_name = output_dir + "/byte-level-BPE_son.tokenizer.json"
trainer_state_path = output_dir + '/trainer_state.json'
if from_scratch is False: # fine-tuning gpt2
print("----Fine-tuning model----")
# tokenizer from GPT-2
tokenizer = AutoTokenizer.from_pretrained("gpt2")
# if a previously fine-tuned model exists
if exists(trainer_state_path): # set model name to previously trained
print("Loading existing model")
model_name = output_dir
else: # set model name to load gpt2 124M parameter model
print("Loading GPT-2 124M parameter model")
model_name = "gpt2"
# load model
model = AutoModelForCausalLM.from_pretrained(model_name, use_cache=False)
else: # training from scratch
print("----Training from scratch----")
if exists(tokenizer_file_name): # model has already starting training
print("Loading existing model")
# load model and tokenizer from directory
model = AutoModelForCausalLM.from_pretrained(output_dir, use_cache=False)
else: # no model exists
print("Setting up new model")
from tokenizers import ByteLevelBPETokenizer
from transformers import GPT2Config
print("Creating tokenizer----")
tokenizer = ByteLevelBPETokenizer()
# train tokenizer on given text file, setting vocab to be around 52,000 (similar to GPT-2)
print("Training tokenizer")
tokenizer.train(files=text_file, vocab_size=52_000, min_frequency=2, special_tokens=["<|endoftext|>"])
# save tokenizer so can be used for further training
tokenizer.save(tokenizer_file_name)
print("Creating model config")
# uses same architecture as GPT-2
config = GPT2Config(vocab_size=52_000, max_length=1024, use_cache=False)
# create model from configuration
print("Creating model")
model = AutoModelForCausalLM.from_config(config=config)
tokenizer = PreTrainedTokenizerFast(tokenizer_file=tokenizer_file_name)
print("----Number of Model Parameters----")
print(model.num_parameters())
# tokenize datasets
tokenized_datasets = Preprocess.process(tokenizer, text_file)
# train and run analysis
Train.do_train(model, tokenizer, tokenized_datasets, output_dir)