forked from NomosArtificial/static-eval
-
Notifications
You must be signed in to change notification settings - Fork 0
/
Copy pathtokenization.py
97 lines (79 loc) · 3.55 KB
/
tokenization.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
from itertools import chain
import multiprocessing
import torch
from datasets import load_dataset
from transformers import AutoTokenizer, DataCollatorWithPadding
from utils import load_yaml
import argparse
# dataloaders
def build_dataloaders(
config,
sequence_length: int = 2048,
):
"""
Build data loaders for training.
This function performs the following steps:
1. Load the tokenizer from the pretrained "EleutherAI/gpt-neox-20b" model.
2. Load the "openwebtext" dataset.
3. Tokenize the dataset, adding the end-of-sentence token to each text.
4. Process the tokenized dataset into chunks of a specified block size.
Returns:
Dataset: The processed dataset ready for training.
"""
tokenizer = AutoTokenizer.from_pretrained(config["model_path"])
dataset = load_dataset(config["data_path"], split=f"train[:{config['length']}]",cache_dir = config["cache_dir"])
dataset = dataset.shuffle()
tokenized_dataset = dataset.map(
lambda example: tokenizer([t + tokenizer.eos_token for t in example["text"]]),
batched=True,
remove_columns=["text"],
num_proc=32,
)
sequence_length = config["max_length"]
block_size = sequence_length
# Main data processing function that will concatenate all texts from our dataset and generate chunks of block_size.
def group_texts(examples):
# Concatenate all texts.
# concatenated_examples = {k: list(chain(*examples[k])) for k in examples.keys()}
# total_length = config["length"]# len(concatenated_examples[list(examples.keys())[0]])
# # We drop the small remainder, we could add padding if the model supported it instead of this drop, you can
# # customize this part to your needs.
# # if total_length >= block_size:
# # total_length = (total_length // block_size) * block_size
# # Split by chunks of max_len.
# result = {
# k: [t[i : i + block_size] for i in range(0, total_length, block_size)]
# for k, t in concatenated_examples.items()
# }
concatenated_examples = {k: list(chain(*examples[k])) for k in examples.keys()}
total_length = config["length"]
# Split by chunks of max_len.
result = {
k: [t[i : i + block_size] for i in range(0, total_length, block_size)]
for k, t in concatenated_examples.items()
}
# Create labels, same as input_ids but with -100 where there are padding tokens
result["labels"] = [[-100 if token == tokenizer.pad_token_id else token for token in text] for text in result["input_ids"]]
return result
train_dataset = tokenized_dataset.map(
group_texts, batched=True, num_proc=32
)
train_dataset.push_to_hub(config["savedata_dir"], private=True)
# Create a data collator that will dynamically pad the batches
tokenizer.add_special_tokens({'pad_token': '[PAD]'})
data_collator = DataCollatorWithPadding(tokenizer=tokenizer)
# Then, you can use this collator when creating your data loader:
train_dataloader = torch.utils.data.DataLoader(
train_dataset,
batch_size=config["train_args"]["per_device_train_batch_size"],
shuffle=True,
collate_fn=data_collator,
)
return train_dataloader
if __name__ == "__main__":
parser = argparse.ArgumentParser()
parser.add_argument("--config_path", type=str)
# parser.add_argument("--local_rank", type=int)
args = parser.parse_args()
config = load_yaml(args.config_path)
print(build_dataloaders(config))