Skip to content
This repository has been archived by the owner on Jun 21, 2024. It is now read-only.

Update build_dataset.py #14

Open
wants to merge 2 commits into
base: main
Choose a base branch
from
Open
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
33 changes: 13 additions & 20 deletions build_dataset.py
Original file line number Diff line number Diff line change
Expand Up @@ -4,29 +4,21 @@
from datasets import load_dataset
from transformers import AutoTokenizer

class CFG:
SEED: int = 42
SEQ_LEN: int = 8192
NUM_CPU: int = multiprocessing.cpu_count()
HF_ACCOUNT_REPO: str = "YOUR HF ACCOUNT"
TOKENIZER: str = "EleutherAI/gpt-neox-20b"
DATASET_NAME: str = "EleutherAI/the_pile_deduplicated"

def main(args):
tokenizer = AutoTokenizer.from_pretrained(CFG.TOKENIZER)
train_dataset = load_dataset(CFG.DATASET_NAME, split="train")
tokenizer = AutoTokenizer.from_pretrained(args.tokenizer)
train_dataset = load_dataset(args.dataset_name, split="train", data_dir=args.data_dir)

def tokenize_function(example):
return tokenizer([t + tokenizer.eos_token for t in example["text"]])

tokenized_dataset = train_dataset.map(
tokenize_function,
batched=True,
num_proc=CFG.NUM_CPU,
num_proc=args.num_workers,
remove_columns=["text"],
)

block_size = CFG.SEQ_LEN
block_size = args.seq_len

# Main data processing function that will concatenate all texts from our dataset and generate chunks of block_size.
def group_texts(examples):
Expand All @@ -47,17 +39,18 @@ def group_texts(examples):
train_tokenized_dataset = tokenized_dataset.map(
group_texts,
batched=True,
num_proc=CFG.NUM_CPU,
num_proc=args.num_workers,
)

train_tokenized_dataset.push_to_hub(CFG.HF_ACCOUNT_REPO)
train_tokenized_dataset.push_to_hub(args.hf_account_repo)

if __name__ == '__main__':
parser = argparse.ArgumentParser(description="Process and push dataset to Hugging Face Hub")
parser.add_argument("--seed", type=int, default=CFG.SEED, help="Random seed")
parser.add_argument("--seq_len", type=int, default=CFG.SEQ_LEN, help="Sequence length for processing")
parser.add_argument("--hf_account", type=str, default=CFG.HF_ACCOUNT_REPO, help="Hugging Face account name and repo")
parser.add_argument("--tokenizer", type=str, default=CFG.TOKENIZER, help="Tokenizer model to use")
parser.add_argument("--dataset_name", type=str, default=CFG.DATASET_NAME, help="Name of the dataset to process")
parser.add_argument("--seq_len", type=int, default=8192, help="Sequence length for processing")
parser.add_argument("--hf_account_repo", type=str, default="YOUR HF ACCOUNT/REPO NAME", help="Hugging Face account name and repo")
parser.add_argument("--tokenizer", type=str, default="EleutherAI/gpt-neox-20b", help="Tokenizer model to use")
parser.add_argument("--dataset_name", type=str, default="EleutherAI/the_pile_deduplicated", help="Name of the dataset to process")
parser.add_argument("--data_dir", type=str, default=None, help="Name of the dataset directory to process")
parser.add_argument("--num_workers", type=int, default=multiprocessing.cpu_count(), help="Number of workers for processing the data")
args = parser.parse_args()
main(args)
main(args)