From 23d90de6726c537eba788ac3a465508dfb427890 Mon Sep 17 00:00:00 2001 From: Tanishq Abraham <37097934+tmabraham@users.noreply.github.com> Date: Mon, 9 Oct 2023 01:19:01 -0700 Subject: [PATCH 1/2] Update build_dataset.py --- build_dataset.py | 37 ++++++++++++++++--------------------- 1 file changed, 16 insertions(+), 21 deletions(-) diff --git a/build_dataset.py b/build_dataset.py index bccb1eb..0f38850 100644 --- a/build_dataset.py +++ b/build_dataset.py @@ -2,19 +2,12 @@ import argparse from itertools import chain from datasets import load_dataset -from transformers import AutoTokenizer - -class CFG: - SEED: int = 42 - SEQ_LEN: int = 8192 - NUM_CPU: int = multiprocessing.cpu_count() - HF_ACCOUNT_REPO: str = "YOUR HF ACCOUNT" - TOKENIZER: str = "EleutherAI/gpt-neox-20b" - DATASET_NAME: str = "EleutherAI/the_pile_deduplicated" +from transformers import AutoTokenizer, set_seed def main(args): - tokenizer = AutoTokenizer.from_pretrained(CFG.TOKENIZER) - train_dataset = load_dataset(CFG.DATASET_NAME, split="train") + set_seed(args.seed) + tokenizer = AutoTokenizer.from_pretrained(args.tokenizer) + train_dataset = load_dataset(args.dataset_name, split="train", data_dir=args.data_dir) def tokenize_function(example): return tokenizer([t + tokenizer.eos_token for t in example["text"]]) @@ -22,11 +15,11 @@ def tokenize_function(example): tokenized_dataset = train_dataset.map( tokenize_function, batched=True, - num_proc=CFG.NUM_CPU, + num_proc=args.num_workers, remove_columns=["text"], ) - block_size = CFG.SEQ_LEN + block_size = args.seq_len # Main data processing function that will concatenate all texts from our dataset and generate chunks of block_size. def group_texts(examples): @@ -47,17 +40,19 @@ def group_texts(examples): train_tokenized_dataset = tokenized_dataset.map( group_texts, batched=True, - num_proc=CFG.NUM_CPU, + num_proc=args.num_workers, ) - train_tokenized_dataset.push_to_hub(CFG.HF_ACCOUNT_REPO) + train_tokenized_dataset.push_to_hub(args.hf_account_repo) if __name__ == '__main__': parser = argparse.ArgumentParser(description="Process and push dataset to Hugging Face Hub") - parser.add_argument("--seed", type=int, default=CFG.SEED, help="Random seed") - parser.add_argument("--seq_len", type=int, default=CFG.SEQ_LEN, help="Sequence length for processing") - parser.add_argument("--hf_account", type=str, default=CFG.HF_ACCOUNT_REPO, help="Hugging Face account name and repo") - parser.add_argument("--tokenizer", type=str, default=CFG.TOKENIZER, help="Tokenizer model to use") - parser.add_argument("--dataset_name", type=str, default=CFG.DATASET_NAME, help="Name of the dataset to process") + parser.add_argument("--seed", type=int, default=42, help="Random seed") + parser.add_argument("--seq_len", type=int, default=8192, help="Sequence length for processing") + parser.add_argument("--hf_account_repo", type=str, default="YOUR HF ACCOUNT/REPO NAME", help="Hugging Face account name and repo") + parser.add_argument("--tokenizer", type=str, default="EleutherAI/gpt-neox-20b", help="Tokenizer model to use") + parser.add_argument("--dataset_name", type=str, default="EleutherAI/the_pile_deduplicated", help="Name of the dataset to process") + parser.add_argument("--data_dir", type=str, default=None, help="Name of the dataset directory to process") + parser.add_argument("--num_workers", type=int, default=multiprocessing.cpu_count(), help="Number of workers for processing the data") args = parser.parse_args() - main(args) \ No newline at end of file + main(args) From 45c91ebf390737ce0fed03f8078607f35d45beba Mon Sep 17 00:00:00 2001 From: Tanishq Abraham <37097934+tmabraham@users.noreply.github.com> Date: Mon, 9 Oct 2023 01:22:06 -0700 Subject: [PATCH 2/2] Update build_dataset.py --- build_dataset.py | 4 +--- 1 file changed, 1 insertion(+), 3 deletions(-) diff --git a/build_dataset.py b/build_dataset.py index 0f38850..9c207fc 100644 --- a/build_dataset.py +++ b/build_dataset.py @@ -2,10 +2,9 @@ import argparse from itertools import chain from datasets import load_dataset -from transformers import AutoTokenizer, set_seed +from transformers import AutoTokenizer def main(args): - set_seed(args.seed) tokenizer = AutoTokenizer.from_pretrained(args.tokenizer) train_dataset = load_dataset(args.dataset_name, split="train", data_dir=args.data_dir) @@ -47,7 +46,6 @@ def group_texts(examples): if __name__ == '__main__': parser = argparse.ArgumentParser(description="Process and push dataset to Hugging Face Hub") - parser.add_argument("--seed", type=int, default=42, help="Random seed") parser.add_argument("--seq_len", type=int, default=8192, help="Sequence length for processing") parser.add_argument("--hf_account_repo", type=str, default="YOUR HF ACCOUNT/REPO NAME", help="Hugging Face account name and repo") parser.add_argument("--tokenizer", type=str, default="EleutherAI/gpt-neox-20b", help="Tokenizer model to use")