diff --git a/build_dataset.py b/build_dataset.py index bccb1eb..9c207fc 100644 --- a/build_dataset.py +++ b/build_dataset.py @@ -4,17 +4,9 @@ from datasets import load_dataset from transformers import AutoTokenizer -class CFG: - SEED: int = 42 - SEQ_LEN: int = 8192 - NUM_CPU: int = multiprocessing.cpu_count() - HF_ACCOUNT_REPO: str = "YOUR HF ACCOUNT" - TOKENIZER: str = "EleutherAI/gpt-neox-20b" - DATASET_NAME: str = "EleutherAI/the_pile_deduplicated" - def main(args): - tokenizer = AutoTokenizer.from_pretrained(CFG.TOKENIZER) - train_dataset = load_dataset(CFG.DATASET_NAME, split="train") + tokenizer = AutoTokenizer.from_pretrained(args.tokenizer) + train_dataset = load_dataset(args.dataset_name, split="train", data_dir=args.data_dir) def tokenize_function(example): return tokenizer([t + tokenizer.eos_token for t in example["text"]]) @@ -22,11 +14,11 @@ def tokenize_function(example): tokenized_dataset = train_dataset.map( tokenize_function, batched=True, - num_proc=CFG.NUM_CPU, + num_proc=args.num_workers, remove_columns=["text"], ) - block_size = CFG.SEQ_LEN + block_size = args.seq_len # Main data processing function that will concatenate all texts from our dataset and generate chunks of block_size. def group_texts(examples): @@ -47,17 +39,18 @@ def group_texts(examples): train_tokenized_dataset = tokenized_dataset.map( group_texts, batched=True, - num_proc=CFG.NUM_CPU, + num_proc=args.num_workers, ) - train_tokenized_dataset.push_to_hub(CFG.HF_ACCOUNT_REPO) + train_tokenized_dataset.push_to_hub(args.hf_account_repo) if __name__ == '__main__': parser = argparse.ArgumentParser(description="Process and push dataset to Hugging Face Hub") - parser.add_argument("--seed", type=int, default=CFG.SEED, help="Random seed") - parser.add_argument("--seq_len", type=int, default=CFG.SEQ_LEN, help="Sequence length for processing") - parser.add_argument("--hf_account", type=str, default=CFG.HF_ACCOUNT_REPO, help="Hugging Face account name and repo") - parser.add_argument("--tokenizer", type=str, default=CFG.TOKENIZER, help="Tokenizer model to use") - parser.add_argument("--dataset_name", type=str, default=CFG.DATASET_NAME, help="Name of the dataset to process") + parser.add_argument("--seq_len", type=int, default=8192, help="Sequence length for processing") + parser.add_argument("--hf_account_repo", type=str, default="YOUR HF ACCOUNT/REPO NAME", help="Hugging Face account name and repo") + parser.add_argument("--tokenizer", type=str, default="EleutherAI/gpt-neox-20b", help="Tokenizer model to use") + parser.add_argument("--dataset_name", type=str, default="EleutherAI/the_pile_deduplicated", help="Name of the dataset to process") + parser.add_argument("--data_dir", type=str, default=None, help="Name of the dataset directory to process") + parser.add_argument("--num_workers", type=int, default=multiprocessing.cpu_count(), help="Number of workers for processing the data") args = parser.parse_args() - main(args) \ No newline at end of file + main(args)