diff --git a/docs/source/detoxifying_a_lm.mdx b/docs/source/detoxifying_a_lm.mdx index b07a166ba8..3fe2c80ae9 100644 --- a/docs/source/detoxifying_a_lm.mdx +++ b/docs/source/detoxifying_a_lm.mdx @@ -58,13 +58,13 @@ And its `continuation` value: We want to increase the chance for the model to generate toxic prompts so we get more learning signal. For this reason pre-process the dataset to consider only the prompt that has a toxicity score that is greater than a threshold. We can do this in a few lines of code: ```python -ds = load_dataset("allenai/real-toxicity-prompts", split="train") +train_dataset = load_dataset("allenai/real-toxicity-prompts", split="train") def filter_fn(sample): toxicity = sample["prompt"]["toxicity"] return toxicity is not None and toxicity > 0.3 -ds = ds.filter(filter_fn, batched=False) +train_dataset = dataset.filter(filter_fn, batched=False) ``` ### Reward function diff --git a/examples/datasets/tokenize_ds.py b/examples/datasets/tokenize_ds.py index 373c33e9a3..755c04fceb 100644 --- a/examples/datasets/tokenize_ds.py +++ b/examples/datasets/tokenize_ds.py @@ -29,7 +29,7 @@ @dataclass class ScriptArguments: - dataset: str = field( + dataset_name: str = field( default="trl-internal-testing/hh-rlhf-helpful-base-trl-style", metadata={"help": "The dataset to load"} ) model: str = field(default="gpt2", metadata={"help": "The model to use for tokenization"}) @@ -40,7 +40,7 @@ class ScriptArguments: if __name__ == "__main__": args = HfArgumentParser(ScriptArguments).parse_args_into_dataclasses()[0] - ds = load_dataset(args.dataset) + dataset = load_dataset(args.dataset_name) tokenizer = AutoTokenizer.from_pretrained(args.model) if tokenizer.chat_template is None: tokenizer.chat_template = SIMPLE_CHAT_TEMPLATE @@ -50,5 +50,5 @@ def process(row): row["rejected"] = tokenizer.apply_chat_template(row["rejected"], tokenize=False) return row - ds = ds.map(process, num_proc=args.dataset_num_proc) - print(ds["train"][0]["chosen"]) + dataset = dataset.map(process, num_proc=args.dataset_num_proc) + print(dataset["train"][0]["chosen"]) diff --git a/examples/scripts/bco.py b/examples/scripts/bco.py index db3c325ea0..23d171a8a0 100644 --- a/examples/scripts/bco.py +++ b/examples/scripts/bco.py @@ -114,38 +114,38 @@ def get_model_response(example, llm_name: str): dataset = load_dataset("openbmb/UltraFeedback")["train"] - ds = dataset.filter(lambda example: llm_name in example["models"], batched=False, num_proc=num_proc) - ds = ds.filter( + dataset = dataset.filter(lambda example: llm_name in example["models"], batched=False, num_proc=num_proc) + dataset = dataset.filter( lambda example: len(example["models"]) == len(example["completions"]), batched=False, num_proc=num_proc ) METRIC = "helpfulness" - ds = ds.map( + dataset = dataset.map( get_model_rating, batched=False, fn_kwargs={"metric": METRIC, "llm_name": llm_name}, num_proc=num_proc, ) - ds = ds.map( + dataset = dataset.map( get_model_response, batched=False, fn_kwargs={"llm_name": llm_name}, num_proc=num_proc, ) - ds = ds.select_columns(["source", "instruction", "response", "helpfulness"]) + dataset = dataset.select_columns(["source", "instruction", "response", "helpfulness"]) - ds = ds.rename_columns({"instruction": "prompt", "response": "completion"}) - ds = ds.map(lambda example: {"label": example["helpfulness"] >= 5}, batched=False, num_proc=num_proc) + dataset = dataset.rename_columns({"instruction": "prompt", "response": "completion"}) + dataset = dataset.map(lambda example: {"label": example["helpfulness"] >= 5}, batched=False, num_proc=num_proc) - ds = ds.map( + dataset = dataset.map( lambda example: {"prompt": [{"role": "user", "content": example["prompt"]}]}, batched=False, num_proc=num_proc, ) - dataset = ds.train_test_split(test_size=0.05, seed=42) + dataset = dataset.train_test_split(test_size=0.05, seed=42) return dataset @@ -209,7 +209,7 @@ def format_dataset(example): with PartialState().local_main_process_first(): # Load the dataset dataset = build_helpfulness_dataset(script_args.llm_name, num_proc=bco_args.dataset_num_proc) - formatted_dataset = dataset.map(format_dataset, batched=False, num_proc=bco_args.dataset_num_proc) + dataset = dataset.map(format_dataset, batched=False, num_proc=bco_args.dataset_num_proc) accelerator = Accelerator() embedding_model = AutoModel.from_pretrained( @@ -233,8 +233,8 @@ def format_dataset(example): model, ref_model, args=bco_args, - train_dataset=formatted_dataset["train"], - eval_dataset=formatted_dataset["test"], + train_dataset=dataset["train"], + eval_dataset=dataset["test"], tokenizer=tokenizer, peft_config=get_peft_config(model_args), embedding_func=embedding_func, diff --git a/examples/scripts/cpo.py b/examples/scripts/cpo.py index b4c6386850..924cbf162a 100644 --- a/examples/scripts/cpo.py +++ b/examples/scripts/cpo.py @@ -64,7 +64,7 @@ @dataclass class ScriptArguments: - dataset: str = field( + dataset_name: str = field( default="trl-internal-testing/hh-rlhf-helpful-base-trl-style", metadata={"help": "The name of the dataset to use."}, ) @@ -89,7 +89,7 @@ class ScriptArguments: ################ # Dataset ################ - ds = load_dataset(args.dataset) + dataset = load_dataset(args.dataset_name) if tokenizer.chat_template is None: tokenizer.chat_template = SIMPLE_CHAT_TEMPLATE @@ -101,10 +101,7 @@ def process(row): # Compute that only on the main process for faster data processing. # see: https://github.com/huggingface/trl/pull/1255 with PartialState().local_main_process_first(): - ds = ds.map(process, num_proc=cpo_args.dataset_num_proc) - - train_dataset = ds["train"] - eval_dataset = ds["test"] + dataset = dataset.map(process, num_proc=cpo_args.dataset_num_proc) ################ # Training @@ -112,8 +109,8 @@ def process(row): trainer = CPOTrainer( model, args=cpo_args, - train_dataset=train_dataset, - eval_dataset=eval_dataset, + train_dataset=dataset["train"], + eval_dataset=dataset["test"], tokenizer=tokenizer, peft_config=get_peft_config(model_config), ) diff --git a/examples/scripts/dpo.py b/examples/scripts/dpo.py index 94abfd3efc..99f3bd16aa 100644 --- a/examples/scripts/dpo.py +++ b/examples/scripts/dpo.py @@ -112,17 +112,14 @@ ################ # Dataset ################ - ds = load_dataset(args.dataset_name) + dataset = load_dataset(args.dataset_name) with PartialState().local_main_process_first(): - ds = ds.map(maybe_extract_prompt, num_proc=training_args.dataset_num_proc) - ds = ds.map( + dataset = dataset.map(maybe_extract_prompt, num_proc=training_args.dataset_num_proc) + dataset = dataset.map( maybe_apply_chat_template, num_proc=training_args.dataset_num_proc, fn_kwargs={"tokenizer": tokenizer} ) - train_dataset = ds[args.dataset_train_split] - eval_dataset = ds[args.dataset_test_split] - ########## # Training ################ @@ -130,8 +127,8 @@ model, ref_model, args=training_args, - train_dataset=train_dataset, - eval_dataset=eval_dataset, + train_dataset=dataset[args.dataset_train_split], + eval_dataset=dataset[args.dataset_test_split], tokenizer=tokenizer, peft_config=peft_config, ) diff --git a/examples/scripts/dpo_visual.py b/examples/scripts/dpo_visual.py index 277d786c5c..3f0fe8665d 100644 --- a/examples/scripts/dpo_visual.py +++ b/examples/scripts/dpo_visual.py @@ -105,7 +105,7 @@ ################ # Dataset ################ - ds = load_dataset(args.dataset_name) + dataset = load_dataset(args.dataset_name) def process(row): row["prompt"] = processor.apply_chat_template(row["prompt"], tokenize=False) @@ -116,10 +116,7 @@ def process(row): # Compute that only on the main process for faster data processing. # see: https://github.com/huggingface/trl/pull/1255 with PartialState().local_main_process_first(): - ds = ds.map(process, num_proc=training_args.dataset_num_proc) - - train_dataset = ds[args.dataset_train_split] - eval_dataset = ds[args.dataset_test_split] + dataset = dataset.map(process, num_proc=training_args.dataset_num_proc) ################ # Training @@ -128,8 +125,8 @@ def process(row): model, ref_model, args=training_args, - train_dataset=train_dataset, - eval_dataset=eval_dataset, + train_dataset=dataset[args.dataset_train_split], + eval_dataset=dataset[args.dataset_test_split], tokenizer=processor, peft_config=peft_config, ) diff --git a/examples/scripts/evals/judge_tldr.py b/examples/scripts/evals/judge_tldr.py index ac4d7cfb9a..9e6111cf7c 100644 --- a/examples/scripts/evals/judge_tldr.py +++ b/examples/scripts/evals/judge_tldr.py @@ -62,13 +62,13 @@ class ScriptArguments: args = parser.parse_args_into_dataclasses()[0] # Load the dataset -raw_dataset = load_dataset("trl-lib/tldr", split="validation") +dataset = load_dataset("trl-lib/tldr", split="validation") if args.num_examples is not None: - raw_dataset = raw_dataset.select(range(args.num_examples)) + dataset = dataset.select(range(args.num_examples)) # Extract the prompts and reference completions -prompts = raw_dataset["prompt"] -reference_completions = raw_dataset["completion"] +prompts = dataset["prompt"] +reference_completions = dataset["completion"] # Generate the model completions sampling_params = SamplingParams(temperature=0.0, top_p=0.95, max_tokens=200) # very generous max token length diff --git a/examples/scripts/gkd.py b/examples/scripts/gkd.py index 05791ba36e..c268ac0c63 100644 --- a/examples/scripts/gkd.py +++ b/examples/scripts/gkd.py @@ -103,19 +103,16 @@ ################ # Dataset ################ - raw_datasets = load_dataset(args.dataset_name) + dataset = load_dataset(args.dataset_name) with PartialState().local_main_process_first(): - raw_datasets = raw_datasets.map( + dataset = dataset.map( lambda x: { "prompt": tokenizer.apply_chat_template(x["prompt"], tokenize=False, add_generation_prompt=True) }, num_proc=training_args.dataset_num_proc, ) - train_dataset = raw_datasets[args.dataset_train_split] - eval_dataset = raw_datasets[args.dataset_test_split] - ################ # Training ################ @@ -123,8 +120,8 @@ model=model_config.model_name_or_path, teacher_model=training_args.teacher_model_name_or_path, args=training_args, - train_dataset=train_dataset, - eval_dataset=eval_dataset, + train_dataset=dataset[args.dataset_train_split], + eval_dataset=dataset[args.dataset_test_split], tokenizer=tokenizer, peft_config=get_peft_config(model_config), ) diff --git a/examples/scripts/kto.py b/examples/scripts/kto.py index df5dfb105d..09d30d62f6 100644 --- a/examples/scripts/kto.py +++ b/examples/scripts/kto.py @@ -113,15 +113,15 @@ def format_dataset(example): # Compute that only on the main process for faster data processing. # see: https://github.com/huggingface/trl/pull/1255 with PartialState().local_main_process_first(): - formatted_dataset = dataset.map(format_dataset, num_proc=kto_args.dataset_num_proc) + dataset = dataset.map(format_dataset, num_proc=kto_args.dataset_num_proc) # Initialize the KTO trainer kto_trainer = KTOTrainer( model, ref_model, args=kto_args, - train_dataset=formatted_dataset["train"], - eval_dataset=formatted_dataset["test"], + train_dataset=dataset["train"], + eval_dataset=dataset["test"], tokenizer=tokenizer, peft_config=get_peft_config(model_args), ) diff --git a/examples/scripts/orpo.py b/examples/scripts/orpo.py index b5ce93dfd8..9aa0684c6f 100644 --- a/examples/scripts/orpo.py +++ b/examples/scripts/orpo.py @@ -64,7 +64,7 @@ @dataclass class ScriptArguments: - dataset: str = field( + dataset_name: str = field( default="trl-internal-testing/hh-rlhf-helpful-base-trl-style", metadata={"help": "The name of the dataset to use."}, ) @@ -89,7 +89,7 @@ class ScriptArguments: ################ # Dataset ################ - ds = load_dataset(args.dataset) + dataset = load_dataset(args.dataset_name) if tokenizer.chat_template is None: tokenizer.chat_template = SIMPLE_CHAT_TEMPLATE @@ -102,10 +102,7 @@ def process(row): # Compute that only on the main process for faster data processing. # see: https://github.com/huggingface/trl/pull/1255 with PartialState().local_main_process_first(): - ds = ds.map(process, num_prc=orpo_args.dataset_num_proc) - - train_dataset = ds["train"] - eval_dataset = ds["test"] + dataset = dataset.map(process, num_prc=orpo_args.dataset_num_proc) ################ # Training @@ -113,8 +110,8 @@ def process(row): trainer = ORPOTrainer( model, args=orpo_args, - train_dataset=train_dataset, - eval_dataset=eval_dataset, + train_dataset=dataset["train"], + eval_dataset=dataset["test"], tokenizer=tokenizer, peft_config=get_peft_config(model_config), ) diff --git a/examples/scripts/ppo/ppo.py b/examples/scripts/ppo/ppo.py index 408ce29778..5312f32cc7 100644 --- a/examples/scripts/ppo/ppo.py +++ b/examples/scripts/ppo/ppo.py @@ -87,10 +87,10 @@ ################ # Dataset ################ - raw_datasets = load_dataset("trl-internal-testing/descriptiveness-sentiment-trl-style", split="descriptiveness") + dataset = load_dataset("trl-internal-testing/descriptiveness-sentiment-trl-style", split="descriptiveness") eval_samples = 20 - train_dataset = raw_datasets.select(range(len(raw_datasets) - eval_samples)) - eval_dataset = raw_datasets.select(range(len(raw_datasets) - eval_samples, len(raw_datasets))) + train_dataset = dataset.select(range(len(dataset) - eval_samples)) + eval_dataset = dataset.select(range(len(dataset) - eval_samples, len(dataset))) dataset_text_field = "prompt" def prepare_dataset(dataset, tokenizer): diff --git a/examples/scripts/ppo/ppo_tldr.py b/examples/scripts/ppo/ppo_tldr.py index 4fb8b23d2e..138e0f2570 100644 --- a/examples/scripts/ppo/ppo_tldr.py +++ b/examples/scripts/ppo/ppo_tldr.py @@ -89,9 +89,9 @@ ################ # Dataset ################ - raw_datasets = load_dataset("trl-internal-testing/tldr-preference-sft-trl-style") - train_dataset = raw_datasets["train"] - eval_dataset = raw_datasets["validation"] + dataset = load_dataset("trl-internal-testing/tldr-preference-sft-trl-style") + train_dataset = dataset["train"] + eval_dataset = dataset["validation"] def prepare_dataset(dataset, tokenizer): """pre-tokenize the dataset before training; only collate during training""" diff --git a/examples/scripts/reward_modeling.py b/examples/scripts/reward_modeling.py index 3daf375352..e00e5d8ff8 100644 --- a/examples/scripts/reward_modeling.py +++ b/examples/scripts/reward_modeling.py @@ -113,7 +113,7 @@ ############################# # Load and preprocess dataset ############################# - raw_datasets = load_dataset(args.dataset_name) + dataset = load_dataset(args.dataset_name) def preprocess_function(examples): new_examples = { @@ -137,25 +137,22 @@ def preprocess_function(examples): # This assumes the chosen/rejected columns are in the OpenAI messages format. chosen_fn = conversations_formatting_function(tokenizer, "chosen") rejected_fn = conversations_formatting_function(tokenizer, "rejected") - raw_datasets = raw_datasets.map( + dataset = dataset.map( lambda x: {"chosen": chosen_fn(x), "rejected": rejected_fn(x)}, num_proc=config.dataset_num_proc ) # Tokenize inputs - raw_datasets = raw_datasets.map( + dataset = dataset.map( preprocess_function, batched=True, num_proc=config.dataset_num_proc, ) # Filter out examples that are too long - raw_datasets = raw_datasets.filter( + dataset = dataset.filter( lambda x: len(x["input_ids_chosen"]) <= config.max_length and len(x["input_ids_rejected"]) <= config.max_length, num_proc=config.dataset_num_proc, ) - train_dataset = raw_datasets[args.dataset_train_split] - eval_dataset = raw_datasets[args.dataset_test_split] - ########## # Training ########## @@ -163,8 +160,8 @@ def preprocess_function(examples): model=model, tokenizer=tokenizer, args=config, - train_dataset=train_dataset, - eval_dataset=eval_dataset, + train_dataset=dataset[args.dataset_train_split], + eval_dataset=dataset[args.dataset_test_split], peft_config=get_peft_config(model_config), ) trainer.train() diff --git a/examples/scripts/rloo/rloo_tldr.py b/examples/scripts/rloo/rloo_tldr.py index e40bec3ed2..ae96d1f0c1 100644 --- a/examples/scripts/rloo/rloo_tldr.py +++ b/examples/scripts/rloo/rloo_tldr.py @@ -89,9 +89,9 @@ ################ # Dataset ################ - raw_datasets = load_dataset("trl-internal-testing/tldr-preference-sft-trl-style") - train_dataset = raw_datasets["train"] - eval_dataset = raw_datasets["validation"] + dataset = load_dataset("trl-internal-testing/tldr-preference-sft-trl-style") + train_dataset = dataset["train"] + eval_dataset = dataset["validation"] def prepare_dataset(dataset, tokenizer): """pre-tokenize the dataset before training; only collate during training""" diff --git a/examples/scripts/sft.py b/examples/scripts/sft.py index 1e461bc81b..6b00e30227 100644 --- a/examples/scripts/sft.py +++ b/examples/scripts/sft.py @@ -90,10 +90,7 @@ ################ # Dataset ################ - raw_datasets = load_dataset(args.dataset_name) - - train_dataset = raw_datasets[args.dataset_train_split] - eval_dataset = raw_datasets[args.dataset_test_split] + dataset = load_dataset(args.dataset_name) ################ # Training @@ -101,8 +98,8 @@ trainer = SFTTrainer( model=model_config.model_name_or_path, args=training_args, - train_dataset=train_dataset, - eval_dataset=eval_dataset, + train_dataset=dataset[args.dataset_train_split], + eval_dataset=dataset[args.dataset_test_split], tokenizer=tokenizer, peft_config=get_peft_config(model_config), ) diff --git a/examples/scripts/vsft_llava.py b/examples/scripts/vsft_llava.py index 7313df2e22..d3ffb808b3 100644 --- a/examples/scripts/vsft_llava.py +++ b/examples/scripts/vsft_llava.py @@ -103,9 +103,7 @@ def collate_fn(examples): ################ # Dataset ################ - raw_datasets = load_dataset(sft_script_args.dataset_name) - train_dataset = raw_datasets[sft_script_args.dataset_train_split] - eval_dataset = raw_datasets[sft_script_args.dataset_test_split] + dataset = load_dataset(sft_script_args.dataset_name) ################ # Training @@ -114,8 +112,8 @@ def collate_fn(examples): model=model, args=training_args, data_collator=collate_fn, - train_dataset=train_dataset, - eval_dataset=eval_dataset, + train_dataset=dataset[sft_script_args.dataset_train_split], + eval_dataset=dataset[sft_script_args.dataset_test_split], tokenizer=processor.tokenizer, peft_config=get_peft_config(model_config), ) diff --git a/tests/test_trainers_args.py b/tests/test_trainers_args.py index 94e2bf230e..173369a020 100644 --- a/tests/test_trainers_args.py +++ b/tests/test_trainers_args.py @@ -80,7 +80,7 @@ def test_bco(self): def test_cpo(self): tokenizer = AutoTokenizer.from_pretrained("gpt2") - dataset = dataset = load_dataset("trl-internal-testing/zen", "standard_preference", split="train") + dataset = load_dataset("trl-internal-testing/zen", "standard_preference", split="train") with tempfile.TemporaryDirectory() as tmp_dir: args = CPOConfig( tmp_dir, @@ -121,7 +121,7 @@ def test_cpo(self): def test_dpo(self): tokenizer = AutoTokenizer.from_pretrained("gpt2") - dataset = dataset = load_dataset("trl-internal-testing/zen", "standard_preference", split="train") + dataset = load_dataset("trl-internal-testing/zen", "standard_preference", split="train") with tempfile.TemporaryDirectory() as tmp_dir: args = DPOConfig( tmp_dir, @@ -182,7 +182,7 @@ def test_dpo(self): def test_kto(self): tokenizer = AutoTokenizer.from_pretrained("gpt2") - dataset = dataset = load_dataset("trl-internal-testing/zen", "standard_unpaired_preference", split="train") + dataset = load_dataset("trl-internal-testing/zen", "standard_unpaired_preference", split="train") with tempfile.TemporaryDirectory() as tmp_dir: args = KTOConfig( tmp_dir,