diff --git a/docs/source/detoxifying_a_lm.mdx b/docs/source/detoxifying_a_lm.mdx
index b07a166ba8..3fe2c80ae9 100644
--- a/docs/source/detoxifying_a_lm.mdx
+++ b/docs/source/detoxifying_a_lm.mdx
@@ -58,13 +58,13 @@ And its `continuation` value:
 
 We want to increase the chance for the model to generate toxic prompts so we get more learning signal. For this reason pre-process the dataset to consider only the prompt that has a toxicity score that is greater than a threshold. We can do this in a few lines of code:
 ```python
-ds = load_dataset("allenai/real-toxicity-prompts", split="train")
+train_dataset = load_dataset("allenai/real-toxicity-prompts", split="train")
 
 def filter_fn(sample):
     toxicity = sample["prompt"]["toxicity"]
     return toxicity is not None and toxicity > 0.3
 
-ds = ds.filter(filter_fn, batched=False)
+train_dataset = dataset.filter(filter_fn, batched=False)
 ```
 
 ### Reward function
diff --git a/examples/datasets/tokenize_ds.py b/examples/datasets/tokenize_ds.py
index 373c33e9a3..755c04fceb 100644
--- a/examples/datasets/tokenize_ds.py
+++ b/examples/datasets/tokenize_ds.py
@@ -29,7 +29,7 @@
 
 @dataclass
 class ScriptArguments:
-    dataset: str = field(
+    dataset_name: str = field(
         default="trl-internal-testing/hh-rlhf-helpful-base-trl-style", metadata={"help": "The dataset to load"}
     )
     model: str = field(default="gpt2", metadata={"help": "The model to use for tokenization"})
@@ -40,7 +40,7 @@ class ScriptArguments:
 
 if __name__ == "__main__":
     args = HfArgumentParser(ScriptArguments).parse_args_into_dataclasses()[0]
-    ds = load_dataset(args.dataset)
+    dataset = load_dataset(args.dataset_name)
     tokenizer = AutoTokenizer.from_pretrained(args.model)
     if tokenizer.chat_template is None:
         tokenizer.chat_template = SIMPLE_CHAT_TEMPLATE
@@ -50,5 +50,5 @@ def process(row):
         row["rejected"] = tokenizer.apply_chat_template(row["rejected"], tokenize=False)
         return row
 
-    ds = ds.map(process, num_proc=args.dataset_num_proc)
-    print(ds["train"][0]["chosen"])
+    dataset = dataset.map(process, num_proc=args.dataset_num_proc)
+    print(dataset["train"][0]["chosen"])
diff --git a/examples/scripts/bco.py b/examples/scripts/bco.py
index db3c325ea0..23d171a8a0 100644
--- a/examples/scripts/bco.py
+++ b/examples/scripts/bco.py
@@ -114,38 +114,38 @@ def get_model_response(example, llm_name: str):
 
     dataset = load_dataset("openbmb/UltraFeedback")["train"]
 
-    ds = dataset.filter(lambda example: llm_name in example["models"], batched=False, num_proc=num_proc)
-    ds = ds.filter(
+    dataset = dataset.filter(lambda example: llm_name in example["models"], batched=False, num_proc=num_proc)
+    dataset = dataset.filter(
         lambda example: len(example["models"]) == len(example["completions"]), batched=False, num_proc=num_proc
     )
 
     METRIC = "helpfulness"
 
-    ds = ds.map(
+    dataset = dataset.map(
         get_model_rating,
         batched=False,
         fn_kwargs={"metric": METRIC, "llm_name": llm_name},
         num_proc=num_proc,
     )
 
-    ds = ds.map(
+    dataset = dataset.map(
         get_model_response,
         batched=False,
         fn_kwargs={"llm_name": llm_name},
         num_proc=num_proc,
     )
 
-    ds = ds.select_columns(["source", "instruction", "response", "helpfulness"])
+    dataset = dataset.select_columns(["source", "instruction", "response", "helpfulness"])
 
-    ds = ds.rename_columns({"instruction": "prompt", "response": "completion"})
-    ds = ds.map(lambda example: {"label": example["helpfulness"] >= 5}, batched=False, num_proc=num_proc)
+    dataset = dataset.rename_columns({"instruction": "prompt", "response": "completion"})
+    dataset = dataset.map(lambda example: {"label": example["helpfulness"] >= 5}, batched=False, num_proc=num_proc)
 
-    ds = ds.map(
+    dataset = dataset.map(
         lambda example: {"prompt": [{"role": "user", "content": example["prompt"]}]},
         batched=False,
         num_proc=num_proc,
     )
-    dataset = ds.train_test_split(test_size=0.05, seed=42)
+    dataset = dataset.train_test_split(test_size=0.05, seed=42)
 
     return dataset
 
@@ -209,7 +209,7 @@ def format_dataset(example):
     with PartialState().local_main_process_first():
         # Load the dataset
         dataset = build_helpfulness_dataset(script_args.llm_name, num_proc=bco_args.dataset_num_proc)
-        formatted_dataset = dataset.map(format_dataset, batched=False, num_proc=bco_args.dataset_num_proc)
+        dataset = dataset.map(format_dataset, batched=False, num_proc=bco_args.dataset_num_proc)
 
     accelerator = Accelerator()
     embedding_model = AutoModel.from_pretrained(
@@ -233,8 +233,8 @@ def format_dataset(example):
         model,
         ref_model,
         args=bco_args,
-        train_dataset=formatted_dataset["train"],
-        eval_dataset=formatted_dataset["test"],
+        train_dataset=dataset["train"],
+        eval_dataset=dataset["test"],
         tokenizer=tokenizer,
         peft_config=get_peft_config(model_args),
         embedding_func=embedding_func,
diff --git a/examples/scripts/cpo.py b/examples/scripts/cpo.py
index b4c6386850..924cbf162a 100644
--- a/examples/scripts/cpo.py
+++ b/examples/scripts/cpo.py
@@ -64,7 +64,7 @@
 
 @dataclass
 class ScriptArguments:
-    dataset: str = field(
+    dataset_name: str = field(
         default="trl-internal-testing/hh-rlhf-helpful-base-trl-style",
         metadata={"help": "The name of the dataset to use."},
     )
@@ -89,7 +89,7 @@ class ScriptArguments:
     ################
     # Dataset
     ################
-    ds = load_dataset(args.dataset)
+    dataset = load_dataset(args.dataset_name)
     if tokenizer.chat_template is None:
         tokenizer.chat_template = SIMPLE_CHAT_TEMPLATE
 
@@ -101,10 +101,7 @@ def process(row):
     # Compute that only on the main process for faster data processing.
     # see: https://github.com/huggingface/trl/pull/1255
     with PartialState().local_main_process_first():
-        ds = ds.map(process, num_proc=cpo_args.dataset_num_proc)
-
-    train_dataset = ds["train"]
-    eval_dataset = ds["test"]
+        dataset = dataset.map(process, num_proc=cpo_args.dataset_num_proc)
 
     ################
     # Training
@@ -112,8 +109,8 @@ def process(row):
     trainer = CPOTrainer(
         model,
         args=cpo_args,
-        train_dataset=train_dataset,
-        eval_dataset=eval_dataset,
+        train_dataset=dataset["train"],
+        eval_dataset=dataset["test"],
         tokenizer=tokenizer,
         peft_config=get_peft_config(model_config),
     )
diff --git a/examples/scripts/dpo.py b/examples/scripts/dpo.py
index 94abfd3efc..99f3bd16aa 100644
--- a/examples/scripts/dpo.py
+++ b/examples/scripts/dpo.py
@@ -112,17 +112,14 @@
     ################
     # Dataset
     ################
-    ds = load_dataset(args.dataset_name)
+    dataset = load_dataset(args.dataset_name)
 
     with PartialState().local_main_process_first():
-        ds = ds.map(maybe_extract_prompt, num_proc=training_args.dataset_num_proc)
-        ds = ds.map(
+        dataset = dataset.map(maybe_extract_prompt, num_proc=training_args.dataset_num_proc)
+        dataset = dataset.map(
             maybe_apply_chat_template, num_proc=training_args.dataset_num_proc, fn_kwargs={"tokenizer": tokenizer}
         )
 
-    train_dataset = ds[args.dataset_train_split]
-    eval_dataset = ds[args.dataset_test_split]
-
     ##########
     # Training
     ################
@@ -130,8 +127,8 @@
         model,
         ref_model,
         args=training_args,
-        train_dataset=train_dataset,
-        eval_dataset=eval_dataset,
+        train_dataset=dataset[args.dataset_train_split],
+        eval_dataset=dataset[args.dataset_test_split],
         tokenizer=tokenizer,
         peft_config=peft_config,
     )
diff --git a/examples/scripts/dpo_visual.py b/examples/scripts/dpo_visual.py
index 277d786c5c..3f0fe8665d 100644
--- a/examples/scripts/dpo_visual.py
+++ b/examples/scripts/dpo_visual.py
@@ -105,7 +105,7 @@
     ################
     # Dataset
     ################
-    ds = load_dataset(args.dataset_name)
+    dataset = load_dataset(args.dataset_name)
 
     def process(row):
         row["prompt"] = processor.apply_chat_template(row["prompt"], tokenize=False)
@@ -116,10 +116,7 @@ def process(row):
     # Compute that only on the main process for faster data processing.
     # see: https://github.com/huggingface/trl/pull/1255
     with PartialState().local_main_process_first():
-        ds = ds.map(process, num_proc=training_args.dataset_num_proc)
-
-    train_dataset = ds[args.dataset_train_split]
-    eval_dataset = ds[args.dataset_test_split]
+        dataset = dataset.map(process, num_proc=training_args.dataset_num_proc)
 
     ################
     # Training
@@ -128,8 +125,8 @@ def process(row):
         model,
         ref_model,
         args=training_args,
-        train_dataset=train_dataset,
-        eval_dataset=eval_dataset,
+        train_dataset=dataset[args.dataset_train_split],
+        eval_dataset=dataset[args.dataset_test_split],
         tokenizer=processor,
         peft_config=peft_config,
     )
diff --git a/examples/scripts/evals/judge_tldr.py b/examples/scripts/evals/judge_tldr.py
index ac4d7cfb9a..9e6111cf7c 100644
--- a/examples/scripts/evals/judge_tldr.py
+++ b/examples/scripts/evals/judge_tldr.py
@@ -62,13 +62,13 @@ class ScriptArguments:
 args = parser.parse_args_into_dataclasses()[0]
 
 # Load the dataset
-raw_dataset = load_dataset("trl-lib/tldr", split="validation")
+dataset = load_dataset("trl-lib/tldr", split="validation")
 if args.num_examples is not None:
-    raw_dataset = raw_dataset.select(range(args.num_examples))
+    dataset = dataset.select(range(args.num_examples))
 
 # Extract the prompts and reference completions
-prompts = raw_dataset["prompt"]
-reference_completions = raw_dataset["completion"]
+prompts = dataset["prompt"]
+reference_completions = dataset["completion"]
 
 # Generate the model completions
 sampling_params = SamplingParams(temperature=0.0, top_p=0.95, max_tokens=200)  # very generous max token length
diff --git a/examples/scripts/gkd.py b/examples/scripts/gkd.py
index 05791ba36e..c268ac0c63 100644
--- a/examples/scripts/gkd.py
+++ b/examples/scripts/gkd.py
@@ -103,19 +103,16 @@
     ################
     # Dataset
     ################
-    raw_datasets = load_dataset(args.dataset_name)
+    dataset = load_dataset(args.dataset_name)
 
     with PartialState().local_main_process_first():
-        raw_datasets = raw_datasets.map(
+        dataset = dataset.map(
             lambda x: {
                 "prompt": tokenizer.apply_chat_template(x["prompt"], tokenize=False, add_generation_prompt=True)
             },
             num_proc=training_args.dataset_num_proc,
         )
 
-    train_dataset = raw_datasets[args.dataset_train_split]
-    eval_dataset = raw_datasets[args.dataset_test_split]
-
     ################
     # Training
     ################
@@ -123,8 +120,8 @@
         model=model_config.model_name_or_path,
         teacher_model=training_args.teacher_model_name_or_path,
         args=training_args,
-        train_dataset=train_dataset,
-        eval_dataset=eval_dataset,
+        train_dataset=dataset[args.dataset_train_split],
+        eval_dataset=dataset[args.dataset_test_split],
         tokenizer=tokenizer,
         peft_config=get_peft_config(model_config),
     )
diff --git a/examples/scripts/kto.py b/examples/scripts/kto.py
index df5dfb105d..09d30d62f6 100644
--- a/examples/scripts/kto.py
+++ b/examples/scripts/kto.py
@@ -113,15 +113,15 @@ def format_dataset(example):
     # Compute that only on the main process for faster data processing.
     # see: https://github.com/huggingface/trl/pull/1255
     with PartialState().local_main_process_first():
-        formatted_dataset = dataset.map(format_dataset, num_proc=kto_args.dataset_num_proc)
+        dataset = dataset.map(format_dataset, num_proc=kto_args.dataset_num_proc)
 
     # Initialize the KTO trainer
     kto_trainer = KTOTrainer(
         model,
         ref_model,
         args=kto_args,
-        train_dataset=formatted_dataset["train"],
-        eval_dataset=formatted_dataset["test"],
+        train_dataset=dataset["train"],
+        eval_dataset=dataset["test"],
         tokenizer=tokenizer,
         peft_config=get_peft_config(model_args),
     )
diff --git a/examples/scripts/orpo.py b/examples/scripts/orpo.py
index b5ce93dfd8..9aa0684c6f 100644
--- a/examples/scripts/orpo.py
+++ b/examples/scripts/orpo.py
@@ -64,7 +64,7 @@
 
 @dataclass
 class ScriptArguments:
-    dataset: str = field(
+    dataset_name: str = field(
         default="trl-internal-testing/hh-rlhf-helpful-base-trl-style",
         metadata={"help": "The name of the dataset to use."},
     )
@@ -89,7 +89,7 @@ class ScriptArguments:
     ################
     # Dataset
     ################
-    ds = load_dataset(args.dataset)
+    dataset = load_dataset(args.dataset_name)
     if tokenizer.chat_template is None:
         tokenizer.chat_template = SIMPLE_CHAT_TEMPLATE
 
@@ -102,10 +102,7 @@ def process(row):
     # Compute that only on the main process for faster data processing.
     # see: https://github.com/huggingface/trl/pull/1255
     with PartialState().local_main_process_first():
-        ds = ds.map(process, num_prc=orpo_args.dataset_num_proc)
-
-    train_dataset = ds["train"]
-    eval_dataset = ds["test"]
+        dataset = dataset.map(process, num_prc=orpo_args.dataset_num_proc)
 
     ################
     # Training
@@ -113,8 +110,8 @@ def process(row):
     trainer = ORPOTrainer(
         model,
         args=orpo_args,
-        train_dataset=train_dataset,
-        eval_dataset=eval_dataset,
+        train_dataset=dataset["train"],
+        eval_dataset=dataset["test"],
         tokenizer=tokenizer,
         peft_config=get_peft_config(model_config),
     )
diff --git a/examples/scripts/ppo/ppo.py b/examples/scripts/ppo/ppo.py
index 408ce29778..5312f32cc7 100644
--- a/examples/scripts/ppo/ppo.py
+++ b/examples/scripts/ppo/ppo.py
@@ -87,10 +87,10 @@
     ################
     # Dataset
     ################
-    raw_datasets = load_dataset("trl-internal-testing/descriptiveness-sentiment-trl-style", split="descriptiveness")
+    dataset = load_dataset("trl-internal-testing/descriptiveness-sentiment-trl-style", split="descriptiveness")
     eval_samples = 20
-    train_dataset = raw_datasets.select(range(len(raw_datasets) - eval_samples))
-    eval_dataset = raw_datasets.select(range(len(raw_datasets) - eval_samples, len(raw_datasets)))
+    train_dataset = dataset.select(range(len(dataset) - eval_samples))
+    eval_dataset = dataset.select(range(len(dataset) - eval_samples, len(dataset)))
     dataset_text_field = "prompt"
 
     def prepare_dataset(dataset, tokenizer):
diff --git a/examples/scripts/ppo/ppo_tldr.py b/examples/scripts/ppo/ppo_tldr.py
index 4fb8b23d2e..138e0f2570 100644
--- a/examples/scripts/ppo/ppo_tldr.py
+++ b/examples/scripts/ppo/ppo_tldr.py
@@ -89,9 +89,9 @@
     ################
     # Dataset
     ################
-    raw_datasets = load_dataset("trl-internal-testing/tldr-preference-sft-trl-style")
-    train_dataset = raw_datasets["train"]
-    eval_dataset = raw_datasets["validation"]
+    dataset = load_dataset("trl-internal-testing/tldr-preference-sft-trl-style")
+    train_dataset = dataset["train"]
+    eval_dataset = dataset["validation"]
 
     def prepare_dataset(dataset, tokenizer):
         """pre-tokenize the dataset before training; only collate during training"""
diff --git a/examples/scripts/reward_modeling.py b/examples/scripts/reward_modeling.py
index 3daf375352..e00e5d8ff8 100644
--- a/examples/scripts/reward_modeling.py
+++ b/examples/scripts/reward_modeling.py
@@ -113,7 +113,7 @@
     #############################
     # Load and preprocess dataset
     #############################
-    raw_datasets = load_dataset(args.dataset_name)
+    dataset = load_dataset(args.dataset_name)
 
     def preprocess_function(examples):
         new_examples = {
@@ -137,25 +137,22 @@ def preprocess_function(examples):
         # This assumes the chosen/rejected columns are in the OpenAI messages format.
         chosen_fn = conversations_formatting_function(tokenizer, "chosen")
         rejected_fn = conversations_formatting_function(tokenizer, "rejected")
-        raw_datasets = raw_datasets.map(
+        dataset = dataset.map(
             lambda x: {"chosen": chosen_fn(x), "rejected": rejected_fn(x)}, num_proc=config.dataset_num_proc
         )
         # Tokenize inputs
-        raw_datasets = raw_datasets.map(
+        dataset = dataset.map(
             preprocess_function,
             batched=True,
             num_proc=config.dataset_num_proc,
         )
         # Filter out examples that are too long
-        raw_datasets = raw_datasets.filter(
+        dataset = dataset.filter(
             lambda x: len(x["input_ids_chosen"]) <= config.max_length
             and len(x["input_ids_rejected"]) <= config.max_length,
             num_proc=config.dataset_num_proc,
         )
 
-    train_dataset = raw_datasets[args.dataset_train_split]
-    eval_dataset = raw_datasets[args.dataset_test_split]
-
     ##########
     # Training
     ##########
@@ -163,8 +160,8 @@ def preprocess_function(examples):
         model=model,
         tokenizer=tokenizer,
         args=config,
-        train_dataset=train_dataset,
-        eval_dataset=eval_dataset,
+        train_dataset=dataset[args.dataset_train_split],
+        eval_dataset=dataset[args.dataset_test_split],
         peft_config=get_peft_config(model_config),
     )
     trainer.train()
diff --git a/examples/scripts/rloo/rloo_tldr.py b/examples/scripts/rloo/rloo_tldr.py
index e40bec3ed2..ae96d1f0c1 100644
--- a/examples/scripts/rloo/rloo_tldr.py
+++ b/examples/scripts/rloo/rloo_tldr.py
@@ -89,9 +89,9 @@
     ################
     # Dataset
     ################
-    raw_datasets = load_dataset("trl-internal-testing/tldr-preference-sft-trl-style")
-    train_dataset = raw_datasets["train"]
-    eval_dataset = raw_datasets["validation"]
+    dataset = load_dataset("trl-internal-testing/tldr-preference-sft-trl-style")
+    train_dataset = dataset["train"]
+    eval_dataset = dataset["validation"]
 
     def prepare_dataset(dataset, tokenizer):
         """pre-tokenize the dataset before training; only collate during training"""
diff --git a/examples/scripts/sft.py b/examples/scripts/sft.py
index 1e461bc81b..6b00e30227 100644
--- a/examples/scripts/sft.py
+++ b/examples/scripts/sft.py
@@ -90,10 +90,7 @@
     ################
     # Dataset
     ################
-    raw_datasets = load_dataset(args.dataset_name)
-
-    train_dataset = raw_datasets[args.dataset_train_split]
-    eval_dataset = raw_datasets[args.dataset_test_split]
+    dataset = load_dataset(args.dataset_name)
 
     ################
     # Training
@@ -101,8 +98,8 @@
     trainer = SFTTrainer(
         model=model_config.model_name_or_path,
         args=training_args,
-        train_dataset=train_dataset,
-        eval_dataset=eval_dataset,
+        train_dataset=dataset[args.dataset_train_split],
+        eval_dataset=dataset[args.dataset_test_split],
         tokenizer=tokenizer,
         peft_config=get_peft_config(model_config),
     )
diff --git a/examples/scripts/vsft_llava.py b/examples/scripts/vsft_llava.py
index 7313df2e22..d3ffb808b3 100644
--- a/examples/scripts/vsft_llava.py
+++ b/examples/scripts/vsft_llava.py
@@ -103,9 +103,7 @@ def collate_fn(examples):
     ################
     # Dataset
     ################
-    raw_datasets = load_dataset(sft_script_args.dataset_name)
-    train_dataset = raw_datasets[sft_script_args.dataset_train_split]
-    eval_dataset = raw_datasets[sft_script_args.dataset_test_split]
+    dataset = load_dataset(sft_script_args.dataset_name)
 
     ################
     # Training
@@ -114,8 +112,8 @@ def collate_fn(examples):
         model=model,
         args=training_args,
         data_collator=collate_fn,
-        train_dataset=train_dataset,
-        eval_dataset=eval_dataset,
+        train_dataset=dataset[sft_script_args.dataset_train_split],
+        eval_dataset=dataset[sft_script_args.dataset_test_split],
         tokenizer=processor.tokenizer,
         peft_config=get_peft_config(model_config),
     )
diff --git a/tests/test_trainers_args.py b/tests/test_trainers_args.py
index 94e2bf230e..173369a020 100644
--- a/tests/test_trainers_args.py
+++ b/tests/test_trainers_args.py
@@ -80,7 +80,7 @@ def test_bco(self):
 
     def test_cpo(self):
         tokenizer = AutoTokenizer.from_pretrained("gpt2")
-        dataset = dataset = load_dataset("trl-internal-testing/zen", "standard_preference", split="train")
+        dataset = load_dataset("trl-internal-testing/zen", "standard_preference", split="train")
         with tempfile.TemporaryDirectory() as tmp_dir:
             args = CPOConfig(
                 tmp_dir,
@@ -121,7 +121,7 @@ def test_cpo(self):
 
     def test_dpo(self):
         tokenizer = AutoTokenizer.from_pretrained("gpt2")
-        dataset = dataset = load_dataset("trl-internal-testing/zen", "standard_preference", split="train")
+        dataset = load_dataset("trl-internal-testing/zen", "standard_preference", split="train")
         with tempfile.TemporaryDirectory() as tmp_dir:
             args = DPOConfig(
                 tmp_dir,
@@ -182,7 +182,7 @@ def test_dpo(self):
 
     def test_kto(self):
         tokenizer = AutoTokenizer.from_pretrained("gpt2")
-        dataset = dataset = load_dataset("trl-internal-testing/zen", "standard_unpaired_preference", split="train")
+        dataset = load_dataset("trl-internal-testing/zen", "standard_unpaired_preference", split="train")
         with tempfile.TemporaryDirectory() as tmp_dir:
             args = KTOConfig(
                 tmp_dir,