From 10c2f63b2ac8564cca28aa1598a1f3ac6a5fc63c Mon Sep 17 00:00:00 2001
From: =?UTF-8?q?Quentin=20Gallou=C3=A9dec?=
 <45557362+qgallouedec@users.noreply.github.com>
Date: Thu, 19 Sep 2024 15:03:47 +0200
Subject: [PATCH] `training_args` for all `TrainingArguments` (#2082)

---
 docs/source/cpo_trainer.mdx         |  4 +--
 docs/source/gkd_trainer.md          |  4 +--
 docs/source/nash_md_trainer.md      |  6 ++--
 docs/source/online_dpo_trainer.md   |  6 ++--
 docs/source/orpo_trainer.md         |  4 +--
 docs/source/reward_trainer.mdx      |  2 +-
 docs/source/sft_trainer.mdx         | 46 ++++++++++++++---------------
 docs/source/xpo_trainer.mdx         |  6 ++--
 examples/scripts/alignprop.py       |  6 ++--
 examples/scripts/bco.py             | 12 ++++----
 examples/scripts/cpo.py             |  8 ++---
 examples/scripts/ddpo.py            |  6 ++--
 examples/scripts/kto.py             | 10 +++----
 examples/scripts/orpo.py            |  8 ++---
 examples/scripts/ppo.py             | 12 ++++----
 examples/scripts/ppo/ppo.py         | 20 ++++++-------
 examples/scripts/ppo/ppo_tldr.py    | 24 +++++++--------
 examples/scripts/reward_modeling.py | 20 ++++++-------
 examples/scripts/rloo/rloo.py       | 24 +++++++--------
 examples/scripts/rloo/rloo_tldr.py  | 22 +++++++-------
 tests/slow/test_sft_slow.py         | 40 ++++++++++++-------------
 tests/test_alignprop_trainer.py     |  8 ++---
 tests/test_callbacks.py             | 12 ++++----
 tests/test_ddpo_trainer.py          |  8 ++---
 tests/test_dpo_trainer.py           | 24 +++++++--------
 tests/test_iterative_sft_trainer.py |  4 +--
 tests/test_trainers_args.py         | 34 ++++++++++++---------
 27 files changed, 192 insertions(+), 188 deletions(-)

diff --git a/docs/source/cpo_trainer.mdx b/docs/source/cpo_trainer.mdx
index 05c0f40cf9..39c80f3acc 100644
--- a/docs/source/cpo_trainer.mdx
+++ b/docs/source/cpo_trainer.mdx
@@ -61,13 +61,13 @@ The CPO trainer expects a model of `AutoModelForCausalLM`, compared to PPO that
 For a detailed example have a look at the `examples/scripts/cpo.py` script. At a high level we need to initialize the `CPOTrainer` with a `model` we wish to train. **Note that CPOTrainer eliminates the need to use the reference model, simplifying the optimization process.** The `beta` refers to the hyperparameter of the implicit reward, and the dataset contains the 3 entries listed above.
 
 ```py
-cpo_config = CPOConfig(
+training_args = CPOConfig(
     beta=0.1,
 )
 
 cpo_trainer = CPOTrainer(
     model,
-    args=cpo_config,
+    args=training_args,
     train_dataset=train_dataset,
     tokenizer=tokenizer,
 )
diff --git a/docs/source/gkd_trainer.md b/docs/source/gkd_trainer.md
index 4801c35c32..e6513cb5cc 100644
--- a/docs/source/gkd_trainer.md
+++ b/docs/source/gkd_trainer.md
@@ -67,11 +67,11 @@ eval_dataset = Dataset.from_dict(
     }
 )
 
-args = GKDConfig(output_dir="gkd-model", per_device_train_batch_size=1)
+training_args = GKDConfig(output_dir="gkd-model", per_device_train_batch_size=1)
 trainer = GKDTrainer(
     model=model,
     teacher_model=teacher_model,
-    args=args,
+    args=training_args,
     tokenizer=tokenizer,
     train_dataset=train_dataset,
     eval_dataset=eval_dataset,
diff --git a/docs/source/nash_md_trainer.md b/docs/source/nash_md_trainer.md
index e0d931d187..9d380cb4c0 100644
--- a/docs/source/nash_md_trainer.md
+++ b/docs/source/nash_md_trainer.md
@@ -34,11 +34,11 @@ tokenizer = AutoTokenizer.from_pretrained("Qwen/Qwen2-0.5B-Instruct")
 reward_model = AutoModelForSequenceClassification.from_pretrained("trl-lib/Qwen2-0.5B-Reward", num_labels=1)
 train_dataset = load_dataset("trl-lib/ultrafeedback-prompt", split="train")
 
-args = NashMDConfig(output_dir="nash-md-qwen2", logging_steps=10)
+training_args = NashMDConfig(output_dir="nash-md-qwen2", logging_steps=10)
 trainer = NashMDTrainer(
     model=model,
     reward_model=reward_model,
-    args=args,
+    args=training_args,
     tokenizer=tokenizer,
     train_dataset=train_dataset,
 )
@@ -66,7 +66,7 @@ Make sure that the SFT model and reward model use the _same_ chat template. Othe
 We can want the model to generate completion within a given length. During the learning, the model will generate completion up to the maximum completion length specified in the `max_new_tokens` argument of [`NashMDConfig`]. I you want to penalize for not generating an EOS token before the maximum completion length, you can use the `missing_eos_penalty` argument of [`NashMDConfig`]:
 
 ```python
-args = NashMDConfig(..., max_new_tokens=128, missing_eos_penalty=1.0)
+training_args = NashMDConfig(..., max_new_tokens=128, missing_eos_penalty=1.0)
 ```
 
 ### Logging Completions
diff --git a/docs/source/online_dpo_trainer.md b/docs/source/online_dpo_trainer.md
index 3dfca04053..1e272ae26f 100644
--- a/docs/source/online_dpo_trainer.md
+++ b/docs/source/online_dpo_trainer.md
@@ -36,11 +36,11 @@ tokenizer = AutoTokenizer.from_pretrained("Qwen/Qwen2-0.5B-Instruct")
 reward_model = AutoModelForSequenceClassification.from_pretrained("trl-lib/Qwen2-0.5B-Reward", num_labels=1)
 train_dataset = load_dataset("trl-lib/ultrafeedback-prompt", split="train")
 
-args = OnlineDPOConfig(output_dir="online-dpo-qwen2", logging_steps=10)
+training_args = OnlineDPOConfig(output_dir="online-dpo-qwen2", logging_steps=10)
 trainer = OnlineDPOTrainer(
     model=model,
     reward_model=reward_model,
-    args=args,
+    args=training_args,
     tokenizer=tokenizer,
     train_dataset=train_dataset,
 )
@@ -85,7 +85,7 @@ Make sure that the SFT model and reward model use the _same_ chat template. Othe
 We can want the model to generate completion within a given length. During the learning, the model will generate completion up to the maximum completion length specified in the `max_new_tokens` argument of [`OnlineDPOConfig`]. I you want to penalize for not generating an EOS token before the maximum completion length, you can use the `missing_eos_penalty` argument of [`OnlineDPOConfig`]:
 
 ```python
-args = OnlineDPOConfig(..., max_new_tokens=128, missing_eos_penalty=1.0)
+training_args = OnlineDPOConfig(..., max_new_tokens=128, missing_eos_penalty=1.0)
 ```
 
 ### Logging Completions
diff --git a/docs/source/orpo_trainer.md b/docs/source/orpo_trainer.md
index 124bc7891b..12358f940b 100644
--- a/docs/source/orpo_trainer.md
+++ b/docs/source/orpo_trainer.md
@@ -56,13 +56,13 @@ The ORPO trainer expects a model of `AutoModelForCausalLM`, compared to PPO that
 For a detailed example have a look at the `examples/scripts/orpo.py` script. At a high level we need to initialize the `ORPOTrainer` with a `model` we wish to train. **Note that ORPOTrainer eliminates the need to use the reference model, simplifying the optimization process.** The `beta` refers to the hyperparameter `lambda` in eq. (6) of the paper and refers to the weighting of the relative odd ratio loss in the standard cross-entropy loss used for SFT.
 
 ```py
-orpo_config = ORPOConfig(
+training_args = ORPOConfig(
     beta=0.1, # the lambda/alpha hyperparameter in the paper/code
 )
 
 orpo_trainer = ORPOTrainer(
     model,
-    args=orpo_config,
+    args=training_args,
     train_dataset=train_dataset,
     tokenizer=tokenizer,
 )
diff --git a/docs/source/reward_trainer.mdx b/docs/source/reward_trainer.mdx
index 8335ecf56a..5a73217ead 100644
--- a/docs/source/reward_trainer.mdx
+++ b/docs/source/reward_trainer.mdx
@@ -79,7 +79,7 @@ $$\Big( R(p, r_1) + R(p, r_2) \Big)^2 $$
 This auxiliary loss is combined with the main loss function, weighted by the parameter `center_rewards_coefficient` in the `[RewardConfig]`. By default, this feature is deactivated (`center_rewards_coefficient = None`).
 
 ```python
-reward_config = RewardConfig(
+training_args = RewardConfig(
     center_rewards_coefficient=0.01,
     ...
 )
diff --git a/docs/source/sft_trainer.mdx b/docs/source/sft_trainer.mdx
index fb14c68dc3..99f10006c4 100644
--- a/docs/source/sft_trainer.mdx
+++ b/docs/source/sft_trainer.mdx
@@ -16,7 +16,7 @@ from trl import SFTConfig, SFTTrainer
 
 dataset = load_dataset("stanfordnlp/imdb", split="train")
 
-sft_config = SFTConfig(
+training_args = SFTConfig(
     dataset_text_field="text",
     max_seq_length=512,
     output_dir="/tmp",
@@ -24,7 +24,7 @@ sft_config = SFTConfig(
 trainer = SFTTrainer(
     "facebook/opt-350m",
     train_dataset=dataset,
-    args=sft_config,
+    args=training_args,
 )
 trainer.train()
 ```
@@ -41,12 +41,12 @@ dataset = load_dataset("stanfordnlp/imdb", split="train")
 
 model = AutoModelForCausalLM.from_pretrained("facebook/opt-350m")
 
-sft_config = SFTConfig(output_dir="/tmp")
+training_args = SFTConfig(output_dir="/tmp")
 
 trainer = SFTTrainer(
     model,
     train_dataset=dataset,
-    args=sft_config,
+    args=training_args,
 )
 
 trainer.train()
@@ -220,10 +220,10 @@ dataset = load_dataset("philschmid/dolly-15k-oai-style", split="train")
 
 ...
 
-sft_config = SFTConfig(packing=True)
+training_args = SFTConfig(packing=True)
 trainer = SFTTrainer(
     "facebook/opt-350m",
-    args=sft_config,
+    args=training_args,
     train_dataset=dataset,
 )
 ```
@@ -256,7 +256,7 @@ def formatting_prompts_func(example):
 
 trainer = SFTTrainer(
     model,
-    args=sft_config,
+    args=training_args,
     train_dataset=dataset,
     formatting_func=formatting_prompts_func,
 )
@@ -271,12 +271,12 @@ To properly format your input make sure to process all the examples by looping o
 
 ```python
 ...
-sft_config = SFTConfig(packing=True, dataset_text_field="text",)
+training_args = SFTConfig(packing=True, dataset_text_field="text",)
 
 trainer = SFTTrainer(
     "facebook/opt-350m",
     train_dataset=dataset,
-    args=sft_config
+    args=training_args
 )
 
 trainer.train()
@@ -294,11 +294,11 @@ def formatting_func(example):
     text = f"### Question: {example['question']}\n ### Answer: {example['answer']}"
     return text
 
-sft_config = SFTConfig(packing=True)
+training_args = SFTConfig(packing=True)
 trainer = SFTTrainer(
     "facebook/opt-350m",
     train_dataset=dataset,
-    args=sft_config,
+    args=training_args,
     formatting_func=formatting_func
 )
 
@@ -315,7 +315,7 @@ model = AutoModelForCausalLM.from_pretrained("facebook/opt-350m", torch_dtype=to
 
 ...
 
-sft_config = SFTConfig(
+training_args = SFTConfig(
     model_init_kwargs={
         "torch_dtype": "bfloat16",
     },
@@ -324,7 +324,7 @@ sft_config = SFTConfig(
 trainer = SFTTrainer(
     "facebook/opt-350m",
     train_dataset=dataset,
-    args=sft_config,
+    args=training_args,
 )
 
 trainer.train()
@@ -510,13 +510,13 @@ from trl import SFTConfig, SFTTrainer
 
 dataset = load_dataset("stanfordnlp/imdb", split="train")
 
-sft_config = SFTConfig(
+training_args = SFTConfig(
     neftune_noise_alpha=5,
 )
 trainer = SFTTrainer(
     "facebook/opt-350m",
     train_dataset=dataset,
-    args=sft_config,
+    args=training_args,
 )
 trainer.train()
 ```
@@ -578,7 +578,7 @@ model = FastLanguageModel.get_peft_model(
     random_state=3407,
 )
 
-args = SFTConfig(
+training_args = SFTConfig(
     output_dir="./output",
     max_seq_length=max_seq_length,
     dataset_text_field="text",
@@ -586,7 +586,7 @@ args = SFTConfig(
 
 trainer = SFTTrainer(
     model=model,
-    args=args,
+    args=training_args,
     train_dataset=dataset,
 )
 trainer.train()
@@ -611,10 +611,10 @@ With great memory reduction, you can potentially turn off cpu_offloading or grad
 pip install liger-kernel
 ```
 
-2. Once installed, set `use_liger` in [SFTConfig](https://github.com/huggingface/trl/blob/850ddcf598984013007d384c6b3e311def2a616e/trl/trainer/sft_config.py#L69). No other changes are needed!
+2. Once installed, set `use_liger` in [`SFTConfig`]. No other changes are needed!
 
 ```python
-config = SFTConfig(
+training_args = SFTConfig(
   use_liger=True
 )
 ```
@@ -742,13 +742,13 @@ print(collated_data.keys())  # dict_keys(['input_ids', 'attention_mask', 'pixel_
 Now that we have prepared the data and defined the collator, we can proceed with training the model. To ensure that the data is not processed as text-only, we need to set a couple of arguments in the `SFTConfig`, specifically `dataset_text_field` and `remove_unused_columns`. We also need to set `skip_prepare_dataset` to `True` to avoid the default processing of the dataset. Below is an example of how to set up the `SFTTrainer`.
 
 ```python
-args.dataset_text_field = ""  # needs a dummy field
-args.remove_unused_columns = False
-args.dataset_kwargs = {"skip_prepare_dataset": True}
+training_args.dataset_text_field = ""  # needs a dummy field
+training_args.remove_unused_columns = False
+training_args.dataset_kwargs = {"skip_prepare_dataset": True}
 
 trainer = SFTTrainer(
     model=model,
-    args=args,
+    args=training_args,
     data_collator=collate_fn,
     train_dataset=train_dataset,
     tokenizer=processor.tokenizer,
diff --git a/docs/source/xpo_trainer.mdx b/docs/source/xpo_trainer.mdx
index ef23da1db8..57fa58c21c 100644
--- a/docs/source/xpo_trainer.mdx
+++ b/docs/source/xpo_trainer.mdx
@@ -34,11 +34,11 @@ tokenizer = AutoTokenizer.from_pretrained("Qwen/Qwen2-0.5B-Instruct")
 reward_model = AutoModelForSequenceClassification.from_pretrained("trl-lib/Qwen2-0.5B-Reward", num_labels=1)
 train_dataset = load_dataset("trl-lib/ultrafeedback-prompt", split="train")
 
-args = XPOConfig(output_dir="nash-md-qwen2", logging_steps=10)
+training_args = XPOConfig(output_dir="nash-md-qwen2", logging_steps=10)
 trainer = XPOTrainer(
     model=model,
     reward_model=reward_model,
-    args=args,
+    args=training_args,
     tokenizer=tokenizer,
     train_dataset=train_dataset,
 )
@@ -66,7 +66,7 @@ Make sure that the SFT model and reward model use the _same_ chat template. Othe
 We can want the model to generate completion within a given length. During the learning, the model will generate completion up to the maximum completion length specified in the `max_new_tokens` argument of [`XPOConfig`]. I you want to penalize for not generating an EOS token before the maximum completion length, you can use the `missing_eos_penalty` argument of [`XPOConfig`]:
 
 ```python
-args = XPOConfig(..., max_new_tokens=128, missing_eos_penalty=1.0)
+training_args = XPOConfig(..., max_new_tokens=128, missing_eos_penalty=1.0)
 ```
 
 ### Logging Completions
diff --git a/examples/scripts/alignprop.py b/examples/scripts/alignprop.py
index f44c677634..376f5bee2b 100644
--- a/examples/scripts/alignprop.py
+++ b/examples/scripts/alignprop.py
@@ -106,8 +106,8 @@ def image_outputs_logger(image_pair_data, global_step, accelerate_logger):
 
 if __name__ == "__main__":
     parser = HfArgumentParser((ScriptArguments, AlignPropConfig))
-    args, alignprop_config = parser.parse_args_into_dataclasses()
-    alignprop_config.project_kwargs = {
+    args, training_args = parser.parse_args_into_dataclasses()
+    training_args.project_kwargs = {
         "logging_dir": "./logs",
         "automatic_checkpoint_naming": True,
         "total_limit": 5,
@@ -118,7 +118,7 @@ def image_outputs_logger(image_pair_data, global_step, accelerate_logger):
         args.pretrained_model, pretrained_model_revision=args.pretrained_revision, use_lora=args.use_lora
     )
     trainer = AlignPropTrainer(
-        alignprop_config,
+        training_args,
         aesthetic_scorer(args.hf_hub_aesthetic_model_id, args.hf_hub_aesthetic_model_filename),
         prompt_fn,
         pipeline,
diff --git a/examples/scripts/bco.py b/examples/scripts/bco.py
index 23d171a8a0..5ac9ed4a0d 100644
--- a/examples/scripts/bco.py
+++ b/examples/scripts/bco.py
@@ -175,9 +175,9 @@ def mean_pooling(model_output, attention_mask):
 
 if __name__ == "__main__":
     parser = HfArgumentParser((ScriptArguments, BCOConfig, ModelConfig))
-    script_args, bco_args, model_args = parser.parse_args_into_dataclasses()
+    script_args, training_args, model_args = parser.parse_args_into_dataclasses()
 
-    bco_args.gradient_checkpointing_kwargs = {"use_reentrant": True}
+    training_args.gradient_checkpointing_kwargs = {"use_reentrant": True}
 
     # Load a pretrained model
     model = AutoModelForCausalLM.from_pretrained(
@@ -208,8 +208,8 @@ def format_dataset(example):
     # see: https://github.com/huggingface/trl/pull/1255
     with PartialState().local_main_process_first():
         # Load the dataset
-        dataset = build_helpfulness_dataset(script_args.llm_name, num_proc=bco_args.dataset_num_proc)
-        dataset = dataset.map(format_dataset, batched=False, num_proc=bco_args.dataset_num_proc)
+        dataset = build_helpfulness_dataset(script_args.llm_name, num_proc=training_args.dataset_num_proc)
+        dataset = dataset.map(format_dataset, batched=False, num_proc=training_args.dataset_num_proc)
 
     accelerator = Accelerator()
     embedding_model = AutoModel.from_pretrained(
@@ -232,7 +232,7 @@ def format_dataset(example):
     bco_trainer = BCOTrainer(
         model,
         ref_model,
-        args=bco_args,
+        args=training_args,
         train_dataset=dataset["train"],
         eval_dataset=dataset["test"],
         tokenizer=tokenizer,
@@ -243,4 +243,4 @@ def format_dataset(example):
 
     # Train and push the model to the Hub
     bco_trainer.train()
-    bco_trainer.save_model(bco_args.output_dir)
+    bco_trainer.save_model(training_args.output_dir)
diff --git a/examples/scripts/cpo.py b/examples/scripts/cpo.py
index 924cbf162a..66c295bfb7 100644
--- a/examples/scripts/cpo.py
+++ b/examples/scripts/cpo.py
@@ -72,7 +72,7 @@ class ScriptArguments:
 
 if __name__ == "__main__":
     parser = HfArgumentParser((ScriptArguments, CPOConfig, ModelConfig))
-    args, cpo_args, model_config = parser.parse_args_into_dataclasses()
+    args, training_args, model_config = parser.parse_args_into_dataclasses()
 
     ################
     # Model & Tokenizer
@@ -101,14 +101,14 @@ def process(row):
     # Compute that only on the main process for faster data processing.
     # see: https://github.com/huggingface/trl/pull/1255
     with PartialState().local_main_process_first():
-        dataset = dataset.map(process, num_proc=cpo_args.dataset_num_proc)
+        dataset = dataset.map(process, num_proc=training_args.dataset_num_proc)
 
     ################
     # Training
     ################
     trainer = CPOTrainer(
         model,
-        args=cpo_args,
+        args=training_args,
         train_dataset=dataset["train"],
         eval_dataset=dataset["test"],
         tokenizer=tokenizer,
@@ -117,4 +117,4 @@ def process(row):
 
     # train and save the model
     trainer.train()
-    trainer.save_model(cpo_args.output_dir)
+    trainer.save_model(training_args.output_dir)
diff --git a/examples/scripts/ddpo.py b/examples/scripts/ddpo.py
index 2b318d0b82..c5baae3c12 100644
--- a/examples/scripts/ddpo.py
+++ b/examples/scripts/ddpo.py
@@ -185,8 +185,8 @@ def image_outputs_logger(image_data, global_step, accelerate_logger):
 
 if __name__ == "__main__":
     parser = HfArgumentParser((ScriptArguments, DDPOConfig))
-    args, ddpo_config = parser.parse_args_into_dataclasses()
-    ddpo_config.project_kwargs = {
+    args, training_args = parser.parse_args_into_dataclasses()
+    training_args.project_kwargs = {
         "logging_dir": "./logs",
         "automatic_checkpoint_naming": True,
         "total_limit": 5,
@@ -198,7 +198,7 @@ def image_outputs_logger(image_data, global_step, accelerate_logger):
     )
 
     trainer = DDPOTrainer(
-        ddpo_config,
+        training_args,
         aesthetic_scorer(args.hf_hub_aesthetic_model_id, args.hf_hub_aesthetic_model_filename),
         prompt_fn,
         pipeline,
diff --git a/examples/scripts/kto.py b/examples/scripts/kto.py
index 09d30d62f6..b9ae7bf987 100644
--- a/examples/scripts/kto.py
+++ b/examples/scripts/kto.py
@@ -74,7 +74,7 @@ class ScriptArguments:
 
 if __name__ == "__main__":
     parser = HfArgumentParser((ScriptArguments, KTOConfig, ModelConfig))
-    script_args, kto_args, model_args = parser.parse_args_into_dataclasses()
+    script_args, training_args, model_args = parser.parse_args_into_dataclasses()
 
     # Load a pretrained model
     model = AutoModelForCausalLM.from_pretrained(
@@ -98,7 +98,7 @@ class ScriptArguments:
     dataset = load_dataset(script_args.dataset_name)
 
     # If needed, reformat a DPO-formatted dataset (prompt, chosen, rejected) to a KTO-format (prompt, completion, label)
-    dataset = maybe_unpair_preference_dataset(dataset, num_proc=kto_args.dataset_num_proc)
+    dataset = maybe_unpair_preference_dataset(dataset, num_proc=training_args.dataset_num_proc)
 
     # Apply chat template
     def format_dataset(example):
@@ -113,13 +113,13 @@ def format_dataset(example):
     # Compute that only on the main process for faster data processing.
     # see: https://github.com/huggingface/trl/pull/1255
     with PartialState().local_main_process_first():
-        dataset = dataset.map(format_dataset, num_proc=kto_args.dataset_num_proc)
+        dataset = dataset.map(format_dataset, num_proc=training_args.dataset_num_proc)
 
     # Initialize the KTO trainer
     kto_trainer = KTOTrainer(
         model,
         ref_model,
-        args=kto_args,
+        args=training_args,
         train_dataset=dataset["train"],
         eval_dataset=dataset["test"],
         tokenizer=tokenizer,
@@ -128,5 +128,5 @@ def format_dataset(example):
 
     # Train and push the model to the Hub
     kto_trainer.train()
-    kto_trainer.save_model(kto_args.output_dir)
+    kto_trainer.save_model(training_args.output_dir)
     kto_trainer.push_to_hub()
diff --git a/examples/scripts/orpo.py b/examples/scripts/orpo.py
index 9aa0684c6f..98c66fb578 100644
--- a/examples/scripts/orpo.py
+++ b/examples/scripts/orpo.py
@@ -72,7 +72,7 @@ class ScriptArguments:
 
 if __name__ == "__main__":
     parser = HfArgumentParser((ScriptArguments, ORPOConfig, ModelConfig))
-    args, orpo_args, model_config = parser.parse_args_into_dataclasses()
+    args, training_args, model_config = parser.parse_args_into_dataclasses()
 
     ################
     # Model & Tokenizer
@@ -102,14 +102,14 @@ def process(row):
     # Compute that only on the main process for faster data processing.
     # see: https://github.com/huggingface/trl/pull/1255
     with PartialState().local_main_process_first():
-        dataset = dataset.map(process, num_prc=orpo_args.dataset_num_proc)
+        dataset = dataset.map(process, num_prc=training_args.dataset_num_proc)
 
     ################
     # Training
     ################
     trainer = ORPOTrainer(
         model,
-        args=orpo_args,
+        args=training_args,
         train_dataset=dataset["train"],
         eval_dataset=dataset["test"],
         tokenizer=tokenizer,
@@ -118,4 +118,4 @@ def process(row):
 
     # train and save the model
     trainer.train()
-    trainer.save_model(orpo_args.output_dir)
+    trainer.save_model(training_args.output_dir)
diff --git a/examples/scripts/ppo.py b/examples/scripts/ppo.py
index 65f5f4e93e..a8c3140d34 100644
--- a/examples/scripts/ppo.py
+++ b/examples/scripts/ppo.py
@@ -74,9 +74,9 @@ def build_dataset(query_dataset, dataset_num_proc, input_min_text_length=2, inpu
             The dataloader for the dataset.
     """
     # load imdb with datasets
-    ds = load_dataset(query_dataset, split="train")
-    ds = ds.rename_columns({"text": "review"})
-    ds = ds.filter(lambda x: len(x["review"]) > 200, num_proc=dataset_num_proc)
+    dataset = load_dataset(query_dataset, split="train")
+    dataset = dataset.rename_columns({"text": "review"})
+    dataset = dataset.filter(lambda x: len(x["review"]) > 200, num_proc=dataset_num_proc)
 
     input_size = LengthSampler(input_min_text_length, input_max_text_length)
 
@@ -85,9 +85,9 @@ def tokenize(sample):
         sample["query"] = tokenizer.decode(sample["input_ids"])
         return sample
 
-    ds = ds.map(tokenize, num_proc=dataset_num_proc)
-    ds.set_format(type="torch")
-    return ds
+    dataset = dataset.map(tokenize, num_proc=dataset_num_proc)
+    dataset.set_format(type="torch")
+    return dataset
 
 
 # We retrieve the dataloader by calling the `build_dataset` function.
diff --git a/examples/scripts/ppo/ppo.py b/examples/scripts/ppo/ppo.py
index 5312f32cc7..541af12b6c 100644
--- a/examples/scripts/ppo/ppo.py
+++ b/examples/scripts/ppo/ppo.py
@@ -57,9 +57,9 @@
 
 if __name__ == "__main__":
     parser = HfArgumentParser((PPOv2Config, ModelConfig))
-    config, model_config = parser.parse_args_into_dataclasses()
+    training_args, model_config = parser.parse_args_into_dataclasses()
     # remove output_dir if exists
-    shutil.rmtree(config.output_dir, ignore_errors=True)
+    shutil.rmtree(training_args.output_dir, ignore_errors=True)
 
     ################
     # Model & Tokenizer
@@ -73,16 +73,16 @@
     if tokenizer.chat_template is None:
         tokenizer.chat_template = SIMPLE_CHAT_TEMPLATE
     value_model = AutoModelForSequenceClassification.from_pretrained(
-        config.reward_model_path, trust_remote_code=model_config.trust_remote_code, num_labels=1
+        training_args.reward_model_path, trust_remote_code=model_config.trust_remote_code, num_labels=1
     )
     reward_model = AutoModelForSequenceClassification.from_pretrained(
-        config.reward_model_path, trust_remote_code=model_config.trust_remote_code, num_labels=1
+        training_args.reward_model_path, trust_remote_code=model_config.trust_remote_code, num_labels=1
     )
     ref_policy = AutoModelForCausalLM.from_pretrained(
-        config.sft_model_path, trust_remote_code=model_config.trust_remote_code
+        training_args.sft_model_path, trust_remote_code=model_config.trust_remote_code
     )
     policy = AutoModelForCausalLM.from_pretrained(
-        config.sft_model_path, trust_remote_code=model_config.trust_remote_code
+        training_args.sft_model_path, trust_remote_code=model_config.trust_remote_code
     )
     ################
     # Dataset
@@ -107,7 +107,7 @@ def tokenize(element):
             tokenize,
             batched=True,
             remove_columns=dataset.column_names,
-            num_proc=config.dataset_num_proc,
+            num_proc=training_args.dataset_num_proc,
         )
 
     # Compute that only on the main process for faster data processing.
@@ -120,7 +120,7 @@ def tokenize(element):
     # Training
     ################
     trainer = PPOv2Trainer(
-        config=config,
+        config=training_args,
         tokenizer=tokenizer,
         policy=policy,
         ref_policy=ref_policy,
@@ -130,7 +130,7 @@ def tokenize(element):
         eval_dataset=eval_dataset,
     )
     trainer.train()
-    trainer.save_model(config.output_dir)
-    if config.push_to_hub:
+    trainer.save_model(training_args.output_dir)
+    if training_args.push_to_hub:
         trainer.push_to_hub()
     trainer.generate_completions()
diff --git a/examples/scripts/ppo/ppo_tldr.py b/examples/scripts/ppo/ppo_tldr.py
index 138e0f2570..4ea8d36763 100644
--- a/examples/scripts/ppo/ppo_tldr.py
+++ b/examples/scripts/ppo/ppo_tldr.py
@@ -59,9 +59,9 @@
 
 if __name__ == "__main__":
     parser = HfArgumentParser((PPOv2Config, ModelConfig))
-    config, model_config = parser.parse_args_into_dataclasses()
+    training_args, model_config = parser.parse_args_into_dataclasses()
     # remove output_dir if exists
-    shutil.rmtree(config.output_dir, ignore_errors=True)
+    shutil.rmtree(training_args.output_dir, ignore_errors=True)
 
     ################
     # Model & Tokenizer
@@ -75,16 +75,16 @@
     if tokenizer.chat_template is None:
         tokenizer.chat_template = SIMPLE_CHAT_TEMPLATE
     value_model = AutoModelForSequenceClassification.from_pretrained(
-        config.reward_model_path, trust_remote_code=model_config.trust_remote_code, num_labels=1
+        training_args.reward_model_path, trust_remote_code=model_config.trust_remote_code, num_labels=1
     )
     reward_model = AutoModelForSequenceClassification.from_pretrained(
-        config.reward_model_path, trust_remote_code=model_config.trust_remote_code, num_labels=1
+        training_args.reward_model_path, trust_remote_code=model_config.trust_remote_code, num_labels=1
     )
     ref_policy = AutoModelForCausalLM.from_pretrained(
-        config.sft_model_path, trust_remote_code=model_config.trust_remote_code
+        training_args.sft_model_path, trust_remote_code=model_config.trust_remote_code
     )
     policy = AutoModelForCausalLM.from_pretrained(
-        config.sft_model_path, trust_remote_code=model_config.trust_remote_code
+        training_args.sft_model_path, trust_remote_code=model_config.trust_remote_code
     )
     ################
     # Dataset
@@ -107,7 +107,7 @@ def tokenize(element):
         return dataset.map(
             tokenize,
             remove_columns=dataset.column_names,
-            num_proc=config.dataset_num_proc,
+            num_proc=training_args.dataset_num_proc,
         )
 
     # Compute that only on the main process for faster data processing.
@@ -116,15 +116,15 @@ def tokenize(element):
         train_dataset = prepare_dataset(train_dataset, tokenizer)
         eval_dataset = prepare_dataset(eval_dataset, tokenizer)
         # filtering
-        train_dataset = train_dataset.filter(lambda x: x["lengths"] <= 512, num_proc=config.dataset_num_proc)
-        eval_dataset = eval_dataset.filter(lambda x: x["lengths"] <= 512, num_proc=config.dataset_num_proc)
+        train_dataset = train_dataset.filter(lambda x: x["lengths"] <= 512, num_proc=training_args.dataset_num_proc)
+        eval_dataset = eval_dataset.filter(lambda x: x["lengths"] <= 512, num_proc=training_args.dataset_num_proc)
 
     assert train_dataset[0]["input_ids"][-1] != tokenizer.eos_token_id, "The last token should not be an EOS token"
     ################
     # Training
     ################
     trainer = PPOv2Trainer(
-        config=config,
+        config=training_args,
         tokenizer=tokenizer,
         policy=policy,
         ref_policy=ref_policy,
@@ -134,7 +134,7 @@ def tokenize(element):
         eval_dataset=eval_dataset,
     )
     trainer.train()
-    trainer.save_model(config.output_dir)
-    if config.push_to_hub:
+    trainer.save_model(training_args.output_dir)
+    if training_args.push_to_hub:
         trainer.push_to_hub()
     trainer.generate_completions()
diff --git a/examples/scripts/reward_modeling.py b/examples/scripts/reward_modeling.py
index e00e5d8ff8..bbb2e23459 100644
--- a/examples/scripts/reward_modeling.py
+++ b/examples/scripts/reward_modeling.py
@@ -74,8 +74,8 @@
 
 if __name__ == "__main__":
     parser = HfArgumentParser((RewardScriptArguments, RewardConfig, ModelConfig))
-    args, config, model_config = parser.parse_args_into_dataclasses()
-    config.gradient_checkpointing_kwargs = dict(use_reentrant=False)
+    args, training_args, model_config = parser.parse_args_into_dataclasses()
+    training_args.gradient_checkpointing_kwargs = dict(use_reentrant=False)
 
     ################
     # Model & Tokenizer
@@ -138,19 +138,19 @@ def preprocess_function(examples):
         chosen_fn = conversations_formatting_function(tokenizer, "chosen")
         rejected_fn = conversations_formatting_function(tokenizer, "rejected")
         dataset = dataset.map(
-            lambda x: {"chosen": chosen_fn(x), "rejected": rejected_fn(x)}, num_proc=config.dataset_num_proc
+            lambda x: {"chosen": chosen_fn(x), "rejected": rejected_fn(x)}, num_proc=training_args.dataset_num_proc
         )
         # Tokenize inputs
         dataset = dataset.map(
             preprocess_function,
             batched=True,
-            num_proc=config.dataset_num_proc,
+            num_proc=training_args.dataset_num_proc,
         )
         # Filter out examples that are too long
         dataset = dataset.filter(
-            lambda x: len(x["input_ids_chosen"]) <= config.max_length
-            and len(x["input_ids_rejected"]) <= config.max_length,
-            num_proc=config.dataset_num_proc,
+            lambda x: len(x["input_ids_chosen"]) <= training_args.max_length
+            and len(x["input_ids_rejected"]) <= training_args.max_length,
+            num_proc=training_args.dataset_num_proc,
         )
 
     ##########
@@ -159,7 +159,7 @@ def preprocess_function(examples):
     trainer = RewardTrainer(
         model=model,
         tokenizer=tokenizer,
-        args=config,
+        args=training_args,
         train_dataset=dataset[args.dataset_train_split],
         eval_dataset=dataset[args.dataset_test_split],
         peft_config=get_peft_config(model_config),
@@ -169,9 +169,9 @@ def preprocess_function(examples):
     ############################
     # Save model and push to Hub
     ############################
-    trainer.save_model(config.output_dir)
+    trainer.save_model(training_args.output_dir)
     metrics = trainer.evaluate()
     trainer.log_metrics("eval", metrics)
     trainer.save_metrics("eval", metrics)
-    trainer.save_model(config.output_dir)
+    trainer.save_model(training_args.output_dir)
     trainer.push_to_hub()
diff --git a/examples/scripts/rloo/rloo.py b/examples/scripts/rloo/rloo.py
index b4e9c2053e..74a52fe69d 100644
--- a/examples/scripts/rloo/rloo.py
+++ b/examples/scripts/rloo/rloo.py
@@ -61,9 +61,9 @@
 
 if __name__ == "__main__":
     parser = HfArgumentParser((RLOOConfig, ModelConfig))
-    config, model_config = parser.parse_args_into_dataclasses()
+    training_args, model_config = parser.parse_args_into_dataclasses()
     # remove output_dir if exists
-    shutil.rmtree(config.output_dir, ignore_errors=True)
+    shutil.rmtree(training_args.output_dir, ignore_errors=True)
 
     ################
     # Model & Tokenizer
@@ -77,21 +77,21 @@
     if tokenizer.chat_template is None:
         tokenizer.chat_template = SIMPLE_CHAT_TEMPLATE
     reward_model = AutoModelForSequenceClassification.from_pretrained(
-        config.reward_model_path, trust_remote_code=model_config.trust_remote_code, num_labels=1
+        training_args.reward_model_path, trust_remote_code=model_config.trust_remote_code, num_labels=1
     )
     ref_policy = AutoModelForCausalLM.from_pretrained(
-        config.sft_model_path, trust_remote_code=model_config.trust_remote_code
+        training_args.sft_model_path, trust_remote_code=model_config.trust_remote_code
     )
     policy = AutoModelForCausalLM.from_pretrained(
-        config.sft_model_path, trust_remote_code=model_config.trust_remote_code
+        training_args.sft_model_path, trust_remote_code=model_config.trust_remote_code
     )
     ################
     # Dataset
     ################
-    raw_datasets = load_dataset("trl-internal-testing/descriptiveness-sentiment-trl-style", split="descriptiveness")
+    dataset = load_dataset("trl-internal-testing/descriptiveness-sentiment-trl-style", split="descriptiveness")
     eval_samples = 20
-    train_dataset = raw_datasets.select(range(len(raw_datasets) - eval_samples))
-    eval_dataset = raw_datasets.select(range(len(raw_datasets) - eval_samples, len(raw_datasets)))
+    train_dataset = dataset.select(range(len(dataset) - eval_samples))
+    eval_dataset = dataset.select(range(len(dataset) - eval_samples, len(dataset)))
     dataset_text_field = "prompt"
 
     def prepare_dataset(dataset, tokenizer):
@@ -108,7 +108,7 @@ def tokenize(element):
             tokenize,
             batched=True,
             remove_columns=dataset.column_names,
-            num_proc=config.dataset_num_proc,
+            num_proc=training_args.dataset_num_proc,
         )
 
     # Compute that only on the main process for faster data processing.
@@ -121,7 +121,7 @@ def tokenize(element):
     # Training
     ################
     trainer = RLOOTrainer(
-        config=config,
+        config=training_args,
         tokenizer=tokenizer,
         policy=policy,
         ref_policy=ref_policy,
@@ -130,7 +130,7 @@ def tokenize(element):
         eval_dataset=eval_dataset,
     )
     trainer.train()
-    trainer.save_model(config.output_dir)
-    if config.push_to_hub:
+    trainer.save_model(training_args.output_dir)
+    if training_args.push_to_hub:
         trainer.push_to_hub()
     trainer.generate_completions()
diff --git a/examples/scripts/rloo/rloo_tldr.py b/examples/scripts/rloo/rloo_tldr.py
index ae96d1f0c1..6ebad8d2e9 100644
--- a/examples/scripts/rloo/rloo_tldr.py
+++ b/examples/scripts/rloo/rloo_tldr.py
@@ -62,9 +62,9 @@
 
 if __name__ == "__main__":
     parser = HfArgumentParser((RLOOConfig, ModelConfig))
-    config, model_config = parser.parse_args_into_dataclasses()
+    training_args, model_config = parser.parse_args_into_dataclasses()
     # remove output_dir if exists
-    shutil.rmtree(config.output_dir, ignore_errors=True)
+    shutil.rmtree(training_args.output_dir, ignore_errors=True)
 
     ################
     # Model & Tokenizer
@@ -78,13 +78,13 @@
     if tokenizer.chat_template is None:
         tokenizer.chat_template = SIMPLE_CHAT_TEMPLATE
     reward_model = AutoModelForSequenceClassification.from_pretrained(
-        config.reward_model_path, trust_remote_code=model_config.trust_remote_code, num_labels=1
+        training_args.reward_model_path, trust_remote_code=model_config.trust_remote_code, num_labels=1
     )
     ref_policy = AutoModelForCausalLM.from_pretrained(
-        config.sft_model_path, trust_remote_code=model_config.trust_remote_code
+        training_args.sft_model_path, trust_remote_code=model_config.trust_remote_code
     )
     policy = AutoModelForCausalLM.from_pretrained(
-        config.sft_model_path, trust_remote_code=model_config.trust_remote_code
+        training_args.sft_model_path, trust_remote_code=model_config.trust_remote_code
     )
     ################
     # Dataset
@@ -107,7 +107,7 @@ def tokenize(element):
         return dataset.map(
             tokenize,
             remove_columns=dataset.column_names,
-            num_proc=config.dataset_num_proc,
+            num_proc=training_args.dataset_num_proc,
         )
 
     # Compute that only on the main process for faster data processing.
@@ -116,15 +116,15 @@ def tokenize(element):
         train_dataset = prepare_dataset(train_dataset, tokenizer)
         eval_dataset = prepare_dataset(eval_dataset, tokenizer)
         # filtering
-        train_dataset = train_dataset.filter(lambda x: x["lengths"] <= 512, num_proc=config.dataset_num_proc)
-        eval_dataset = eval_dataset.filter(lambda x: x["lengths"] <= 512, num_proc=config.dataset_num_proc)
+        train_dataset = train_dataset.filter(lambda x: x["lengths"] <= 512, num_proc=training_args.dataset_num_proc)
+        eval_dataset = eval_dataset.filter(lambda x: x["lengths"] <= 512, num_proc=training_args.dataset_num_proc)
 
     assert train_dataset[0]["input_ids"][-1] != tokenizer.eos_token_id, "The last token should not be an EOS token"
     ################
     # Training
     ################
     trainer = RLOOTrainer(
-        config=config,
+        config=training_args,
         tokenizer=tokenizer,
         policy=policy,
         ref_policy=ref_policy,
@@ -133,7 +133,7 @@ def tokenize(element):
         eval_dataset=eval_dataset,
     )
     trainer.train()
-    trainer.save_model(config.output_dir)
-    if config.push_to_hub:
+    trainer.save_model(training_args.output_dir)
+    if training_args.push_to_hub:
         trainer.push_to_hub()
     trainer.generate_completions()
diff --git a/tests/slow/test_sft_slow.py b/tests/slow/test_sft_slow.py
index a6c73825d4..2d689b1ab4 100644
--- a/tests/slow/test_sft_slow.py
+++ b/tests/slow/test_sft_slow.py
@@ -67,7 +67,7 @@ def test_sft_trainer_str(self, model_name, packing):
         as expected.
         """
         with tempfile.TemporaryDirectory() as tmp_dir:
-            args = SFTConfig(
+            training_args = SFTConfig(
                 output_dir=tmp_dir,
                 logging_strategy="no",
                 report_to="none",
@@ -80,7 +80,7 @@ def test_sft_trainer_str(self, model_name, packing):
 
             trainer = SFTTrainer(
                 model_name,
-                args=args,
+                args=training_args,
                 train_dataset=self.train_dataset,
                 eval_dataset=self.eval_dataset,
             )
@@ -94,7 +94,7 @@ def test_sft_trainer_transformers(self, model_name, packing):
         as expected.
         """
         with tempfile.TemporaryDirectory() as tmp_dir:
-            args = SFTConfig(
+            training_args = SFTConfig(
                 output_dir=tmp_dir,
                 logging_strategy="no",
                 report_to="none",
@@ -110,7 +110,7 @@ def test_sft_trainer_transformers(self, model_name, packing):
 
             trainer = SFTTrainer(
                 model,
-                args=args,
+                args=training_args,
                 tokenizer=tokenizer,
                 train_dataset=self.train_dataset,
                 eval_dataset=self.eval_dataset,
@@ -128,7 +128,7 @@ def test_sft_trainer_peft(self, model_name, packing):
         as expected.
         """
         with tempfile.TemporaryDirectory() as tmp_dir:
-            args = SFTConfig(
+            training_args = SFTConfig(
                 output_dir=tmp_dir,
                 logging_strategy="no",
                 report_to="none",
@@ -145,7 +145,7 @@ def test_sft_trainer_peft(self, model_name, packing):
 
             trainer = SFTTrainer(
                 model,
-                args=args,
+                args=training_args,
                 tokenizer=tokenizer,
                 train_dataset=self.train_dataset,
                 eval_dataset=self.eval_dataset,
@@ -165,7 +165,7 @@ def test_sft_trainer_transformers_mp(self, model_name, packing):
         as expected in mixed precision.
         """
         with tempfile.TemporaryDirectory() as tmp_dir:
-            args = SFTConfig(
+            training_args = SFTConfig(
                 output_dir=tmp_dir,
                 logging_strategy="no",
                 report_to="none",
@@ -182,7 +182,7 @@ def test_sft_trainer_transformers_mp(self, model_name, packing):
 
             trainer = SFTTrainer(
                 model,
-                args=args,
+                args=training_args,
                 tokenizer=tokenizer,
                 train_dataset=self.train_dataset,
                 eval_dataset=self.eval_dataset,
@@ -199,7 +199,7 @@ def test_sft_trainer_transformers_mp_gc(self, model_name, packing, gradient_chec
         as expected in mixed precision + different scenarios of gradient_checkpointing.
         """
         with tempfile.TemporaryDirectory() as tmp_dir:
-            args = SFTConfig(
+            training_args = SFTConfig(
                 output_dir=tmp_dir,
                 logging_strategy="no",
                 report_to="none",
@@ -218,7 +218,7 @@ def test_sft_trainer_transformers_mp_gc(self, model_name, packing, gradient_chec
 
             trainer = SFTTrainer(
                 model,
-                args=args,
+                args=training_args,
                 tokenizer=tokenizer,
                 train_dataset=self.train_dataset,
                 eval_dataset=self.eval_dataset,
@@ -236,7 +236,7 @@ def test_sft_trainer_transformers_mp_gc_peft(self, model_name, packing, gradient
         as expected in mixed precision + different scenarios of gradient_checkpointing.
         """
         with tempfile.TemporaryDirectory() as tmp_dir:
-            args = SFTConfig(
+            training_args = SFTConfig(
                 output_dir=tmp_dir,
                 logging_strategy="no",
                 report_to="none",
@@ -255,7 +255,7 @@ def test_sft_trainer_transformers_mp_gc_peft(self, model_name, packing, gradient
 
             trainer = SFTTrainer(
                 model,
-                args=args,
+                args=training_args,
                 tokenizer=tokenizer,
                 train_dataset=self.train_dataset,
                 eval_dataset=self.eval_dataset,
@@ -280,7 +280,7 @@ def test_sft_trainer_transformers_mp_gc_device_map(
         as expected in mixed precision + different scenarios of gradient_checkpointing (single, multi-gpu, etc).
         """
         with tempfile.TemporaryDirectory() as tmp_dir:
-            args = SFTConfig(
+            training_args = SFTConfig(
                 output_dir=tmp_dir,
                 logging_strategy="no",
                 report_to="none",
@@ -299,7 +299,7 @@ def test_sft_trainer_transformers_mp_gc_device_map(
 
             trainer = SFTTrainer(
                 model,
-                args=args,
+                args=training_args,
                 tokenizer=tokenizer,
                 train_dataset=self.train_dataset,
                 eval_dataset=self.eval_dataset,
@@ -318,7 +318,7 @@ def test_sft_trainer_transformers_mp_gc_peft_qlora(self, model_name, packing, gr
         as expected in mixed precision + different scenarios of gradient_checkpointing.
         """
         with tempfile.TemporaryDirectory() as tmp_dir:
-            args = SFTConfig(
+            training_args = SFTConfig(
                 output_dir=tmp_dir,
                 logging_strategy="no",
                 report_to="none",
@@ -339,7 +339,7 @@ def test_sft_trainer_transformers_mp_gc_peft_qlora(self, model_name, packing, gr
 
             trainer = SFTTrainer(
                 model,
-                args=args,
+                args=training_args,
                 tokenizer=tokenizer,
                 train_dataset=self.train_dataset,
                 eval_dataset=self.eval_dataset,
@@ -363,7 +363,7 @@ def test_sft_trainer_with_chat_format_qlora(self, model_name, packing):
         with tempfile.TemporaryDirectory() as tmp_dir:
             train_dataset = load_dataset("trl-internal-testing/dolly-chatml-sft", split="train")
 
-            args = SFTConfig(
+            training_args = SFTConfig(
                 packing=packing,
                 max_seq_length=self.max_seq_length,
                 output_dir=tmp_dir,
@@ -383,7 +383,7 @@ def test_sft_trainer_with_chat_format_qlora(self, model_name, packing):
 
             trainer = SFTTrainer(
                 model,
-                args=args,
+                args=training_args,
                 tokenizer=tokenizer,
                 train_dataset=train_dataset,
                 peft_config=self.peft_config,
@@ -403,7 +403,7 @@ def test_sft_trainer_with_liger(self, model_name, packing):
         with AutoLigerKernelForCausalLM as expected.
         """
         with tempfile.TemporaryDirectory() as tmp_dir:
-            args = SFTConfig(
+            training_args = SFTConfig(
                 output_dir=tmp_dir,
                 logging_strategy="no",
                 report_to="none",
@@ -417,7 +417,7 @@ def test_sft_trainer_with_liger(self, model_name, packing):
 
             trainer = SFTTrainer(
                 model_name,
-                args=args,
+                args=training_args,
                 train_dataset=self.train_dataset,
                 eval_dataset=self.eval_dataset,
             )
diff --git a/tests/test_alignprop_trainer.py b/tests/test_alignprop_trainer.py
index bb25bb7cd7..995b91a750 100644
--- a/tests/test_alignprop_trainer.py
+++ b/tests/test_alignprop_trainer.py
@@ -42,7 +42,7 @@ class AlignPropTrainerTester(unittest.TestCase):
     """
 
     def setUp(self):
-        alignprop_config = AlignPropConfig(
+        training_args = AlignPropConfig(
             num_epochs=2,
             train_gradient_accumulation_steps=1,
             train_batch_size=2,
@@ -58,11 +58,9 @@ def setUp(self):
         pipeline_without_lora = DefaultDDPOStableDiffusionPipeline(
             pretrained_model, pretrained_model_revision=pretrained_revision, use_lora=False
         )
-        self.trainer_with_lora = AlignPropTrainer(
-            alignprop_config, scorer_function, prompt_function, pipeline_with_lora
-        )
+        self.trainer_with_lora = AlignPropTrainer(training_args, scorer_function, prompt_function, pipeline_with_lora)
         self.trainer_without_lora = AlignPropTrainer(
-            alignprop_config, scorer_function, prompt_function, pipeline_without_lora
+            training_args, scorer_function, prompt_function, pipeline_without_lora
         )
 
     def tearDown(self) -> None:
diff --git a/tests/test_callbacks.py b/tests/test_callbacks.py
index a1e6902f93..e19375ff1d 100644
--- a/tests/test_callbacks.py
+++ b/tests/test_callbacks.py
@@ -70,7 +70,7 @@ def tokenize_function(examples):
 
     def test_basic(self):
         with tempfile.TemporaryDirectory() as tmp_dir:
-            args = TrainingArguments(
+            training_args = TrainingArguments(
                 output_dir=tmp_dir,
                 eval_strategy="steps",
                 eval_steps=2,  # evaluate every 2 steps
@@ -81,7 +81,7 @@ def test_basic(self):
             trainer = TrainerWithRefModel(
                 model=self.model,
                 ref_model=self.ref_model,
-                args=args,
+                args=training_args,
                 train_dataset=self.dataset["train"],
                 eval_dataset=self.dataset["test"],
                 tokenizer=self.tokenizer,
@@ -97,7 +97,7 @@ def test_basic(self):
     def test_without_ref_model(self):
         # Same as before, but without the ref_model attribute. It should use the model attribute instead
         with tempfile.TemporaryDirectory() as tmp_dir:
-            args = TrainingArguments(
+            training_args = TrainingArguments(
                 output_dir=tmp_dir,
                 eval_strategy="steps",
                 eval_steps=2,  # evaluate every 2 steps
@@ -107,7 +107,7 @@ def test_without_ref_model(self):
             )
             trainer = Trainer(
                 model=self.model,
-                args=args,
+                args=training_args,
                 train_dataset=self.dataset["train"],
                 eval_dataset=self.dataset["test"],
                 tokenizer=self.tokenizer,
@@ -130,7 +130,7 @@ def test_lora(self):
                 task_type="CAUSAL_LM",
             )
             self.model.add_adapter(peft_config)
-            args = TrainingArguments(
+            training_args = TrainingArguments(
                 output_dir=tmp_dir,
                 eval_strategy="steps",
                 eval_steps=2,  # evaluate every 2 steps
@@ -140,7 +140,7 @@ def test_lora(self):
             )
             trainer = Trainer(
                 model=self.model,
-                args=args,
+                args=training_args,
                 train_dataset=self.dataset["train"],
                 eval_dataset=self.dataset["test"],
                 tokenizer=self.tokenizer,
diff --git a/tests/test_ddpo_trainer.py b/tests/test_ddpo_trainer.py
index 71c94e7502..8db0b0747a 100644
--- a/tests/test_ddpo_trainer.py
+++ b/tests/test_ddpo_trainer.py
@@ -41,7 +41,7 @@ class DDPOTrainerTester(unittest.TestCase):
     """
 
     def setUp(self):
-        self.ddpo_config = DDPOConfig(
+        self.training_args = DDPOConfig(
             num_epochs=2,
             train_gradient_accumulation_steps=1,
             per_prompt_stat_tracking_buffer_size=32,
@@ -57,7 +57,7 @@ def setUp(self):
             pretrained_model, pretrained_model_revision=pretrained_revision, use_lora=False
         )
 
-        self.trainer = DDPOTrainer(self.ddpo_config, scorer_function, prompt_function, pipeline)
+        self.trainer = DDPOTrainer(self.training_args, scorer_function, prompt_function, pipeline)
 
         return super().setUp()
 
@@ -107,7 +107,7 @@ class DDPOTrainerWithLoRATester(DDPOTrainerTester):
     """
 
     def setUp(self):
-        self.ddpo_config = DDPOConfig(
+        self.training_args = DDPOConfig(
             num_epochs=2,
             train_gradient_accumulation_steps=1,
             per_prompt_stat_tracking_buffer_size=32,
@@ -123,6 +123,6 @@ def setUp(self):
             pretrained_model, pretrained_model_revision=pretrained_revision, use_lora=True
         )
 
-        self.trainer = DDPOTrainer(self.ddpo_config, scorer_function, prompt_function, pipeline)
+        self.trainer = DDPOTrainer(self.training_args, scorer_function, prompt_function, pipeline)
 
         return super().setUp()
diff --git a/tests/test_dpo_trainer.py b/tests/test_dpo_trainer.py
index f6a50c3ee7..7d6e5c4670 100644
--- a/tests/test_dpo_trainer.py
+++ b/tests/test_dpo_trainer.py
@@ -112,7 +112,7 @@ def mock_vision_processor(text, images=None, add_special_tokens=True):
 class TestTruncateTokens(unittest.TestCase):
     def setUp(self):
         with tempfile.TemporaryDirectory() as tmp_dir:
-            self.args = DPOConfig(
+            self.training_args = DPOConfig(
                 max_length=20, max_prompt_length=10, truncation_mode="keep_start", output_dir=tmp_dir
             )
 
@@ -135,7 +135,7 @@ def test_truncate_tokens(self):
         ]
         prompt_tokens = [{"prompt_input_ids": list(range(15)), "prompt_attention_mask": [1] * 15}]
 
-        _truncate_tokens(chosen_tokens, rejected_tokens, prompt_tokens, self.args)
+        _truncate_tokens(chosen_tokens, rejected_tokens, prompt_tokens, self.training_args)
 
         # Check if prompt is truncated correctly
         self.assertEqual(len(chosen_tokens[0]["prompt_input_ids"]), 10)
@@ -152,7 +152,7 @@ def test_truncate_tokens(self):
         self.assertEqual(len(rejected_tokens[0]["attention_mask"]), 10)
 
     def test_truncation_mode_keep_end(self):
-        self.args.truncation_mode = "keep_end"
+        self.training_args.truncation_mode = "keep_end"
         chosen_tokens = [
             {
                 "prompt_input_ids": list(range(15)),
@@ -171,7 +171,7 @@ def test_truncation_mode_keep_end(self):
         ]
         prompt_tokens = [{"prompt_input_ids": list(range(15)), "prompt_attention_mask": [1] * 15}]
 
-        _truncate_tokens(chosen_tokens, rejected_tokens, prompt_tokens, self.args)
+        _truncate_tokens(chosen_tokens, rejected_tokens, prompt_tokens, self.training_args)
 
         # Check if prompt is truncated correctly from the end
         self.assertEqual(prompt_tokens[0]["prompt_input_ids"], list(range(5, 15)))
@@ -190,9 +190,9 @@ def test_truncation_mode_keep_end(self):
         self.assertEqual(rejected_tokens[0]["attention_mask"], [1] * 10)
 
     def test_invalid_truncation_mode(self):
-        self.args.truncation_mode = "invalid_mode"
+        self.training_args.truncation_mode = "invalid_mode"
         with self.assertRaises(ValueError):
-            _truncate_tokens([], [], [], self.args)
+            _truncate_tokens([], [], [], self.training_args)
 
 
 class DPOTrainerTester(unittest.TestCase):
@@ -895,7 +895,7 @@ def test_dpo_trainer_torch_dtype(self):
         # See https://github.com/huggingface/trl/issues/1751
         dummy_dataset = load_dataset("trl-internal-testing/zen", "standard_preference")
         with tempfile.TemporaryDirectory() as tmp_dir:
-            dpo_config = DPOConfig(
+            training_args = DPOConfig(
                 output_dir=tmp_dir,
                 per_device_train_batch_size=2,
                 max_steps=1,
@@ -908,7 +908,7 @@ def test_dpo_trainer_torch_dtype(self):
                 model=self.model_id,
                 ref_model=self.model_id,
                 tokenizer=self.tokenizer,
-                args=dpo_config,
+                args=training_args,
                 train_dataset=dummy_dataset["train"],
             )
             assert trainer.model.config.torch_dtype == torch.float16
@@ -916,7 +916,7 @@ def test_dpo_trainer_torch_dtype(self):
 
         # Now test when `torch_dtype` is provided but is wrong to either the model or the ref_model
         with tempfile.TemporaryDirectory() as tmp_dir:
-            dpo_config = DPOConfig(
+            training_args = DPOConfig(
                 output_dir=tmp_dir,
                 per_device_train_batch_size=2,
                 max_steps=1,
@@ -931,12 +931,12 @@ def test_dpo_trainer_torch_dtype(self):
                 _ = DPOTrainer(
                     model=self.model_id,
                     tokenizer=self.tokenizer,
-                    args=dpo_config,
+                    args=training_args,
                     train_dataset=dummy_dataset["train"],
                 )
 
         with tempfile.TemporaryDirectory() as tmp_dir:
-            dpo_config = DPOConfig(
+            training_args = DPOConfig(
                 output_dir=tmp_dir,
                 per_device_train_batch_size=2,
                 max_steps=1,
@@ -952,7 +952,7 @@ def test_dpo_trainer_torch_dtype(self):
                     model=self.model_id,
                     ref_model=self.model_id,
                     tokenizer=self.tokenizer,
-                    args=dpo_config,
+                    args=training_args,
                     train_dataset=dummy_dataset["train"],
                 )
 
diff --git a/tests/test_iterative_sft_trainer.py b/tests/test_iterative_sft_trainer.py
index 9cdda3418f..295d82bf69 100644
--- a/tests/test_iterative_sft_trainer.py
+++ b/tests/test_iterative_sft_trainer.py
@@ -100,14 +100,14 @@ def test_iterative_step_from_tensor(self, model_name, input_name):
                 model = self.t5_model
                 tokenizer = self.t5_tokenizer
 
-            args = TrainingArguments(
+            training_args = TrainingArguments(
                 output_dir=tmp_dir,
                 per_device_train_batch_size=2,
                 max_steps=2,
                 learning_rate=1e-3,
                 report_to="none",
             )
-            iterative_trainer = IterativeSFTTrainer(model=model, args=args, tokenizer=tokenizer)
+            iterative_trainer = IterativeSFTTrainer(model=model, args=training_args, tokenizer=tokenizer)
             iterative_trainer.optimizer.zero_grad = partial(iterative_trainer.optimizer.zero_grad, set_to_none=False)
 
             iterative_trainer.step(**inputs)
diff --git a/tests/test_trainers_args.py b/tests/test_trainers_args.py
index 173369a020..cbd43ccf0c 100644
--- a/tests/test_trainers_args.py
+++ b/tests/test_trainers_args.py
@@ -41,7 +41,7 @@ def test_bco(self):
         tokenizer = AutoTokenizer.from_pretrained("gpt2")
         dataset = load_dataset("trl-internal-testing/zen", "standard_unpaired_preference", split="train")
         with tempfile.TemporaryDirectory() as tmp_dir:
-            args = BCOConfig(
+            training_args = BCOConfig(
                 tmp_dir,
                 max_length=256,
                 max_prompt_length=64,
@@ -60,7 +60,9 @@ def test_bco(self):
                 min_density_ratio=0.2,
                 max_density_ratio=20.0,
             )
-            trainer = BCOTrainer(model="gpt2", ref_model="gpt2", args=args, train_dataset=dataset, tokenizer=tokenizer)
+            trainer = BCOTrainer(
+                model="gpt2", ref_model="gpt2", args=training_args, train_dataset=dataset, tokenizer=tokenizer
+            )
             self.assertEqual(trainer.args.max_length, 256)
             self.assertEqual(trainer.args.max_prompt_length, 64)
             self.assertEqual(trainer.args.max_completion_length, 64)
@@ -82,7 +84,7 @@ def test_cpo(self):
         tokenizer = AutoTokenizer.from_pretrained("gpt2")
         dataset = load_dataset("trl-internal-testing/zen", "standard_preference", split="train")
         with tempfile.TemporaryDirectory() as tmp_dir:
-            args = CPOConfig(
+            training_args = CPOConfig(
                 tmp_dir,
                 max_length=256,
                 max_prompt_length=64,
@@ -101,7 +103,7 @@ def test_cpo(self):
                 model_init_kwargs={"trust_remote_code": True},
                 dataset_num_proc=4,
             )
-            trainer = CPOTrainer(model="gpt2", args=args, train_dataset=dataset, tokenizer=tokenizer)
+            trainer = CPOTrainer(model="gpt2", args=training_args, train_dataset=dataset, tokenizer=tokenizer)
             self.assertEqual(trainer.args.max_length, 256)
             self.assertEqual(trainer.args.max_prompt_length, 64)
             self.assertEqual(trainer.args.max_completion_length, 64)
@@ -123,7 +125,7 @@ def test_dpo(self):
         tokenizer = AutoTokenizer.from_pretrained("gpt2")
         dataset = load_dataset("trl-internal-testing/zen", "standard_preference", split="train")
         with tempfile.TemporaryDirectory() as tmp_dir:
-            args = DPOConfig(
+            training_args = DPOConfig(
                 tmp_dir,
                 beta=0.5,
                 label_smoothing=0.5,
@@ -152,7 +154,9 @@ def test_dpo(self):
                 ref_model_sync_steps=32,
                 rpo_alpha=0.5,
             )
-            trainer = DPOTrainer(model="gpt2", ref_model="gpt2", args=args, train_dataset=dataset, tokenizer=tokenizer)
+            trainer = DPOTrainer(
+                model="gpt2", ref_model="gpt2", args=training_args, train_dataset=dataset, tokenizer=tokenizer
+            )
             self.assertEqual(trainer.args.beta, 0.5)
             self.assertEqual(trainer.args.label_smoothing, 0.5)
             self.assertEqual(trainer.args.loss_type, "hinge")
@@ -184,7 +188,7 @@ def test_kto(self):
         tokenizer = AutoTokenizer.from_pretrained("gpt2")
         dataset = load_dataset("trl-internal-testing/zen", "standard_unpaired_preference", split="train")
         with tempfile.TemporaryDirectory() as tmp_dir:
-            args = KTOConfig(
+            training_args = KTOConfig(
                 tmp_dir,
                 max_length=256,
                 max_prompt_length=64,
@@ -202,7 +206,9 @@ def test_kto(self):
                 ref_model_init_kwargs={"trust_remote_code": True},
                 dataset_num_proc=4,
             )
-            trainer = KTOTrainer(model="gpt2", ref_model="gpt2", args=args, train_dataset=dataset, tokenizer=tokenizer)
+            trainer = KTOTrainer(
+                model="gpt2", ref_model="gpt2", args=training_args, train_dataset=dataset, tokenizer=tokenizer
+            )
             self.assertEqual(trainer.args.max_length, 256)
             self.assertEqual(trainer.args.max_prompt_length, 64)
             self.assertEqual(trainer.args.max_completion_length, 64)
@@ -223,7 +229,7 @@ def test_online_dpo(self):
         tokenizer = AutoTokenizer.from_pretrained("gpt2")
         dataset = load_dataset("trl-internal-testing/zen", "standard_prompt_only", split="train")
         with tempfile.TemporaryDirectory() as tmp_dir:
-            args = OnlineDPOConfig(
+            training_args = OnlineDPOConfig(
                 tmp_dir,
                 max_new_tokens=42,
                 temperature=0.5,
@@ -236,7 +242,7 @@ def test_online_dpo(self):
             ref_model = AutoModelForCausalLM.from_pretrained("EleutherAI/pythia-14m")
             reward_model = AutoModelForSequenceClassification.from_pretrained("EleutherAI/pythia-14m", num_labels=1)
             trainer = OnlineDPOTrainer(
-                args=args,
+                args=training_args,
                 tokenizer=tokenizer,
                 model=model,
                 ref_model=ref_model,
@@ -254,7 +260,7 @@ def test_orpo(self):
         tokenizer = AutoTokenizer.from_pretrained("gpt2")
         dataset = load_dataset("trl-internal-testing/zen", "standard_preference", split="train")
         with tempfile.TemporaryDirectory() as tmp_dir:
-            args = ORPOConfig(
+            training_args = ORPOConfig(
                 tmp_dir,
                 max_length=256,
                 max_prompt_length=64,
@@ -270,7 +276,7 @@ def test_orpo(self):
                 dataset_num_proc=4,
             )
 
-            trainer = ORPOTrainer(model="gpt2", args=args, train_dataset=dataset, tokenizer=tokenizer)
+            trainer = ORPOTrainer(model="gpt2", args=training_args, train_dataset=dataset, tokenizer=tokenizer)
             self.assertEqual(trainer.args.max_length, 256)
             self.assertEqual(trainer.args.max_prompt_length, 64)
             self.assertEqual(trainer.args.max_completion_length, 64)
@@ -281,7 +287,7 @@ def test_orpo(self):
     def test_sft(self):
         dataset = load_dataset("trl-internal-testing/zen", "standard_language_modeling", split="train")
         with tempfile.TemporaryDirectory() as tmp_dir:
-            args = SFTConfig(
+            training_args = SFTConfig(
                 tmp_dir,
                 dataset_text_field="dummy_text_field",
                 packing=True,
@@ -295,7 +301,7 @@ def test_sft(self):
                 num_of_sequences=32,
                 chars_per_token=4.2,
             )
-            trainer = SFTTrainer("gpt2", args=args, train_dataset=dataset)
+            trainer = SFTTrainer("gpt2", args=training_args, train_dataset=dataset)
             self.assertEqual(trainer.args.dataset_text_field, "dummy_text_field")
             self.assertEqual(trainer.args.packing, True)
             self.assertEqual(trainer.args.max_seq_length, 256)