unify run_grpo with multiple env

RayenTian · RayenTian · commit b7fedb9f8182 · 2025-11-15T19:34:30.000-08:00
Signed-off-by: ruit &lt;ruit@nvidia.com&gt;
diff --git a/examples/configs/grpo_helpsteer3.yaml b/examples/configs/grpo_helpsteer3.yaml
diff --git a/examples/configs/recipes/llm/sft-nemotron-super-49b-tulu-v3.yaml b/examples/configs/recipes/llm/sft-nemotron-super-49b-tulu-v3.yaml
@@ -1,19 +1,36 @@
-defaults:
-- ../../sft.yaml
-- ../../sft_nemotron_super_49b_base.yaml
+defaults: ../../sft.yaml
 sft:
   max_num_steps: 50
   val_period: 5
+  val_global_batch_size: 128
+
+checkpointing:
+  checkpoint_dir: results/sft_nemotron_super_49b
+  metric_name: val_loss
+  keep_top_k: 100
+  save_period: 500
+
 policy:
   model_name: /lustre/fsw/portfolios/coreai/users/joyang/models/llama-3_3-nemotron-49b-instruct-128k-v1_2-hf
   max_total_sequence_length: 32768
+  train_global_batch_size: 128
   dtensor_cfg:
     _v2: true
     activation_checkpointing: true
     context_parallel_size: 8
+    tensor_parallel_size: 4
+    custom_parallel_plan: examples.custom_parallel.llama_nemotron_super_49b_custom_plan.custom_parallel_plan
+  dynamic_batching:
+    train_mb_tokens: 4096
+    logprob_mb_tokens: 8192
+  make_sequence_length_divisible_by: ${max:${mul:${policy.dtensor_cfg.context_parallel_size},
+    2}, ${policy.max_total_sequence_length}}
+  max_grad_norm: null
   optimizer:
     kwargs:
       lr: 1.0e-05
+      weight_decay: 0.01
+      eps: 1.0e-08
   scheduler:
   - name: torch.optim.lr_scheduler.LinearLR
     kwargs:
@@ -28,8 +45,12 @@ policy:
     - 10
 data:
   dataset_name: tulu3_sft_mixture
+  num_workers: 20
   test_size: 0.05
 logger:
+  tensorboard_enabled: false
+  monitor_gpus: false
+  num_val_samples_to_print: 0
   wandb:
     project: nemotron-tulu-3-sft
     name: nemotron-tulu-3
diff --git a/examples/configs/recipes/llm/sft-nemotron-super-49b.yaml b/examples/configs/recipes/llm/sft-nemotron-super-49b.yaml
@@ -1,10 +1,37 @@
-defaults:
-- ../../sft.yaml
-- ../../sft_nemotron_super_49b_base.yaml
+defaults: ../../sft.yaml
 sft:
   max_num_epochs: 3
+  max_num_steps: 100
+  val_global_batch_size: 128
+checkpointing:
+  checkpoint_dir: results/sft-nemotron-super-49b
+  metric_name: val_loss
+  keep_top_k: 100
+  save_period: 500
 policy:
+  model_name: /lustre/fsw/portfolios/coreai/users/joyang/models/llama-3_3-nemotron-49b-instruct-128k-v1_2-hf
+  max_total_sequence_length: 4096
+  train_global_batch_size: 128
   train_micro_batch_size: 8
+  dtensor_cfg:
+    _v2: true
+    activation_checkpointing: true
+    context_parallel_size: 2
+    tensor_parallel_size: 4
+    custom_parallel_plan: examples.custom_parallel.llama_nemotron_super_49b_custom_plan.custom_parallel_plan
+  dynamic_batching:
+    train_mb_tokens: 4096
+    logprob_mb_tokens: 8192
+  make_sequence_length_divisible_by: ${max:${mul:${policy.dtensor_cfg.context_parallel_size},
+    2}, ${policy.max_total_sequence_length}}
+  max_grad_norm: null
+  optimizer:
+    kwargs:
+      lr: 2.0e-05
+      weight_decay: 0.01
+      eps: 1.0e-08
+data:
+  num_workers: 20
 logger:
   tensorboard_enabled: false
   monitor_gpus: false
@@ -15,4 +42,7 @@ logger:
   tensorboard:
     log_dir: tb_logs-openmathinstruct-nemorl-1M_train
   mlflow:
+    experiment_name: sft-nemotron-super-49b
     run_name: openmathinstruct-nemorl-1M_train
+cluster:
+  gpus_per_node: 8
diff --git a/examples/configs/sft_nemotron_super_49b_base.yaml b/examples/configs/sft_nemotron_super_49b_base.yaml
diff --git a/examples/run_grpo_math.py b/examples/run_grpo_math.py
@@ -82,12 +82,15 @@ def setup_data(
 
     # load dataset
     data: Any = load_response_dataset(data_config, seed)
+    task_name = (
+        data.task_name if hasattr(data, "task_name") else data.task_spec.task_name
+    )
 
     # data processor
     task_data_processors: dict[str, tuple[TaskDataSpec, TaskDataProcessFnCallable]] = (
         defaultdict(lambda: (math_task_spec, math_hf_data_processor))
     )
-    task_data_processors["math"] = (math_task_spec, math_hf_data_processor)
+    task_data_processors[task_name] = (math_task_spec, math_hf_data_processor)
 
     # setup math environment
     math_env = MathEnvironment.options(  # type: ignore # it's wrapped with ray.remote
@@ -120,7 +123,7 @@ def setup_data(
         val_dataset = None
 
     task_to_env: dict[str, EnvironmentInterface] = defaultdict(lambda: math_env)
-    task_to_env["math"] = math_env
+    task_to_env[task_name] = math_env
     return dataset, val_dataset, task_to_env, task_to_env
 
 
diff --git a/nemo_rl/data/datasets/response_datasets/helpsteer3.py b/nemo_rl/data/datasets/response_datasets/helpsteer3.py
@@ -1,3 +1,16 @@
+# Copyright (c) 2025, NVIDIA CORPORATION.  All rights reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
 from typing import Any
 
 from absl import logging
@@ -7,11 +20,7 @@
 
 
 # Choose the chosen response as the response and the rejected response as the target
-def to_response_data_format(
-    data: dict[str, Any],
-) -> dict[
-    str, list[dict[str, int | list[dict[str, str | Any]]]] | list[dict[str, str]]
-]:
+def to_response_data_format(data: dict[str, Any]) -> dict:
     response_1 = data["response1"]
     response_2 = data["response2"]
     overall_preference = data["overall_preference"]
@@ -28,10 +37,13 @@ def to_response_data_format(
     else:
         chosen = response_2
 
+    if isinstance(data["context"], str):
+        context = [{"role": "user", "content": data["context"]}]
+    else:
+        context = data["context"]
+
     return {
-        "context": [{"role": "user", "content": data["context"]}]
-        if isinstance(data["context"], str)
-        else data["context"],
+        "context": context,
         "response": [{"role": "assistant", "content": chosen}],
     }
 
diff --git a/pyrefly.toml b/pyrefly.toml
@@ -64,6 +64,7 @@ project-includes = [
     "nemo_rl/data/datasets/response_datasets/oai_format_dataset.py",
     "nemo_rl/data/datasets/response_datasets/oasst.py",
     "nemo_rl/data/datasets/response_datasets/openmathinstruct2.py",
+    "nemo_rl/data/datasets/response_datasets/helpsteer3.py",
     "nemo_rl/data/datasets/response_datasets/refcoco.py",
     "nemo_rl/data/datasets/response_datasets/response_dataset.py",
     "nemo_rl/data/datasets/response_datasets/squad.py",
@@ -81,7 +82,7 @@ project-includes = [
     "nemo_rl/distributed/worker_group_utils.py",
     "nemo_rl/environments/__init__.py",
     "nemo_rl/environments/games/sliding_puzzle.py",
-    "nemo_rl/environments/helpsteer3_environment.py",
+    "nemo_rl/environments/code_jaccard_environment.py",
     "nemo_rl/environments/interfaces.py",
     "nemo_rl/environments/math_environment.py",
     "nemo_rl/environments/metrics.py",