NVIDIA-NeMo
diff --git a/‎examples/configs/grpo_helpsteer3.yaml‎
Lines changed: 0 additions & 34 deletions b/‎examples/configs/grpo_helpsteer3.yaml‎
Lines changed: 0 additions & 34 deletions
diff --git a/‎examples/configs/grpo_math_1B.yaml‎
Lines changed: 2 additions & 4 deletions b/‎examples/configs/grpo_math_1B.yaml‎
Lines changed: 2 additions & 4 deletions
diff --git a/‎examples/configs/grpo_math_1B_megatron.yaml‎
Lines changed: 1 addition & 0 deletions b/‎examples/configs/grpo_math_1B_megatron.yaml‎
Lines changed: 1 addition & 0 deletions
diff --git a/‎examples/configs/grpo_rm_1B.yaml‎
Lines changed: 4 additions & 0 deletions b/‎examples/configs/grpo_rm_1B.yaml‎
Lines changed: 4 additions & 0 deletions
diff --git a/‎examples/configs/grpo_sliding_puzzle.yaml‎
Lines changed: 1 addition & 0 deletions b/‎examples/configs/grpo_sliding_puzzle.yaml‎
Lines changed: 1 addition & 0 deletions
diff --git a/‎examples/configs/recipes/llm/dapo-qwen2.5-7b.yaml‎
Lines changed: 1 addition & 2 deletions b/‎examples/configs/recipes/llm/dapo-qwen2.5-7b.yaml‎
Lines changed: 1 addition & 2 deletions
diff --git a/‎examples/configs/recipes/llm/grpo-helpsteer3-llama-3.2-1b-1n8g-fsdp2tp1.yaml‎
Lines changed: 10 additions & 5 deletions b/‎examples/configs/recipes/llm/grpo-helpsteer3-llama-3.2-1b-1n8g-fsdp2tp1.yaml‎
Lines changed: 10 additions & 5 deletions
diff --git a/‎examples/configs/recipes/llm/grpo-helpsteer3-llama-3.3-nemotron-super-49b-v1.5-4n8g-fsdp2tp8.yaml‎
Lines changed: 13 additions & 9 deletions b/‎examples/configs/recipes/llm/grpo-helpsteer3-llama-3.3-nemotron-super-49b-v1.5-4n8g-fsdp2tp8.yaml‎
Lines changed: 13 additions & 9 deletions
diff --git a/‎examples/configs/recipes/llm/sft-nemotron-super-49b-tulu-v3.yaml‎
Lines changed: 6 additions & 2 deletions b/‎examples/configs/recipes/llm/sft-nemotron-super-49b-tulu-v3.yaml‎
Lines changed: 6 additions & 2 deletions
diff --git a/‎examples/configs/recipes/llm/sft-nemotron-super-49b.yaml‎
Lines changed: 8 additions & 7 deletions b/‎examples/configs/recipes/llm/sft-nemotron-super-49b.yaml‎
Lines changed: 8 additions & 7 deletions
@@ -260,12 +260,10 @@ data:
 
 env:
   math:
+    enabled: true
     num_workers: 8
     math_verify_impl: "hf_math_verify"
-  ## unused in this config but needed for DAPO recipe
-  dapo:
-    num_workers: 8
-    math_verify_impl: "dapo_math_verify"
+    processor: "math_hf_data_processor"
 
 logger:
   log_dir: "logs"  # Base directory for all logs
 
@@ -156,6 +156,7 @@ data:
 
 env:
   math:
+    enabled: true
     num_workers: 8
     math_verify_impl: "hf_math_verify"
 
 
@@ -2,15 +2,19 @@
 defaults: "grpo_math_1B.yaml"
 
 env:
+  math:
+    enabled: false
   reward_model:  
     enabled: true
+    processor: "math_hf_data_processor"
     model_name: "Skywork/Skywork-Reward-V2-Qwen3-0.6B"
     tokenizer:
       name: ${env.reward_model.model_name}
     precision: "bfloat16"
     batch_size: ${policy.train_micro_batch_size}
     checkpoint_path: null
     max_model_len: 2048
+    offload_optimizer_for_logprob: false
     resources:
       gpus_per_node: 1
       num_nodes: 1
 
@@ -51,6 +51,7 @@ data:
 
 env:
   sliding_puzzle_game:
+    enabled: true
     cfg:
       game_config:
         size: 5 # Size of the puzzle (e.g., 2 for 2x2, 3 for 3x3)
 
@@ -85,9 +85,8 @@ data:
   prompt_file: null
   dataset_name: DAPOMath17K
 env:
-  dapo:
-    num_workers: 16
   math:
+    enabled: true
     num_workers: 16
     math_verify_impl: "dapo_math_verify"
 
 
@@ -3,8 +3,8 @@ grpo:
   max_num_epochs: 3
   max_num_steps: 500
 checkpointing:
-  checkpoint_dir: results/grpo-helpsteer3-llama-3.2-1b
-  metric_name: val_reward
+  checkpoint_dir: results/grpo-helpsteer3-llama-3.2-1b-5
+  metric_name: val:reward
 policy:
   model_name: meta-llama/Llama-3.2-1B-Instruct
   max_total_sequence_length: 2048
@@ -18,19 +18,24 @@ policy:
 data:
   prompt_file: null
   dataset_name: HelpSteer3
+  split: preference
 env:
-  helpsteer3:
+  math:
+    enabled: false
+  code_jaccard:
+    enabled: true
     num_workers: 8
-    reward_model: preference_based
+    processor: helpsteer3_data_processor
 logger:
   wandb_enabled: true
   tensorboard_enabled: true
   wandb:
     project: grpo-helpsteer3-llama-3.2-1b
-    name: grpo-helpsteer3-llama-3.2-1b
+    name: grpo-helpsteer3-llama-3.2-1b-tp${policy.dtensor_cfg.tensor_parallel_size}
   tensorboard:
     log_dir: tb_logs-grpo-helpsteer3-llama-3.2-1b
   mlflow:
+    experiment_name: grpo-helpsteer3
     run_name: grpo-helpsteer3-llama-3.2-1b
 cluster:
   gpus_per_node: 8
@@ -4,23 +4,23 @@ grpo:
   max_num_steps: 10
 checkpointing:
   checkpoint_dir: results/grpo-helpsteer3-llama-3.3-nemotron-super-49b-v1.5
-  metric_name: val_reward
+  metric_name: val:reward
 policy:
   model_name: /lustre/fsw/portfolios/coreai/users/joyang/models/llama-3_3-nemotron-49b-instruct-128k-v1_2-hf
   max_total_sequence_length: 32768
   train_global_batch_size: 64
   train_micro_batch_size: 1
   logprob_batch_size: 1
-  dynamic_batching:
-    enabled: true
-  sequence_packing:
-    enabled: false
   dtensor_cfg:
     activation_checkpointing: true
     context_parallel_size: 4
     cpu_offload: true
     tensor_parallel_size: 8
     custom_parallel_plan: examples.custom_parallel.llama_nemotron_super_49b_custom_plan.custom_parallel_plan
+  dynamic_batching:
+    enabled: true
+  sequence_packing:
+    enabled: false
   optimizer:
     kwargs:
       lr: 3.0e-07
@@ -42,19 +42,23 @@ policy:
 data:
   prompt_file: null
   dataset_name: HelpSteer3
+  split: preference
 env:
-  helpsteer3:
+  math:
+    enabled: false
+  code_jaccard:
+    enabled: true
     num_workers: 8
-    reward_model: preference_based
+    processor: helpsteer3_data_processor
 logger:
   wandb_enabled: true
-  monitor_gpus: false
   wandb:
     project: grpo-helpsteer3-llama-3.3-nemotron-super-49b-v1.5
-    name: grpo-helpsteer3-llama-3.3-nemotron-super-49b-v1.5-tp8
+    name: grpo-helpsteer3-llama-3.3-nemotron-super-49b-v1.5-tp${policy.dtensor_cfg.tensor_parallel_size}
   tensorboard:
     log_dir: tb_logs-grpo-helpsteer3-llama-3.3-nemotron-super-49b-v1.5
   mlflow:
+    experiment_name: grpo-helpsteer3
     run_name: grpo-helpsteer3-llama-3.3-nemotron-super-49b-v1.5
 cluster:
   gpus_per_node: 8
 
@@ -3,11 +3,13 @@ sft:
   max_num_steps: 50
   val_period: 5
   val_global_batch_size: 128
+
 checkpointing:
   checkpoint_dir: results/sft_nemotron_super_49b
-  metric_name: val_loss
+  metric_name: val:loss
   keep_top_k: 100
   save_period: 500
+
 policy:
   model_name: /lustre/fsw/portfolios/coreai/users/joyang/models/llama-3_3-nemotron-49b-instruct-128k-v1_2-hf
   max_total_sequence_length: 32768
@@ -43,10 +45,11 @@ policy:
     - 10
 data:
   dataset_name: tulu3_sft_mixture
-  test_size: 0.05
   num_workers: 20
+  test_size: 0.05
 logger:
   tensorboard_enabled: false
+  monitor_gpus: false
   num_val_samples_to_print: 0
   wandb:
     project: nemotron-tulu-3-sft
@@ -58,3 +61,4 @@ logger:
     run_name: nemotron-tulu-3-sft
 cluster:
   gpus_per_node: 8
+  num_nodes: 8
@@ -4,14 +4,15 @@ sft:
   max_num_steps: 100
   val_global_batch_size: 128
 checkpointing:
-  checkpoint_dir: results/sft_nemotron_super_49b
-  metric_name: val_loss
+  checkpoint_dir: results/sft-nemotron-super-49b
+  metric_name: val:loss
   keep_top_k: 100
   save_period: 500
 policy:
   model_name: /lustre/fsw/portfolios/coreai/users/joyang/models/llama-3_3-nemotron-49b-instruct-128k-v1_2-hf
   max_total_sequence_length: 4096
   train_global_batch_size: 128
+  train_micro_batch_size: 8
   dtensor_cfg:
     _v2: true
     activation_checkpointing: true
@@ -36,12 +37,12 @@ logger:
   monitor_gpus: false
   num_val_samples_to_print: 0
   wandb:
-    project: sft-nemotron-super-49b
-    name: sft-nemotron-super-49b
+    project: sft-nemotron
+    name: sft-${data.dataset_name}-nemotron-super-49b
   tensorboard:
-    log_dir: tb_logs-sft-nemotron-super-49b
+    log_dir: tb_logs-openmathinstruct-nemorl-1M_train
   mlflow:
     experiment_name: sft-nemotron-super-49b
-    run_name: sft-nemotron-super-49b
+    run_name: openmathinstruct-nemorl-1M_train
 cluster:
-  gpus_per_node: 8
+  gpus_per_node: 8