Skip to content

Commit ab0ac80

Browse files
committed
unify run_grpo with multiple env
Signed-off-by: ruit <[email protected]>
1 parent d008143 commit ab0ac80

35 files changed

+808
-507
lines changed

examples/configs/grpo_helpsteer3.yaml

Lines changed: 0 additions & 34 deletions
This file was deleted.

examples/configs/grpo_math_1B.yaml

Lines changed: 2 additions & 4 deletions
Original file line numberDiff line numberDiff line change
@@ -260,12 +260,10 @@ data:
260260

261261
env:
262262
math:
263+
enabled: true
263264
num_workers: 8
264265
math_verify_impl: "hf_math_verify"
265-
## unused in this config but needed for DAPO recipe
266-
dapo:
267-
num_workers: 8
268-
math_verify_impl: "dapo_math_verify"
266+
processor: "math_hf_data_processor"
269267

270268
logger:
271269
log_dir: "logs" # Base directory for all logs

examples/configs/grpo_math_1B_megatron.yaml

Lines changed: 1 addition & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -156,6 +156,7 @@ data:
156156

157157
env:
158158
math:
159+
enabled: true
159160
num_workers: 8
160161
math_verify_impl: "hf_math_verify"
161162

examples/configs/grpo_rm_1B.yaml

Lines changed: 4 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -2,15 +2,19 @@
22
defaults: "grpo_math_1B.yaml"
33

44
env:
5+
math:
6+
enabled: false
57
reward_model:
68
enabled: true
9+
processor: "math_hf_data_processor"
710
model_name: "Skywork/Skywork-Reward-V2-Qwen3-0.6B"
811
tokenizer:
912
name: ${env.reward_model.model_name}
1013
precision: "bfloat16"
1114
batch_size: ${policy.train_micro_batch_size}
1215
checkpoint_path: null
1316
max_model_len: 2048
17+
offload_optimizer_for_logprob: false
1418
resources:
1519
gpus_per_node: 1
1620
num_nodes: 1

examples/configs/grpo_sliding_puzzle.yaml

Lines changed: 1 addition & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -51,6 +51,7 @@ data:
5151

5252
env:
5353
sliding_puzzle_game:
54+
enabled: true
5455
cfg:
5556
game_config:
5657
size: 5 # Size of the puzzle (e.g., 2 for 2x2, 3 for 3x3)

examples/configs/recipes/llm/dapo-qwen2.5-7b.yaml

Lines changed: 1 addition & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -85,9 +85,8 @@ data:
8585
prompt_file: null
8686
dataset_name: DAPOMath17K
8787
env:
88-
dapo:
89-
num_workers: 16
9088
math:
89+
enabled: true
9190
num_workers: 16
9291
math_verify_impl: "dapo_math_verify"
9392

examples/configs/recipes/llm/grpo-helpsteer3-llama-3.2-1b-1n8g-fsdp2tp1.yaml

Lines changed: 10 additions & 5 deletions
Original file line numberDiff line numberDiff line change
@@ -3,8 +3,8 @@ grpo:
33
max_num_epochs: 3
44
max_num_steps: 500
55
checkpointing:
6-
checkpoint_dir: results/grpo-helpsteer3-llama-3.2-1b
7-
metric_name: val_reward
6+
checkpoint_dir: results/grpo-helpsteer3-llama-3.2-1b-5
7+
metric_name: val:reward
88
policy:
99
model_name: meta-llama/Llama-3.2-1B-Instruct
1010
max_total_sequence_length: 2048
@@ -18,19 +18,24 @@ policy:
1818
data:
1919
prompt_file: null
2020
dataset_name: HelpSteer3
21+
split: preference
2122
env:
22-
helpsteer3:
23+
math:
24+
enabled: false
25+
code_jaccard:
26+
enabled: true
2327
num_workers: 8
24-
reward_model: preference_based
28+
processor: helpsteer3_data_processor
2529
logger:
2630
wandb_enabled: true
2731
tensorboard_enabled: true
2832
wandb:
2933
project: grpo-helpsteer3-llama-3.2-1b
30-
name: grpo-helpsteer3-llama-3.2-1b
34+
name: grpo-helpsteer3-llama-3.2-1b-tp${policy.dtensor_cfg.tensor_parallel_size}
3135
tensorboard:
3236
log_dir: tb_logs-grpo-helpsteer3-llama-3.2-1b
3337
mlflow:
38+
experiment_name: grpo-helpsteer3
3439
run_name: grpo-helpsteer3-llama-3.2-1b
3540
cluster:
3641
gpus_per_node: 8

examples/configs/recipes/llm/grpo-helpsteer3-llama-3.3-nemotron-super-49b-v1.5-4n8g-fsdp2tp8.yaml

Lines changed: 13 additions & 9 deletions
Original file line numberDiff line numberDiff line change
@@ -4,23 +4,23 @@ grpo:
44
max_num_steps: 10
55
checkpointing:
66
checkpoint_dir: results/grpo-helpsteer3-llama-3.3-nemotron-super-49b-v1.5
7-
metric_name: val_reward
7+
metric_name: val:reward
88
policy:
99
model_name: /lustre/fsw/portfolios/coreai/users/joyang/models/llama-3_3-nemotron-49b-instruct-128k-v1_2-hf
1010
max_total_sequence_length: 32768
1111
train_global_batch_size: 64
1212
train_micro_batch_size: 1
1313
logprob_batch_size: 1
14-
dynamic_batching:
15-
enabled: true
16-
sequence_packing:
17-
enabled: false
1814
dtensor_cfg:
1915
activation_checkpointing: true
2016
context_parallel_size: 4
2117
cpu_offload: true
2218
tensor_parallel_size: 8
2319
custom_parallel_plan: examples.custom_parallel.llama_nemotron_super_49b_custom_plan.custom_parallel_plan
20+
dynamic_batching:
21+
enabled: true
22+
sequence_packing:
23+
enabled: false
2424
optimizer:
2525
kwargs:
2626
lr: 3.0e-07
@@ -42,19 +42,23 @@ policy:
4242
data:
4343
prompt_file: null
4444
dataset_name: HelpSteer3
45+
split: preference
4546
env:
46-
helpsteer3:
47+
math:
48+
enabled: false
49+
code_jaccard:
50+
enabled: true
4751
num_workers: 8
48-
reward_model: preference_based
52+
processor: helpsteer3_data_processor
4953
logger:
5054
wandb_enabled: true
51-
monitor_gpus: false
5255
wandb:
5356
project: grpo-helpsteer3-llama-3.3-nemotron-super-49b-v1.5
54-
name: grpo-helpsteer3-llama-3.3-nemotron-super-49b-v1.5-tp8
57+
name: grpo-helpsteer3-llama-3.3-nemotron-super-49b-v1.5-tp${policy.dtensor_cfg.tensor_parallel_size}
5558
tensorboard:
5659
log_dir: tb_logs-grpo-helpsteer3-llama-3.3-nemotron-super-49b-v1.5
5760
mlflow:
61+
experiment_name: grpo-helpsteer3
5862
run_name: grpo-helpsteer3-llama-3.3-nemotron-super-49b-v1.5
5963
cluster:
6064
gpus_per_node: 8

examples/configs/recipes/llm/sft-nemotron-super-49b-tulu-v3.yaml

Lines changed: 6 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -3,11 +3,13 @@ sft:
33
max_num_steps: 50
44
val_period: 5
55
val_global_batch_size: 128
6+
67
checkpointing:
78
checkpoint_dir: results/sft_nemotron_super_49b
8-
metric_name: val_loss
9+
metric_name: val:loss
910
keep_top_k: 100
1011
save_period: 500
12+
1113
policy:
1214
model_name: /lustre/fsw/portfolios/coreai/users/joyang/models/llama-3_3-nemotron-49b-instruct-128k-v1_2-hf
1315
max_total_sequence_length: 32768
@@ -43,10 +45,11 @@ policy:
4345
- 10
4446
data:
4547
dataset_name: tulu3_sft_mixture
46-
test_size: 0.05
4748
num_workers: 20
49+
test_size: 0.05
4850
logger:
4951
tensorboard_enabled: false
52+
monitor_gpus: false
5053
num_val_samples_to_print: 0
5154
wandb:
5255
project: nemotron-tulu-3-sft
@@ -58,3 +61,4 @@ logger:
5861
run_name: nemotron-tulu-3-sft
5962
cluster:
6063
gpus_per_node: 8
64+
num_nodes: 8

examples/configs/recipes/llm/sft-nemotron-super-49b.yaml

Lines changed: 8 additions & 7 deletions
Original file line numberDiff line numberDiff line change
@@ -4,14 +4,15 @@ sft:
44
max_num_steps: 100
55
val_global_batch_size: 128
66
checkpointing:
7-
checkpoint_dir: results/sft_nemotron_super_49b
8-
metric_name: val_loss
7+
checkpoint_dir: results/sft-nemotron-super-49b
8+
metric_name: val:loss
99
keep_top_k: 100
1010
save_period: 500
1111
policy:
1212
model_name: /lustre/fsw/portfolios/coreai/users/joyang/models/llama-3_3-nemotron-49b-instruct-128k-v1_2-hf
1313
max_total_sequence_length: 4096
1414
train_global_batch_size: 128
15+
train_micro_batch_size: 8
1516
dtensor_cfg:
1617
_v2: true
1718
activation_checkpointing: true
@@ -36,12 +37,12 @@ logger:
3637
monitor_gpus: false
3738
num_val_samples_to_print: 0
3839
wandb:
39-
project: sft-nemotron-super-49b
40-
name: sft-nemotron-super-49b
40+
project: sft-nemotron
41+
name: sft-${data.dataset_name}-nemotron-super-49b
4142
tensorboard:
42-
log_dir: tb_logs-sft-nemotron-super-49b
43+
log_dir: tb_logs-openmathinstruct-nemorl-1M_train
4344
mlflow:
4445
experiment_name: sft-nemotron-super-49b
45-
run_name: sft-nemotron-super-49b
46+
run_name: openmathinstruct-nemorl-1M_train
4647
cluster:
47-
gpus_per_node: 8
48+
gpus_per_node: 8

0 commit comments

Comments
 (0)