Skip to content

Commit 85b1726

Browse files
committed
refactor yaml
Signed-off-by: Yuki Huang <[email protected]>
1 parent f272c41 commit 85b1726

6 files changed

+173
-346
lines changed
Lines changed: 17 additions & 175 deletions
Original file line numberDiff line numberDiff line change
@@ -1,192 +1,34 @@
1-
# Base GRPO Algorithm Configuration for HelpSteer3 dataset
1+
# TODO @rayen: remove this file after refactor
2+
defaults: grpo_math_1B.yaml
23
grpo:
3-
num_prompts_per_step: 32
4-
num_generations_per_prompt: 16
5-
max_rollout_turns: 1 # for multi-turn rollouts. HelpSteer3 conversations can have multiple turns
6-
max_num_epochs: 1
74
max_num_steps: 500
8-
normalize_rewards: true
9-
use_leave_one_out_baseline: true
10-
val_period: 10
11-
val_at_start: false
12-
overlong_filtering: false
13-
max_val_samples: 256
14-
val_batch_size: 256
15-
seed: 42
16-
use_dynamic_sampling: false
17-
batch_multiplier: 1
18-
dynamic_sampling_max_gen_batches: 10
19-
reward_shaping:
20-
enabled: false
21-
overlong_buffer_length: 128
22-
overlong_buffer_penalty: 1
23-
max_response_length: ${policy.max_total_sequence_length}
24-
reward_scaling:
25-
enabled: false
26-
source_min: 0.0
27-
source_max: 1.0
28-
target_min: 0.0
29-
target_max: 1.0
30-
31-
async_grpo:
32-
enabled: false # Set to true to enable async training mode
33-
# Max age (in training steps) for trajectories used in training
34-
max_trajectory_age_steps: 1
35-
36-
loss_fn:
37-
reference_policy_kl_penalty: 0.01
38-
ratio_clip_min: 0.2
39-
ratio_clip_max: 0.2
40-
ratio_clip_c: null
41-
# (default off) loss formulation improvements (docs/guides/grpo.md#loss)
42-
use_on_policy_kl_approximation: false
43-
use_importance_sampling_correction: false
44-
sequence_level_importance_ratios: false
45-
truncated_importance_sampling_ratio: null
46-
token_level_loss: true
47-
485
checkpointing:
49-
enabled: true
50-
checkpoint_dir: "results/grpo-helpsteer3"
51-
metric_name: "val_reward"
52-
higher_is_better: true
53-
keep_top_k: 3
54-
save_period: 10
55-
checkpoint_must_save_by: null
56-
model_save_format: "safetensors"
57-
save_consolidated: false
58-
6+
checkpoint_dir: results/grpo-helpsteer3
7+
metric_name: val_reward
598
policy:
60-
model_name: "meta-llama/Llama-3.2-1B-Instruct"
61-
tokenizer:
62-
name: ${policy.model_name}
9+
model_name: meta-llama/Llama-3.2-1B-Instruct
6310
max_total_sequence_length: 2048
64-
precision: "bfloat16"
65-
train_global_batch_size: 512
66-
train_micro_batch_size: 4
67-
logprob_batch_size: 4
68-
logprob_chunk_size: null
69-
70-
dtensor_cfg:
71-
_v2: true
72-
enabled: true
73-
cpu_offload: false
74-
sequence_parallel: false
75-
activation_checkpointing: false
76-
tensor_parallel_size: 1
77-
context_parallel_size: 1
78-
custom_parallel_plan: null
79-
80-
megatron_cfg:
81-
enabled: false
82-
83-
# See docs/design-docs/sequence-packing-and-dynamic-batching.md
84-
# for more details on dynamic batching and sequence packing.
8511
dynamic_batching:
86-
enabled: True
87-
train_mb_tokens: ${mul:${policy.max_total_sequence_length}, ${policy.train_micro_batch_size}}
88-
logprob_mb_tokens: ${mul:${policy.max_total_sequence_length}, ${policy.logprob_batch_size}}
89-
sequence_length_round: 64
90-
12+
enabled: true
9113
sequence_packing:
92-
enabled: False
93-
train_mb_tokens: ${mul:${policy.max_total_sequence_length}, ${policy.train_micro_batch_size}}
94-
logprob_mb_tokens: ${mul:${policy.max_total_sequence_length}, ${policy.logprob_batch_size}}
95-
algorithm: "modified_first_fit_decreasing"
96-
sequence_length_round: 64
97-
98-
make_sequence_length_divisible_by: ${policy.dtensor_cfg.tensor_parallel_size}
99-
max_grad_norm: 1.0
100-
101-
optimizer:
102-
name: "torch.optim.AdamW"
103-
kwargs:
104-
lr: 5.0e-6
105-
weight_decay: 0.01
106-
betas: [0.9, 0.999]
107-
eps: 1e-8
108-
109-
scheduler:
110-
- name: "torch.optim.lr_scheduler.LinearLR"
111-
kwargs:
112-
start_factor: 0.1
113-
end_factor: 1.0
114-
# The scheduler iteration is per GPRO step and is decoupled with the optimizer step (may be >=1 per GPRO step)
115-
total_iters: 50
116-
- name: "torch.optim.lr_scheduler.ConstantLR"
117-
kwargs:
118-
factor: 1.0
119-
total_iters: 10000000000
120-
- milestones: [50]
121-
122-
generation:
123-
backend: "vllm"
124-
max_new_tokens: ${policy.max_total_sequence_length}
125-
temperature: 1.0
126-
top_p: 1.0
127-
top_k: null
128-
stop_token_ids: null
129-
stop_strings: null
130-
vllm_cfg:
131-
async_engine: false
132-
precision: ${policy.precision}
133-
tensor_parallel_size: 1
134-
pipeline_parallel_size: 1
135-
expert_parallel_size: 1
136-
gpu_memory_utilization: 0.6
137-
max_model_len: ${policy.max_total_sequence_length}
138-
# when enforce_eager is False, it is optional to set ++policy.generation.vllm_kwargs.compilation_config.use_inductor=False for better accuracy,
139-
# with the flag, vllm will use the custom CUDA kernels instead of the Triton kernels generated by torch.compile
140-
# for more details, see convergence issue https://github.com/NVIDIA-NeMo/RL/issues/998
141-
enforce_eager: False
142-
use_deep_gemm: False
143-
num_last_layers_in_bf16: 0
144-
num_first_layers_in_bf16: 0
145-
vllm_kwargs: {}
146-
colocated:
147-
# true: generation shares training GPUs
148-
# false: uses dedicated generation resources
149-
enabled: true
150-
# only relevant when enabled is false
151-
resources:
152-
gpus_per_node: null # Decides num gpus to be dedicated to generation when there is one node in the cluster i.e cluster.num_nodes == 1
153-
num_nodes: null # Decides number of nodes to be dedicated to generation
154-
14+
enabled: false
15515
data:
156-
max_input_seq_length: ${policy.max_total_sequence_length} # upper bound, real truncation occurs at vllm.max_model_len
157-
prompt_file: null # HelpSteer3 contains its own prompts
158-
system_prompt_file: null
159-
shuffle: true
160-
num_workers: 1
161-
dataset_name: "HelpSteer3"
162-
# HelpSteer3 preference dataset will be converted to response format for GRPO
163-
# The preferred responses will be used as target responses for the environment
164-
16+
prompt_file: null
17+
dataset_name: HelpSteer3
16518
env:
16619
helpsteer3:
16720
num_workers: 8
168-
# Environment configuration for HelpSteer3 preference-based rewards
169-
reward_model: "preference_based" # Use preference scores as rewards
170-
21+
reward_model: preference_based
17122
logger:
172-
log_dir: "logs" # Base directory for all logs
173-
wandb_enabled: true # Make sure you do a ``wandb login [Your API key]'' before running
174-
tensorboard_enabled: false
175-
mlflow_enabled: false
176-
monitor_gpus: false # If true, will monitor GPU usage and log to wandb and/or tensorboard
23+
wandb_enabled: true
24+
monitor_gpus: false
17725
wandb:
178-
project: "grpo-helpsteer3"
179-
name: "grpo-helpsteer3"
26+
project: grpo-helpsteer3
27+
name: grpo-helpsteer3
18028
tensorboard:
181-
log_dir: "tb_logs-grpo-helpsteer3"
29+
log_dir: tb_logs-grpo-helpsteer3
18230
mlflow:
183-
experiment_name: "grpo-helpsteer3"
184-
run_name: "grpo-helpsteer3"
185-
gpu_monitoring:
186-
collection_interval: 10 # How often to collect GPU usage metrics (in seconds)
187-
flush_interval: 10 # How often to flush GPU usage metrics to the loggers (in seconds)
188-
31+
experiment_name: grpo-helpsteer3
32+
run_name: grpo-helpsteer3
18933
cluster:
19034
gpus_per_node: 8
191-
num_nodes: 1
192-
Lines changed: 26 additions & 14 deletions
Original file line numberDiff line numberDiff line change
@@ -1,24 +1,36 @@
1-
defaults: ../../grpo_helpsteer3.yaml
2-
3-
# GRPO Algorithm Configuration for Llama-3.2-1B with HelpSteer3
1+
defaults: ../../grpo_math_1B.yaml
42
grpo:
53
max_num_epochs: 3
6-
4+
max_num_steps: 500
75
checkpointing:
8-
checkpoint_dir: "results/grpo-helpsteer3-llama-3.2-1b-5"
9-
6+
checkpoint_dir: results/grpo-helpsteer3-llama-3.2-1b
7+
metric_name: val_reward
108
policy:
9+
model_name: meta-llama/Llama-3.2-1B-Instruct
10+
max_total_sequence_length: 2048
1111
generation:
12-
stop_token_ids:
13-
- 128009 # <|eot_id|> for Llama-3.2
14-
12+
stop_token_ids:
13+
- 128009
14+
dynamic_batching:
15+
enabled: true
16+
sequence_packing:
17+
enabled: false
18+
data:
19+
prompt_file: null
20+
dataset_name: HelpSteer3
21+
env:
22+
helpsteer3:
23+
num_workers: 8
24+
reward_model: preference_based
1525
logger:
26+
wandb_enabled: true
1627
tensorboard_enabled: true
17-
monitor_gpus: true
1828
wandb:
19-
project: "grpo-helpsteer3-llama-3.2-1b"
20-
name: "grpo-helpsteer3-llama-3.2-1b-tp${policy.dtensor_cfg.tensor_parallel_size}"
29+
project: grpo-helpsteer3-llama-3.2-1b
30+
name: grpo-helpsteer3-llama-3.2-1b
2131
tensorboard:
22-
log_dir: "tb_logs-grpo-helpsteer3-llama-3.2-1b"
32+
log_dir: tb_logs-grpo-helpsteer3-llama-3.2-1b
2333
mlflow:
24-
run_name: "grpo-helpsteer3-llama-3.2-1b"
34+
run_name: grpo-helpsteer3-llama-3.2-1b
35+
cluster:
36+
gpus_per_node: 8
Lines changed: 33 additions & 29 deletions
Original file line numberDiff line numberDiff line change
@@ -1,57 +1,61 @@
1-
defaults: ../../grpo_helpsteer3.yaml
2-
3-
# GRPO Algorithm Configuration for Llama-3.3-Nemotron-Super-49B-v1.5 with HelpSteer3
1+
defaults: ../../grpo_math_1B.yaml
42
grpo:
53
num_prompts_per_step: 64
6-
max_num_epochs: 1
74
max_num_steps: 10
8-
95
checkpointing:
10-
checkpoint_dir: "results/grpo-helpsteer3-llama-3.3-nemotron-super-49b-v1.5-3"
11-
6+
checkpoint_dir: results/grpo-helpsteer3-llama-3.3-nemotron-super-49b-v1.5
7+
metric_name: val_reward
128
policy:
139
model_name: /lustre/fsw/portfolios/coreai/users/joyang/models/llama-3_3-nemotron-49b-instruct-128k-v1_2-hf
1410
max_total_sequence_length: 32768
1511
train_global_batch_size: 64
1612
train_micro_batch_size: 1
1713
logprob_batch_size: 1
18-
14+
dynamic_batching:
15+
enabled: true
16+
sequence_packing:
17+
enabled: false
1918
dtensor_cfg:
2019
activation_checkpointing: true
2120
context_parallel_size: 4
2221
cpu_offload: true
23-
sequence_parallel: false
2422
tensor_parallel_size: 8
2523
custom_parallel_plan: examples.custom_parallel.llama_nemotron_super_49b_custom_plan.custom_parallel_plan
26-
2724
optimizer:
2825
kwargs:
29-
lr: 3.0e-7
30-
26+
lr: 3.0e-07
3127
scheduler:
32-
- name: "torch.optim.lr_scheduler.LinearLR"
33-
kwargs:
34-
start_factor: 0.1
35-
end_factor: 1.0
36-
total_iters: 13
37-
- name: "torch.optim.lr_scheduler.ConstantLR"
38-
kwargs:
39-
factor: 1.0
40-
total_iters: 10000000000
41-
- milestones: [13]
42-
28+
- name: torch.optim.lr_scheduler.LinearLR
29+
kwargs:
30+
start_factor: 0.1
31+
end_factor: 1.0
32+
total_iters: 13
33+
- name: torch.optim.lr_scheduler.ConstantLR
34+
kwargs:
35+
factor: 1.0
36+
total_iters: 10000000000
37+
- milestones:
38+
- 13
4339
generation:
4440
vllm_cfg:
4541
tensor_parallel_size: 4
46-
42+
data:
43+
prompt_file: null
44+
dataset_name: HelpSteer3
45+
env:
46+
helpsteer3:
47+
num_workers: 8
48+
reward_model: preference_based
4749
logger:
50+
wandb_enabled: true
51+
monitor_gpus: false
4852
wandb:
49-
project: "grpo-helpsteer3-llama-3.3-nemotron-super-49b-v1.5"
50-
name: "grpo-helpsteer3-llama-3.3-nemotron-super-49b-v1.5-tp${policy.dtensor_cfg.tensor_parallel_size}"
53+
project: grpo-helpsteer3-llama-3.3-nemotron-super-49b-v1.5
54+
name: grpo-helpsteer3-llama-3.3-nemotron-super-49b-v1.5-tp8
5155
tensorboard:
52-
log_dir: "tb_logs-grpo-helpsteer3-llama-3.3-nemotron-super-49b-v1.5"
56+
log_dir: tb_logs-grpo-helpsteer3-llama-3.3-nemotron-super-49b-v1.5
5357
mlflow:
54-
run_name: "grpo-helpsteer3-llama-3.3-nemotron-super-49b-v1.5"
55-
58+
run_name: grpo-helpsteer3-llama-3.3-nemotron-super-49b-v1.5
5659
cluster:
60+
gpus_per_node: 8
5761
num_nodes: 16

0 commit comments

Comments
 (0)