|
1 | | -# Base GRPO Algorithm Configuration for HelpSteer3 dataset |
| 1 | +# TODO @rayen: remove this file after refactor |
| 2 | +defaults: grpo_math_1B.yaml |
2 | 3 | grpo: |
3 | | - num_prompts_per_step: 32 |
4 | | - num_generations_per_prompt: 16 |
5 | | - max_rollout_turns: 1 # for multi-turn rollouts. HelpSteer3 conversations can have multiple turns |
6 | | - max_num_epochs: 1 |
7 | 4 | max_num_steps: 500 |
8 | | - normalize_rewards: true |
9 | | - use_leave_one_out_baseline: true |
10 | | - val_period: 10 |
11 | | - val_at_start: false |
12 | | - overlong_filtering: false |
13 | | - max_val_samples: 256 |
14 | | - val_batch_size: 256 |
15 | | - seed: 42 |
16 | | - use_dynamic_sampling: false |
17 | | - batch_multiplier: 1 |
18 | | - dynamic_sampling_max_gen_batches: 10 |
19 | | - reward_shaping: |
20 | | - enabled: false |
21 | | - overlong_buffer_length: 128 |
22 | | - overlong_buffer_penalty: 1 |
23 | | - max_response_length: ${policy.max_total_sequence_length} |
24 | | - reward_scaling: |
25 | | - enabled: false |
26 | | - source_min: 0.0 |
27 | | - source_max: 1.0 |
28 | | - target_min: 0.0 |
29 | | - target_max: 1.0 |
30 | | - |
31 | | - async_grpo: |
32 | | - enabled: false # Set to true to enable async training mode |
33 | | - # Max age (in training steps) for trajectories used in training |
34 | | - max_trajectory_age_steps: 1 |
35 | | - |
36 | | -loss_fn: |
37 | | - reference_policy_kl_penalty: 0.01 |
38 | | - ratio_clip_min: 0.2 |
39 | | - ratio_clip_max: 0.2 |
40 | | - ratio_clip_c: null |
41 | | - # (default off) loss formulation improvements (docs/guides/grpo.md#loss) |
42 | | - use_on_policy_kl_approximation: false |
43 | | - use_importance_sampling_correction: false |
44 | | - sequence_level_importance_ratios: false |
45 | | - truncated_importance_sampling_ratio: null |
46 | | - token_level_loss: true |
47 | | - |
48 | 5 | checkpointing: |
49 | | - enabled: true |
50 | | - checkpoint_dir: "results/grpo-helpsteer3" |
51 | | - metric_name: "val_reward" |
52 | | - higher_is_better: true |
53 | | - keep_top_k: 3 |
54 | | - save_period: 10 |
55 | | - checkpoint_must_save_by: null |
56 | | - model_save_format: "safetensors" |
57 | | - save_consolidated: false |
58 | | - |
| 6 | + checkpoint_dir: results/grpo-helpsteer3 |
| 7 | + metric_name: val_reward |
59 | 8 | policy: |
60 | | - model_name: "meta-llama/Llama-3.2-1B-Instruct" |
61 | | - tokenizer: |
62 | | - name: ${policy.model_name} |
| 9 | + model_name: meta-llama/Llama-3.2-1B-Instruct |
63 | 10 | max_total_sequence_length: 2048 |
64 | | - precision: "bfloat16" |
65 | | - train_global_batch_size: 512 |
66 | | - train_micro_batch_size: 4 |
67 | | - logprob_batch_size: 4 |
68 | | - logprob_chunk_size: null |
69 | | - |
70 | | - dtensor_cfg: |
71 | | - _v2: true |
72 | | - enabled: true |
73 | | - cpu_offload: false |
74 | | - sequence_parallel: false |
75 | | - activation_checkpointing: false |
76 | | - tensor_parallel_size: 1 |
77 | | - context_parallel_size: 1 |
78 | | - custom_parallel_plan: null |
79 | | - |
80 | | - megatron_cfg: |
81 | | - enabled: false |
82 | | - |
83 | | - # See docs/design-docs/sequence-packing-and-dynamic-batching.md |
84 | | - # for more details on dynamic batching and sequence packing. |
85 | 11 | dynamic_batching: |
86 | | - enabled: True |
87 | | - train_mb_tokens: ${mul:${policy.max_total_sequence_length}, ${policy.train_micro_batch_size}} |
88 | | - logprob_mb_tokens: ${mul:${policy.max_total_sequence_length}, ${policy.logprob_batch_size}} |
89 | | - sequence_length_round: 64 |
90 | | - |
| 12 | + enabled: true |
91 | 13 | sequence_packing: |
92 | | - enabled: False |
93 | | - train_mb_tokens: ${mul:${policy.max_total_sequence_length}, ${policy.train_micro_batch_size}} |
94 | | - logprob_mb_tokens: ${mul:${policy.max_total_sequence_length}, ${policy.logprob_batch_size}} |
95 | | - algorithm: "modified_first_fit_decreasing" |
96 | | - sequence_length_round: 64 |
97 | | - |
98 | | - make_sequence_length_divisible_by: ${policy.dtensor_cfg.tensor_parallel_size} |
99 | | - max_grad_norm: 1.0 |
100 | | - |
101 | | - optimizer: |
102 | | - name: "torch.optim.AdamW" |
103 | | - kwargs: |
104 | | - lr: 5.0e-6 |
105 | | - weight_decay: 0.01 |
106 | | - betas: [0.9, 0.999] |
107 | | - eps: 1e-8 |
108 | | - |
109 | | - scheduler: |
110 | | - - name: "torch.optim.lr_scheduler.LinearLR" |
111 | | - kwargs: |
112 | | - start_factor: 0.1 |
113 | | - end_factor: 1.0 |
114 | | - # The scheduler iteration is per GPRO step and is decoupled with the optimizer step (may be >=1 per GPRO step) |
115 | | - total_iters: 50 |
116 | | - - name: "torch.optim.lr_scheduler.ConstantLR" |
117 | | - kwargs: |
118 | | - factor: 1.0 |
119 | | - total_iters: 10000000000 |
120 | | - - milestones: [50] |
121 | | - |
122 | | - generation: |
123 | | - backend: "vllm" |
124 | | - max_new_tokens: ${policy.max_total_sequence_length} |
125 | | - temperature: 1.0 |
126 | | - top_p: 1.0 |
127 | | - top_k: null |
128 | | - stop_token_ids: null |
129 | | - stop_strings: null |
130 | | - vllm_cfg: |
131 | | - async_engine: false |
132 | | - precision: ${policy.precision} |
133 | | - tensor_parallel_size: 1 |
134 | | - pipeline_parallel_size: 1 |
135 | | - expert_parallel_size: 1 |
136 | | - gpu_memory_utilization: 0.6 |
137 | | - max_model_len: ${policy.max_total_sequence_length} |
138 | | - # when enforce_eager is False, it is optional to set ++policy.generation.vllm_kwargs.compilation_config.use_inductor=False for better accuracy, |
139 | | - # with the flag, vllm will use the custom CUDA kernels instead of the Triton kernels generated by torch.compile |
140 | | - # for more details, see convergence issue https://github.com/NVIDIA-NeMo/RL/issues/998 |
141 | | - enforce_eager: False |
142 | | - use_deep_gemm: False |
143 | | - num_last_layers_in_bf16: 0 |
144 | | - num_first_layers_in_bf16: 0 |
145 | | - vllm_kwargs: {} |
146 | | - colocated: |
147 | | - # true: generation shares training GPUs |
148 | | - # false: uses dedicated generation resources |
149 | | - enabled: true |
150 | | - # only relevant when enabled is false |
151 | | - resources: |
152 | | - gpus_per_node: null # Decides num gpus to be dedicated to generation when there is one node in the cluster i.e cluster.num_nodes == 1 |
153 | | - num_nodes: null # Decides number of nodes to be dedicated to generation |
154 | | - |
| 14 | + enabled: false |
155 | 15 | data: |
156 | | - max_input_seq_length: ${policy.max_total_sequence_length} # upper bound, real truncation occurs at vllm.max_model_len |
157 | | - prompt_file: null # HelpSteer3 contains its own prompts |
158 | | - system_prompt_file: null |
159 | | - shuffle: true |
160 | | - num_workers: 1 |
161 | | - dataset_name: "HelpSteer3" |
162 | | - # HelpSteer3 preference dataset will be converted to response format for GRPO |
163 | | - # The preferred responses will be used as target responses for the environment |
164 | | - |
| 16 | + prompt_file: null |
| 17 | + dataset_name: HelpSteer3 |
165 | 18 | env: |
166 | 19 | helpsteer3: |
167 | 20 | num_workers: 8 |
168 | | - # Environment configuration for HelpSteer3 preference-based rewards |
169 | | - reward_model: "preference_based" # Use preference scores as rewards |
170 | | - |
| 21 | + reward_model: preference_based |
171 | 22 | logger: |
172 | | - log_dir: "logs" # Base directory for all logs |
173 | | - wandb_enabled: true # Make sure you do a ``wandb login [Your API key]'' before running |
174 | | - tensorboard_enabled: false |
175 | | - mlflow_enabled: false |
176 | | - monitor_gpus: false # If true, will monitor GPU usage and log to wandb and/or tensorboard |
| 23 | + wandb_enabled: true |
| 24 | + monitor_gpus: false |
177 | 25 | wandb: |
178 | | - project: "grpo-helpsteer3" |
179 | | - name: "grpo-helpsteer3" |
| 26 | + project: grpo-helpsteer3 |
| 27 | + name: grpo-helpsteer3 |
180 | 28 | tensorboard: |
181 | | - log_dir: "tb_logs-grpo-helpsteer3" |
| 29 | + log_dir: tb_logs-grpo-helpsteer3 |
182 | 30 | mlflow: |
183 | | - experiment_name: "grpo-helpsteer3" |
184 | | - run_name: "grpo-helpsteer3" |
185 | | - gpu_monitoring: |
186 | | - collection_interval: 10 # How often to collect GPU usage metrics (in seconds) |
187 | | - flush_interval: 10 # How often to flush GPU usage metrics to the loggers (in seconds) |
188 | | - |
| 31 | + experiment_name: grpo-helpsteer3 |
| 32 | + run_name: grpo-helpsteer3 |
189 | 33 | cluster: |
190 | 34 | gpus_per_node: 8 |
191 | | - num_nodes: 1 |
192 | | - |
0 commit comments