feat: Support for nano-v2 (#1514)

yfw · web-flow · commit c32778d87aba · 2025-11-17T21:57:11.000-08:00
Signed-off-by: Yi-Fu Wu &lt;yifu.wu@gmail.com&gt;
diff --git a/3rdparty/Megatron-Bridge-workspace/Megatron-Bridge b/3rdparty/Megatron-Bridge-workspace/Megatron-Bridge
@@ -1 +1 @@
-Subproject commit f003cd8ca3e4876853b6097e816f0a94ea8fefc1
+Subproject commit 8aa287df3ca6833c78733460f0c0f0bcfb79f5de
diff --git a/examples/configs/recipes/llm/grpo-nano-v2-12b-1n8g-megatron.yaml b/examples/configs/recipes/llm/grpo-nano-v2-12b-1n8g-megatron.yaml
@@ -0,0 +1,34 @@
+defaults: ../../grpo_math_1B.yaml
+grpo:
+  max_num_steps: 30
+checkpointing:
+  checkpoint_dir: results/grpo-nano-v2-12b-1n8g-megatron
+policy:
+  model_name: nvidia/NVIDIA-Nemotron-Nano-12B-v2
+  tokenizer:
+    name: nvidia/NVIDIA-Nemotron-Nano-12B-v2
+  optimizer: null
+  megatron_cfg:
+    enabled: true
+    bias_activation_fusion: false
+    tensor_model_parallel_size: 8
+  dtensor_cfg:
+    enabled: false
+  make_sequence_length_divisible_by: 1
+  generation:
+    max_new_tokens: 512
+    vllm_cfg:
+      max_model_len: 512
+  sequence_packing:
+    enabled: false
+data:
+  max_input_seq_length: 512
+logger:
+  log_dir: logs/grpo-nano-v2-12b-1n8g-megatron
+  wandb_enabled: true
+  tensorboard_enabled: true
+  wandb:
+    project: nemo-rl
+    name: grpo-nano-v2-12b-1n8g-megatron
+cluster:
+  gpus_per_node: 8
diff --git a/examples/configs/recipes/llm/grpo-nano-v2-12b-2n8g-fsdp2tp1.yaml b/examples/configs/recipes/llm/grpo-nano-v2-12b-2n8g-fsdp2tp1.yaml
@@ -0,0 +1,44 @@
+defaults: ../../grpo_math_1B.yaml
+grpo:
+  max_num_steps: 30
+checkpointing:
+  checkpoint_dir: results/grpo-nano-v2-12b-2n8g-fsdp2tp1
+policy:
+  model_name: nvidia/NVIDIA-Nemotron-Nano-12B-v2
+  tokenizer:
+    name: nvidia/NVIDIA-Nemotron-Nano-12B-v2
+  dtensor_cfg:
+    cpu_offload: true
+    activation_checkpointing: true
+  dynamic_batching:
+    enabled: true
+  sequence_packing:
+    enabled: false
+  make_sequence_length_divisible_by: 1
+  generation:
+    max_new_tokens: 512
+    vllm_cfg:
+      max_model_len: 512
+  scheduler:
+    - name: "torch.optim.lr_scheduler.LinearLR"
+      kwargs:
+        start_factor: 0.1
+        end_factor: 1.0
+        total_iters: 13
+    - name: "torch.optim.lr_scheduler.ConstantLR"
+      kwargs:
+        factor: 1.0
+        total_iters: 10000000000
+    - milestones: [13]
+data:
+  max_input_seq_length: 512
+logger:
+  log_dir: logs/grpo-nano-v2-12b-2n8g-fsdp2tp1
+  wandb_enabled: true
+  tensorboard_enabled: true
+  wandb:
+    project: nemo-rl
+    name: grpo-nano-v2-12b-2n8g-fsdp2tp1
+cluster:
+  gpus_per_node: 8
+  num_nodes: 2
diff --git a/nemo_rl/models/megatron/common.py b/nemo_rl/models/megatron/common.py
@@ -348,12 +348,17 @@ def forward_step_arbitrary_loss(
     if len(multimodal_data) > 0:
         position_ids = None
 
+    additional_kwargs = {}
+    # Mamba models currently do not support packed_seq_params
+    if packed_seq_params is not None:
+        additional_kwargs["packed_seq_params"] = packed_seq_params
+
     with straggler_timer:
         output_tensor = model(
             input_ids=input_ids_cp_sharded,
             position_ids=position_ids,
             attention_mask=attention_mask,
-            packed_seq_params=packed_seq_params,
+            **additional_kwargs,
             **multimodal_data,
         )
 
diff --git a/nemo_rl/models/megatron/community_import.py b/nemo_rl/models/megatron/community_import.py
@@ -42,6 +42,7 @@ def import_model_from_hf_name(
     # Keep track of defaults so can restore them to the config after loading the model
     orig_tensor_model_parallel_size = model_provider.tensor_model_parallel_size
     orig_pipeline_model_parallel_size = model_provider.pipeline_model_parallel_size
+    orig_context_parallel_size = model_provider.context_parallel_size
     orig_expert_model_parallel_size = model_provider.expert_model_parallel_size
     orig_expert_tensor_parallel_size = model_provider.expert_tensor_parallel_size
     orig_num_layers_in_first_pipeline_stage = (
@@ -59,6 +60,7 @@ def import_model_from_hf_name(
         model_provider.pipeline_model_parallel_size = megatron_config[
             "pipeline_model_parallel_size"
         ]
+        model_provider.context_parallel_size = megatron_config["context_parallel_size"]
         model_provider.expert_model_parallel_size = megatron_config[
             "expert_model_parallel_size"
         ]
@@ -83,6 +85,7 @@ def import_model_from_hf_name(
     config = megatron_model[0].config
     config.tensor_model_parallel_size = orig_tensor_model_parallel_size
     config.pipeline_model_parallel_size = orig_pipeline_model_parallel_size
+    config.context_parallel_size = orig_context_parallel_size
     config.expert_model_parallel_size = orig_expert_model_parallel_size
     config.expert_tensor_parallel_size = orig_expert_tensor_parallel_size
     config.num_layers_in_first_pipeline_stage = orig_num_layers_in_first_pipeline_stage
@@ -123,6 +126,11 @@ def export_model_from_megatron(
 
     # Export performs on CPU with proper distributed context
     with temporary_distributed_context(backend="gloo"):
+        # Need to set model parallel cuda manual seed for mamba mixer
+        from megatron.core.tensor_parallel import model_parallel_cuda_manual_seed
+
+        model_parallel_cuda_manual_seed(0)
+
         # Load the Megatron model
         megatron_model = bridge.load_megatron_model(
             input_path, skip_temp_dist_context=True
diff --git a/nemo_rl/models/policy/megatron_policy_worker.py b/nemo_rl/models/policy/megatron_policy_worker.py
@@ -269,7 +269,7 @@ def freeze_moe_router(megatron_model):
                 if hasattr(model_module, "language_model"):
                     model_module = model_module.language_model
                 for layer in model_module.decoder.layers:
-                    if hasattr(layer.mlp, "router"):
+                    if hasattr(layer, "mlp") and hasattr(layer.mlp, "router"):
                         layer.mlp.router.weight.requires_grad = False
 
         mixed_precision_wrapper = CustomFloat16Module
@@ -1271,12 +1271,17 @@ def forward_step_fn(
             if len(multimodal_data) > 0:
                 position_ids = None
 
+            additional_kwargs = {}
+            # Mamba models currently do not support packed_seq_params
+            if packed_seq_params is not None:
+                additional_kwargs["packed_seq_params"] = packed_seq_params
+
             output_tensor = model(
                 input_ids=input_ids_cp_sharded,
                 position_ids=position_ids,
                 attention_mask=attention_mask,
-                packed_seq_params=packed_seq_params,
                 **multimodal_data,
+                **additional_kwargs,
             )
 
             # Apply temperature scaling to logits for training
@@ -1550,11 +1555,15 @@ def forward_step_fn(
             if len(multimodal_data) > 0:
                 position_ids = None
 
+            additional_kwargs = {}
+            if packed_seq_params is not None:
+                additional_kwargs["packed_seq_params"] = packed_seq_params
+
             output_tensor = model(
                 input_ids=input_ids_cp_sharded,
                 position_ids=position_ids,
                 attention_mask=attention_mask,
-                packed_seq_params=packed_seq_params,
+                **additional_kwargs,
                 **multimodal_data,
             )
 
diff --git a/tests/test_suites/llm/grpo-nano-v2-12b-1n8g-megatron.sh b/tests/test_suites/llm/grpo-nano-v2-12b-1n8g-megatron.sh
@@ -0,0 +1,41 @@
+#!/bin/bash
+SCRIPT_DIR=$( cd -- "$( dirname -- "${BASH_SOURCE[0]}" )" &> /dev/null && pwd)
+source $SCRIPT_DIR/common.env
+
+# ===== BEGIN CONFIG =====
+NUM_NODES=1
+STEPS_PER_RUN=30
+MAX_STEPS=30
+NUM_RUNS=$(( (MAX_STEPS + STEPS_PER_RUN - 1) / STEPS_PER_RUN ))  # Round up
+NUM_MINUTES=60
+# ===== END CONFIG =====
+
+exit_if_max_steps_reached
+
+# Run the experiment
+cd $PROJECT_ROOT
+uv run examples/run_grpo_math.py \
+    --config $CONFIG_PATH \
+    grpo.max_num_steps=$MAX_STEPS \
+    logger.log_dir=$LOG_DIR \
+    logger.wandb_enabled=True \
+    logger.wandb.project=nemo-rl \
+    logger.wandb.name=$EXP_NAME \
+    logger.monitor_gpus=True \
+    logger.tensorboard_enabled=True \
+    checkpointing.enabled=True \
+    checkpointing.checkpoint_dir=$CKPT_DIR \
+    $@ \
+    2>&1 | tee $RUN_LOG
+
+# Convert tensorboard logs to json
+uv run tests/json_dump_tb_logs.py $LOG_DIR --output_path $JSON_METRICS
+
+# Only run metrics if the target step is reached
+if [[ $(jq 'to_entries | .[] | select(.key == "train/loss") | .value | keys | map(tonumber) | max' $JSON_METRICS) -ge $MAX_STEPS ]]; then
+    uv run tests/check_metrics.py $JSON_METRICS \
+        'mean(data["train/token_mult_prob_error"]) < 1.05' \
+        'data["train/token_mult_prob_error"]["30"] < 1.05' \
+        'data["train/reward"]["30"] > 0.4' \
+        'mean(data["timing/train/total_step_time"], -6, -1) < 80'
+fi
diff --git a/tests/test_suites/llm/grpo-nano-v2-12b-2n8g-fsdp2tp1.sh b/tests/test_suites/llm/grpo-nano-v2-12b-2n8g-fsdp2tp1.sh
@@ -0,0 +1,41 @@
+#!/bin/bash
+SCRIPT_DIR=$( cd -- "$( dirname -- "${BASH_SOURCE[0]}" )" &> /dev/null && pwd)
+source $SCRIPT_DIR/common.env
+
+# ===== BEGIN CONFIG =====
+NUM_NODES=2
+STEPS_PER_RUN=30
+MAX_STEPS=30
+NUM_RUNS=$(( (MAX_STEPS + STEPS_PER_RUN - 1) / STEPS_PER_RUN ))  # Round up
+NUM_MINUTES=60
+# ===== END CONFIG =====
+
+exit_if_max_steps_reached
+
+# Run the experiment
+cd $PROJECT_ROOT
+uv run examples/run_grpo_math.py \
+    --config $CONFIG_PATH \
+    grpo.max_num_steps=$MAX_STEPS \
+    logger.log_dir=$LOG_DIR \
+    logger.wandb_enabled=True \
+    logger.wandb.project=nemo-rl \
+    logger.wandb.name=$EXP_NAME \
+    logger.monitor_gpus=True \
+    logger.tensorboard_enabled=True \
+    checkpointing.enabled=True \
+    checkpointing.checkpoint_dir=$CKPT_DIR \
+    $@ \
+    2>&1 | tee $RUN_LOG
+
+# Convert tensorboard logs to json
+uv run tests/json_dump_tb_logs.py $LOG_DIR --output_path $JSON_METRICS
+
+# Only run metrics if the target step is reached
+if [[ $(jq 'to_entries | .[] | select(.key == "train/loss") | .value | keys | map(tonumber) | max' $JSON_METRICS) -ge $MAX_STEPS ]]; then
+    uv run tests/check_metrics.py $JSON_METRICS \
+        'mean(data["train/token_mult_prob_error"]) < 1.05' \
+        'data["train/token_mult_prob_error"]["30"] < 1.05' \
+        'data["train/reward"]["30"] > 0.4' \
+        'mean(data["timing/train/total_step_time"], -6, -1) < 60'
+fi
diff --git a/tests/test_suites/nightly.txt b/tests/test_suites/nightly.txt
@@ -48,6 +48,10 @@ tests/test_suites/llm/grpo-llama3.1-8b-instruct-2n8g-fsdp2tp1-noncolocated.sh
 #https://github.com/NVIDIA-NeMo/RL/issues/1374
 #tests/test_suites/llm/grpo-math-llama-nemotron-super-49b-v.5-4n8g-fsdp2tp8.sh
 
+# Nano-v2
+tests/test_suites/llm/grpo-nano-v2-12b-1n8g-megatron.sh
+tests/test_suites/llm/grpo-nano-v2-12b-2n8g-fsdp2tp1.sh
+
 #######
 # SFT #
 #######