add is_hf_model and is_moe_model to model state

hemildesai · hemildesai · commit 530a21608ec3 · 2025-11-12T10:58:23.000-08:00
Signed-off-by: Hemil Desai &lt;hemild@nvidia.com&gt;
diff --git a/nemo_rl/models/policy/dtensor_init.py b/nemo_rl/models/policy/dtensor_init.py
@@ -20,6 +20,7 @@
 
 import torch
 from accelerate import init_empty_weights
+from nemo_automodel._transformers.registry import ModelRegistry
 from nemo_automodel._transformers.utils import sliding_window_overwrite
 from nemo_automodel.components.config.loader import _resolve_target
 from nemo_automodel.components.distributed.fsdp2 import FSDP2Manager
@@ -83,6 +84,8 @@ class ModelAndOptimizerState:
     optimizer: Optional[torch.optim.Optimizer]
     scheduler: Optional[Any]
     reference_model_state_dict: Optional[dict[str, torch.Tensor]]
+    is_hf_model: bool
+    is_moe_model: bool
 
 
 def validate_and_set_config(
@@ -423,8 +426,11 @@ def setup_model_and_optimizer(
             )
 
     # Parallelize model
+    is_hf_model = (
+        model_config.architectures[0] not in ModelRegistry.model_arch_name_to_cls
+    )
     is_moe_model = any(["expert" in key for key in model_state_dict_keys])
-    if not isinstance(model, PreTrainedModel) and is_moe_model:
+    if not isinstance(model, PreTrainedModel) and is_moe_model and not is_hf_model:
         moe_parallelize_model(
             model=model,
             world_mesh=device_mesh,
@@ -539,4 +545,6 @@ def setup_model_and_optimizer(
         optimizer=optimizer,
         scheduler=scheduler,
         reference_model_state_dict=reference_model_state_dict,
+        is_hf_model=is_hf_model,
+        is_moe_model=is_moe_model,
     )
diff --git a/nemo_rl/models/policy/dtensor_policy_worker_v2.py b/nemo_rl/models/policy/dtensor_policy_worker_v2.py
@@ -224,7 +224,14 @@ def __init__(
         _copy_dataclass_fields(
             self,
             model_state,
-            ["model", "model_state_dict_keys", "optimizer", "scheduler"],
+            [
+                "model",
+                "model_state_dict_keys",
+                "optimizer",
+                "scheduler",
+                "is_hf_model",
+                "is_moe_model",
+            ],
         )
         if init_reference_model:
             self.reference_model_state_dict = model_state.reference_model_state_dict
@@ -503,6 +510,8 @@ def get_logprobs(
                     self.cp_mesh,
                     self._is_reward_model,
                     self.allow_flash_attn_args,
+                    self.is_hf_model,
+                    self.is_moe_model,
                 )
 
                 # Process outputs for logprobs
diff --git a/nemo_rl/models/policy/dtensor_train.py b/nemo_rl/models/policy/dtensor_train.py
@@ -88,6 +88,8 @@ def forward_backward(
     enable_seq_packing: bool,
     is_reward_model: bool,
     allow_flash_attn_args: bool,
+    is_hf_model: bool,
+    is_moe_model: bool,
     eval_mode: bool,
     apply_temperature_fn,
 ) -> tuple[torch.Tensor, dict[str, Any]]:
@@ -107,6 +109,8 @@ def forward_backward(
         enable_seq_packing: Whether sequence packing is enabled
         is_reward_model: Whether this is a reward model
         allow_flash_attn_args: Whether model supports flash_attn_kwargs
+        is_hf_model: Whether the model is an HF model
+        is_moe_model: Whether the model is a MoE model
         eval_mode: Whether in evaluation mode
         apply_temperature_fn: Function to apply temperature scaling to logits
 
@@ -121,6 +125,8 @@ def forward_backward(
         cp_mesh,
         is_reward_model,
         allow_flash_attn_args,
+        is_hf_model,
+        is_moe_model,
     )
 
     # Process outputs for training (loss + backward)
@@ -230,6 +236,8 @@ def model_forward(
     cp_mesh: Any,
     is_reward_model: bool,
     allow_flash_attn_args: bool,
+    is_hf_model: bool,
+    is_moe_model: bool,
 ) -> Any:
     """Perform model forward pass.
 
@@ -240,7 +248,8 @@ def model_forward(
         cp_mesh: Context parallel mesh
         is_reward_model: Whether this is a reward model
         allow_flash_attn_args: Whether model supports flash_attn_kwargs
-
+        is_hf_model: Whether the model is an HF model
+        is_moe_model: Whether the model is a MoE model
     Returns:
         Model outputs
     """
@@ -268,12 +277,14 @@ def model_forward(
             model_args = dict(
                 input_ids=input_ids,
                 attention_mask=attention_mask,
-                padding_mask=~attention_mask if attention_mask is not None else None,
                 position_ids=position_ids,
                 use_cache=False,
                 flash_attn_kwargs=flash_attn_kwargs,
                 **vlm_kwargs,
             )
+            if is_moe_model and not is_hf_model:
+                padding_mask = ~attention_mask if attention_mask is not None else None
+                model_args["padding_mask"] = padding_mask
 
             if is_reward_model:
                 # `flash_attn_kwarg` is not supported for `LlamaForSequenceClassification`.
@@ -291,7 +302,7 @@ def model_forward(
             # Remove None attention_mask padding_mask if present
             if model_args.get("attention_mask") is None:
                 del model_args["attention_mask"]
-            if model_args.get("padding_mask") is None:
+            if "padding_mask" in model_args and model_args.get("padding_mask") is None:
                 del model_args["padding_mask"]
 
             outputs = model(**model_args)
diff --git a/pyproject.toml b/pyproject.toml
@@ -64,6 +64,7 @@ automodel = [
     "flash-attn==2.8.1",
     "mamba-ssm",
     "causal-conv1d",
+    "transformers>=4.57.1",
 ]
 vllm = [
     "cuda-python",
diff --git a/tests/unit/models/policy/test_dtensor_init.py b/tests/unit/models/policy/test_dtensor_init.py
@@ -188,7 +188,7 @@ def test_sequence_packing_with_vlm_raises_error(
     @patch("nemo_rl.models.policy.dtensor_init.resolve_model_class")
     @patch("nemo_rl.models.policy.dtensor_init.configure_dynamo_cache")
     @patch("nemo_rl.models.policy.dtensor_init.sliding_window_overwrite")
-    @patch("nemo_rl.models.policy.dtensor_init.NeMoAutoModelForSequenceClassification")
+    @patch("nemo_automodel.NeMoAutoModelForSequenceClassification")
     def test_reward_model_bradley_terry(
         self,
         mock_rm_class,
@@ -407,7 +407,7 @@ def test_hf_config_overrides_none(
     @patch("nemo_rl.models.policy.dtensor_init.resolve_model_class")
     @patch("nemo_rl.models.policy.dtensor_init.configure_dynamo_cache")
     @patch("nemo_rl.models.policy.dtensor_init.sliding_window_overwrite")
-    @patch("nemo_rl.models.policy.dtensor_init.NeMoAutoModelForSequenceClassification")
+    @patch("nemo_automodel.NeMoAutoModelForSequenceClassification")
     def test_reward_model_with_num_labels_equals_one(
         self,
         mock_rm_class,
@@ -753,6 +753,8 @@ def test_basic_model_setup(
         assert result.scheduler == mock_scheduler
         assert result.reference_model_state_dict is not None
         assert len(result.model_state_dict_keys) > 0
+        assert isinstance(result.is_hf_model, bool)
+        assert isinstance(result.is_moe_model, bool)
 
     @patch("nemo_rl.models.policy.dtensor_init.init_empty_weights")
     @patch("nemo_rl.models.policy.utils.import_class_from_path")
@@ -835,6 +837,8 @@ def test_model_setup_without_optimizer(
         assert result.optimizer is None
         assert result.scheduler is None
         assert result.reference_model_state_dict is None
+        assert isinstance(result.is_hf_model, bool)
+        assert isinstance(result.is_moe_model, bool)
 
     @patch("nemo_rl.models.policy.dtensor_init.init_empty_weights")
     def test_context_parallel_with_gemma3_raises_error(
@@ -1014,6 +1018,8 @@ def import_side_effect(path):
         )
 
         assert result.scheduler == mock_final_scheduler
+        assert isinstance(result.is_hf_model, bool)
+        assert isinstance(result.is_moe_model, bool)
 
     @patch("nemo_rl.models.policy.dtensor_init.init_empty_weights")
     @patch("nemo_rl.models.policy.utils.import_class_from_path")
@@ -1211,10 +1217,14 @@ def test_model_and_optimizer_state_creation(self):
             optimizer=MagicMock(),
             scheduler=MagicMock(),
             reference_model_state_dict={"layer.weight": torch.zeros(10, 10)},
+            is_hf_model=False,
+            is_moe_model=True,
         )
 
         assert state.model is not None
         assert len(state.model_state_dict_keys) == 2
         assert state.optimizer is not None
         assert state.scheduler is not None
         assert state.reference_model_state_dict is not None
+        assert state.is_hf_model is False
+        assert state.is_moe_model is True
diff --git a/tests/unit/models/policy/test_dtensor_train.py b/tests/unit/models/policy/test_dtensor_train.py
diff --git a/uv.lock b/uv.lock

Original file line number	Diff line number	Diff line change
`@@ -64,6 +64,7 @@ automodel = [`
`64`	`64`	`"flash-attn==2.8.1",`
`65`	`65`	`"mamba-ssm",`
`66`	`66`	`"causal-conv1d",`
	`67`	`+ "transformers>=4.57.1",`
`67`	`68`	`]`
`68`	`69`	`vllm = [`
`69`	`70`	`"cuda-python",`