fix init weights issue for critic/reward model

jouw · jouw · commit 4c87e8de4866 · 2025-07-06T18:21:17.000+08:00
Signed-off-by: jouw &lt;jouw@foxmail.com&gt;
diff --git a/applications/DeepSpeed-Chat/dschat/rlhf/ppo_trainer.py b/applications/DeepSpeed-Chat/dschat/rlhf/ppo_trainer.py
@@ -236,8 +236,7 @@ def train_rlhf(self, inputs):
         value = self.critic_model.forward_value(**batch,
                                                 return_value_only=True,
                                                 use_cache=False)[:, :-1]
-        critic_loss = self.critic_loss_fn(value[:, start:], old_values[:,
-                                                                       start:],
+        critic_loss = self.critic_loss_fn(value[:, start:], old_values[:, start:],
                                           returns, action_mask[:, start:])
         self.critic_model.backward(critic_loss)
 
diff --git a/applications/DeepSpeed-Chat/dschat/utils/model/model_utils.py b/applications/DeepSpeed-Chat/dschat/utils/model/model_utils.py
@@ -11,6 +11,7 @@
 )
 from huggingface_hub import snapshot_download
 from transformers.integrations.deepspeed import HfDeepSpeedConfig
+from transformers.modeling_utils import no_init_weights
 
 from dschat.utils.model.reward_model import RewardModel
 from dschat.utils.utils import load_state_dict_into_model, print_rank_0
@@ -99,7 +100,8 @@ def create_hf_model(model_class,
         dschf = None
     if rlhf_training:
         # the weight loading is handled by create critic model
-        model = model_class.from_config(model_config)
+        with no_init_weights():
+            model = model_class.from_config(model_config)
     else:
         model = model_class.from_pretrained(
             model_name_or_path,
diff --git a/applications/DeepSpeed-Chat/training/step3_rlhf_finetuning/main.py b/applications/DeepSpeed-Chat/training/step3_rlhf_finetuning/main.py
@@ -594,8 +594,7 @@ def main():
                     "-------------------------------------------------------------------------------------",
                     args.global_rank)
 
-                if args.enable_tensorboard and torch.distributed.get_rank(
-                ) == 0:
+                if args.enable_tensorboard and torch.distributed.get_rank() == 0:
                     writer.add_scalar('reward',
                                       average_reward / inner_iter,
                                       global_step=step)