From 1344ffde9e61b3b67e8ab9d6fa1daa86f93f730d Mon Sep 17 00:00:00 2001
From: Hongwei Chen <33092912+hwchen2017@users.noreply.github.com>
Date: Sat, 21 Jun 2025 10:20:30 -0700
Subject: [PATCH 1/4] Add file extension (#980)

Signed-off-by: Hongwei Chen <hongweichen@microsoft.com>
Signed-off-by: jouw <jouw@foxmail.com>
---
 deepnvme/model_checkpoint/{README => README.md} | 0
 1 file changed, 0 insertions(+), 0 deletions(-)
 rename deepnvme/model_checkpoint/{README => README.md} (100%)

diff --git a/deepnvme/model_checkpoint/README b/deepnvme/model_checkpoint/README.md
similarity index 100%
rename from deepnvme/model_checkpoint/README
rename to deepnvme/model_checkpoint/README.md

From 4c87e8de4866f99a87dc24d30b3fee6803358180 Mon Sep 17 00:00:00 2001
From: jouw <jouw@foxmail.com>
Date: Thu, 26 Jun 2025 18:36:19 +0800
Subject: [PATCH 2/4] fix init weights issue for critic/reward model

Signed-off-by: jouw <jouw@foxmail.com>
---
 applications/DeepSpeed-Chat/dschat/rlhf/ppo_trainer.py        | 3 +--
 applications/DeepSpeed-Chat/dschat/utils/model/model_utils.py | 4 +++-
 .../DeepSpeed-Chat/training/step3_rlhf_finetuning/main.py     | 3 +--
 3 files changed, 5 insertions(+), 5 deletions(-)

diff --git a/applications/DeepSpeed-Chat/dschat/rlhf/ppo_trainer.py b/applications/DeepSpeed-Chat/dschat/rlhf/ppo_trainer.py
index 22cba6be0..1801d42f9 100644
--- a/applications/DeepSpeed-Chat/dschat/rlhf/ppo_trainer.py
+++ b/applications/DeepSpeed-Chat/dschat/rlhf/ppo_trainer.py
@@ -236,8 +236,7 @@ def train_rlhf(self, inputs):
         value = self.critic_model.forward_value(**batch,
                                                 return_value_only=True,
                                                 use_cache=False)[:, :-1]
-        critic_loss = self.critic_loss_fn(value[:, start:], old_values[:,
-                                                                       start:],
+        critic_loss = self.critic_loss_fn(value[:, start:], old_values[:, start:],
                                           returns, action_mask[:, start:])
         self.critic_model.backward(critic_loss)
 
diff --git a/applications/DeepSpeed-Chat/dschat/utils/model/model_utils.py b/applications/DeepSpeed-Chat/dschat/utils/model/model_utils.py
index 050819a22..0a37fa299 100644
--- a/applications/DeepSpeed-Chat/dschat/utils/model/model_utils.py
+++ b/applications/DeepSpeed-Chat/dschat/utils/model/model_utils.py
@@ -11,6 +11,7 @@
 )
 from huggingface_hub import snapshot_download
 from transformers.integrations.deepspeed import HfDeepSpeedConfig
+from transformers.modeling_utils import no_init_weights
 
 from dschat.utils.model.reward_model import RewardModel
 from dschat.utils.utils import load_state_dict_into_model, print_rank_0
@@ -99,7 +100,8 @@ def create_hf_model(model_class,
         dschf = None
     if rlhf_training:
         # the weight loading is handled by create critic model
-        model = model_class.from_config(model_config)
+        with no_init_weights():
+            model = model_class.from_config(model_config)
     else:
         model = model_class.from_pretrained(
             model_name_or_path,
diff --git a/applications/DeepSpeed-Chat/training/step3_rlhf_finetuning/main.py b/applications/DeepSpeed-Chat/training/step3_rlhf_finetuning/main.py
index 1378dc4e6..a6647d92b 100644
--- a/applications/DeepSpeed-Chat/training/step3_rlhf_finetuning/main.py
+++ b/applications/DeepSpeed-Chat/training/step3_rlhf_finetuning/main.py
@@ -594,8 +594,7 @@ def main():
                     "-------------------------------------------------------------------------------------",
                     args.global_rank)
 
-                if args.enable_tensorboard and torch.distributed.get_rank(
-                ) == 0:
+                if args.enable_tensorboard and torch.distributed.get_rank() == 0:
                     writer.add_scalar('reward',
                                       average_reward / inner_iter,
                                       global_step=step)

From 9a7062bb14576b81f3a836b00fc9b88c76780e49 Mon Sep 17 00:00:00 2001
From: raviguptaamd <ravi.gupta@amd.com>
Date: Thu, 3 Jul 2025 22:24:42 -0700
Subject: [PATCH 3/4] Update submodule link to reflect https style (#981)

Signed-off-by: raviguptaamd <ravi.gupta@amd.com>
Signed-off-by: jouw <jouw@foxmail.com>
---
 .gitmodules | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/.gitmodules b/.gitmodules
index 1e192cfb3..5190f4b46 100644
--- a/.gitmodules
+++ b/.gitmodules
@@ -1,3 +1,3 @@
 [submodule "training/DeepSpeed-Domino/Megatron-LM"]
 	path = training/DeepSpeed-Domino/Megatron-LM
-	url = git@github.com:NVIDIA/Megatron-LM.git
+	url = https://github.com/NVIDIA/Megatron-LM.git

From 8000b6fcd1f7365a23bb73583b594b788346ed44 Mon Sep 17 00:00:00 2001
From: jouw <jouw@foxmail.com>
Date: Mon, 7 Jul 2025 10:46:41 +0800
Subject: [PATCH 4/4] fix formatting issue

Signed-off-by: jouw <jouw@foxmail.com>
---
 applications/DeepSpeed-Chat/dschat/rlhf/ppo_trainer.py         | 3 ++-
 .../DeepSpeed-Chat/training/step3_rlhf_finetuning/main.py      | 3 ++-
 2 files changed, 4 insertions(+), 2 deletions(-)

diff --git a/applications/DeepSpeed-Chat/dschat/rlhf/ppo_trainer.py b/applications/DeepSpeed-Chat/dschat/rlhf/ppo_trainer.py
index 1801d42f9..22cba6be0 100644
--- a/applications/DeepSpeed-Chat/dschat/rlhf/ppo_trainer.py
+++ b/applications/DeepSpeed-Chat/dschat/rlhf/ppo_trainer.py
@@ -236,7 +236,8 @@ def train_rlhf(self, inputs):
         value = self.critic_model.forward_value(**batch,
                                                 return_value_only=True,
                                                 use_cache=False)[:, :-1]
-        critic_loss = self.critic_loss_fn(value[:, start:], old_values[:, start:],
+        critic_loss = self.critic_loss_fn(value[:, start:], old_values[:,
+                                                                       start:],
                                           returns, action_mask[:, start:])
         self.critic_model.backward(critic_loss)
 
diff --git a/applications/DeepSpeed-Chat/training/step3_rlhf_finetuning/main.py b/applications/DeepSpeed-Chat/training/step3_rlhf_finetuning/main.py
index a6647d92b..1378dc4e6 100644
--- a/applications/DeepSpeed-Chat/training/step3_rlhf_finetuning/main.py
+++ b/applications/DeepSpeed-Chat/training/step3_rlhf_finetuning/main.py
@@ -594,7 +594,8 @@ def main():
                     "-------------------------------------------------------------------------------------",
                     args.global_rank)
 
-                if args.enable_tensorboard and torch.distributed.get_rank() == 0:
+                if args.enable_tensorboard and torch.distributed.get_rank(
+                ) == 0:
                     writer.add_scalar('reward',
                                       average_reward / inner_iter,
                                       global_step=step)