update

inkcherry · inkcherry · commit 1cf3038fa050 · 2025-03-28T03:19:19.000Z
Signed-off-by: inkcherry &lt;mingzhi.liu@intel.com&gt;
diff --git a/deepspeed/module_inject/auto_tp.py b/deepspeed/module_inject/auto_tp.py
@@ -383,9 +383,10 @@ def _replace(self, child, name, conv_linear_layer):
                 return Conv_LinearALlreduce(child, self.mp_group, name=name)
             elif name == "lm_head" or name == 'embed_out':
                 if is_autotp_training_mode():
-                    # pass
-                    # return child
-                    return LinearLayer(child, self.mp_group, name=name, gather_output=True)
+                    return child
+                    
+                    ## gather output column parallel
+                    ## return LinearLayer(child, self.mp_group, name=name, gather_output=True)
                 else:
                     return LmHeadLinearAllreduce(child, self.mp_group)
 
diff --git a/deepspeed/module_inject/layers.py b/deepspeed/module_inject/layers.py
@@ -112,10 +112,6 @@ def backward(ctx: Any, grad_output: torch.Tensor) -> Tuple[None, torch.Tensor]:
 class GatherTensor(torch.autograd.Function):
     """Gather the input from model parallel region and concatinate."""
 
-    # @staticmethod
-    # def symbolic(graph, input_):
-    #     """Symbolic function for tracing."""
-    #     return _gather_along_last_dim(input_)
 
     @staticmethod
     def forward(ctx, group, input_):
@@ -431,8 +427,7 @@ def __init__(self, module, mp_group=None, skip_partition=False, gather_output=Fa
         super(LinearLayer, self).__init__(mp_group, **kwargs)
         self.weight = module.weight
         self.bias = module.bias
-        if gather_output:
-            b=0
+
         if not skip_partition:
             self._tp_partition([self.weight, self.bias])
         self.support_training = True
@@ -639,7 +634,6 @@ def __init__(self, module, mp_group, **kwargs):
     def forward(self, input):
         input_shard_size = get_shard_size(input.shape[-1], self.tp_world_size, "lm_head")
         input_shard_offset = sum(get_shard_size_list(input.shape[-1], self.tp_world_size, "lm_head")[0:self.tp_index])
-        input= input[:, :, input_shard_offset:input_shard_offset + input_shard_size]
         
         output = torch.matmul(input[:, :, input_shard_offset:input_shard_offset + input_shard_size],
                               self.weight.transpose(-1, -2))
diff --git a/deepspeed/module_inject/replace_module.py b/deepspeed/module_inject/replace_module.py
@@ -335,9 +335,6 @@ def replace_fn(child, _policy, layer_id=0, prefix="", state_dict=None):
         return new_module
 
     def set_lm_head(module):
-        # if is_autotp_training_mode():
-        #     # we need to handle autoTP training mode separately.
-        #     return
 
         embedding_weight = None
         for n, p in module.named_parameters():