format

inkcherry · inkcherry · commit f6fa3845895f · 2025-03-28T03:19:19.000Z
Signed-off-by: inkcherry &lt;mingzhi.liu@intel.com&gt;
diff --git a/deepspeed/module_inject/auto_tp.py b/deepspeed/module_inject/auto_tp.py
@@ -384,7 +384,7 @@ def _replace(self, child, name, conv_linear_layer):
             elif name == "lm_head" or name == 'embed_out':
                 if is_autotp_training_mode():
                     return child
-                    
+
                     ## gather output column parallel
                     ## return LinearLayer(child, self.mp_group, name=name, gather_output=True)
                 else:
diff --git a/deepspeed/module_inject/layers.py b/deepspeed/module_inject/layers.py
@@ -109,23 +109,23 @@ def backward(ctx: Any, grad_output: torch.Tensor) -> Tuple[None, torch.Tensor]:
         dist.all_reduce(grad_output.contiguous(), group=ctx.group)
         return None, grad_output
 
+
 class GatherTensor(torch.autograd.Function):
     """Gather the input from model parallel region and concatinate."""
 
-
     @staticmethod
     def forward(ctx, group, input_):
         """Forward function."""
         # gather along last dim
-        world_size=dist.get_world_size(group)
-        if world_size==1:
-            return 
-        ctx.group=group
-        ctx.world_size=world_size
-
-        gather_shape = (world_size,) + input_.shape
-        output =torch.empty(gather_shape, dtype=input_.dtype, device=get_accelerator().current_device_name()  )
-        dist.all_gather_into_tensor(output, input_.contiguous(),  group)
+        world_size = dist.get_world_size(group)
+        if world_size == 1:
+            return
+        ctx.group = group
+        ctx.world_size = world_size
+
+        gather_shape = (world_size, ) + input_.shape
+        output = torch.empty(gather_shape, dtype=input_.dtype, device=get_accelerator().current_device_name())
+        dist.all_gather_into_tensor(output, input_.contiguous(), group)
         tensor_list = output.chunk(world_size, dim=0)
         output = torch.cat(tensor_list, dim=-1).squeeze(0).contiguous()
         return output
@@ -139,6 +139,7 @@ def backward(ctx, grad_output):
         grad_output = input_list[rank].contiguous()
         return None, grad_output
 
+
 class TensorParallel_Layer(nn.Module, ABC):
     """
     A base class for model layers with  tensor parallelism support.
@@ -434,19 +435,18 @@ def __init__(self, module, mp_group=None, skip_partition=False, gather_output=Fa
         self.config_tp_params(self.weight)
         if self.bias is not None:
             self.config_tp_params(self.bias)
-        self.gather_output=gather_output
-     
+        self.gather_output = gather_output
 
     def forward(self, input):
         if getattr(self, 'mp_group', None) is not None:
             input = ColumnParallel.apply(self.mp_group, input)
         output = torch.matmul(input, self.weight.transpose(-1, -2))
         if self.bias is not None:
             output += self.bias
-            
+
         if self.gather_output:
-            output = GatherTensor.apply(self.mp_group,output)
-            
+            output = GatherTensor.apply(self.mp_group, output)
+
         return output
 
     @torch.no_grad()
@@ -634,7 +634,7 @@ def __init__(self, module, mp_group, **kwargs):
     def forward(self, input):
         input_shard_size = get_shard_size(input.shape[-1], self.tp_world_size, "lm_head")
         input_shard_offset = sum(get_shard_size_list(input.shape[-1], self.tp_world_size, "lm_head")[0:self.tp_index])
-        
+
         output = torch.matmul(input[:, :, input_shard_offset:input_shard_offset + input_shard_size],
                               self.weight.transpose(-1, -2))
         if self.mp_group is not None: