change layout of dispatch output x_scales

shifangx · shifangx · commit 74b631a2a8c3 · 2025-08-27T21:39:48.000-07:00
diff --git a/csrc/deep_ep.cpp b/csrc/deep_ep.cpp
@@ -1159,12 +1159,26 @@ Buffer::low_latency_dispatch(const torch::Tensor& x, const torch::Tensor& topk_i
         if (not use_ue8m0) {
             packed_recv_x_scales = torch::empty({num_local_experts, hidden / 128, num_ranks * num_max_dispatch_tokens_per_rank},
                                                 torch::dtype(torch::kFloat32).device(torch::kCUDA));
+            packed_recv_x_scales = torch::transpose(packed_recv_x_scales.value(), 1, 2);
         } else {
             EP_HOST_ASSERT(round_scale);
-            packed_recv_x_scales = torch::empty({num_local_experts, hidden / 512, num_ranks * num_max_dispatch_tokens_per_rank},
-                                                torch::dtype(torch::kInt).device(torch::kCUDA));
+            // The blockscale tensor in FP8-E4M3, with shape (32, 4, rm, 4, rk, l)
+            // but the physical layout is (l, rm, rk, 32, 4, 4).
+            // For the shape of output_scales, `32 * 4 * rm` is a padded m to nearest multiple of 128.
+            // `4 * rk` is a padded `k // 16` to nearest multiple of 4. These layout constants are
+            // required by the NVIDIA Blackwell MMA operations.
+            // So we need to allocate the tensor with shape (l, rm, rk, 32, 4, 4) and transpose it to the logical layout.
+            auto rm = (hidden + 127) / 128;
+            auto rk = (hidden + 15) / 16;
+            auto l = num_local_experts;
+            packed_recv_x_scales = torch::empty({l, rm, rk, 32, 4, 4},
+                                                torch::dtype(torch::kFloat8_e4m3fn).device(torch::kCUDA));
+            // Reshape and permute the packed_recv_x_scales tensor to match the required layout
+            packed_recv_x_scales = packed_recv_x_scales.value().view(torch::kFloat8_e4m3fn).view(
+                {l, rm, rk, 32, 4, 4}
+            );
+            packed_recv_x_scales = packed_recv_x_scales.value().permute({3, 4, 1, 5, 2, 0});
         }
-        packed_recv_x_scales = torch::transpose(packed_recv_x_scales.value(), 1, 2);
         packed_recv_x_scales_ptr = packed_recv_x_scales->data_ptr();
     }else if (use_nvfp4) {
         constexpr int SF_VEC_SIZE = 16;
diff --git a/csrc/kernels/internode_ll.cu b/csrc/kernels/internode_ll.cu
@@ -517,18 +517,27 @@ dispatch(void* packed_recv_x, void* packed_recv_x_scales, void* packed_recv_x_sf
                 }
             } else if constexpr (kUseNVFP4) {            
                 // Equivalent CuTe layout:
-                //   (num_tokens, (num_packed, num_elems_per_pack)):(num_elems_per_pack, (num_tokens * num_elems_per_pack, 1))
                 const auto src_scales = reinterpret_cast<uint8_t*>(reinterpret_cast<uint8_t*>(src_data) + hidden_bytes);
                 const auto num_elems_per_pack = static_cast<int>(sizeof(packed_t) / sizeof(scale_t));
+                const auto max_rk = (hidden_int4 * 4 + 15) / 16;
                 const auto token_idx = recv_token_begin_idx + i;
-                const auto token_stride = num_elems_per_pack;
-                const auto pack_stride = num_ranks * num_max_dispatch_tokens_per_rank * num_elems_per_pack;
+
+                const int stride_dim_0 = 1;
+                const int stride_dim_1 = num_elems_per_pack;
+                const int stride_dim_2 = 4*4;
+                const int stride_rk = 32*4*4;
+                const int stride_rm = max_rk*32*4*4;
+
+                const auto index_rm = token_idx / (32 * 4);
                 #pragma unroll
                 for (int j = lane_id; j < num_scales; j += 32) {
-                    const auto pack_idx = j / num_elems_per_pack;
-                    const auto elem_idx = j % num_elems_per_pack;
+                    auto index_rk = j / num_elems_per_pack;
+                    auto index_dim_2 = (token_idx % (32)) / 4;
+                    auto index_dim_1 = token_idx % (32 * 4);
+                    auto index_dim_0 = j % num_elems_per_pack;
+                    auto offset = index_dim_0 * stride_dim_0 + index_dim_1 * stride_dim_1 + index_dim_2 * stride_dim_2 + index_rk * stride_rk + index_rm * stride_rm;
                     auto scale = ld_nc_global(src_scales + j);
-                    recv_x_scales[token_idx * token_stride + pack_idx * pack_stride + elem_idx] = scale;                   
+                    recv_x_scales[offset] = scale;
                 }
             }
         }
diff --git a/tests/utils.py b/tests/utils.py
@@ -52,6 +52,8 @@ def per_token_cast_to_fp8(x: torch.Tensor):
 
     
 def cast_fp8_to_fp32(x_fp8: torch.Tensor, x_scales: torch.Tensor):
+    # TODO(shifangx): remove print after debugging
+    print(f"in cast_fp8_to_fp32, x_fp8.shape: {x_fp8.shape}, x_scales.shape: {x_scales.shape}")
     if x_fp8.numel() == 0:
         return x_fp8.to(torch.bfloat16)
     if x_scales.dtype == torch.int:
@@ -91,11 +93,21 @@ def int32_to_8floats_lookup(tensor: torch.Tensor, table: torch.Tensor) -> torch.
 
 
 def cast_nvfp4_to_fp32(x_nvfp4: torch.Tensor, x_scales: torch.Tensor, x_sf_scale: float, use_ue8m0_for_nvfp4_sf: bool = False):
+    # TODO(shifangx): remove print after debugging
+    print(f"in cast_nvfp4_to_fp32, x_nvfp4.shape: {x_nvfp4.shape}, x_scales.shape: {x_scales.shape}")
     NVFP4_TABLE = torch.tensor([0, 0.5, 1, 1.5, 2, 3, 4, 6, 0, -0.5, -1.0, -1.5, -2, -3, -4, -6], dtype=torch.float32, device='cuda')    
     if use_ue8m0_for_nvfp4_sf:
         x_scales = x_scales.view(dtype=torch.int8).to(torch.int) << 23
         x_scales = x_scales.view(dtype=torch.float)
     else:
+        # shape of x_scales: (32, 4, rm, 4, rk, l)
+        dim_0, dim_1, dim_2, dim_3, dim_4, dim_5 = x_scales.shape
+        assert dim_0 == 32 and dim_1 == 4 and dim_3 == 4 , "x_scales must be in the shape of (32, 4, rm, 4, rk, l)"
+        rm = dim_2
+        rk = dim_4
+        l = dim_5
+        x_scales = x_scales.view(dtype=torch.float8_e4m3fn).permute({{5, 2, 0, 1, 4, 3}}); # shape of x_scales: (l, rm, 32, 4, rk, 4)
+        x_scales = x_scales.reshape({l, rm * 32 * 4, rk * 4}); # shape of x_scales: (l, m, k)
         x_scales = x_scales.view(dtype=torch.float8_e4m3fn).to(torch.float32)
     x_sf_scale = x_sf_scale.view(*x_sf_scale.shape, 1)
     x_scales = x_scales * (1 / x_sf_scale)