deepseek-ai
diff --git a/‎csrc/deep_ep.cpp‎
Lines changed: 19 additions & 13 deletions b/‎csrc/deep_ep.cpp‎
Lines changed: 19 additions & 13 deletions
diff --git a/‎csrc/deep_ep.hpp‎
Lines changed: 2 additions & 2 deletions b/‎csrc/deep_ep.hpp‎
Lines changed: 2 additions & 2 deletions
diff --git a/‎csrc/kernels/api.cuh‎
Lines changed: 2 additions & 2 deletions b/‎csrc/kernels/api.cuh‎
Lines changed: 2 additions & 2 deletions
diff --git a/‎csrc/kernels/internode_ll.cu‎
Lines changed: 24 additions & 17 deletions b/‎csrc/kernels/internode_ll.cu‎
Lines changed: 24 additions & 17 deletions
diff --git a/‎deep_ep/buffer.py‎
Lines changed: 7 additions & 7 deletions b/‎deep_ep/buffer.py‎
Lines changed: 7 additions & 7 deletions
@@ -1091,10 +1091,10 @@ std::tuple<torch::Tensor, std::optional<torch::Tensor>, torch::Tensor, torch::Te
 Buffer::low_latency_dispatch(const torch::Tensor& x, const torch::Tensor& topk_idx,
                              const std::optional<torch::Tensor>& cumulative_local_expert_recv_stats,
                              const std::optional<torch::Tensor>& dispatch_wait_recv_cost_stats,
-                             const std::optional<torch::Tensor>& x_global_scales,
+                             const std::optional<torch::Tensor>& x_sf_scale,
                              int num_max_dispatch_tokens_per_rank, int num_experts,
                              bool use_fp8, bool round_scale, bool use_ue8m0,
-                             bool use_nvfp4, bool use_ue8m0_for_nvfp4_x_scale,
+                             bool use_nvfp4, bool use_ue8m0_for_sf,
                              bool async, bool return_recv_hook) {
 #ifndef DISABLE_NVSHMEM
     EP_HOST_ASSERT(low_latency_mode);
@@ -1139,9 +1139,8 @@ Buffer::low_latency_dispatch(const torch::Tensor& x, const torch::Tensor& topk_i
         stream_wait(launch_stream, compute_stream);
 
     // Allocate packed tensors
-    constexpr int NUM_ELEMS_PER_PACK = 8;
-    auto packed_recv_x = torch::empty({num_local_experts, num_ranks * num_max_dispatch_tokens_per_rank, use_nvfp4 ? hidden / NUM_ELEMS_PER_PACK : hidden},
-                                      x.options().dtype(use_nvfp4 ? torch::kUInt32 : (use_fp8 ? torch::kFloat8_e4m3fn: torch::kBFloat16)));
+    auto packed_recv_x = torch::empty({num_local_experts, num_ranks * num_max_dispatch_tokens_per_rank, use_nvfp4 ? hidden / 2 : hidden},
+                                      x.options().dtype(use_nvfp4 ? torch::kUInt8 : (use_fp8 ? torch::kFloat8_e4m3fn: torch::kBFloat16)));
     auto packed_recv_src_info = torch::empty({num_local_experts, num_ranks * num_max_dispatch_tokens_per_rank}, torch::dtype(torch::kInt32).device(torch::kCUDA));
     auto packed_recv_layout_range = torch::empty({num_local_experts, num_ranks}, torch::dtype(torch::kInt64).device(torch::kCUDA));
     auto packed_recv_count = torch::empty({num_local_experts}, torch::dtype(torch::kInt32).device(torch::kCUDA));
@@ -1172,19 +1171,26 @@ Buffer::low_latency_dispatch(const torch::Tensor& x, const torch::Tensor& topk_i
         constexpr int mTileSize_dim_1 = 4;
         constexpr int mTileSize = mTileSize_dim_0 * mTileSize_dim_1;
 
+        assert(hidden % kNumPerChannels == 0);
         auto l = num_local_experts;
         auto m = num_ranks * num_max_dispatch_tokens_per_rank;
         auto rm = (m + 127) / 128;
-        auto rk = hidden / (kNumPerChannels * NUM_SF_ELEMS_PER_PACK);
-        auto scale_dtype = use_ue8m0_for_nvfp4_x_scale ?
-            torch::dtype(torch::kUInt8) :
-            torch::dtype(torch::kFloat8_e4m3fn);
+        auto rk = (hidden + (kNumPerChannels * NUM_SF_ELEMS_PER_PACK) -1 ) / (kNumPerChannels * NUM_SF_ELEMS_PER_PACK);
         // The physical layout is (l, rm, rk, 32, 4, 4).
-        packed_recv_x_scales = torch::empty({l, rm, rk, 32, 4, 4},
-                                            scale_dtype.device(torch::kCUDA));
+        if (use_ue8m0_for_sf) {
+            packed_recv_x_scales = torch::empty({l, rm, rk, 32, 4, 4},
+                                                torch::dtype(torch::kInt).device(torch::kCUDA));
+        } else {
+            packed_recv_x_scales = torch::empty({l, rm, rk, 32, 4, 4},
+                                                torch::dtype(torch::kFloat8_e4m3fn).device(torch::kCUDA));
+        }
         // After permute, the logical shape is (32, 4, rm, 4, rk, l)
         packed_recv_x_scales = packed_recv_x_scales.value().permute({3, 4, 1, 5, 2, 0});
 
+        // The physical layout is (l, m, k // 2). 
+        // After permute, the logical shape is (m, k // 2, l).
+        packed_recv_x = packed_recv_x.permute({1, 2, 0});
+
         packed_recv_x_scales_ptr = packed_recv_x_scales->data_ptr();
         EP_HOST_ASSERT(packed_recv_x_scales_ptr != nullptr);
     }
@@ -1197,15 +1203,15 @@ Buffer::low_latency_dispatch(const torch::Tensor& x, const torch::Tensor& topk_i
                                packed_recv_count.data_ptr<int>(),
                                cumulative_local_expert_recv_stats.has_value() ? cumulative_local_expert_recv_stats->data_ptr<int>() : nullptr,
                                dispatch_wait_recv_cost_stats.has_value() ? dispatch_wait_recv_cost_stats->data_ptr<int64_t>() : nullptr,
-                               x_global_scales.has_value() ? x_global_scales->data_ptr<float>() : nullptr,
+                               x_sf_scale.has_value() ? x_sf_scale->data_ptr<float>() : nullptr,
                                buffer.dispatch_rdma_recv_data_buffer, buffer.dispatch_rdma_recv_count_buffer,
                                buffer.dispatch_rdma_send_buffer,
                                x.data_ptr(), topk_idx.data_ptr<int64_t>(),
                                next_clean_meta.first, next_clean_meta.second,
                                num_tokens, hidden, num_max_dispatch_tokens_per_rank,
                                num_topk, num_experts, rank, num_ranks,
                                use_fp8, round_scale, use_ue8m0,
-                               use_nvfp4, use_ue8m0_for_nvfp4_x_scale,
+                               use_nvfp4, use_ue8m0_for_sf,
                                workspace, num_device_sms,
                                launch_stream, phases);
     };
 
@@ -147,10 +147,10 @@ struct Buffer {
     low_latency_dispatch(const torch::Tensor& x, const torch::Tensor& topk_idx,
                          const std::optional<torch::Tensor>& cumulative_local_expert_recv_stats,
                          const std::optional<torch::Tensor>& dispatch_wait_recv_cost_stats,
-                         const std::optional<torch::Tensor>& x_global_scales,
+                         const std::optional<torch::Tensor>& x_sf_scale,
                          int num_max_dispatch_tokens_per_rank, int num_experts,
                          bool use_fp8, bool round_scale, bool use_ue8m0,
-                         bool use_nvfp4, bool use_ue8m0_for_nvfp4_x_scale,
+                         bool use_nvfp4, bool use_ue8m0_for_sf,
                          bool async, bool return_recv_hook);
 
     std::tuple<torch::Tensor, std::optional<EventHandle>, std::optional<std::function<void()>>>
 
@@ -144,14 +144,14 @@ void dispatch(void* packed_recv_x, void* packed_recv_x_scales,
               int* packed_recv_count,
               int* cumulative_local_expert_recv_stats,
               int64_t* dispatch_wait_recv_cost_stats,
-              const float* x_global_scales,
+              const float* x_sf_scale,
               void* rdma_recv_x, int* rdma_recv_count, void* rdma_x,
               const void* x, const int64_t* topk_idx,
               int* next_clean, int num_next_clean_int,
               int num_tokens, int hidden, int num_max_dispatch_tokens_per_rank,
               int num_topk, int num_experts, int rank, int num_ranks,
               bool use_fp8, bool round_scale, bool use_ue8m0,
-              bool use_nvfp4, bool use_ue8m0_for_nvfp4_x_scale,
+              bool use_nvfp4, bool use_ue8m0_for_sf,
               void* workspace, int num_device_sms,
               cudaStream_t stream, int phases);
 
 
@@ -200,7 +200,7 @@ dispatch(void* packed_recv_x, void* packed_recv_x_scales,
          int* packed_recv_count,
          int* cumulative_local_expert_recv_stats,
          int64_t* dispatch_wait_recv_cost_stats,
-         const float* x_global_scales,
+         const float* x_sf_scale,
          void* rdma_recv_x, int* rdma_recv_count, void* rdma_x,
          const void* x, const int64_t* topk_idx,
          int* atomic_counter_per_expert, int* atomic_finish_counter_per_expert,
@@ -275,8 +275,8 @@ dispatch(void* packed_recv_x, void* packed_recv_x_scales,
             float SFScaleVal = 1.0f;
             if constexpr (kUseNVFP4) {
                 // Get scaling value;
-                EP_DEVICE_ASSERT(x_global_scales != nullptr);
-                SFScaleVal = *(static_cast<const float*>(x_global_scales));
+                EP_DEVICE_ASSERT(x_sf_scale != nullptr);
+                SFScaleVal = *(static_cast<const float*>(x_sf_scale));
             }
 
             // FP8 or NVFP4 cast
@@ -517,21 +517,28 @@ dispatch(void* packed_recv_x, void* packed_recv_x_scales,
                     recv_x_scales[token_idx * token_stride + pack_idx * pack_stride + elem_idx] = scale;
                 }
             } else if constexpr (kUseNVFP4) {            
-                 // The physical layout is (l, rm, rk, 32, 4, 4).
+                 // The physical layout is (l, rm, rk, 32, 4, 4)
                 const auto src_scales = reinterpret_cast<uint8_t*>(reinterpret_cast<uint8_t*>(src_data) + hidden_bytes);
                 const auto num_elems_per_pack = static_cast<int>(sizeof(packed_t) / sizeof(scale_t));
                 const auto token_idx = recv_token_begin_idx + i;
-                const auto token_stride = num_scales * sizeof(scale_t);
-                const auto pack_stride = num_elems_per_pack;
-                const auto rm = token_idx / 128;
-                const auto rm_res = token_idx % 128;
+                
+                const auto padded_k = (kHidden + (kNumPerChannels * num_elems_per_pack) -1 ) / (kNumPerChannels * num_elems_per_pack);
+                const auto dim0_stride = 128 * padded_k / kNumPerChannels;
+                const auto dim1_stride = 128 * num_elems_per_pack;
+                const auto dim2_stride = 4 * num_elems_per_pack;
+                const auto dim3_stride = num_elems_per_pack;
+
+                const auto dim0_offset = token_idx / 128;
+                const auto dim2_offset = (token_idx % 128) % 32;
+                const auto dim3_offset = (token_idx % 128) / 32;
+
                 #pragma unroll
                 for (int j = lane_id; j < num_scales; j += 32) {
-                    const auto pack_idx = j / num_elems_per_pack;
-                    const auto elem_idx = j % num_elems_per_pack;
+                    const auto dim1_offset = j / num_elems_per_pack;
+                    const auto dim4_offset = j % num_elems_per_pack;
                     auto scale = ld_nc_global(src_scales + j);
-                    // recv_x_scales[token_idx * token_stride + pack_idx * pack_stride + elem_idx] = scale;                   
-                    recv_x_scales[rm * token_stride * 128 + pack_idx * pack_stride * 128 + rm_res * pack_stride + elem_idx] = scale;
+                    const auto offset = dim0_offset * dim0_stride + dim1_offset * dim1_stride + dim2_offset * dim2_stride + dim3_offset * dim3_stride + dim4_offset;
+                    recv_x_scales[offset] = scale;
                 }
             }
         }
@@ -543,14 +550,14 @@ void dispatch(void* packed_recv_x, void* packed_recv_x_scales,
               int* packed_recv_count,
               int* cumulative_local_expert_recv_stats,
               int64_t* dispatch_wait_recv_cost_stats,
-              const float* x_global_scales,
+              const float* x_sf_scale,
               void* rdma_recv_x, int* rdma_recv_count, void* rdma_x,
               const void* x, const int64_t* topk_idx,
               int* next_clean, int num_next_clean_int,
               int num_tokens, int hidden, int num_max_dispatch_tokens_per_rank,
               int num_topk, int num_experts, int rank, int num_ranks,
               bool use_fp8, bool round_scale, bool use_ue8m0,
-              bool use_nvfp4, bool use_ue8m0_for_nvfp4_x_scale,
+              bool use_nvfp4, bool use_ue8m0_for_sf,
               void* workspace, int num_device_sms,
               cudaStream_t stream, int phases) {
     constexpr int kNumMaxTopK = 9;
@@ -578,17 +585,17 @@ if (use_fp8 and not use_ue8m0) \
     dispatch_func = dispatch<true, false, false, false, hidden>; \
 if (use_fp8 and use_ue8m0) \
     dispatch_func = dispatch<true, true, false, false, hidden>; \
-if (use_nvfp4 and not use_ue8m0_for_nvfp4_x_scale) \
+if (use_nvfp4 and not use_ue8m0_for_sf) \
     dispatch_func = dispatch<false, false, true, false, hidden>; \
-if (use_nvfp4 and use_ue8m0_for_nvfp4_x_scale) \
+if (use_nvfp4 and use_ue8m0_for_sf) \
     dispatch_func = dispatch<false, false, true, true, hidden>; \
 LAUNCH_KERNEL(&cfg, dispatch_func, \
               packed_recv_x, packed_recv_x_scales, \
               packed_recv_src_info, packed_recv_layout_range, \
               packed_recv_count, \
               cumulative_local_expert_recv_stats, \
               dispatch_wait_recv_cost_stats, \
-              x_global_scales, \
+              x_sf_scale, \
               rdma_recv_x, rdma_recv_count, rdma_x, \
               x, topk_idx, \
               atomic_counter_per_expert, atomic_finish_counter_per_expert, \
 
@@ -528,9 +528,9 @@ def low_latency_dispatch(self, x: torch.Tensor, topk_idx: torch.Tensor,
                              num_max_dispatch_tokens_per_rank: int, num_experts: int,
                              cumulative_local_expert_recv_stats: Optional[torch.Tensor] = None,
                              dispatch_wait_recv_cost_stats: Optional[torch.Tensor] = None,
-                             x_global_scales: Optional[torch.Tensor] = None,
+                             x_sf_scale: Optional[torch.Tensor] = None,
                              use_fp8: bool = True, round_scale: bool = False, use_ue8m0: bool = False,
-                             use_nvfp4: bool = False, use_ue8m0_for_nvfp4_x_scale: bool = False,
+                             use_nvfp4: bool = False, use_ue8m0_for_sf: bool = False,
                              async_finish: bool = False, return_recv_hook: bool = False) -> \
             Tuple[Tuple[torch.Tensor, torch.Tensor], torch.Tensor, Tuple, EventOverlap, Callable]:
         """
@@ -553,12 +553,12 @@ def low_latency_dispatch(self, x: torch.Tensor, topk_idx: torch.Tensor,
             dispatch_wait_recv_cost_stats: a cumulative time spent waiting to receive each token tensor for statistics,
                 which should have shape `[num_ranks, num_ranks]` and be typed as `torch.int64`.
                 This is useful for detecting and pre-cisely localizing slow anomalies.
-            x_global_scales: a float32 tensor with dim() == 0, the scaling factors for the entire dispatch.
+            x_sf_scale: a float32 tensor with dim() == 0, the scaling factors for the entire dispatch.
             use_fp8: whether to enable FP8 casting, with this, the received data will be a tuple of FP8 tensor and scaling factors.
             round_scale: whether round the scaling factors into power of 2.
             use_ue8m0: whether use UE8M0 as scaling factor format (available only with `round_scale=True`).
             use_nvfp4: whether to enable NVFP4 casting, with this, the received data will be a tuple of NVFP4 tensor and scaling factors.
-            use_ue8m0_for_nvfp4_x_scale: whether use UE8M0 as NVFP4 scaling factor format (available only with `use_nvfp4=True`).
+            use_ue8m0_for_sf: whether use UE8M0 as NVFP4 scaling factor format (available only with `use_nvfp4=True`).
             async_finish: the current stream will not wait for the communication kernels to be finished if set.
             return_recv_hook: return a receiving hook if set. If set, the kernel will just do the RDMA request issues,
                 but **without actually receiving the data**. You must call the received hook to make sure the data's arrival.
@@ -591,17 +591,17 @@ def low_latency_dispatch(self, x: torch.Tensor, topk_idx: torch.Tensor,
             self.runtime.low_latency_dispatch(x, topk_idx,
                                               cumulative_local_expert_recv_stats,
                                               dispatch_wait_recv_cost_stats,
-                                              x_global_scales,
+                                              x_sf_scale,
                                               num_max_dispatch_tokens_per_rank, num_experts,
                                               use_fp8, round_scale, use_ue8m0,
-                                              use_nvfp4, use_ue8m0_for_nvfp4_x_scale,
+                                              use_nvfp4, use_ue8m0_for_sf,
                                               async_finish, return_recv_hook)
         handle = (packed_recv_src_info, packed_recv_layout_range, num_max_dispatch_tokens_per_rank, x.size(1), num_experts)
         tensors_to_record = (x, topk_idx,
                              packed_recv_x, packed_recv_x_scales, packed_recv_count,
                              packed_recv_src_info, packed_recv_layout_range,
                              cumulative_local_expert_recv_stats,
-                             x_global_scales)
+                             x_sf_scale)
         if use_fp8 or use_nvfp4:
             packed_recv_x = (packed_recv_x, packed_recv_x_scales)
         return packed_recv_x, packed_recv_count, handle, \