change some names and dtype:

shifangx · shifangx · commit 82147f21fe41 · 2025-09-05T22:48:52.000-07:00
change from x_sf_scale to x_global_scales.
change from use_ue8m0_for_sf to use_ue8m0_for_nvfp4_x_scale.
set x_scale dtpye as torch::kFloat8_e4m3fn for if use_ue8m0_for_nvfp4_x_scale==False and torch::kUInt8 for use_ue8m0_for_nvfp4_x_scale==True.
diff --git a/csrc/deep_ep.cpp b/csrc/deep_ep.cpp
@@ -1091,10 +1091,10 @@ std::tuple<torch::Tensor, std::optional<torch::Tensor>, torch::Tensor, torch::Te
 Buffer::low_latency_dispatch(const torch::Tensor& x, const torch::Tensor& topk_idx,
                              const std::optional<torch::Tensor>& cumulative_local_expert_recv_stats,
                              const std::optional<torch::Tensor>& dispatch_wait_recv_cost_stats,
-                             const std::optional<torch::Tensor>& x_sf_scale,
+                             const std::optional<torch::Tensor>& x_global_scales,
                              int num_max_dispatch_tokens_per_rank, int num_experts,
                              bool use_fp8, bool round_scale, bool use_ue8m0,
-                             bool use_nvfp4, bool use_ue8m0_for_sf,
+                             bool use_nvfp4, bool use_ue8m0_for_nvfp4_x_scale,
                              bool async, bool return_recv_hook) {
 #ifndef DISABLE_NVSHMEM
     EP_HOST_ASSERT(low_latency_mode);
@@ -1176,9 +1176,12 @@ Buffer::low_latency_dispatch(const torch::Tensor& x, const torch::Tensor& topk_i
         auto m = num_ranks * num_max_dispatch_tokens_per_rank;
         auto rm = (m + 127) / 128;
         auto rk = hidden / (kNumPerChannels * NUM_SF_ELEMS_PER_PACK);
+        auto scale_dtype = use_ue8m0_for_nvfp4_x_scale ?
+            torch::dtype(torch::kUInt8) :
+            torch::dtype(torch::kFloat8_e4m3fn);
         // The physical layout is (l, rm, rk, 32, 4, 4).
         packed_recv_x_scales = torch::empty({l, rm, rk, 32, 4, 4},
-                                            torch::dtype(torch::kUInt8).device(torch::kCUDA));
+                                            scale_dtype.device(torch::kCUDA));
         // After permute, the logical shape is (32, 4, rm, 4, rk, l)
         packed_recv_x_scales = packed_recv_x_scales.value().permute({3, 4, 1, 5, 2, 0});
 
@@ -1194,15 +1197,15 @@ Buffer::low_latency_dispatch(const torch::Tensor& x, const torch::Tensor& topk_i
                                packed_recv_count.data_ptr<int>(),
                                cumulative_local_expert_recv_stats.has_value() ? cumulative_local_expert_recv_stats->data_ptr<int>() : nullptr,
                                dispatch_wait_recv_cost_stats.has_value() ? dispatch_wait_recv_cost_stats->data_ptr<int64_t>() : nullptr,
-                               x_sf_scale.has_value() ? x_sf_scale->data_ptr<float>() : nullptr,
+                               x_global_scales.has_value() ? x_global_scales->data_ptr<float>() : nullptr,
                                buffer.dispatch_rdma_recv_data_buffer, buffer.dispatch_rdma_recv_count_buffer,
                                buffer.dispatch_rdma_send_buffer,
                                x.data_ptr(), topk_idx.data_ptr<int64_t>(),
                                next_clean_meta.first, next_clean_meta.second,
                                num_tokens, hidden, num_max_dispatch_tokens_per_rank,
                                num_topk, num_experts, rank, num_ranks,
                                use_fp8, round_scale, use_ue8m0,
-                               use_nvfp4, use_ue8m0_for_sf,
+                               use_nvfp4, use_ue8m0_for_nvfp4_x_scale,
                                workspace, num_device_sms,
                                launch_stream, phases);
     };
diff --git a/csrc/deep_ep.hpp b/csrc/deep_ep.hpp
@@ -147,10 +147,10 @@ struct Buffer {
     low_latency_dispatch(const torch::Tensor& x, const torch::Tensor& topk_idx,
                          const std::optional<torch::Tensor>& cumulative_local_expert_recv_stats,
                          const std::optional<torch::Tensor>& dispatch_wait_recv_cost_stats,
-                         const std::optional<torch::Tensor>& x_sf_scale,
+                         const std::optional<torch::Tensor>& x_global_scales,
                          int num_max_dispatch_tokens_per_rank, int num_experts,
                          bool use_fp8, bool round_scale, bool use_ue8m0,
-                         bool use_nvfp4, bool use_ue8m0_for_sf,
+                         bool use_nvfp4, bool use_ue8m0_for_nvfp4_x_scale,
                          bool async, bool return_recv_hook);
 
     std::tuple<torch::Tensor, std::optional<EventHandle>, std::optional<std::function<void()>>>
diff --git a/csrc/kernels/api.cuh b/csrc/kernels/api.cuh
@@ -144,14 +144,14 @@ void dispatch(void* packed_recv_x, void* packed_recv_x_scales,
               int* packed_recv_count,
               int* cumulative_local_expert_recv_stats,
               int64_t* dispatch_wait_recv_cost_stats,
-              const float* x_sf_scale,
+              const float* x_global_scales,
               void* rdma_recv_x, int* rdma_recv_count, void* rdma_x,
               const void* x, const int64_t* topk_idx,
               int* next_clean, int num_next_clean_int,
               int num_tokens, int hidden, int num_max_dispatch_tokens_per_rank,
               int num_topk, int num_experts, int rank, int num_ranks,
               bool use_fp8, bool round_scale, bool use_ue8m0,
-              bool use_nvfp4, bool use_ue8m0_for_sf,
+              bool use_nvfp4, bool use_ue8m0_for_nvfp4_x_scale,
               void* workspace, int num_device_sms,
               cudaStream_t stream, int phases);
 
diff --git a/csrc/kernels/internode_ll.cu b/csrc/kernels/internode_ll.cu
@@ -81,10 +81,11 @@ __device__ inline uint8_t float_to_e2m1(float f) {
     return (sign << 3) | (exp << 1) | mant;
 }
 
-
 // Convert 4 float2 values into 8 e2m1 values (represented as one uint32_t).
 inline __device__ uint32_t fp32_vec_to_e2m1(float2 (&array)[4]) {
-  #if defined(__CUDA_ARCH__) && (__CUDA_ARCH__ >= 1000)
+    // PTX instructions used here requires sm100a.
+  #if CUDA_VERSION >= 12080
+  #if defined(__CUDA_ARCH__) && (__CUDA_ARCH__ >= 1000) && __CUDA_ARCH_HAS_FEATURE__(SM100_ALL)
     uint32_t val;
     asm volatile(
         "{\n"
@@ -99,13 +100,16 @@ inline __device__ uint32_t fp32_vec_to_e2m1(float2 (&array)[4]) {
         "mov.b32 %0, {byte0, byte1, byte2, byte3};\n"
         "}"
         : "=r"(val)
-        : "f"(array[0].x), "f"(array[0].y), "f"(array[1].x), "f"(array[1].y), "f"(array[2].x),
-          "f"(array[2].y), "f"(array[3].x), "f"(array[3].y));
+        : "f"(array[0].x),
+          "f"(array[0].y),
+          "f"(array[1].x),
+          "f"(array[1].y),
+          "f"(array[2].x),
+          "f"(array[2].y),
+          "f"(array[3].x),
+          "f"(array[3].y));
     return val;
   #else
-    #if !(defined(__CUDA_ARCH__) && (__CUDA_ARCH__ >= 1000))
-        #pragma message("warning: this architecture does not support cvt.rn.satfinite.e2m1x2.f32, use float_to_e2m1 instead.")
-    #endif
     uint32_t val = 0;
     float2* data = reinterpret_cast<float2*>(&array[0]);
     for (int i = 0; i < 4; ++i) {
@@ -114,7 +118,8 @@ inline __device__ uint32_t fp32_vec_to_e2m1(float2 (&array)[4]) {
     }
     return val;
   #endif
-}
+  #endif
+  }
 
 constexpr int CVT_ELTS_PER_THREAD = 8;
 // Quantizes the provided PackedVec into the uint32_t output
@@ -195,7 +200,7 @@ dispatch(void* packed_recv_x, void* packed_recv_x_scales,
          int* packed_recv_count,
          int* cumulative_local_expert_recv_stats,
          int64_t* dispatch_wait_recv_cost_stats,
-         const float* x_sf_scale,
+         const float* x_global_scales,
          void* rdma_recv_x, int* rdma_recv_count, void* rdma_x,
          const void* x, const int64_t* topk_idx,
          int* atomic_counter_per_expert, int* atomic_finish_counter_per_expert,
@@ -270,8 +275,8 @@ dispatch(void* packed_recv_x, void* packed_recv_x_scales,
             float SFScaleVal = 1.0f;
             if constexpr (kUseNVFP4) {
                 // Get scaling value;
-                EP_DEVICE_ASSERT(x_sf_scale != nullptr);
-                SFScaleVal = *(static_cast<const float*>(x_sf_scale));
+                EP_DEVICE_ASSERT(x_global_scales != nullptr);
+                SFScaleVal = *(static_cast<const float*>(x_global_scales));
             }
 
             // FP8 or NVFP4 cast
@@ -537,14 +542,14 @@ void dispatch(void* packed_recv_x, void* packed_recv_x_scales,
               int* packed_recv_count,
               int* cumulative_local_expert_recv_stats,
               int64_t* dispatch_wait_recv_cost_stats,
-              const float* x_sf_scale,
+              const float* x_global_scales,
               void* rdma_recv_x, int* rdma_recv_count, void* rdma_x,
               const void* x, const int64_t* topk_idx,
               int* next_clean, int num_next_clean_int,
               int num_tokens, int hidden, int num_max_dispatch_tokens_per_rank,
               int num_topk, int num_experts, int rank, int num_ranks,
               bool use_fp8, bool round_scale, bool use_ue8m0,
-              bool use_nvfp4, bool use_ue8m0_for_sf,
+              bool use_nvfp4, bool use_ue8m0_for_nvfp4_x_scale,
               void* workspace, int num_device_sms,
               cudaStream_t stream, int phases) {
     constexpr int kNumMaxTopK = 9;
@@ -572,17 +577,17 @@ if (use_fp8 and not use_ue8m0) \
     dispatch_func = dispatch<true, false, false, false, hidden>; \
 if (use_fp8 and use_ue8m0) \
     dispatch_func = dispatch<true, true, false, false, hidden>; \
-if (use_nvfp4 and not use_ue8m0_for_sf) \
+if (use_nvfp4 and not use_ue8m0_for_nvfp4_x_scale) \
     dispatch_func = dispatch<false, false, true, false, hidden>; \
-if (use_nvfp4 and use_ue8m0_for_sf) \
+if (use_nvfp4 and use_ue8m0_for_nvfp4_x_scale) \
     dispatch_func = dispatch<false, false, true, true, hidden>; \
 LAUNCH_KERNEL(&cfg, dispatch_func, \
               packed_recv_x, packed_recv_x_scales, \
               packed_recv_src_info, packed_recv_layout_range, \
               packed_recv_count, \
               cumulative_local_expert_recv_stats, \
               dispatch_wait_recv_cost_stats, \
-              x_sf_scale, \
+              x_global_scales, \
               rdma_recv_x, rdma_recv_count, rdma_x, \
               x, topk_idx, \
               atomic_counter_per_expert, atomic_finish_counter_per_expert, \
diff --git a/deep_ep/buffer.py b/deep_ep/buffer.py
@@ -528,9 +528,9 @@ def low_latency_dispatch(self, x: torch.Tensor, topk_idx: torch.Tensor,
                              num_max_dispatch_tokens_per_rank: int, num_experts: int,
                              cumulative_local_expert_recv_stats: Optional[torch.Tensor] = None,
                              dispatch_wait_recv_cost_stats: Optional[torch.Tensor] = None,
-                             x_sf_scale: Optional[torch.Tensor] = None,
+                             x_global_scales: Optional[torch.Tensor] = None,
                              use_fp8: bool = True, round_scale: bool = False, use_ue8m0: bool = False,
-                             use_nvfp4: bool = False, use_ue8m0_for_sf: bool = False,
+                             use_nvfp4: bool = False, use_ue8m0_for_nvfp4_x_scale: bool = False,
                              async_finish: bool = False, return_recv_hook: bool = False) -> \
             Tuple[Tuple[torch.Tensor, torch.Tensor], torch.Tensor, Tuple, EventOverlap, Callable]:
         """
@@ -553,12 +553,12 @@ def low_latency_dispatch(self, x: torch.Tensor, topk_idx: torch.Tensor,
             dispatch_wait_recv_cost_stats: a cumulative time spent waiting to receive each token tensor for statistics,
                 which should have shape `[num_ranks, num_ranks]` and be typed as `torch.int64`.
                 This is useful for detecting and pre-cisely localizing slow anomalies.
-            x_sf_scale: a float32 tensor with dim() == 0, the scaling factors for the entire dispatch.
+            x_global_scales: a float32 tensor with dim() == 0, the scaling factors for the entire dispatch.
             use_fp8: whether to enable FP8 casting, with this, the received data will be a tuple of FP8 tensor and scaling factors.
             round_scale: whether round the scaling factors into power of 2.
             use_ue8m0: whether use UE8M0 as scaling factor format (available only with `round_scale=True`).
             use_nvfp4: whether to enable NVFP4 casting, with this, the received data will be a tuple of NVFP4 tensor and scaling factors.
-            use_ue8m0_for_sf: whether use UE8M0 as NVFP4 scaling factor format (available only with `use_nvfp4=True`).
+            use_ue8m0_for_nvfp4_x_scale: whether use UE8M0 as NVFP4 scaling factor format (available only with `use_nvfp4=True`).
             async_finish: the current stream will not wait for the communication kernels to be finished if set.
             return_recv_hook: return a receiving hook if set. If set, the kernel will just do the RDMA request issues,
                 but **without actually receiving the data**. You must call the received hook to make sure the data's arrival.
@@ -591,17 +591,17 @@ def low_latency_dispatch(self, x: torch.Tensor, topk_idx: torch.Tensor,
             self.runtime.low_latency_dispatch(x, topk_idx,
                                               cumulative_local_expert_recv_stats,
                                               dispatch_wait_recv_cost_stats,
-                                              x_sf_scale,
+                                              x_global_scales,
                                               num_max_dispatch_tokens_per_rank, num_experts,
                                               use_fp8, round_scale, use_ue8m0,
-                                              use_nvfp4, use_ue8m0_for_sf,
+                                              use_nvfp4, use_ue8m0_for_nvfp4_x_scale,
                                               async_finish, return_recv_hook)
         handle = (packed_recv_src_info, packed_recv_layout_range, num_max_dispatch_tokens_per_rank, x.size(1), num_experts)
         tensors_to_record = (x, topk_idx,
                              packed_recv_x, packed_recv_x_scales, packed_recv_count,
                              packed_recv_src_info, packed_recv_layout_range,
                              cumulative_local_expert_recv_stats,
-                             x_sf_scale)
+                             x_global_scales)
         if use_fp8 or use_nvfp4:
             packed_recv_x = (packed_recv_x, packed_recv_x_scales)
         return packed_recv_x, packed_recv_count, handle, \
diff --git a/tests/test_low_latency.py b/tests/test_low_latency.py
@@ -54,21 +54,21 @@ def test_main(num_tokens: int, hidden: int, num_experts: int, num_topk: int,
             for dispatch_data_type in ('bf16', 'fp8', 'nvfp4'):
                 dispatch_use_fp8 = dispatch_data_type == 'fp8'
                 dispatch_use_nvfp4 = dispatch_data_type == 'nvfp4'
-                use_ue8m0_for_sf = False
+                use_ue8m0_for_nvfp4_x_scale = False
                 for round_scale in (False, True) if dispatch_use_fp8 else (False, ):
                     for use_ue8m0 in (False, True) if round_scale else (False, ):
                         num_times += 1
                         for i in range((num_times % 2) + 1):
                             cumulative_local_expert_recv_stats = torch.zeros((num_local_experts, ), dtype=torch.int, device='cuda')
                             x_max = torch.max(torch.abs(current_x))
-                            x_sf_scale = (MAX_E4M3 * MAX_NVFP4) / x_max.to(torch.float32)
-                            dist.all_reduce(x_sf_scale, op=dist.ReduceOp.MIN, group=group)
+                            x_global_scales = (MAX_E4M3 * MAX_NVFP4) / x_max.to(torch.float32)
+                            dist.all_reduce(x_global_scales, op=dist.ReduceOp.MIN, group=group)
                             packed_recv_x, packed_recv_count, handle, event, hook = \
                                 buffer.low_latency_dispatch(current_x, topk_idx, num_tokens, num_experts,
                                                             use_fp8=dispatch_use_fp8, round_scale=round_scale, use_ue8m0=use_ue8m0,
-                                                            use_nvfp4=dispatch_use_nvfp4, use_ue8m0_for_sf=use_ue8m0_for_sf,
+                                                            use_nvfp4=dispatch_use_nvfp4, use_ue8m0_for_nvfp4_x_scale=use_ue8m0_for_nvfp4_x_scale,
                                                             cumulative_local_expert_recv_stats=cumulative_local_expert_recv_stats,
-                                                            x_sf_scale=x_sf_scale,
+                                                            x_global_scales=x_global_scales,
                                                             async_finish=not return_recv_hook, return_recv_hook=return_recv_hook)
                             hook() if return_recv_hook else event.current_stream_wait()
                         if dispatch_use_fp8:
@@ -77,9 +77,10 @@ def test_main(num_tokens: int, hidden: int, num_experts: int, num_topk: int,
                         elif dispatch_use_nvfp4:
                             recv_x_scale_view = packed_recv_x[1]
                             recv_x_scale_view = recv_x_scale_view.permute(5, 2, 0, 1, 4, 3)
+                            print(f'for num_times: {num_times}, recv_x_scale_view.shape: {recv_x_scale_view.shape}')
                             recv_x_scale_view = recv_x_scale_view.contiguous().view(num_local_experts, int(num_ranks * num_tokens), hidden // 16)
                             packed_recv_x = (packed_recv_x[0], recv_x_scale_view)
-                            simulated_gemm_x = per_token_cast_back(packed_recv_x[0], packed_recv_x[1], x_sf_scale, use_ue8m0_for_sf=use_ue8m0_for_sf, src_data_format='nvfp4')
+                            simulated_gemm_x = per_token_cast_back(packed_recv_x[0], packed_recv_x[1], x_global_scales, use_ue8m0_for_nvfp4_x_scale=use_ue8m0_for_nvfp4_x_scale, src_data_format='nvfp4')
                         else:
                             packed_recv_x = packed_recv_x
                             simulated_gemm_x = packed_recv_x.clone()
@@ -157,7 +158,7 @@ def test_func(return_recv_hook: bool):
         recv_x, recv_count, handle, event, hook = \
             buffer.low_latency_dispatch(current_x, topk_idx, num_tokens, num_experts,
                                         cumulative_local_expert_recv_stats=cumulative_local_expert_recv_stats,
-                                        use_fp8=False, use_nvfp4=True, x_sf_scale=x_sf_scale,
+                                        use_fp8=False, use_nvfp4=True, x_global_scales=x_global_scales,
                                         async_finish=False, return_recv_hook=return_recv_hook)
         large_gemm_with_hook(hook) if return_recv_hook else None
         combined_x, event, hook = buffer.low_latency_combine(simulated_gemm_x, topk_idx, topk_weights, handle,
diff --git a/tests/utils.py b/tests/utils.py