change test case

shifangx · shifangx · commit ccf4eaf24ff1 · 2025-09-03T06:30:37.000-07:00
diff --git a/csrc/deep_ep.cpp b/csrc/deep_ep.cpp
@@ -1151,6 +1151,7 @@ Buffer::low_latency_dispatch(const torch::Tensor& x, const torch::Tensor& topk_i
     void* packed_recv_x_scales_ptr = nullptr;
     EP_HOST_ASSERT((num_ranks * num_max_dispatch_tokens_per_rank) % 4 == 0 and "TMA requires the number of tokens to be multiple of 4");
 
+    EP_HOST_ASSERT(not (use_fp8 and use_nvfp4));
     if (use_fp8) {
         // TODO: support unaligned cases
         EP_HOST_ASSERT(hidden % 512 == 0);
@@ -1182,6 +1183,7 @@ Buffer::low_latency_dispatch(const torch::Tensor& x, const torch::Tensor& topk_i
         packed_recv_x_scales = packed_recv_x_scales.value().permute({3, 4, 1, 5, 2, 0});
 
         packed_recv_x_scales_ptr = packed_recv_x_scales->data_ptr();
+        EP_HOST_ASSERT(packed_recv_x_scales_ptr != nullptr);
     }
 
     // Kernel launch
diff --git a/csrc/kernels/internode_ll.cu b/csrc/kernels/internode_ll.cu
@@ -269,10 +269,9 @@ dispatch(void* packed_recv_x, void* packed_recv_x_scales,
             thread_id == 0 ? (*rdma_x_src_idx = token_idx) : 0;
             float SFScaleVal = 1.0f;
             if constexpr (kUseNVFP4) {
-                // Get scaling value: if x_sf_scale is nullptr, use 1.0f;
-                if (x_sf_scale != nullptr) {
-                    SFScaleVal = *(static_cast<const float*>(x_sf_scale));
-                }
+                // Get scaling value;
+                EP_DEVICE_ASSERT(x_sf_scale != nullptr);
+                SFScaleVal = *(static_cast<const float*>(x_sf_scale));
             }
 
             // FP8 or NVFP4 cast
diff --git a/deep_ep/buffer.py b/deep_ep/buffer.py
@@ -573,7 +573,11 @@ def low_latency_dispatch(self, x: torch.Tensor, topk_idx: torch.Tensor,
                 if `use_ue8m0=False`. With `use_ue8m0=True`, the second one is packed and shaped as
                 `[num_local_experts, num_max_dispatch_tokens_per_rank * num_ranks, hidden // 512]` with type `torch.int`.
                 Notice that, the last-two-dimension of the scaling tensors are in column-major for TMA compatibility.
-                With `use_fp8=False`, the result would be a tensor shaped as
+                with `use_nvfp4=True`: the first element is a `torch.Tensor` shaped as
+                `[num_local_experts, num_max_dispatch_tokens_per_rank * num_ranks, hidden // 4]` with `torch.uint32`.
+                The second tensor is the corresponding scales for the first element with shape
+                `[32, 4, num_max_dispatch_tokens_per_rank * num_ranks // 128, 4, hidden // 64, num_local_experts]` with `torch.uint8`.
+                With `use_fp8=False and use_nvfp4=False`, the result would be a tensor shaped as
                 `[num_local_experts, num_max_dispatch_tokens_per_rank * num_ranks, hidden]` with `torch.bfloat16`.
                 Moreover, not all tokens are valid, only some of the `num_max_dispatch_tokens_per_rank * num_ranks` are,
                 as we do not synchronize CPU received count with GPU (also not incompatible with CUDA graph if synced).
diff --git a/tests/test_low_latency.py b/tests/test_low_latency.py
@@ -61,8 +61,8 @@ def test_main(num_tokens: int, hidden: int, num_experts: int, num_topk: int,
                         for i in range((num_times % 2) + 1):
                             cumulative_local_expert_recv_stats = torch.zeros((num_local_experts, ), dtype=torch.int, device='cuda')
                             x_max = torch.max(torch.abs(current_x))
-                            dist.all_reduce(x_max, op=dist.ReduceOp.MAX, group=group)
                             x_sf_scale = (MAX_E4M3 * MAX_NVFP4) / x_max.to(torch.float32)
+                            dist.all_reduce(x_sf_scale, op=dist.ReduceOp.MIN, group=group)
                             packed_recv_x, packed_recv_count, handle, event, hook = \
                                 buffer.low_latency_dispatch(current_x, topk_idx, num_tokens, num_experts,
                                                             use_fp8=dispatch_use_fp8, round_scale=round_scale, use_ue8m0=use_ue8m0,
@@ -153,10 +153,12 @@ def large_gemm_with_hook(hook):
 
     # noinspection PyShadowingNames
     def test_func(return_recv_hook: bool):
+        # NOTE: use nvfp4
         recv_x, recv_count, handle, event, hook = \
             buffer.low_latency_dispatch(current_x, topk_idx, num_tokens, num_experts,
                                         cumulative_local_expert_recv_stats=cumulative_local_expert_recv_stats,
-                                        use_fp8=True, async_finish=False, return_recv_hook=return_recv_hook)
+                                        use_fp8=False, use_nvfp4=True, x_sf_scale=x_sf_scale,
+                                        async_finish=False, return_recv_hook=return_recv_hook)
         large_gemm_with_hook(hook) if return_recv_hook else None
         combined_x, event, hook = buffer.low_latency_combine(simulated_gemm_x, topk_idx, topk_weights, handle,
                                                              use_logfmt=use_logfmt, return_recv_hook=return_recv_hook)