code clean

shifangx · shifangx · commit c358fd5fc49a · 2025-08-28T23:42:25.000-07:00
diff --git a/csrc/deep_ep.cpp b/csrc/deep_ep.cpp
@@ -1224,18 +1224,6 @@ Buffer::low_latency_dispatch(const torch::Tensor& x, const torch::Tensor& topk_i
     std::optional<std::function<void()>> recv_hook = std::nullopt;
     if (return_recv_hook)
         recv_hook = [=]() { launcher(LOW_LATENCY_RECV_PHASE); };
-    // if (not use_fp8 and use_nvfp4) {
-    //     constexpr int kNumPerChannels = 16;
-    //     constexpr int NUM_SF_ELEMS_PER_PACK = 4;
-    //     constexpr int mTileSize_dim_0 = 32;
-    //     constexpr int mTileSize_dim_1 = 4;
-    //     constexpr int mTileSize = mTileSize_dim_0 * mTileSize_dim_1;
-
-    //     auto l = num_local_experts;
-    //     auto m = num_ranks * num_max_dispatch_tokens_per_rank;
-    //     auto rk = hidden / (kNumPerChannels * NUM_SF_ELEMS_PER_PACK);
-    //     packed_recv_x_scales = packed_recv_x_scales.value().contiguous().view(torch::kInt).reshape({l, m, rk});
-    // }
     // Return values
     return {packed_recv_x, packed_recv_x_scales, packed_recv_x_sf_scale, packed_recv_count, packed_recv_src_info, packed_recv_layout_range, event, recv_hook};
 #else
diff --git a/csrc/kernels/internode_ll.cu b/csrc/kernels/internode_ll.cu
@@ -57,7 +57,7 @@ inline __device__ float reciprocal_approximate_ftz(float a) {
   return b;
 }
 
-// float to e2m1 4bit (sign:1, exp:2, mantissa:1) quantization
+// Convert 1 float value into 8 e2m1 values (4bit, sign:1, exp:2, mantissa:1) quantization.
 __device__ inline uint8_t float_to_e2m1(float f) {
     // Get sign
     uint8_t sign = (f < 0);
@@ -92,8 +92,9 @@ inline __device__ uint32_t fp32_vec_to_e2m1(float2 (&array)[4]) {
     return val;
   #else
     #if !(defined(__CUDA_ARCH__) && (__CUDA_ARCH__ >= 1000))
-        #pragma message("warning: this architecture does not support cvt.rn.satfinite.e2m1x2.f32, use float_to_e2m1 instead.")
-    #endif
+        #pragma message("warning: this architecture does not support " \
+                       "cvt.rn.satfinite.e2m1x2.f32, use user defined " \
+                       "float_to_e2m1 to convert float values to e2m1 values.")
     uint32_t val = 0;
     float* data = reinterpret_cast<float*>(&array[0]);
     for (int i = 0; i < 8; ++i) {
@@ -125,7 +126,9 @@ inline __device__ uint32_t fp32_vec_to_e2m1(float (&array)[8]) {
     return val;
   #else
     #if !(defined(__CUDA_ARCH__) && (__CUDA_ARCH__ >= 1000))
-        #pragma message("warning: this architecture does not support cvt.rn.satfinite.e2m1x2.f32, use float_to_e2m1 instead.")
+        #pragma message("warning: this architecture does not support " \
+                       "cvt.rn.satfinite.e2m1x2.f32, use user defined " \
+                       "float_to_e2m1 to convert float values to e2m1 values.")
     #endif
     uint32_t val = 0;
     float* data = reinterpret_cast<float*>(&array[0]);
@@ -542,9 +545,8 @@ dispatch(void* packed_recv_x, void* packed_recv_x_scales, void* packed_recv_x_sf
                     auto scale = extract_required_scale_format<kUseUE8M0>(ld_nc_global(src_scales + lane_id + 32));
                     recv_x_scales[token_idx * token_stride + pack_idx * pack_stride + elem_idx] = scale;
                 }
-            } else if constexpr (kUseNVFP4) {            
-                // Equivalent CuTe layout:
-                //   (num_tokens, (num_packed, num_elems_per_pack)):(num_elems_per_pack, (num_tokens * num_elems_per_pack, 1))
+            } else if constexpr (kUseNVFP4) {  
+                // The physical layout is (l, rm, rk, 32, 4, 4).
                 const auto src_scales = reinterpret_cast<uint8_t*>(reinterpret_cast<uint8_t*>(src_data) + hidden_bytes);
                 const auto num_elems_per_pack = static_cast<int>(sizeof(packed_t) / sizeof(scale_t));
                 const auto token_idx = recv_token_begin_idx + i;
@@ -557,7 +559,6 @@ dispatch(void* packed_recv_x, void* packed_recv_x_scales, void* packed_recv_x_sf
                     const auto pack_idx = j / num_elems_per_pack;
                     const auto elem_idx = j % num_elems_per_pack;
                     auto scale = ld_nc_global(src_scales + j);
-                    // recv_x_scales[token_idx * token_stride + pack_idx * pack_stride + elem_idx] = scale;                   
                     recv_x_scales[rm * token_stride * 128 + pack_idx * pack_stride * 128 + rm_res * pack_stride + elem_idx] = scale;                   
                 }
             }
diff --git a/tests/test_low_latency.py b/tests/test_low_latency.py
@@ -73,9 +73,14 @@ def test_main(num_tokens: int, hidden: int, num_experts: int, num_topk: int,
                         if dispatch_use_fp8:
                             packed_recv_x = (packed_recv_x[0], packed_recv_x[1].contiguous())
                         elif dispatch_use_nvfp4:
+                            # For the receved x_scale, its dtype is int8, its physical layout is (l, rm, rk, 32, 4, 4).
+                            # and its logical shape is (32, 4, rm, 4, rk, l).
                             recv_x_scale_view = packed_recv_x[1].clone()
+                            # After permute, the logical shape will be (l, rm, 32, 4, rk, 4).
                             recv_x_scale_view = recv_x_scale_view.permute(5, 2, 0, 1, 4, 3)
+                            # After view, the logical shape will be (l, rm, 32, 4, rk), the dtype is int32.
                             recv_x_scale_view = recv_x_scale_view.contiguous().view(torch.int32)
+                            # After view, the logical shape will be (l, rm * 32 * 4, rk), the dtype is int32.
                             recv_x_scale_view = recv_x_scale_view.contiguous().view(num_local_experts, int(num_ranks * num_tokens), hidden // (16 * 4))
                             packed_recv_x = (packed_recv_x[0], recv_x_scale_view, packed_recv_x[2].contiguous())
                         else: