fix issue with padding m

shifangx · shifangx · commit 57a670a96f93 · 2025-09-06T01:59:55.000-07:00
diff --git a/csrc/kernels/internode_ll.cu b/csrc/kernels/internode_ll.cu
@@ -450,7 +450,8 @@ dispatch(void* packed_recv_x, void* packed_recv_x_scales,
         const auto recv_src_info = packed_recv_src_info + local_expert_idx * num_ranks * num_max_dispatch_tokens_per_rank;
         const auto recv_range = packed_recv_layout_range + local_expert_idx * num_ranks;
         const auto num_aligned_scales = align<int>(num_scales, sizeof(float) / sizeof(scale_t));
-        const auto recv_x_scales = static_cast<scale_t*>(packed_recv_x_scales) + local_expert_idx * num_ranks * num_max_dispatch_tokens_per_rank * num_aligned_scales;
+        const auto num_aligned_tokens = align<int>(num_ranks * num_max_dispatch_tokens_per_rank, 128);
+        const auto recv_x_scales = static_cast<scale_t*>(packed_recv_x_scales) + local_expert_idx * num_aligned_tokens * num_aligned_scales;
 
         // Shared between sub-warps in warp groups
         __shared__ int shared_num_recv_tokens[kNumMaxWarpGroups], shared_recv_token_begin_idx[kNumMaxWarpGroups];
diff --git a/tests/test_low_latency.py b/tests/test_low_latency.py
@@ -77,7 +77,6 @@ def test_main(num_tokens: int, hidden: int, num_experts: int, num_topk: int,
                         elif dispatch_use_nvfp4:
                             recv_x_scale_view = packed_recv_x[1]
                             recv_x_scale_view = recv_x_scale_view.permute(5, 2, 0, 1, 4, 3)
-                            print(f'for num_times: {num_times}, recv_x_scale_view.shape: {recv_x_scale_view.shape}')
                             recv_x_scale_view = recv_x_scale_view.contiguous().view(num_local_experts, int(num_ranks * num_tokens), hidden // 16)
                             packed_recv_x = (packed_recv_x[0], recv_x_scale_view)
                             simulated_gemm_x = per_token_cast_back(packed_recv_x[0], packed_recv_x[1], x_global_scales, use_ue8m0_for_nvfp4_x_scale=use_ue8m0_for_nvfp4_x_scale, src_data_format='nvfp4')
diff --git a/tests/utils.py b/tests/utils.py
@@ -90,8 +90,7 @@ def int32_to_8floats_lookup(tensor: torch.Tensor, table: torch.Tensor) -> torch.
     return out
 
 
-def cast_nvfp4_to_fp32(x_nvfp4: torch.Tensor, x_scales: torch.Tensor, x_global_scales: float, use_ue8m0_for_nvfp4_x_scale: bool = False):
-    assert(x_global_scales.dim() == 0, f"expect x_global_scales.dim() == 0, but got {x_global_scales.dim()}")
+def cast_nvfp4_to_fp32(x_nvfp4: torch.Tensor, x_scales: torch.Tensor, x_global_scales: torch.Tensor, use_ue8m0_for_nvfp4_x_scale: bool = False):
     NVFP4_TABLE = torch.tensor([0, 0.5, 1, 1.5, 2, 3, 4, 6, 0, -0.5, -1.0, -1.5, -2, -3, -4, -6], dtype=torch.float32, device='cuda')
     if use_ue8m0_for_nvfp4_x_scale:
         x_scales = x_scales.view(dtype=torch.int8).to(torch.int) << 23