[mxfp8 moe training] update 3d quant colwise scaling kernel to use single input/output TMA descriptors (#3034)

danielvegamyhre · web-flow · commit d2fae7adf7f9 · 2025-09-19T19:31:13.000-07:00
diff --git a/test/prototype/moe_training/test_kernels.py b/test/prototype/moe_training/test_kernels.py
@@ -325,8 +325,8 @@ def test_triton_mx_block_rearrange_2d_K_groups(
     reason="MXFP8 requires CUDA capability 10.0 or greater",
 )
 @pytest.mark.parametrize("E", (1, 2, 4, 8))
-@pytest.mark.parametrize("N", (32, 64, 8192))
-@pytest.mark.parametrize("K", (32, 64, 8192))
+@pytest.mark.parametrize("N", (32, 1536, 5120, 7168, 8192))
+@pytest.mark.parametrize("K", (32, 1536, 5120, 7168, 8192))
 @pytest.mark.parametrize("input_dtype", (torch.bfloat16,))
 @pytest.mark.parametrize("scaling_mode", (ScaleCalculationMode.FLOOR,))
 def test_cuda_mx_dim1_3d_numerics(E, N, K, input_dtype, scaling_mode):
@@ -361,7 +361,6 @@ def test_cuda_mx_dim1_3d_numerics(E, N, K, input_dtype, scaling_mode):
     y_d1, s_d1 = mxfp8_cuda.quantize_3d(
         x, scale_dim_n=block_size, scaling_mode=scaling_mode_str
     )
-
     # Check scales
     torch.testing.assert_close(s_d1, s_d1_ref, rtol=0, atol=0)
 
diff --git a/torchao/csrc/cuda/mx_kernels/mxfp8_extension.cpp b/torchao/csrc/cuda/mx_kernels/mxfp8_extension.cpp
@@ -54,7 +54,8 @@ mxfp8_quantize(torch::Tensor input, bool rowwise, bool colwise,
 
   // Validate inputs
   TORCH_CHECK(!rowwise, "rowwise scaling is not supported yet");
-  check_cuda_tensor(input, "input");
+  TORCH_CHECK(input.is_cuda(), "input must be a CUDA tensor");
+  TORCH_CHECK(input.is_contiguous(), "input must be contiguous");
   TORCH_CHECK(input.dim() == 2, "input must be 2D");
   TORCH_CHECK(input.scalar_type() == torch::kFloat32 ||
                   input.scalar_type() == torch::kFloat16 ||
@@ -130,6 +131,7 @@ mxfp8_quantize_3d(torch::Tensor input, int64_t scale_dim_n,
 
   // Validate inputs
   TORCH_CHECK(input.is_cuda(), "input must be a CUDA tensor");
+  TORCH_CHECK(input.is_contiguous(), "input must be contiguous");
   // Note: We don't check contiguous for 3D as it may have column major strides
   TORCH_CHECK(input.dim() == 3, "input must be 3D");
   TORCH_CHECK(input.scalar_type() == torch::kFloat32 ||
@@ -148,7 +150,6 @@ mxfp8_quantize_3d(torch::Tensor input, int64_t scale_dim_n,
   TORCH_CHECK((N >= 32) && (N % 32 == 0), "N must be a multiple of 32");
   TORCH_CHECK((K >= 32) && (K % 32 == 0), "K must be a multiple of 32");
 
-  // The kernel should work with any stride pattern - no layout requirements
 
   c10::cuda::CUDAGuard device_guard(input.device());
 
diff --git a/torchao/csrc/cuda/mx_kernels/mxfp8_quantize.cuh b/torchao/csrc/cuda/mx_kernels/mxfp8_quantize.cuh