[mxfp8 moe training] integrate mxfp8 dim0 cast triton kernel (#3186)

danielvegamyhre · web-flow · commit b6442112a8f1 · 2025-10-16T14:04:44.000-07:00
* [mxfp8 moe training] integrate mxfp8 dim0 cast triton kernel

* handle nans
diff --git a/test/prototype/moe_training/test_scaled_grouped_mm.py b/test/prototype/moe_training/test_scaled_grouped_mm.py
@@ -8,7 +8,7 @@
 import torch
 from torch.nn import functional as F
 
-from torchao.utils import torch_version_at_least
+from torchao.utils import is_sm_version, torch_version_at_least
 
 # We need to skip before doing any imports which would use triton, since
 # triton won't be available on CPU builds and torch < 2.5
@@ -28,6 +28,7 @@
 from torchao.float8.float8_linear import matmul_with_hp_or_float8_args
 from torchao.float8.float8_training_tensor import LinearMMConfig
 from torchao.float8.float8_utils import compute_error, tensor_to_scale, to_fp8_saturated
+from torchao.prototype.moe_training.conversion_utils import MoEScalingType
 from torchao.prototype.moe_training.scaled_grouped_mm import (
     _emulated_mxfp8_scaled_grouped_mm_2d_2d,
     _emulated_mxfp8_scaled_grouped_mm_2d_3d,
@@ -43,10 +44,15 @@
 
 
 @skip_if_rocm("ROCm not supported")
-def test_valid_scaled_grouped_mm_2d_3d():
+@pytest.mark.parametrize("m", [131072])
+@pytest.mark.parametrize("n", [8192])
+@pytest.mark.parametrize("k", [5120])
+@pytest.mark.parametrize("n_groups", [1, 2, 4, 8])
+def test_valid_scaled_grouped_mm_2d_3d(m, n, k, n_groups):
+    if not is_sm_version(9, 0):
+        pytest.skip("Skipping FP8 rowwise test, requires sm90")
     out_dtype = torch.bfloat16
     device = "cuda"
-    m, n, k, n_groups = 16, 32, 16, 4
     a = torch.randn(
         m * n_groups,
         k,
@@ -72,6 +78,7 @@ def test_valid_scaled_grouped_mm_2d_3d():
         b_t,
         offs=offs,
         out_dtype=out_dtype,
+        scaling_type=MoEScalingType.FP8_ROWWISE,
     )
 
     # Validate result.
@@ -307,7 +314,7 @@ def test_emulate_mxfp8_grouped_gemm_2d_2d(M, N, num_experts):
 
 @skip_if_rocm("ROCm not supported")
 @pytest.mark.parametrize(
-    "M,K,N", [(1024, 5120, 8192), (2048, 5120, 8192), (16640, 5120, 8192)]
+    "M,K,N", [(16640, 5120, 8192), (131072, 5120, 8192), (131072, 8192, 5120)]
 )
 @pytest.mark.parametrize("num_experts", (2, 4, 8, 16))
 def test_mxfp8_grouped_gemm_with_dq_fwd_bwd(M, K, N, num_experts):
diff --git a/test/prototype/mx_formats/test_kernels.py b/test/prototype/mx_formats/test_kernels.py
@@ -486,8 +486,8 @@ def test_triton_mxfp8_dim1_randn(M, K):
 
 @pytest.mark.skipif(not has_triton(), reason="unsupported without triton")
 @pytest.mark.skipif(
-    not is_sm_at_least_89(),
-    reason="float8 in triton requires CUDA capability 8.9 or greater",
+    not is_sm_at_least_100(),
+    reason="mxfp8 requires CUDA capability 10.0 or greater",
 )
 @pytest.mark.parametrize("M", (256, 2048, 131072))
 @pytest.mark.parametrize("K", (256, 5120, 7168))
@@ -499,6 +499,20 @@ def test_triton_mxfp8_dim0_randn(M, K):
     torch.testing.assert_close(x_s_t, x_s_ref, rtol=0, atol=0)
 
 
+@pytest.mark.skipif(not has_triton(), reason="unsupported without triton")
+@pytest.mark.skipif(
+    not is_sm_at_least_100(),
+    reason="mxfp8 requires CUDA capability 10.0 or greater",
+)
+def test_triton_mxfp8_dim0_zeros():
+    x = torch.zeros(8192, 5120, dtype=torch.bfloat16, device="cuda")
+    x_mx_ref, x_s_ref = triton_to_mxfp8_dim0_reference(x, block_size=32)
+    x_mx_t, x_s_t = triton_to_mxfp8_dim0(x, inner_block_size=32)
+    assert not x_mx_t.isnan().any(), "quantized tensor should not contain NaNs"
+    torch.testing.assert_close(x_mx_t, x_mx_ref, rtol=0, atol=0)
+    torch.testing.assert_close(x_s_t, x_s_ref, rtol=0, atol=0)
+
+
 @pytest.mark.skipif(not torch.cuda.is_available(), reason="CUDA not available")
 @pytest.mark.parametrize(
     "shape",
diff --git a/torchao/prototype/moe_training/scaled_grouped_mm.py b/torchao/prototype/moe_training/scaled_grouped_mm.py
@@ -32,7 +32,7 @@
     MXGemmKernelChoice,
     ScaleCalculationMode,
 )
-from torchao.prototype.mx_formats.mx_tensor import to_mx
+from torchao.prototype.mx_formats.kernels import triton_to_mxfp8_dim0
 from torchao.prototype.mx_formats.utils import _to_mxfp8_dim1_kernel_wrapper
 
 logger: logging.Logger = logging.getLogger(__name__)
@@ -303,16 +303,16 @@ def forward(
 
         # A_data shape: (M, K)
         # A_scale shape: (M, K//block_size)
-        A_scale, A_data = to_mx(
-            A, elem_dtype=torch.float8_e4m3fn, block_size=block_size
+        A_data, A_scale = triton_to_mxfp8_dim0(
+            A,
+            inner_block_size=block_size,
         )
 
         # B_data shape: (E, N, K)
         # B_scale shape: (E, N, K//block_size)
-        B_scales, B_data = to_mx(
+        B_data, B_scales = triton_to_mxfp8_dim0(
             B_t.transpose(-2, -1),
-            elem_dtype=torch.float8_e4m3fn,
-            block_size=block_size,
+            inner_block_size=block_size,
         )
 
         # Convert scales to blocked format for 2d-3d grouped mm
@@ -351,8 +351,8 @@ def backward(ctx, grad_out: torch.Tensor):
 
         # grad_out_data shape: (M, N)
         # grad_out_scale shape: (M, N//block_size)
-        grad_out_scale, grad_out_data = to_mx(
-            grad_out, elem_dtype=torch.float8_e4m3fn, block_size=block_size
+        grad_out_data, grad_out_scale = triton_to_mxfp8_dim0(
+            grad_out, inner_block_size=block_size
         )
 
         # Quantize 3d expert weights along N (contraction dimension for next grouped gemm)
diff --git a/torchao/prototype/mx_formats/kernels.py b/torchao/prototype/mx_formats/kernels.py
@@ -852,8 +852,9 @@ def _triton_calculate_scale(x, axis):
         scale_e8m0_unbiased = extracted_pow2.to(tl.bfloat16)
 
         # Clamp to exponents that can be represented in e8m0
+        # Add 1 to capture NaNs
         scale_e8m0_unbiased = tl.clamp(
-            scale_e8m0_unbiased, -1 * e8m0_exponent_bias, e8m0_exponent_bias
+            scale_e8m0_unbiased, -1 * e8m0_exponent_bias, e8m0_exponent_bias + 1
         )
 
         # Create the biased e8m0 representation and cast it to 8 bits
@@ -863,15 +864,18 @@ def _triton_calculate_scale(x, axis):
         # TODO(future PR): add NaN handling here,
         # https://github.com/pytorch/pytorch/pull/100572 will likely be useful to
         # get proper NaN propagation working
-
         # Calculate the scale in floating point.
         scale_fp = (scale_e8m0_biased.to(tl.int32) << fp32_mbits).to(
             tl.float32, bitcast=True
         )
 
+        fp32_exp_bias = 127.0
+        fp32_min_normal = tl.exp2(-fp32_exp_bias + 1)
+        scale_fp = tl.clamp(scale_fp, min=fp32_min_normal, max=float("inf"))
+
         return scale_fp, scale_e8m0_biased
 
-    def _get_mxfp8_dim1_kernel_autotune_configs():
+    def _get_mxfp8_quant_autotune_configs():
         # Values to sweep over here were determined by a manual
         # sweep over a small set of shapes, it's likely that this
         # can be improved in the future.
@@ -890,7 +894,7 @@ def _get_mxfp8_dim1_kernel_autotune_configs():
         return results
 
     @triton.autotune(
-        configs=_get_mxfp8_dim1_kernel_autotune_configs(),
+        configs=_get_mxfp8_quant_autotune_configs(),
         key=["n_cols", "INNER_BLOCK_SIZE"],
     )
     @triton.jit
@@ -1039,110 +1043,87 @@ def to_mxfp8_dim1_kernel(
         tl.store(col_scale_start_ptr + col_scale_indices, col_scale_e8m0)
 
     @triton.autotune(
-        configs=_get_mxfp8_dim1_kernel_autotune_configs(),
-        key=["n_cols", "INNER_BLOCK_SIZE"],
+        configs=_get_mxfp8_quant_autotune_configs(),
+        key=["n_cols", "SCALE_BLOCK_SIZE"],
     )
     @triton.jit
     def to_mxfp8_dim0_kernel(
-        x_ptr,  # pointer to input tensor
-        output_ptr,  # pointer to output tensor (row-normalized)
-        row_scale_ptr,  # pointer to store row-wise maximum absolute values
-        n_rows,  # number of rows in the tensor
-        n_cols,  # number of columns in the tensor
+        x_ptr,
+        output_ptr,
+        scale_ptr,
+        n_rows,
+        n_cols,
         ROW_TILE_SIZE: tl.constexpr,
         COL_TILE_SIZE: tl.constexpr,
-        INNER_BLOCK_SIZE: tl.constexpr,  # should be 32 for MX
+        SCALE_BLOCK_SIZE: tl.constexpr,  # should be 32 for MX
     ):
         """
         Quantizes a high precision tensor to mxfp8 rowwise (1x32 scaling granularity).
-
-        This is the counterpart to to_mxfp8_dim1_kernel which does columnwise quantization.
-        Instead of transposing and scaling across columns, this kernel scales across rows.
         """
 
-        BLOCKS_PER_COL_TILE: tl.constexpr = COL_TILE_SIZE // INNER_BLOCK_SIZE
+        SCALE_BLOCKS_PER_COL_TILE: tl.constexpr = COL_TILE_SIZE // SCALE_BLOCK_SIZE
 
         # Get program ID
         pid_row = tl.program_id(0)
         pid_col = tl.program_id(1)
 
-        # Calculate starting row and column for this tile
         start_row = pid_row * ROW_TILE_SIZE
         start_col = pid_col * COL_TILE_SIZE
-
-        # Create offsets for the block
-        row_offsets = tl.arange(0, ROW_TILE_SIZE)
-        col_offsets = tl.arange(0, COL_TILE_SIZE)
-
-        # Compute global row/col positions
-        rows = start_row + row_offsets[:, None]
-        cols = start_col + col_offsets[None, :]
-
-        # Create masks for out-of-bounds accesses
-        row_mask = rows < n_rows
-        col_mask = cols < n_cols
-        mask = row_mask & col_mask
+        row_offs = start_row + tl.arange(0, ROW_TILE_SIZE)[:, None]
+        col_offs = start_col + tl.arange(0, COL_TILE_SIZE)[None, :]
 
         # Compute memory offsets for row-major layout (rows, cols)
-        row_major_offsets = (rows * n_cols + cols).to(tl.int32)
+        row_major_offsets = (row_offs * n_cols + col_offs).to(tl.int32)
 
         # Load the entire block in a single operation
         # shape: (ROW_TILE_SIZE, COL_TILE_SIZE)
+        mask = (row_offs < n_rows) & (col_offs < n_cols)
         x_block = tl.load(x_ptr + row_major_offsets, mask=mask)
 
         # Reshape to inner tile size for rowwise scaling
-        # shape: (ROW_TILE_SIZE, COL_TILE_SIZE) -> (ROW_TILE_SIZE * BLOCKS_PER_COL_TILE, INNER_BLOCK_SIZE)
+        # shape: (ROW_TILE_SIZE, COL_TILE_SIZE) -> (ROW_TILE_SIZE * BLOCKS_PER_COL_TILE, SCALE_BLOCK_SIZE)
         x_block_r = x_block.reshape(
-            ROW_TILE_SIZE * BLOCKS_PER_COL_TILE, INNER_BLOCK_SIZE
+            ROW_TILE_SIZE * SCALE_BLOCKS_PER_COL_TILE, SCALE_BLOCK_SIZE
         )
 
         # Calculate the absolute values of elements in the block
         x_block_abs_r = tl.abs(x_block_r)
 
         # Find the maximum absolute value for each row (across columns)
         # shape: (ROW_TILE_SIZE * BLOCKS_PER_COL_TILE,)
-        row_scale_r, row_scale_e8m0_r = _triton_calculate_scale(x_block_abs_r, axis=1)
+        scale_fp32_r, scale_e8m0_r = _triton_calculate_scale(x_block_abs_r, axis=1)
 
         # Divide each row by scale
-        # Broadcasting row_scale to match x_block's shape
-        # x_block_r shape (ROW_TILE_SIZE * BLOCKS_PER_COL_TILE, INNER_BLOCK_SIZE)
-        # row_scale shape (ROW_TILE_SIZE * BLOCKS_PER_COL_TILE,) -> (ROW_TILE_SIZE * BLOCKS_PER_COL_TILE, 1)
-        row_normalized_r = x_block_r / row_scale_r[:, None]
+        # Broadcasting scale to match x_block's shape
+        # x_block_r shape:
+        #    (ROW_TILE_SIZE * BLOCKS_PER_COL_TILE, SCALE_BLOCK_SIZE)
+        # scale[:, None] shape:
+        #    (ROW_TILE_SIZE * BLOCKS_PER_COL_TILE, 1)
+        scaled_data_r = x_block_r / scale_fp32_r[:, None]
 
         # Reshape back to original tile size
-        row_normalized = tl.reshape(row_normalized_r, ROW_TILE_SIZE, COL_TILE_SIZE)
-
-        # Quantize to float8
-        row_normalized = row_normalized.to(tl.float8e4nv)
+        e4m3_data_2d = tl.reshape(scaled_data_r, ROW_TILE_SIZE, COL_TILE_SIZE).to(
+            tl.float8e4nv
+        )
 
         # Store the row-normalized result in row-major format
-        tl.store(output_ptr + row_major_offsets, row_normalized, mask=mask)
-
-        # For rowwise quantization, scale tensor has shape (n_rows, n_cols // INNER_BLOCK_SIZE)
-        # Calculate base offset for this tile's scales
-        scales_per_row = n_cols // INNER_BLOCK_SIZE
+        tl.store(output_ptr + row_major_offsets, e4m3_data_2d, mask=mask)
 
-        # Create row and column indices for scale storage
-        scale_row_indices = tl.arange(0, ROW_TILE_SIZE)[:, None] + (
-            pid_row * ROW_TILE_SIZE
+        # Calculate scale offsets to write to
+        scales_per_row = n_cols // SCALE_BLOCK_SIZE
+        scale_row_indices = (
+            pid_row * ROW_TILE_SIZE + tl.arange(0, ROW_TILE_SIZE)[:, None]
         )
-        scale_col_indices = tl.arange(0, BLOCKS_PER_COL_TILE)[None, :] + (
-            pid_col * BLOCKS_PER_COL_TILE
+        scale_col_indices = (
+            pid_col * SCALE_BLOCKS_PER_COL_TILE
+            + tl.arange(0, SCALE_BLOCKS_PER_COL_TILE)[None, :]
         )
-
-        # Calculate linear indices into scale tensor
         scale_offsets = scale_row_indices * scales_per_row + scale_col_indices
 
-        # Create masks for valid scale indices
-        scale_row_mask = scale_row_indices < n_rows
-        scale_col_mask = scale_col_indices < scales_per_row
-        scale_mask = scale_row_mask & scale_col_mask
-
-        # Reshape scale values and masks to match the flattened layout
-        row_scale_e8m0_2d = row_scale_e8m0_r.reshape(ROW_TILE_SIZE, BLOCKS_PER_COL_TILE)
-
-        # Store the scales with proper masking
-        tl.store(row_scale_ptr + scale_offsets, row_scale_e8m0_2d, mask=scale_mask)
+        # Store e8m0 scales
+        scale_mask = (scale_row_indices < n_rows) & (scale_col_indices < scales_per_row)
+        scale_e8m0_2d = scale_e8m0_r.reshape(ROW_TILE_SIZE, SCALE_BLOCKS_PER_COL_TILE)
+        tl.store(scale_ptr + scale_offsets, scale_e8m0_2d, mask=scale_mask)
 
     @triton_op("torchao::triton_to_mxfp8_dim0", mutates_args={})
     def triton_to_mxfp8_dim0(
@@ -1155,7 +1136,7 @@ def triton_to_mxfp8_dim0(
 
         Output:
         * `output`: the `float8_e4m3fn` values of `x` cast to mxfp8 across dim0 (rowwise)
-        * `row_scale`: the `e8m0` values of `x_scale` used to cast `x` to mxfp8 across dim0
+        * `scale`: the `e8m0` values of `x_scale` used to cast `x` to mxfp8 across dim0
         """
         assert x.is_contiguous(), "`x` must be contiguous"
         assert inner_block_size <= 32
@@ -1175,7 +1156,7 @@ def triton_to_mxfp8_dim0(
         )
 
         # Create scale tensors for rowwise scaling
-        row_scale = torch.empty(
+        scale = torch.empty(
             (n_rows, n_cols // inner_block_size),
             dtype=torch.uint8,
             device=x.device,
@@ -1191,19 +1172,19 @@ def triton_to_mxfp8_dim0(
         wrap_triton(to_mxfp8_dim0_kernel)[grid](
             x_ptr=x,
             output_ptr=output,
-            row_scale_ptr=row_scale,
+            scale_ptr=scale,
             n_rows=n_rows,
             n_cols=n_cols,
-            INNER_BLOCK_SIZE=inner_block_size,
+            SCALE_BLOCK_SIZE=inner_block_size,
         )
 
         # Reshape output back to original shape
         output = output.reshape(x_orig_shape)
-        row_scale = row_scale.reshape(*x_orig_shape[:-1], row_scale.shape[-1])
+        scale = scale.reshape(*x_orig_shape[:-1], scale.shape[-1])
 
         return (
             output,
-            row_scale.view(torch.float8_e8m0fnu),
+            scale.view(torch.float8_e8m0fnu),
         )
 
     @triton_op("torchao::triton_to_mxfp8_dim1", mutates_args={})