Merge pull request #2706 from AI-Hypercomputer:mohit/tokamax_quant_gmm

Google-ML-Automation · Google-ML-Automation · commit 05abc905eab2 · 2025-11-19T22:58:38.000-08:00
PiperOrigin-RevId: 834605168
diff --git a/src/MaxText/configs/base.yml b/src/MaxText/configs/base.yml
@@ -128,7 +128,9 @@ save_quantized_params_path: ""
 model_call_mode: ""
 use_qwix_quantization: False # Whether to use qwix for quantization. If set to True, the model will be quantized using qwix.
 # Quantization calibration method used for weights and activations. Supported methods can be found in https://github.com/google/qwix/blob/dc2a0770351c740e5ab3cce7c0efe9f7beacce9e/qwix/qconfig.py#L70-L80
-quantization_calibration_method: "absmax"
+weight_quantization_calibration_method: "absmax"
+act_quantization_calibration_method: "absmax"
+bwd_quantization_calibration_method: "absmax"
 # Shard the range finding operation for quantization. By default this is set to number of slices.
 quantization_local_shard_count: -1
 
@@ -177,10 +179,26 @@ load_balance_loss_weight: 0.01 # weight for the load balance loss
 use_random_routing: False # whether to use random routing for debug/test purpose
 use_custom_sort_vjp: True # whether to use a custom sort vjp for sparse matmul ops
 use_ring_of_experts: False # whether to use ring of experts for sparse matmul expert parallelism
-# Tunable tiling dimensions used for Megablox
-tile_batch_seq: 512
-tile_embed_dim: 1024
-tile_mlp_dim: 1024
+# Tunable tiling dimensions used for MLP GMM, includes Tokamax ragged_dot and Megablox
+wi_tile_fwd_batch_seq: 512
+wi_tile_fwd_embed_dim: 1024
+wi_tile_fwd_mlp_dim: 1024
+wi_tile_dlhs_batch_seq: 512
+wi_tile_dlhs_embed_dim: 1024
+wi_tile_dlhs_mlp_dim: 1024
+wi_tile_drhs_batch_seq: 512
+wi_tile_drhs_embed_dim: 1024
+wi_tile_drhs_mlp_dim: 1024
+
+wo_tile_fwd_batch_seq: 512
+wo_tile_fwd_embed_dim: 1024
+wo_tile_fwd_mlp_dim: 1024
+wo_tile_dlhs_batch_seq: 512
+wo_tile_dlhs_embed_dim: 1024
+wo_tile_dlhs_mlp_dim: 1024
+wo_tile_drhs_batch_seq: 512
+wo_tile_drhs_embed_dim: 1024
+wo_tile_drhs_mlp_dim: 1024
 norm_topk_prob: False # Boolean to enable the top-k probability normalization. Qwen3-specific normalization of router weights.
 
 # How the expert axis is used to shard attention weights and activations
diff --git a/src/MaxText/configs/types.py b/src/MaxText/configs/types.py
@@ -341,9 +341,17 @@ class Quantization(BaseModel):
   kv_quant_dtype: Literal["int8", "int4"] = Field("int8", description="Data type for KV cache quantization.")
   quantization_local_shard_count: int = Field(-1, description="Shards the range finding operation for quantization.")
   use_qwix_quantization: bool = Field(False, description="Whether to use qwix for quantization.")
-  quantization_calibration_method: str = Field(
+  weight_quantization_calibration_method: str = Field(
       "absmax",
-      description="Quantization calibration method used for weights and activations.",
+      description="Quantization calibration method used for weights.",
+  )
+  act_quantization_calibration_method: str = Field(
+      "absmax",
+      description="Quantization calibration method used for activations.",
+  )
+  bwd_quantization_calibration_method: str = Field(
+      "absmax",
+      description="Quantization calibration method used for gradients.",
   )
 
 
@@ -547,9 +555,24 @@ class MoEKernels(BaseModel):
 
   megablox: bool = Field(True, description="Whether to use Megablox kernels for MoE.")
   sparse_matmul: bool = Field(True, description="Whether to use sparse matmul kernels for MoE.")
-  tile_batch_seq: int = Field(512, description="Tunable tiling dimension for batch/sequence in Megablox.")
-  tile_embed_dim: int = Field(1024, description="Tunable tiling dimension for embedding in Megablox.")
-  tile_mlp_dim: int = Field(1024, description="Tunable tiling dimension for MLP in Megablox.")
+  wi_tile_fwd_batch_seq: int = Field(512, description="forward pass tiling dimension for batch/sequence in GMM for wi.")
+  wi_tile_fwd_embed_dim: int = Field(1024, description="forward pass tiling dimension for embedding in GMM for wi.")
+  wi_tile_fwd_mlp_dim: int = Field(1024, description="forward pass tiling dimension for MLP in GMM for wi.")
+  wi_tile_dlhs_batch_seq: int = Field(512, description="bwd pass dlhs tiling dimension for batch/sequence in GMM for wi.")
+  wi_tile_dlhs_embed_dim: int = Field(1024, description="bwd pass dlhs tiling dimension for embedding in GMM for wi.")
+  wi_tile_dlhs_mlp_dim: int = Field(1024, description="bwd pass dlhs tiling dimension for MLP in GMM for wi.")
+  wi_tile_drhs_batch_seq: int = Field(512, description="bwd pass drhs tiling dimension for batch/sequence in GMM for wi.")
+  wi_tile_drhs_embed_dim: int = Field(1024, description="bwd pass drhs tiling dimension for embedding in GMM for wi.")
+  wi_tile_drhs_mlp_dim: int = Field(1024, description="bwd pass drhs tiling dimension for MLP in GMM for wi.")
+  wo_tile_fwd_batch_seq: int = Field(512, description="forward pass tiling dimension for batch/sequence in GMM for wo.")
+  wo_tile_fwd_embed_dim: int = Field(1024, description="forward pass tiling dimension for embedding in GMM for wo.")
+  wo_tile_fwd_mlp_dim: int = Field(1024, description="forward pass tiling dimension for MLP in GMM for wo.")
+  wo_tile_dlhs_batch_seq: int = Field(512, description="bwd pass dlhs tiling dimension for batch/sequence in GMM for wo.")
+  wo_tile_dlhs_embed_dim: int = Field(1024, description="bwd pass dlhs tiling dimension for embedding in GMM for wo.")
+  wo_tile_dlhs_mlp_dim: int = Field(1024, description="bwd pass dlhs tiling dimension for MLP in GMM for wo.")
+  wo_tile_drhs_batch_seq: int = Field(512, description="bwd pass drhs tiling dimension for batch/sequence in GMM for wo.")
+  wo_tile_drhs_embed_dim: int = Field(1024, description="bwd pass drhs tiling dimension for embedding in GMM for wo.")
+  wo_tile_drhs_mlp_dim: int = Field(1024, description="bwd pass drhs tiling dimension for MLP in GMM for wo.")
 
 
 class DeepSeekMoE(BaseModel):
@@ -1400,7 +1423,6 @@ class DerivedValues(BaseModel):
       None, description="Increment for global batch size during rampup."
   )
   rampup_samples_per_increment_to_load: None | float = Field(None, description="Samples per increment for rampup.")
-  tile_fwd_batch_seq: None | int = Field(None, description="Legacy alias for tile_batch_seq.")
 
 
 # ----------------------------------------------------------------------------
@@ -1721,7 +1743,6 @@ def calculate_global_batch_sizes(per_device_batch_size, expansion_factor, num_de
     if self.expert_shard_attention_option == "context":
       cp_size *= self.ici_expert_parallelism * self.dcn_expert_parallelism
     self.context_parallel_size = cp_size
-    self.tile_fwd_batch_seq = self.tile_batch_seq  # Legacy alias.
     if self.pipeline_parallel_layers == -1:
       if self.decoder_block == DecoderBlockType.DEEPSEEK:
         moe_layers = self.num_decoder_layers - self.first_num_dense_layers
diff --git a/src/MaxText/kernels/megablox/backend.py b/src/MaxText/kernels/megablox/backend.py
@@ -20,6 +20,7 @@
 import dataclasses
 import functools
 from typing import Any, Optional
+import json
 
 import jax
 from jax import lax
@@ -514,6 +515,11 @@ def out_transform_indices(n_i, grid_id, k_i, group_metadata, group_offset):
   bytes_accessed = (lhs_bytes * tiles_n) + (rhs_bytes * max_active_tiles) + out_bytes
   flops = 2 * m * k * n
   cost_estimate = pl.CostEstimate(flops=flops, bytes_accessed=bytes_accessed, transcendentals=0)
+  metadata = {
+      "preferred_element_type": jnp.dtype(preferred_element_type).name,
+      "tiling": {"tile_m": tm, "tile_k": tk, "tile_n": tn},
+      "transpose_rhs": transpose_rhs,
+  }
   call_gmm = qpl.pallas_call(
       kernel,
       out_shape=jax.ShapeDtypeStruct((m, n), preferred_element_type),
@@ -532,6 +538,7 @@ def out_transform_indices(n_i, grid_id, k_i, group_metadata, group_offset):
       compiler_params=pltpu.CompilerParams(dimension_semantics=("parallel", "arbitrary", "arbitrary")),
       interpret=interpret,
       cost_estimate=cost_estimate,
+      metadata={"xprof_metadata": json.dumps(metadata)},
   )
 
   out = call_gmm(
@@ -761,6 +768,11 @@ def out_transform_indices(n_i, k_i, grid_id, group_metadata, group_offset):
   flops = 2 * m * k * n
   cost_estimate = pl.CostEstimate(flops=flops, bytes_accessed=bytes_accessed, transcendentals=0)
   lhs = lhs.swapaxes(0, 1)
+  metadata = {
+      "tiling": {"tile_m": tm, "tile_k": tk, "tile_n": tn},
+      "prefer_element_type": jnp.dtype(preferred_element_type).name,
+      "num_actual_groups": num_actual_groups,
+  }
   call_gmm = qpl.pallas_call(
       kernel,
       out_shape=jax.ShapeDtypeStruct((num_actual_groups, k, n), preferred_element_type),
@@ -779,6 +791,7 @@ def out_transform_indices(n_i, k_i, grid_id, group_metadata, group_offset):
       compiler_params=pltpu.CompilerParams(dimension_semantics=("parallel", "arbitrary", "arbitrary")),
       interpret=interpret,
       cost_estimate=cost_estimate,
+      metadata={"xprof_metadata": json.dumps(metadata)},
   )
 
   out = call_gmm(
diff --git a/src/MaxText/kernels/megablox/ops.py b/src/MaxText/kernels/megablox/ops.py
@@ -17,10 +17,12 @@
 # pylint: disable=too-many-positional-arguments
 
 import functools
+import dataclasses
 from typing import Literal
 import jax
 import jax.numpy as jnp
 from MaxText.kernels.megablox import backend
+from tokamax._src.ops.ragged_dot import pallas_mosaic_tpu_kernel as tokamax_backend
 import qwix
 import qwix.pallas as qpl
 
@@ -30,14 +32,16 @@ def gmm(
     rhs: jnp.ndarray,
     group_sizes: jnp.ndarray,
     preferred_element_type: jnp.dtype = jnp.float32,
-    tiling: tuple[int, int, int] = (128, 128, 128),
+    tiling: tuple[int, int, int, int, int, int, int, int, int] = (128, 128, 128, 128, 128, 128, 128, 128, 128),
     group_offset: jnp.ndarray | None = None,
     existing_out: jnp.ndarray | None = None,
     transpose_rhs: bool = False,
     interpret: bool = False,
     lhs_quantize_dtype: Literal[jnp.int4, jnp.int8] | None = None,
     rhs_quantize_dtype: Literal[jnp.int4, jnp.int8] | None = None,
     use_qwix_quantization: bool = False,
+    use_tokamax_backend: bool = False,
+    is_fsdp_shard_on_exp: bool = False,
 ):
   """Grouped matrix multiplication operation."""
   quantization_rule = None
@@ -57,7 +61,7 @@ def gmm(
       )
 
   gmm_fwd_bwd = lambda *args: _gmm_fwd(*args)[0]  # pylint: disable=C3001
-  gmm_fwd_bwd = jax.custom_vjp(gmm_fwd_bwd, nondiff_argnums=(3, 4, 7, 8, 9))
+  gmm_fwd_bwd = jax.custom_vjp(gmm_fwd_bwd, nondiff_argnums=(3, 4, 7, 8, 9, 10, 11))
   gmm_fwd_bwd.defvjp(_gmm_fwd, functools.partial(_gmm_bwd, lhs.dtype, rhs.dtype))
   return gmm_fwd_bwd(
       lhs,
@@ -70,6 +74,8 @@ def gmm(
       transpose_rhs,
       interpret,
       quantization_rule,
+      use_tokamax_backend,
+      is_fsdp_shard_on_exp,
   )
 
 
@@ -78,12 +84,14 @@ def _gmm_fwd(
     rhs: jnp.ndarray,
     group_sizes: jnp.ndarray,
     preferred_element_type: jnp.dtype = jnp.float32,
-    tiling: tuple[int, int, int] = (128, 128, 128),
+    tiling: tuple[int, int, int, int, int, int, int, int, int] = (128, 128, 128, 128, 128, 128, 128, 128, 128),
     group_offset: jnp.ndarray | None = None,
     existing_out: jnp.ndarray | None = None,
     transpose_rhs: bool = False,
     interpret: bool = False,
     quantization_rule: qwix.QtRule | None = None,
+    use_tokamax_backend: bool = False,
+    is_fsdp_shard_on_exp: bool = False,
 ) -> tuple[
     jnp.ndarray,
     tuple[
@@ -114,29 +122,52 @@ def _gmm_fwd(
           calibration_method=quantization_rule.weight_calibration_method,
           scale_dtype=jnp.float32,
       )
-
-  out = backend.gmm(
-      lhs,
-      rhs,
-      group_sizes,
-      preferred_element_type,
-      tiling,
-      group_offset,
-      existing_out,
-      transpose_rhs=transpose_rhs,
-      interpret=interpret,
-  )
+      # QAG is only supported for following conditions
+  if use_tokamax_backend:
+    if quantization_rule and quantization_rule.bwd_qtype:
+      if (
+          quantization_rule.weight_calibration_method.startswith("fixed")
+          and isinstance(rhs, qpl.QArray)
+          and is_fsdp_shard_on_exp
+      ):
+        rhs_qvalue = jax.lax.all_gather(rhs.qvalue, "fsdp", axis=0, tiled=True)
+        rhs = dataclasses.replace(rhs, qvalue=rhs_qvalue)
+    out = tokamax_backend.gmm(
+        lhs=lhs,
+        rhs=rhs,
+        group_sizes=group_sizes,
+        precision=jax.lax.Precision.DEFAULT,
+        out_dtype=preferred_element_type,
+        tiling=tiling[:3],
+        group_offset=group_offset,
+        transpose_rhs=transpose_rhs,
+        interpret=interpret,
+    )
+  else:
+    out = backend.gmm(
+        lhs,
+        rhs,
+        group_sizes,
+        preferred_element_type,
+        tiling[:3],
+        group_offset,
+        existing_out,
+        transpose_rhs=transpose_rhs,
+        interpret=interpret,
+    )
   return out, (lhs, rhs, group_sizes, group_offset)
 
 
 def _gmm_bwd(
     lhs_dtype: jax.typing.DTypeLike,
     rhs_dtype: jax.typing.DTypeLike,
     preferred_element_type: jnp.dtype,
-    tiling: tuple[int, int, int],
+    tiling: tuple[int, int, int, int, int, int, int, int, int],
     transpose_rhs: bool,
     interpret: bool,
     quantization_rule: qwix.QtRule | None,
+    use_tokamax_backend: bool,
+    is_fsdp_shard_on_exp: bool,
     residual: tuple[
         jnp.ndarray | qpl.QArray,
         jnp.ndarray | qpl.QArray,
@@ -187,27 +218,52 @@ def _gmm_bwd(
         calibration_method=quantization_rule.bwd_calibration_method,
         scale_dtype=jnp.float32,
     )
-
-  dlhs = backend.gmm(
-      dlhs_dout,
-      rhs,
-      group_sizes,
-      lhs_dtype,
-      tiling,
-      group_offset,
-      transpose_rhs=not transpose_rhs,
-      interpret=interpret,
-  )
-  drhs = backend.tgmm(
-      lhs.swapaxes(0, 1),
-      drhs_dout,
-      group_sizes,
-      rhs_dtype,
-      tiling,
-      group_offset,
-      num_actual_groups,
-      interpret=interpret,
-  )
+  if use_tokamax_backend:
+    dlhs = tokamax_backend.gmm(
+        lhs=dlhs_dout,
+        rhs=rhs,
+        group_sizes=group_sizes,
+        precision=jax.lax.Precision.DEFAULT,
+        out_dtype=lhs_dtype,
+        tiling=tiling[3:6],
+        group_offset=group_offset,
+        transpose_rhs=not transpose_rhs,
+        interpret=interpret,
+    )
+    drhs = tokamax_backend.tgmm(
+        lhs=lhs.swapaxes(0, 1),
+        rhs=drhs_dout,
+        group_sizes=group_sizes,
+        precision=jax.lax.Precision.DEFAULT,
+        out_dtype=rhs_dtype,
+        tiling=tiling[-3:],
+        group_offset=group_offset,
+        num_actual_groups=num_actual_groups,
+        interpret=interpret,
+    )
+    if quantization_rule and quantization_rule.bwd_qtype and is_fsdp_shard_on_exp:
+      drhs = jax.lax.psum_scatter(drhs, "fsdp", scatter_dimension=0, tiled=True)
+  else:
+    dlhs = backend.gmm(
+        dlhs_dout,
+        rhs,
+        group_sizes,
+        lhs_dtype,
+        tiling[3:6],
+        group_offset,
+        transpose_rhs=not transpose_rhs,
+        interpret=interpret,
+    )
+    drhs = backend.tgmm(
+        lhs.swapaxes(0, 1),
+        drhs_dout,
+        group_sizes,
+        rhs_dtype,
+        tiling[-3:],
+        group_offset,
+        num_actual_groups,
+        interpret=interpret,
+    )
 
   # NOTE: If the rhs transposition is fused into the forward pass we need to
   # return the transpose of the rhs gradient that we calculated above.
diff --git a/src/MaxText/layers/moe.py b/src/MaxText/layers/moe.py
diff --git a/src/MaxText/layers/quantizations.py b/src/MaxText/layers/quantizations.py