From ad6bf2ae16ed7d7c54ecf725c649d19e6eab5017 Mon Sep 17 00:00:00 2001
From: Alp Dener <adener@nvidia.com>
Date: Thu, 24 Oct 2024 22:38:05 +0000
Subject: [PATCH 01/32] added XLA custom op defs for TE GEMM

Signed-off-by: Alp Dener <adener@nvidia.com>

Added XLA FFI custom op for TE GEMM

Signed-off-by: Alp Dener <adener@nvidia.com>

finished GEMM custom op primitive and serial unit test

Signed-off-by: Alp Dener <adener@nvidia.com>

fixed GEMM custom op batcher

Signed-off-by: Alp Dener <adener@nvidia.com>

fixed output dtype error and contracting dimensions options

Signed-off-by: Alp Dener <adener@nvidia.com>

AG overlap working but executes scatter to match outer LHS dim

Signed-off-by: Alp Dener <adener@nvidia.com>

both all-gather and all-reduce are now working

Signed-off-by: Alp Dener <adener@nvidia.com>

code style

Signed-off-by: Alp Dener <adener@nvidia.com>

changed kwargs in abstract to be explicit

Signed-off-by: Alp Dener <adener@nvidia.com>

added fwd/bwd implementation for non-fp8 gemm

Signed-off-by: Alp Dener <adener@nvidia.com>
---
 tests/jax/test_custom_call_compute.py         |  55 ++
 .../jax/cpp_extensions/__init__.py            |   1 +
 transformer_engine/jax/cpp_extensions/gemm.py | 647 ++++++++++++++++++
 transformer_engine/jax/cpp_extensions/misc.py |   7 +
 transformer_engine/jax/csrc/extensions.h      |  39 ++
 .../jax/csrc/extensions/gemm.cpp              | 170 +++++
 .../jax/csrc/extensions/packing.cpp           |  11 +
 .../jax/csrc/extensions/pybind.cpp            |   5 +-
 transformer_engine/jax/csrc/utils.h           |   2 +-
 transformer_engine/jax/flax/module.py         |   7 +-
 transformer_engine/jax/fp8.py                 |   7 +-
 transformer_engine/jax/gemm.py                | 425 ++++++++++++
 12 files changed, 1370 insertions(+), 6 deletions(-)
 create mode 100644 transformer_engine/jax/cpp_extensions/gemm.py
 create mode 100644 transformer_engine/jax/csrc/extensions/gemm.cpp
 create mode 100644 transformer_engine/jax/gemm.py

diff --git a/tests/jax/test_custom_call_compute.py b/tests/jax/test_custom_call_compute.py
index 20b16c2809..9bf3f9fa91 100644
--- a/tests/jax/test_custom_call_compute.py
+++ b/tests/jax/test_custom_call_compute.py
@@ -25,6 +25,7 @@
     _jax_dbias_cast_transpose,
 )
 from transformer_engine.jax.cpp_extensions.quantization import _jax_cast_fp8
+from transformer_engine.jax.gemm import fp8_gemm, gemm
 from transformer_engine.jax import cpp_extensions as tex
 
 
@@ -415,6 +416,60 @@ def ref_func(x, ln_s, y, z, w, v, amax_list_1, amax_list_2, scale_list_1, scale_
             )
 
 
+class TestGemm:
+
+    @staticmethod
+    def _generate_inputs(b, m, n, k, dtype):
+        key = jax.random.PRNGKey(0)
+        subkeys = jax.random.split(key, 3)
+        a = jax.random.normal(subkeys[0], (b, m, k), dtype)
+        b = jax.random.normal(subkeys[1], (n, k), dtype)
+        bias_dtype = dtype if dtype not in [jnp.float8_e4m3fn, jnp.float8_e5m2] else jnp.bfloat16
+        bias = jax.random.normal(subkeys[2], (n, ), bias_dtype)
+        return a, b, bias
+
+    @staticmethod
+    def _generate_fp8_inputs(b, m, n, k, fp8_dtype):
+        a, b, bias = TestGemm._generate_inputs(b, m, n, k, jnp.bfloat16)
+        a_scale, b_scale = map(
+            lambda x: (jnp.max(jnp.abs(x)) / 127.).astype(jnp.float32),
+            [a, b]
+        )
+        a_q, b_q = map(
+            lambda x, x_scale: jnp.round(x / x_scale).astype(fp8_dtype),
+            [(a, a_scale), (b, b_scale)]
+        )
+        return a, a_q, jnp.reciprocal(a_scale), b, b_q, jnp.reciprocal(b_scale), bias
+
+    @pytest.mark.parametrize("m,n,k", GEMM_CASES)
+    @pytest.mark.parametrize("use_bias", (False, True))
+    @pytest.mark.parametrize("do_gelu", (False, True))
+    def test_gemm(self, b, m, n, k, use_bias, do_gelu):
+        a, b, bias = self._generate_inputs(b, m, n, k, jnp.bfloat16)
+
+        primitive_out = gemm(a, b, bias=bias if use_bias else None, layout='NT', do_gelu=do_gelu)
+        ref_out = jnp.dot(a, b)
+        if use_bias:
+            ref_out += bias
+        if do_gelu:
+            ref_out = jax.nn.gelu(ref_out)
+
+        assert_allclose(primitive_out, ref_out, dtype=jnp.bfloat16)
+
+    @pytest.mark.skipif(not is_fp8_supported, reason=reason)
+    @pytest.mark.parametrize("m,n,k", GEMM_CASES)
+    @pytest.mark.parametrize("fp8_dtype", FP8_COMPUTE_TYPE)
+    def test_fp8_gemm(self, m, n, k, fp8_dtype):
+        a, a_q, a_scale_inv, b, b_q, b_scale_inv, _ = self._generate_fp8_inputs(
+            m, n, k, fp8_dtype
+        )
+
+        primitive_out = fp8_gemm(a_q, a_scale_inv, b_q, b_scale_inv, out_dtype=jnp.bfloat16)
+        ref_out = jnp.dot(a, b)
+
+        assert_allclose(primitive_out, ref_out, dtype=fp8_dtype)
+
+
 @pytest.fixture(name="random_inputs")
 def random_inputs_fixture(shape):
     key = jax.random.PRNGKey(0)
diff --git a/transformer_engine/jax/cpp_extensions/__init__.py b/transformer_engine/jax/cpp_extensions/__init__.py
index 579daa8e41..1e5cc4c07e 100644
--- a/transformer_engine/jax/cpp_extensions/__init__.py
+++ b/transformer_engine/jax/cpp_extensions/__init__.py
@@ -4,6 +4,7 @@
 """Python interface for c++ extensions"""
 from .activation import *
 from .attention import *
+from .gemm import *
 from .normalization import *
 from .quantization import *
 from .softmax import *
diff --git a/transformer_engine/jax/cpp_extensions/gemm.py b/transformer_engine/jax/cpp_extensions/gemm.py
new file mode 100644
index 0000000000..677fabca59
--- /dev/null
+++ b/transformer_engine/jax/cpp_extensions/gemm.py
@@ -0,0 +1,647 @@
+# Copyright (c) 2022-2024, NVIDIA CORPORATION & AFFILIATES. All rights reserved.
+#
+# See LICENSE for license information.
+"""JAX/TE custom ops for cuBlasLt GEMM"""
+import warnings
+import operator
+from functools import reduce
+from typing import Optional, Union, Tuple
+
+import jax
+import jax.numpy as jnp
+from jax import dtypes
+from jax.interpreters import mlir
+from jax.interpreters.mlir import ir
+from jax.sharding import PartitionSpec, NamedSharding
+from jax.extend import ffi
+from jax.typing import ArrayLike
+
+from transformer_engine import transformer_engine_jax as tex
+from .base import BasePrimitive, register_primitive
+from .custom_call import custom_caller, CustomCallArgsWrapper
+from .misc import (
+    jax_dtype_to_te_dtype,
+    jax_dtype_is_fp8,
+    get_padded_spec,
+    is_ffi_enabled,
+)
+from ..sharding import (
+    global_mesh_resource,
+    get_mesh_axis_size,
+    lax_paral_op,
+    all_reduce_max_along_all_axes_except_PP,
+)
+
+
+__all__ = [
+    "fp8_gemm_impl",
+    "gemm_impl",
+]
+
+
+def get_cublas_workspace_size_bytes() -> None:
+    """Return 32 MiB if using hopper, 4 MiB for all other architectures."""
+    if tex.get_device_compute_capability() >= 90:
+        return 33_554_432
+    return 4_194_304
+
+
+class CollectiveGemmPrimitive(BasePrimitive):
+    """
+    cuBlasLt GEMM Primitive w/ support for distributed inputs
+    """
+
+    name = "te_gemm"
+    impl_static_args = (8, 9, 10, 11, 12, 13, 14)
+    multiple_results = True
+    inner_primitive = None
+    outer_primitive = None
+
+    @staticmethod
+    def abstract(lhs_aval, lhs_scale_inv_aval, rhs_aval, rhs_scale_inv_aval, bias_aval,
+                 gelu_input_aval, out_amax_aval, out_scale_aval, out_dtype, contracting_dims,
+                 fuse_gelu, fuse_bias, grad, accumulate, use_split_accumulator):
+        """
+        cuBlasLt GEMM abstract
+        """
+        del grad, accumulate, use_split_accumulator
+
+        # Validate operand dtypes
+        lhs_dtype = dtypes.canonicalize_dtype(lhs_aval.dtype)
+        rhs_dtype = dtypes.canonicalize_dtype(rhs_aval.dtype)
+        assert lhs_dtype == rhs_dtype, "Mismatched matrix dtypes for GEMM."
+        is_fp8 = False
+        if jax_dtype_is_fp8(lhs_dtype):
+            assert (
+                lhs_scale_inv_aval.size == 1
+                and dtypes.canonicalize_dtype(lhs_scale_inv_aval.dtype) == jnp.float32
+            ), "Missing LHS operand scale inverse in FP8 GEMM."
+            is_fp8 = True
+        if jax_dtype_is_fp8(rhs_dtype):
+            assert (
+                rhs_scale_inv_aval.size == 1
+                and dtypes.canonicalize_dtype(rhs_scale_inv_aval.dtype) == jnp.float32
+            ), "Missing RHS operand scale inverse in FP8 GEMM."
+
+        # Disallow batching for RHS
+        assert rhs_aval.ndim == 2, "GEMM does not support batching the RHS operand."
+
+        # Validate operand layouts
+        lhs_inner_dim, rhs_inner_dim = map(
+            lambda inner_dim, ndims: (ndims - inner_dim) if inner_dim < 0 else inner_dim,
+            contracting_dims,
+            (lhs_aval.ndim, rhs_aval.ndim)
+        )
+        assert (
+            lhs_aval.shape[lhs_inner_dim] == rhs_aval.shape[rhs_inner_dim]
+        ), f"Incompatible operand sizes: {lhs_aval.shape} x {rhs_aval.shape}."
+
+        lhs_trans = lhs_inner_dim != lhs_aval.ndim - 1
+        rhs_trans = rhs_inner_dim == 1
+        assert (
+            not (lhs_trans and rhs_trans)
+        ), "GEMM does not support transposed LHS and transposed RHS at the same time."
+        if is_fp8:
+            assert lhs_trans, "FP8 GEMM does not support transposed LHS."
+            assert rhs_trans, "FP8 GEMM requires transposed RHS."
+
+        # Validate output dtype
+        if jax_dtype_is_fp8(out_dtype):
+            assert (
+                jax_dtype_is_fp8(lhs_dtype) and jax_dtype_is_fp8(rhs_dtype)
+            ), "FP8 GEMM output requires FP8 inputs."
+            assert (
+                out_amax_aval.size == out_scale_aval.size == 1
+            ), "Invalid/missing output amax and scale."
+            out_amax_updated_dtype = dtypes.canonicalize_dtype(out_amax_aval.dtype)
+            out_scale_updated_dtype = dtypes.canonicalize_dtype(out_scale_aval.dtype)
+            assert (
+                out_amax_updated_dtype == out_scale_updated_dtype == jnp.float32
+            ), "Invalid output amax or scale dtype."
+        else:
+            out_dtype = lhs_dtype
+            out_amax_updated_dtype = jnp.float32
+            out_scale_updated_dtype = jnp.float32
+
+        # Infer output shape
+        rhs_outer_dim = 0 if rhs_trans else 1
+        lhs_outer_dim = lhs_aval.ndim - 1 if lhs_trans else lhs_aval.ndim - 2
+        lhs_bdims = [dim for dim in range(lhs_aval.ndim)
+                     if dim not in [lhs_outer_dim, lhs_inner_dim]]
+        lhs_batch_shape = [lhs_aval.shape[dim] for dim in lhs_bdims]
+        out_shape = (*lhs_batch_shape, lhs_aval.shape[lhs_outer_dim], rhs_aval.shape[rhs_outer_dim])
+
+        # Validate bias/bias_grad shape against inferred output
+        bias_dtype = jnp.bfloat16 if jax_dtype_is_fp8(out_dtype) else out_dtype
+        if fuse_bias:
+            assert (
+                bias_aval.size > 0
+                and bias_aval.ndim == 1
+                and bias_aval.shape[0] == out_shape[-1]
+            ), "Incorrect bias shape."
+            bias_dtype = dtypes.canonicalize_dtype(bias_aval.dtype)
+        else:
+            assert bias_aval.size == 0, "Internal TE error."
+
+        # Validate GELU input/output
+        if fuse_gelu:
+            assert (
+                all([gelu_input_aval.shape[i] == out_shape[i] for i in len(out_shape)])
+            ), "Invalid GELU input shape."
+            assert gelu_input_aval.dtype == bias_dtype, "Invalid GELU dtype."
+        else:
+            assert gelu_input_aval.size == 0, "Internal TE error."
+
+        # Create abstract arrays for all outputs
+        out_aval = lhs_aval.update(shape=out_shape, dtype=out_dtype)
+        out_amax_updated_aval = out_amax_aval.update(shape=out_amax_aval.shape,
+                                                     dtype=out_amax_updated_dtype)
+        out_scale_updated_aval = out_scale_aval.update(shape=out_scale_aval.shape,
+                                                       dtype=out_scale_updated_dtype)
+        pre_gelu_out_aval = gelu_input_aval.update(shape=gelu_input_aval.shape, dtype=bias_dtype)
+        bias_grad_aval = bias_aval.update(shape=bias_aval.shape, dtype=bias_dtype)
+        workspace_aval = jax.core.ShapedArray(shape=(get_cublas_workspace_size_bytes(), ),
+                                              dtype=jnp.uint8)
+
+        return (
+            out_aval,
+            out_amax_updated_aval,
+            out_scale_updated_aval,
+            pre_gelu_out_aval,
+            bias_grad_aval,
+            workspace_aval
+        )
+
+    @staticmethod
+    def outer_abstract(*args, **kwargs):
+        """
+        cuBlasLt GEMM outer abstract
+        """
+        (
+            out_aval,
+            out_amax_aval,
+            out_scale_aval,
+            pre_gelu_out_aval,
+            bias_grad_aval,
+            _
+        ) = CollectiveGemmPrimitive.abstract(*args, **kwargs)
+        return out_aval, out_amax_aval, out_scale_aval, pre_gelu_out_aval, bias_grad_aval
+
+    @staticmethod
+    def lowering(ctx, lhs, lhs_scale_inv, rhs, rhs_scale_inv, bias, gelu_input, out_amax, out_scale,
+                 *, out_dtype, contracting_dims, fuse_gelu, fuse_bias, grad, accumulate,
+                 use_split_accumulator):
+        """
+        Fused attention fwd lowering rules
+        """
+        lhs_aval, _, rhs_aval, _, bias_aval, *_ = ctx.avals_in
+        lhs_inner_dim, rhs_inner_dim = map(
+            lambda inner_dim, ndims: (ndims - inner_dim) if inner_dim < 0 else inner_dim,
+            contracting_dims,
+            (lhs_aval.ndim, rhs_aval.ndim)
+        )
+        lhs_trans = lhs_inner_dim != lhs_aval.ndim - 1
+        rhs_trans = rhs_inner_dim == 1
+
+        operand_output_aliases = {
+            4: 4,  # bias        <-->  bias_grad
+            5: 3,  # gelu_input  <-->  pre_gelu_out
+            6: 1,  # out_amax    <-->  out_amax_updated
+            7: 2,  # out_scale   <-->  out_scale_updated
+        }
+
+        if is_ffi_enabled():
+            name = "te_gemm_ffi"
+            return ffi.ffi_lowering(name, operand_output_aliases=operand_output_aliases)(
+                ctx,
+                lhs,
+                lhs_scale_inv,
+                rhs,
+                rhs_scale_inv,
+                bias,
+                gelu_input,
+                out_amax,
+                out_scale,
+                lhs_trans=lhs_trans,
+                rhs_trans=rhs_trans,
+                fuse_gelu=fuse_gelu,
+                fuse_bias=fuse_bias,
+                grad=grad,
+                accumulate=accumulate,
+                use_split_accumulator=use_split_accumulator
+            )
+        else:
+            operands = [
+                lhs,
+                lhs_scale_inv,
+                rhs,
+                rhs_scale_inv,
+                bias,
+                gelu_input,
+                out_amax,
+                out_scale,
+            ]
+            operand_shapes = map(lambda x: ir.RankedTensorType(x.type).shape, operands)
+            out_types = [
+                ir.RankedTensorType.get(output.shape, mlir.dtype_to_ir_dtype(output.dtype))
+                for output in ctx.avals_out
+            ]
+            args = CustomCallArgsWrapper(out_types, operands, operand_shapes)
+
+            rhs_outer_dim = 0 if rhs_trans else 1
+            lhs_outer_dim = lhs_aval.ndim - 1 if lhs_trans else lhs_aval.ndim - 2
+            lhs_bdims = [dim for dim in range(lhs_aval.ndim)
+                        if dim not in [lhs_outer_dim, lhs_inner_dim]]
+            lhs_batch_shape = [lhs_aval.shape[dim] for dim in lhs_bdims]
+            m = reduce(operator.mul, lhs_batch_shape, 1) * lhs_aval.shape[lhs_outer_dim]
+            k = rhs_aval.shape[rhs_inner_dim]
+            n = rhs_aval.shape[rhs_outer_dim]
+            workspace_size = get_cublas_workspace_size_bytes()
+            operand_dtype = jax_dtype_to_te_dtype(lhs_aval.dtype)
+            bias_dtype = jax_dtype_to_te_dtype(bias_aval.dtype)
+            opaque = tex.pack_gemm_descriptor(m, n, k, workspace_size, operand_dtype,
+                                              jax_dtype_to_te_dtype(out_dtype), bias_dtype,
+                                              lhs_trans, rhs_trans, fuse_gelu, fuse_bias, grad,
+                                              accumulate, use_split_accumulator)
+
+            return custom_caller(
+                CollectiveGemmPrimitive.name,
+                args,
+                opaque,
+                has_side_effect=False,
+                operand_output_aliases=operand_output_aliases,
+            )
+
+    @staticmethod
+    def impl(lhs, lhs_scale_inv, rhs, rhs_scale_inv, bias, gelu_input, out_amax, out_scale,
+             out_dtype, contracting_dims, fuse_gelu, fuse_bias, grad, accumulate,
+             use_split_accumulator):
+        assert CollectiveGemmPrimitive.inner_primitive is not None
+
+        (
+            out,
+            out_amax_updated,
+            out_scale_updated,
+            pre_gelu_out,
+            bias_grad,
+            _,
+        ) = CollectiveGemmPrimitive.inner_primitive.bind(
+            lhs,
+            lhs_scale_inv,
+            rhs,
+            rhs_scale_inv,
+            bias,
+            gelu_input,
+            out_amax,
+            out_scale,
+            out_dtype=out_dtype,
+            contracting_dims=contracting_dims,
+            fuse_gelu=fuse_gelu,
+            fuse_bias=fuse_bias,
+            grad=grad,
+            accumulate=accumulate,
+            use_split_accumulator=use_split_accumulator,
+        )
+        return out, out_amax_updated, out_scale_updated, pre_gelu_out, bias_grad
+
+    @staticmethod
+    def batcher(batched_args, batch_dims, *, out_dtype, contracting_dims, fuse_gelu, fuse_bias, grad,
+                accumulate, use_split_accumulator):
+        assert CollectiveGemmPrimitive.outer_primitive is not None
+
+        lhs, lhs_scale_inv, rhs, rhs_scale_inv, bias, gelu_input, out_amax, out_scale = batched_args
+        assert rhs.ndim == 2, "TE/JAX GEMM custom op does not support batching RHS operands."
+
+        # Get contracting and batch dimensions out
+        lhs_inner_dim, rhs_inner_dim = map(
+            lambda inner_dim, ndims: (ndims - inner_dim) if inner_dim < 0 else inner_dim,
+            contracting_dims,
+            (lhs.ndim, rhs.ndim)
+        )
+        lhs_trans = lhs_inner_dim != lhs.ndim - 1
+        rhs_trans = rhs_inner_dim == 1
+        lhs_outer_dim = lhs.ndim - 1 if lhs_trans else lhs.ndim - 2
+        rhs_outer_dim = 0 if rhs_trans else 1
+        lhs_bdims = [dim for dim in range(lhs.ndim) if dim not in [lhs_outer_dim, lhs_inner_dim]]
+
+        # FP8 GEMM only supports lhs_trans = False and rhs_trans = True so we may need to
+        # reorder the axes here to match
+        if jax_dtype_is_fp8(lhs.dtype):
+            lhs = jnp.transpose(lhs, (*lhs_bdims, lhs_outer_dim, lhs_inner_dim))
+            lhs_trans = False
+            rhs = jnp.transpose(rhs, (rhs_outer_dim, rhs_inner_dim))
+            rhs_trans = True
+            contracting_dims = (1, 1)
+
+        # Collapse all non-contracting dimensions
+        batch_shape = [lhs.shape[dim] for dim in lhs_bdims]
+        batch_size = reduce(operator.mul, batch_shape, 1)
+        lhs_outer_size = lhs.shape[lhs_outer_dim]
+        lhs_shape_2d = (
+            (lhs.shape[lhs_inner_dim], batch_size * lhs_outer_size)
+            if lhs_trans
+            else (batch_size * lhs_outer_size, lhs.shape[lhs_inner_dim])
+        )
+        lhs = jnp.reshape(lhs, lhs_shape_2d)
+        if fuse_gelu:
+            gelu_input = jnp.reshape(
+                gelu_input, (batch_size * lhs_outer_size, rhs.shape[rhs_outer_dim])
+            )
+
+        outputs = CollectiveGemmPrimitive.outer_primitive.bind(
+            lhs,
+            lhs_scale_inv,
+            rhs,
+            rhs_scale_inv,
+            bias,
+            gelu_input,
+            out_amax,
+            out_scale,
+            out_dtype=out_dtype,
+            contracting_dims=contracting_dims,
+            fuse_gelu=fuse_gelu,
+            fuse_bias=fuse_bias,
+            grad=grad,
+            accumulate=accumulate,
+            use_split_accumulator=use_split_accumulator,
+        )
+
+        # Reshape output to recover original LHS batch shape
+        outputs[0] = jnp.reshape(
+            outputs[0],
+            (*batch_shape, lhs_outer_size, rhs.shape[rhs_outer_dim])
+        )
+        gelu_bdims = batch_dims[3]
+        if fuse_gelu:
+            outputs[3] = jnp.reshape(outputs[3], outputs[0].shape)
+            gelu_bdims = lhs_bdims
+
+        return (
+            outputs,
+            (lhs_bdims, batch_dims[1], batch_dims[2], gelu_bdims, batch_dims[4])
+        )
+
+    @staticmethod
+    def infer_sharding_from_operands(out_dtype, contracting_dims, fuse_gelu, fuse_bias, grad,
+                                     accumulate, use_split_accumulator, mesh, arg_infos,
+                                     result_infos):
+        del out_dtype, accumulate, use_split_accumulator, result_infos
+        lhs, _, rhs, *_ = arg_infos
+        lhs_spec, rhs_spec = map(get_padded_spec, [lhs, rhs])
+
+        lhs_inner_dim, rhs_inner_dim = map(
+            lambda inner_dim, ndims: (ndims - inner_dim) if inner_dim < 0 else inner_dim,
+            contracting_dims,
+            (lhs.ndim, rhs.ndim)
+        )
+        if lhs_spec[lhs_inner_dim] != rhs_spec[rhs_inner_dim] and not grad:
+            warnings.warn("Forcing the inner dimension of LHS to match the sharding of inner "
+                          + "dimension of RHS. This can trigger additional communication if LHS is "
+                          + "not already partitioned correctly.")
+
+        lhs_trans = lhs_inner_dim != lhs.ndim - 1
+        rhs_trans = rhs_inner_dim == 1
+        lhs_outer_dim = lhs.ndim - 1 if lhs_trans else lhs.ndim - 2
+        rhs_outer_dim = 0 if rhs_trans else 1
+        lhs_bdims = [dim for dim in range(lhs.ndim) if dim not in [lhs_outer_dim, lhs_inner_dim]]
+        batch_specs = [lhs_spec[bdim] for bdim in lhs_bdims]
+        rhs_outer_spec = rhs_spec[rhs_outer_dim]
+
+        if rhs_spec[rhs_inner_dim] is not None and rhs_outer_spec is not None:
+            raise RuntimeError("Both inner and outer dimensions of RHS cannot be sharded.")
+
+        # Outer (sequence) dimension of the GEMM output is always unsharded
+        out_spec = [*batch_specs, None, rhs_outer_spec]
+        out_sharding = NamedSharding(mesh, PartitionSpec(*out_spec))
+
+        # FP8 metas are always unsharded
+        fp8_meta_sharding = NamedSharding(mesh, PartitionSpec(None))
+
+        # Pre-GELU output matches output spec if GELU fusion is turned on, otherwise unsharded
+        gelu_spec = out_spec if fuse_gelu else [None]
+        gelu_sharding = NamedSharding(mesh, PartitionSpec(*gelu_spec))
+
+        # Bias gradient spec matches outer dimension of output if bias fusion is turned on
+        bias_sharding = NamedSharding(mesh, PartitionSpec(rhs_outer_spec if fuse_bias else None))
+
+        return (out_sharding, fp8_meta_sharding, fp8_meta_sharding, gelu_sharding, bias_sharding)
+
+    @staticmethod
+    def partition(out_dtype, contracting_dims, fuse_gelu, fuse_bias, grad, accumulate,
+                  use_split_accumulator, mesh, arg_infos, result_infos):
+        del result_infos
+        lhs, _, rhs, *_ = arg_infos
+        lhs_spec, rhs_spec = map(get_padded_spec, [lhs, rhs])
+
+        lhs_inner_dim, rhs_inner_dim = map(
+            lambda inner_dim, ndims: (ndims - inner_dim) if inner_dim < 0 else inner_dim,
+            contracting_dims,
+            (lhs.ndim, rhs.ndim)
+        )
+
+        lhs_trans = lhs_inner_dim != lhs.ndim - 1
+        rhs_trans = rhs_inner_dim == 1
+        lhs_outer_dim = lhs.ndim - 1 if lhs_trans else lhs.ndim - 2
+        rhs_outer_dim = 0 if rhs_trans else 1
+        lhs_bdims = [dim for dim in range(lhs.ndim) if dim not in [lhs_outer_dim, lhs_inner_dim]]
+        batch_specs = [lhs_spec[bdim] for bdim in lhs_bdims]
+        rhs_outer_spec = rhs_spec[rhs_outer_dim]
+
+        # Force all-gather the outer (sequence) dimension of the LHS operand
+        lhs_spec_new = [spec for spec in lhs_spec]
+        lhs_spec_new[lhs_outer_dim] = None
+        lhs_spec_new[lhs_inner_dim] = rhs_spec[rhs_inner_dim]
+        lhs_sharding = NamedSharding(mesh, PartitionSpec(*lhs_spec_new))
+
+        # RHS operand is unchanged, we already enforce that only one dimension can be sharded
+        rhs_sharding = NamedSharding(mesh, PartitionSpec(*rhs_spec))
+
+        # Bias is sharded to match outer dimension spec of the RHS operand (also the output)
+        bias_sharding = NamedSharding(mesh, PartitionSpec(rhs_outer_spec if fuse_bias else None))
+
+        # FP8 metas are always unsharded
+        fp8_meta_sharding = NamedSharding(mesh, PartitionSpec(None))
+
+        # Outer (sequence) dimension of the GEMM output is always unsharded
+        out_spec = [*batch_specs, None, rhs_outer_spec]
+        out_sharding = NamedSharding(mesh, PartitionSpec(*out_spec))
+
+        # Pre-GELU output matches output spec if GELU fusion is turned on, otherwise unsharded
+        gelu_spec = out_spec if fuse_gelu else [None]
+        gelu_sharding = NamedSharding(mesh, PartitionSpec(*gelu_spec))
+
+        arg_shardings = (lhs_sharding, fp8_meta_sharding, rhs_sharding, fp8_meta_sharding,
+                         bias_sharding, gelu_sharding, fp8_meta_sharding, fp8_meta_sharding)
+        out_shardings = (out_sharding, fp8_meta_sharding, fp8_meta_sharding, gelu_sharding,
+                         bias_sharding)
+
+        def sharded_impl(lhs, lhs_scale_inv, rhs, rhs_scale_inv, bias, gelu_input, out_amax,
+                         out_scale):
+            (
+                out,
+                out_amax_updated,
+                out_scale_updated,
+                pre_gelu_out,
+                bias_grad,
+            ) = CollectiveGemmPrimitive.impl(
+                lhs,
+                lhs_scale_inv,
+                rhs,
+                rhs_scale_inv,
+                bias,
+                gelu_input,
+                out_amax,
+                out_scale,
+                out_dtype=out_dtype,
+                contracting_dims=contracting_dims,
+                fuse_gelu=fuse_gelu,
+                fuse_bias=fuse_bias,
+                grad=grad,
+                accumulate=accumulate,
+                use_split_accumulator=use_split_accumulator,
+            )
+
+            # FP8 amax reduction
+            if jax_dtype_is_fp8(lhs.dtype):
+                out_amax_updated = all_reduce_max_along_all_axes_except_PP(out_amax_updated, mesh)
+
+            if rhs_spec[rhs_inner_dim] is not None:
+                # GEMM output needs to be all-reduced when the contracting dimension is sharded.
+                # If the layer is sequence-parallel, we also need to scatter the output, which we
+                # can combine into a reduce-scatter here.
+                out = lax_paral_op(out, jax.lax.psum, global_mesh_resource().cp_resource,
+                                      mesh)
+                if fuse_gelu:
+                    pre_gelu_out = lax_paral_op(
+                        pre_gelu_out, jax.lax.psum, global_mesh_resource().cp_resource, mesh
+                    )
+
+            return out, out_amax_updated, out_scale_updated, pre_gelu_out, bias_grad
+
+        return mesh, sharded_impl, out_shardings, arg_shardings
+
+
+register_primitive(CollectiveGemmPrimitive)
+
+
+def fp8_gemm_impl(
+    lhs: ArrayLike,
+    lhs_scale_inv: ArrayLike,
+    rhs: ArrayLike,
+    rhs_scale_inv: ArrayLike,
+    bias:  Optional[ArrayLike] = None,
+    gelu_input: Optional[ArrayLike] = None,
+    out_amax:  Optional[ArrayLike] = None,
+    out_scale:  Optional[ArrayLike] = None,
+    out_dtype: jnp.dtype = jnp.bfloat16,
+    contracting_dims: Tuple[int, int] = (1, 1),
+    fuse_gelu: bool = False,
+    fuse_bias: bool = False,
+    accumulate: bool = False,
+    use_split_accumulator: bool = False,
+) -> Tuple[ArrayLike, ...]:
+    """FP8 mat-mul with `nvte_cublas_gemm()` custom op."""
+    if out_dtype is not None and jax_dtype_is_fp8(out_dtype):
+        assert out_amax is not None and out_scale is not None, "Missing output amax and scale."
+    else:
+        out_amax = jnp.zeros(0, dtype=jnp.float32)
+        out_scale = jnp.zeros(0, dtype=jnp.float32)
+
+    if not fuse_bias:
+        bias = jnp.zeros(0, dtype=jnp.bfloat16)
+    else:
+        assert (
+            bias is not None
+        ), "Missing bias in forward GEMM when bias epilogue is enabled."
+
+    if not fuse_gelu:
+        gelu_input = jnp.zeros(0, dtype=bias.dtype)
+    elif gelu_input is None:
+        lhs_outer_dim = lhs.ndim - 1 if contracting_dims[0] == 1 else lhs.ndim - 2
+        rhs_outer_dim = 1 if contracting_dims[1] == 0 else 0
+        out_shape = (*lhs.shape[:-2], lhs.shape[lhs_outer_dim], rhs.shape[rhs_outer_dim])
+        gelu_input = jnp.zeros(out_shape, dtype=bias.dtype)
+
+    out, out_amax, out_scale, pre_gelu_out, _ = CollectiveGemmPrimitive.outer_primitive.bind(
+        rhs,
+        rhs_scale_inv,
+        lhs,
+        lhs_scale_inv,
+        bias,
+        gelu_input,
+        out_amax,
+        out_scale,
+        out_dtype=out_dtype,
+        contracting_dims=tuple(reversed(contracting_dims)),
+        fuse_gelu=fuse_gelu,
+        fuse_bias=fuse_bias,
+        grad=False,
+        accumulate=accumulate,
+        use_split_accumulator=use_split_accumulator,
+    )
+
+    return out, out_amax, out_scale, pre_gelu_out
+
+
+def gemm_impl(
+    lhs: ArrayLike,
+    rhs: ArrayLike,
+    bias:  Optional[ArrayLike] = None,
+    gelu_input:  Optional[ArrayLike] = None,
+    contracting_dims: Tuple[int, int] = (1, 0),
+    fuse_gelu: bool = False,
+    fuse_bias: bool = False,
+    grad: bool = False,
+    accumulate: bool = False,
+    use_split_accumulator: bool = False,
+) -> Tuple[ArrayLike, ...]:
+    """Non-FP8 mat-mul with `nvte_cublas_gemm()` custom op."""
+    dummy_fp8_meta = jnp.zeros(0, dtype=jnp.float32)
+
+    lhs_outer_dim = lhs.ndim - 1 if contracting_dims[0] == 1 else lhs.ndim - 2
+    rhs_outer_dim = 1 if contracting_dims[1] == 0 else 0
+    out_shape = (*lhs.shape[:-2], lhs.shape[lhs_outer_dim], rhs.shape[rhs_outer_dim])
+
+    if not fuse_bias:
+        bias = jnp.zeros(0, dtype=lhs.dtype)
+    elif grad:
+        bias = jnp.zeros(out_shape[-1], dtype=lhs.dtype)
+    else:
+        assert (
+            bias is not None
+        ), "Missing bias in forward GEMM when bias epilogue is enabled."
+
+    if not fuse_gelu:
+        gelu_input = jnp.zeros(0, dtype=lhs.dtype)
+    elif grad:
+        assert (
+            gelu_input is not None
+        ), "Backward GEMM with dGELU epilogue requires pre-GELU output from forward GEMM."
+    elif gelu_input is None:
+        lhs_outer_dim = lhs.ndim - 1 if contracting_dims[0] == 1 else lhs.ndim - 2
+        rhs_outer_dim = 1 if contracting_dims[1] == 0 else 0
+        out_shape = (*lhs.shape[:-2], lhs.shape[lhs_outer_dim], rhs.shape[rhs_outer_dim])
+        gelu_input = jnp.zeros(out_shape, dtype=lhs.dtypes)
+
+    out, _, _, pre_gelu_out, bias_grad = CollectiveGemmPrimitive.outer_primitive.bind(
+        lhs,
+        dummy_fp8_meta,
+        rhs,
+        dummy_fp8_meta,
+        bias,
+        gelu_input,
+        dummy_fp8_meta,
+        dummy_fp8_meta,
+        out_dtype=lhs.dtype,
+        contracting_dims=contracting_dims,
+        fuse_gelu=fuse_gelu,
+        fuse_bias=fuse_bias,
+        grad=grad,
+        accumulate=accumulate,
+        use_split_accumulator=use_split_accumulator,
+    )
+
+    if grad:
+        return out, pre_gelu_out, bias_grad
+    else:
+        return out, pre_gelu_out
diff --git a/transformer_engine/jax/cpp_extensions/misc.py b/transformer_engine/jax/cpp_extensions/misc.py
index 1f13484b98..15d7537fbd 100644
--- a/transformer_engine/jax/cpp_extensions/misc.py
+++ b/transformer_engine/jax/cpp_extensions/misc.py
@@ -81,6 +81,13 @@ def jax_dtype_to_te_dtype(jax_dtype):
     return converter.get(jax_dtype)
 
 
+def jax_dtype_is_fp8(dtype):
+    """
+    Check if the given jax.numpy.dtype is an FP8 dtype.
+    """
+    return dtypes.canonicalize_dtype(dtype) in [jnp.float8_e4m3fn, jnp.float8_e5m2]
+
+
 def get_padded_spec(arg_info):
     """
     Get padded spec for partitioning from arguments' information
diff --git a/transformer_engine/jax/csrc/extensions.h b/transformer_engine/jax/csrc/extensions.h
index 02e6aaf9d5..afac283a6f 100644
--- a/transformer_engine/jax/csrc/extensions.h
+++ b/transformer_engine/jax/csrc/extensions.h
@@ -147,6 +147,31 @@ pybind11::bytes PackCustomCallFusedAttnDescriptor(
     NVTE_QKV_Layout qkv_layout, DType dtype, DType wkspace_dtype, bool is_training,
     bool deterministic, int64_t window_size_left, int64_t window_size_right);
 
+struct CustomCallGemmDescriptor {
+  size_t batch;
+  size_t m;
+  size_t k;
+  size_t n;
+  size_t workspace_size;
+  DType operand_dtype;
+  DType bias_dtype;
+  DType out_dtype;
+  bool lhs_trans;
+  bool rhs_trans;
+  bool fuse_gelu;
+  bool fuse_bias;
+  bool grad;
+  bool accumulate;
+  bool use_split_accumulator;
+};
+
+pybind11::bytes PackCustomCallGemmDescriptor(size_t batch, size_t m, size_t n, size_t k,
+                                             size_t workspace_size, DType operand_dtype,
+                                             DType out_dtype, DType bias_dtype, bool lhs_trans,
+                                             bool rhs_trans, bool fuse_gelu, bool fuse_bias,
+                                             bool grad, bool accumulate,
+                                             bool use_split_accumulator);
+
 // Transpose
 
 void Transpose(cudaStream_t stream, void **buffers, const char *opaque, size_t opaque_len);
@@ -308,6 +333,20 @@ void FusedAttnBackward(cudaStream_t stream, void **buffers, const char *opaque,
 
 XLA_FFI_DECLARE_HANDLER_SYMBOL(FusedAttnBackwardHandler);
 
+// GEMM
+
+void Gemm(cudaStream_t stream, void **buffers, const char *opaque, size_t opaque_len);
+
+Error_Type GemmFFI(cudaStream_t stream, Buffer_Type lhs, Buffer_Type lhs_scale_inv, Buffer_Type rhs,
+                   Buffer_Type rhs_scale_inv, Buffer_Type bias, Buffer_Type gelu_input,
+                   Buffer_Type out_amax, Buffer_Type out_scale, Result_Type out,
+                   Result_Type out_amax_updated, Result_Type out_scale_updated,
+                   Result_Type pre_gelu_out, Result_Type bias_grad, Result_Type workspace,
+                   bool lhs_trans, bool rhs_trans, bool fuse_gelu, bool fuse_bias, bool grad,
+                   bool accumulate, bool use_split_accumulator);
+
+XLA_FFI_DECLARE_HANDLER_SYMBOL(GemmHandler);
+
 }  // namespace jax
 }  // namespace transformer_engine
 
diff --git a/transformer_engine/jax/csrc/extensions/gemm.cpp b/transformer_engine/jax/csrc/extensions/gemm.cpp
new file mode 100644
index 0000000000..f60ae510df
--- /dev/null
+++ b/transformer_engine/jax/csrc/extensions/gemm.cpp
@@ -0,0 +1,170 @@
+/*************************************************************************
+ * Copyright (c) 2022-2024, NVIDIA CORPORATION & AFFILIATES. All rights reserved.
+ *
+ * See LICENSE for license information.
+ ************************************************************************/
+
+#include "transformer_engine/gemm.h"
+
+#include "common/util/cuda_runtime.h"
+#include "common/util/system.h"
+#include "extensions.h"
+
+namespace transformer_engine {
+
+namespace jax {
+
+void GemmImpl(cudaStream_t stream, void *lhs, const std::vector<size_t> &lhs_shape,
+              float *lhs_scale_inv, bool lhs_trans, void *rhs, const std::vector<size_t> &rhs_shape,
+              float *rhs_scale_inv, bool rhs_trans, DType operand_dtype, void *bias,
+              DType bias_dtype, void *out, float *out_amax, float *out_scale, DType out_dtype,
+              void *pre_gelu_out, void *workspace, size_t workspace_size, bool fuse_gelu,
+              bool fuse_bias, bool grad, bool accumulate, bool use_split_accumulator) {
+  auto lhs_ = TensorWrapper(lhs, lhs_shape, operand_dtype, nullptr, nullptr, lhs_scale_inv);
+  auto rhs_ = TensorWrapper(rhs, rhs_shape, operand_dtype, nullptr, nullptr, rhs_scale_inv);
+
+  std::vector<size_t> out_shape(2, 0);
+  out_shape[0] = (lhs_trans) ? lhs_shape[1] : lhs_shape[0];
+  out_shape[1] = (rhs_trans) ? rhs_shape[0] : rhs_shape[1];
+  auto out_ = TensorWrapper(out, out_shape, out_dtype, out_amax, out_scale, nullptr);
+
+  void *bias_ptr = (fuse_bias) ? bias : nullptr;
+  std::vector<size_t> bias_shape = (fuse_bias) ? std::vector<size_t>{out_shape[1]}
+                                               : std::vector<size_t>{0};
+  auto bias_ = TensorWrapper(bias_ptr, bias_shape, bias_dtype);
+
+  void *pre_gelu_ptr = (fuse_gelu) ? pre_gelu_out : nullptr;
+  std::vector<size_t> pre_gelu_shape = (fuse_gelu) ? out_shape : std::vector<size_t>{0};
+  auto pre_gelu_out_ = TensorWrapper(pre_gelu_ptr, pre_gelu_shape, bias_dtype);
+  auto workspace_ = TensorWrapper(workspace, std::vector<size_t>{workspace_size}, DType::kByte);
+
+  // cuBLAS is column-major, so we swap LHS and RHS in the arguments
+  auto num_math_sm = cuda::sm_count() - getenv<int>("NVTE_EXT_MARGIN_SM", 0);
+  nvte_cublas_gemm(rhs_.data(), lhs_.data(), out_.data(), bias_.data(), pre_gelu_out_.data(),
+                   (rhs_trans) ? CUBLAS_OP_T : CUBLAS_OP_N, (lhs_trans) ? CUBLAS_OP_T : CUBLAS_OP_N,
+                   grad, workspace_.data(), accumulate, use_split_accumulator, num_math_sm, stream);
+}
+
+void Gemm(cudaStream_t stream, void **buffers, const char *opaque, size_t opaque_len) {
+  // Inputs
+  auto *lhs = buffers[0];
+  auto *lhs_scale_inv = reinterpret_cast<float *>(buffers[1]);
+  auto *rhs = buffers[2];
+  auto *rhs_scale_inv = reinterpret_cast<float *>(buffers[3]);
+  auto *bias = buffers[4];
+  auto *gelu_input = buffers[5];
+  auto *out_amax = reinterpret_cast<float *>(buffers[6]);
+  auto *out_scale = reinterpret_cast<float *>(buffers[7]);
+
+  // Outputs
+  auto *out = buffers[8];
+  auto *out_amax_updated = reinterpret_cast<float *>(buffers[9]);
+  auto *out_scale_updated = reinterpret_cast<float *>(buffers[10]);
+  auto *pre_gelu_out = buffers[11];
+  auto *bias_grad = buffers[12];
+  auto *workspace = buffers[13];
+
+  // Operand aliasing
+  NVTE_CHECK(bias == bias_grad,
+             "bias not bound to bias_grad in TE/JAX GEMM");
+  NVTE_CHECK(gelu_input == pre_gelu_out,
+             "gelu_input not bound to pre_gelu_out in TE/JAX GEMM");
+  NVTE_CHECK(out_amax == out_amax_updated,
+             "out_amax not bound to out_amax_updated in TE/JAX GEMM");
+  NVTE_CHECK(out_scale == out_scale_updated,
+             "out_scale not bound to out_scale_updated in TE/JAX GEMM");
+
+  // GEMM sizing
+  const auto &desc = *UnpackOpaque<CustomCallGemmDescriptor>(opaque, opaque_len);
+  std::vector<size_t> lhs_shape = {(desc.lhs_trans) ? desc.k : desc.m,
+                                   (desc.lhs_trans) ? desc.m : desc.k};
+  std::vector<size_t> rhs_shape = {(desc.rhs_trans) ? desc.n : desc.k,
+                                   (desc.rhs_trans) ? desc.k : desc.n};
+
+  GemmImpl(stream, lhs, lhs_shape, lhs_scale_inv, desc.lhs_trans, rhs, rhs_shape, rhs_scale_inv,
+           desc.rhs_trans, desc.operand_dtype, bias, desc.bias_dtype, out, out_amax, out_scale,
+           desc.out_dtype, pre_gelu_out, workspace, desc.workspace_size, desc.fuse_gelu,
+           desc.fuse_bias, desc.grad, desc.accumulate, desc.use_split_accumulator);
+}
+
+Error_Type GemmFFI(cudaStream_t stream, Buffer_Type lhs, Buffer_Type lhs_scale_inv, Buffer_Type rhs,
+                   Buffer_Type rhs_scale_inv, Buffer_Type bias, Buffer_Type gelu_input,
+                   Buffer_Type out_amax, Buffer_Type out_scale, Result_Type out,
+                   Result_Type out_amax_updated, Result_Type out_scale_updated,
+                   Result_Type pre_gelu_out, Result_Type bias_grad, Result_Type workspace,
+                   bool lhs_trans, bool rhs_trans, bool fuse_gelu, bool fuse_bias, bool grad,
+                   bool accumulate, bool use_split_accumulator) {
+  // Inputs
+  auto lhs_ptr = lhs.untyped_data();
+  auto lhs_scale_inv_ptr = reinterpret_cast<float *>(lhs_scale_inv.untyped_data());
+  auto rhs_ptr = rhs.untyped_data();
+  auto rhs_scale_inv_ptr = reinterpret_cast<float *>(rhs_scale_inv.untyped_data());
+  auto operand_dtype = convert_ffi_datatype_to_te_dtype(lhs.element_type());
+  auto bias_ptr = bias.untyped_data();
+  auto bias_dtype = convert_ffi_datatype_to_te_dtype(bias.element_type());
+  auto gelu_input_ptr = gelu_input.untyped_data();
+  auto out_amax_ptr = reinterpret_cast<float *>(out_amax.untyped_data());
+  auto out_scale_ptr = reinterpret_cast<float *>(out_scale.untyped_data());
+
+  // Outputs
+  auto out_ptr = out->untyped_data();
+  auto out_amax_updated_ptr = reinterpret_cast<float *>(out_amax_updated->untyped_data());
+  auto out_scale_updated_ptr = reinterpret_cast<float *>(out_scale_updated->untyped_data());
+  auto out_dtype = convert_ffi_datatype_to_te_dtype(out->element_type());
+  auto pre_gelu_out_ptr = pre_gelu_out->untyped_data();
+  auto bias_grad_ptr = bias_grad->untyped_data();
+  auto workspace_ptr = workspace->untyped_data();
+  auto workspace_size = workspace->dimensions().back();
+
+  // Operand aliasing
+  NVTE_CHECK(bias_ptr == bias_grad_ptr,
+             "bias not bound to bias_grad in TE/JAX GEMM");
+  NVTE_CHECK(gelu_input_ptr == pre_gelu_out_ptr,
+             "gelu_input not bound to pre_gelu_out in TE/JAX GEMM");
+  NVTE_CHECK(out_amax_ptr == out_amax_updated_ptr,
+             "out_amax not bound to out_amax_updated in TE/JAX GEMM");
+  NVTE_CHECK(out_scale_ptr == out_scale_updated_ptr,
+             "out_scale not bound to out_scale_updated in TE/JAX GEMM");
+
+  // GEMM sizing
+  std::vector<size_t> lhs_shape(lhs.dimensions().begin(), lhs.dimensions().end());
+  std::vector<size_t> rhs_shape(rhs.dimensions().begin(), rhs.dimensions().end());
+
+  // Swap A and B argument locations to match what the TE/common kernel expects
+  GemmImpl(stream, lhs_ptr, lhs_shape, lhs_scale_inv_ptr, lhs_trans, rhs_ptr, rhs_shape,
+           rhs_scale_inv_ptr, rhs_trans, operand_dtype, bias_ptr, bias_dtype, out_ptr, out_amax_ptr,
+           out_scale_ptr, out_dtype, pre_gelu_out_ptr, workspace_ptr, workspace_size, fuse_gelu,
+           fuse_bias, grad, accumulate, use_split_accumulator);
+
+  return ffi_with_cuda_error_check();
+}
+
+XLA_FFI_DEFINE_HANDLER_SYMBOL(GemmHandler, GemmFFI,
+                              FFI::Bind()
+                                  .Ctx<FFI_Stream_Type>()  // stream
+                                  .Arg<Buffer_Type>()      // lhs
+                                  .Arg<Buffer_Type>()      // lhs_scale_inv
+                                  .Arg<Buffer_Type>()      // rhs
+                                  .Arg<Buffer_Type>()      // rhs_scale_inv
+                                  .Arg<Buffer_Type>()      // bias
+                                  .Arg<Buffer_Type>()      // gelu_input
+                                  .Arg<Buffer_Type>()      // out_amax
+                                  .Arg<Buffer_Type>()      // out_scale
+                                  .Ret<Buffer_Type>()      // out
+                                  .Ret<Buffer_Type>()      // out_amax_updated
+                                  .Ret<Buffer_Type>()      // out_scale_updated
+                                  .Ret<Buffer_Type>()      // pre_gelu_out
+                                  .Ret<Buffer_Type>()      // bias_grad
+                                  .Ret<Buffer_Type>()      // workspace
+                                  .Attr<bool>("lhs_trans")
+                                  .Attr<bool>("rhs_trans")
+                                  .Attr<bool>("fuse_gelu")
+                                  .Attr<bool>("fuse_bias")
+                                  .Attr<bool>("grad")
+                                  .Attr<bool>("accumulate")
+                                  .Attr<bool>("use_split_accumulator"),
+                              FFI_CudaGraph_Traits);
+
+}  // namespace jax
+
+}  // namespace transformer_engine
diff --git a/transformer_engine/jax/csrc/extensions/packing.cpp b/transformer_engine/jax/csrc/extensions/packing.cpp
index 298478603b..1a9ce987af 100644
--- a/transformer_engine/jax/csrc/extensions/packing.cpp
+++ b/transformer_engine/jax/csrc/extensions/packing.cpp
@@ -80,5 +80,16 @@ pybind11::bytes PackCustomCallFusedAttnDescriptor(
                                     deterministic, window_size_left, window_size_right});
 }
 
+pybind11::bytes PackCustomCallGemmDescriptor(size_t batch, size_t m, size_t n, size_t k,
+                                             size_t workspace_size, DType operand_dtype,
+                                             DType bias_dtype, DType out_dtype, bool lhs_trans,
+                                             bool rhs_trans, bool fuse_gelu, bool fuse_bias,
+                                             bool grad, bool accumulate,
+                                             bool use_split_accumulator) {
+  return PackOpaque(CustomCallGemmDescriptor{batch, m, n, k, workspace_size, operand_dtype,
+                                             bias_dtype, out_dtype, lhs_trans, rhs_trans, fuse_gelu,
+                                             fuse_bias, grad, accumulate, use_split_accumulator});
+}
+
 }  // namespace jax
 }  // namespace transformer_engine
diff --git a/transformer_engine/jax/csrc/extensions/pybind.cpp b/transformer_engine/jax/csrc/extensions/pybind.cpp
index 9b5c156e5d..7b8ebdcdd2 100644
--- a/transformer_engine/jax/csrc/extensions/pybind.cpp
+++ b/transformer_engine/jax/csrc/extensions/pybind.cpp
@@ -51,6 +51,7 @@ pybind11::dict Registrations() {
       EncapsulateFunction(ScaledUpperTriangMaskedSoftmaxBackward);
   dict["te_fused_attn_forward"] = EncapsulateFunction(FusedAttnForward);
   dict["te_fused_attn_backward"] = EncapsulateFunction(FusedAttnBackward);
+  dict["te_gemm"] = EncapsulateFunction(Gemm);
 
   // Transpose
   dict["te_transpose_ffi"] = EncapsulateFFI(TransposeHandler);
@@ -101,6 +102,7 @@ pybind11::dict Registrations() {
   fused_attn_backward_ffi["execute"] = EncapsulateFFI(FusedAttnBackwardHandler);
   dict["te_fused_attn_backward_ffi"] = fused_attn_backward_ffi;
 
+  dict["te_gemm_ffi"] = EncapsulateFFI(GemmHandler);
   return dict;
 }
 
@@ -114,10 +116,11 @@ PYBIND11_MODULE(transformer_engine_jax, m) {
   m.def("pack_norm_descriptor", &PackCustomCallNormDescriptor);
   m.def("pack_softmax_descriptor", &PackCustomCallSoftmaxDescriptor);
   m.def("pack_fused_attn_descriptor", &PackCustomCallFusedAttnDescriptor);
+  m.def("pack_gemm_descriptor", &PackCustomCallGemmDescriptor);
   m.def("get_fused_attn_backend", &GetFusedAttnBackend);
   m.def("get_cuda_version", &GetCudaRuntimeVersion);
   m.def("get_cudnn_version", &GetCudnnRuntimeVersion);
-  m.def("get_device_compute_capability", &GetDeviceComputeCapability);
+  m.def("get_device_compute_capability", &GetDeviceComputeCapability, pybind11::arg("gpu_id") = -1);
   m.def("get_cublasLt_version", &cublasLtGetVersion);
   m.def("get_dact_dbias_ct_workspace_sizes", &GetDActDBiasCastTransposeWorkspaceSizes);
   m.def("get_dbias_ct_workspace_sizes", &GetDBiasCastTransposeWorkspaceSizes);
diff --git a/transformer_engine/jax/csrc/utils.h b/transformer_engine/jax/csrc/utils.h
index 32de33bac9..b328c6e278 100644
--- a/transformer_engine/jax/csrc/utils.h
+++ b/transformer_engine/jax/csrc/utils.h
@@ -23,7 +23,7 @@ namespace jax {
 
 int GetCudaRuntimeVersion();
 size_t GetCudnnRuntimeVersion();
-int GetDeviceComputeCapability(int gpu_id);
+int GetDeviceComputeCapability(int gpu_id = -1);
 
 void PopulateRngStateAsync(void *rng_state_dst, const void *const seed, size_t q_max_seqlen,
                            size_t kv_max_seqlen, NVTE_Fused_Attn_Backend backend,
diff --git a/transformer_engine/jax/flax/module.py b/transformer_engine/jax/flax/module.py
index 8b13c47cd4..7312aa8295 100644
--- a/transformer_engine/jax/flax/module.py
+++ b/transformer_engine/jax/flax/module.py
@@ -334,6 +334,7 @@ def generate_fp8_meta_set(postfix: str) -> FP8MetaPackage:
         input_name_post_fix = f"_i_{postfix}"
         weight_name_post_fix = f"_w_{postfix}"
         grad_name_post_fix = f"_g_{postfix}"
+        output_name_post_fix = f"_o_{postfix}"
 
         def generate_a_set(target_postfix):
             amax = nn_partitioning.variable_with_axes(
@@ -359,10 +360,10 @@ def generate_a_set(target_postfix):
         input_amax, input_scale = generate_a_set(input_name_post_fix)
         weight_amax, weight_scale = generate_a_set(weight_name_post_fix)
         grad_amax, grad_scale = generate_a_set(grad_name_post_fix)
+        output_amax, output_scale = generate_a_set(output_name_post_fix)
 
-        return FP8MetaPackage(
-            input_amax, input_scale, weight_amax, weight_scale, grad_amax, grad_scale
-        )
+        return FP8MetaPackage(input_amax, input_scale, weight_amax, weight_scale, grad_amax,
+                              grad_scale, output_amax, output_scale)
 
 
 class DenseGeneral(TransformerEngineBase):
diff --git a/transformer_engine/jax/fp8.py b/transformer_engine/jax/fp8.py
index 5df8ce4386..3d58c86e3e 100644
--- a/transformer_engine/jax/fp8.py
+++ b/transformer_engine/jax/fp8.py
@@ -86,10 +86,11 @@ class FP8MetaPackage:
     A container that contains all required meta data for FP8
     """
 
-    NUM_OF_META: int = 3
+    NUM_OF_META: int = 4
     INPUT_IDX: int = 0
     WEIGHT_IDX: int = 1
     GRAD_IDX: int = 2
+    OUTPUT_IDX: int = 3
 
     def __init__(
         self,
@@ -99,6 +100,8 @@ def __init__(
         weight_scale: jnp.ndarray,
         grad_amax: jnp.ndarray,
         grad_scale: jnp.ndarray,
+        output_amax: jnp.ndarray,
+        output_scale: jnp.ndarray,
     ) -> None:
 
         self._amax_list = [None] * FP8MetaPackage.NUM_OF_META
@@ -110,6 +113,8 @@ def __init__(
         self._scale_list[FP8MetaPackage.WEIGHT_IDX] = weight_scale
         self._amax_list[FP8MetaPackage.GRAD_IDX] = grad_amax
         self._scale_list[FP8MetaPackage.GRAD_IDX] = grad_scale
+        self._amax_list[FP8MetaPackage.OUTPUT_IDX] = output_amax
+        self._scale_list[FP8MetaPackage.OUTPUT_IDX] = output_scale
 
     @property
     def amax_list(self) -> List[jnp.ndarray]:
diff --git a/transformer_engine/jax/gemm.py b/transformer_engine/jax/gemm.py
new file mode 100644
index 0000000000..ccd109e095
--- /dev/null
+++ b/transformer_engine/jax/gemm.py
@@ -0,0 +1,425 @@
+# Copyright (c) 2022-2024, NVIDIA CORPORATION & AFFILIATES. All rights reserved.
+#
+# See LICENSE for license information.
+from functools import partial
+from typing import Optional, Tuple, Union
+
+import jax
+import jax.numpy as jnp
+from jax.typing import ArrayLike
+from jax.ad_checkpoint import checkpoint_name
+
+from .fp8 import FP8Helper, FP8MetaPackage
+from .cpp_extensions import (
+    gemm_impl,
+    fp8_gemm_impl,
+    cast_fp8,
+    cast_transpose,
+    dact_lu,
+    dbias_cast_transpose,
+    dact_lu_dbias_cast_transpose,
+)
+
+
+
+__all__ = [
+    "gemm",
+    "fp8_gemm",
+    "type_safe_gemm",
+]
+
+
+def gemm(
+    x: ArrayLike,
+    kernel: ArrayLike,
+    bias: Optional[ArrayLike] = None,
+    contracting_dims: Tuple[int, int] = (1, 0),
+    fuse_gelu: bool = False,
+    accumulate: bool = False,
+    use_split_accumulator: bool = False,
+) -> ArrayLike:
+    """Non-FP8 collective/distributed `nvte_cublas_gemm()` with GELU and bias-add fusions."""
+    return _gemm(x, kernel, bias, contracting_dims, fuse_gelu, accumulate, use_split_accumulator)
+
+
+@partial(jax.custom_vjp, nondiff_argnums=(3, 4, 5, 6))
+def _gemm(
+    x: ArrayLike,
+    kernel: ArrayLike,
+    bias: Union[ArrayLike, None],
+    contracting_dims: Tuple[int, int],
+    fuse_gelu: bool,
+    accumulate: bool,
+    use_split_accumulator: bool,
+) -> ArrayLike:
+    out, _ = _gemm_fwd_rule(x, kernel, bias, contracting_dims, fuse_gelu, accumulate,
+                            use_split_accumulator)
+    return out
+
+
+def _gemm_fwd_rule(
+    x: ArrayLike,
+    kernel: ArrayLike,
+    bias: ArrayLike,
+    contracting_dims: Tuple[int, int],
+    fuse_gelu: bool,
+    accumulate: bool,
+    use_split_accumulator: bool,
+) -> Tuple[ArrayLike, ...]:
+    fuse_bias = bias is not None
+
+    out, pre_gelu_out = gemm_impl(
+        x,
+        kernel,
+        bias=bias,
+        contracting_dims=contracting_dims,
+        fuse_gelu=fuse_gelu,
+        fuse_bias=fuse_bias,
+        accumulate=accumulate,
+        use_split_accumulator=use_split_accumulator
+    )
+
+    ctx = (
+        x,
+        kernel,
+        pre_gelu_out if fuse_gelu else None,
+        fuse_bias,
+    )
+
+    return out, ctx
+
+
+def _gemm_bwd_rule(
+    contracting_dims,
+    fuse_gelu,
+    accumulate,
+    use_split_accumulator,
+    ctx,
+    grad,
+):
+    x, kernel, pre_gelu_out, fuse_bias = ctx
+
+    x_t_contracting = 0 if contracting_dims[0] == 1 else 1
+    wgrad, dgelu, bgrad = gemm_impl(
+        x,
+        grad,
+        gelu_input=pre_gelu_out,
+        contracting_dims=(x_t_contracting, 0),
+        fuse_gelu=fuse_gelu,
+        fuse_bias=fuse_bias,
+        grad=True,
+        accumulate=accumulate,
+        use_split_accumulator=use_split_accumulator,
+    )
+
+    kernel_t_contracting = 1 if contracting_dims[1] == 0 else 0
+    dgrad, *_ = gemm_impl(
+        dgelu if fuse_gelu else grad,
+        kernel,
+        gelu_input=pre_gelu_out,
+        contracting_dims=(1, kernel_t_contracting),
+        fuse_gelu=fuse_gelu,
+        fuse_bias=fuse_bias,
+        grad=True,
+        accumulate=accumulate,
+        use_split_accumulator=use_split_accumulator,
+    )
+
+    if not fuse_bias:
+        bgrad = None
+
+    return dgrad, wgrad, bgrad
+
+
+_gemm.defvjp(_gemm_fwd_rule, _gemm_bwd_rule)
+
+
+def fp8_gemm(
+    x: ArrayLike,
+    kernel: ArrayLike,
+    fp8_meta: FP8MetaPackage,
+    bias: Optional[ArrayLike] = None,
+    out_dtype: jnp.dtype = jnp.bfloat16,
+    contracting_dims: Tuple[int, int] = (1, 1),
+    fuse_gelu: bool = False,
+    accumulate: bool = False,
+    use_split_accumulator: bool = False,
+) -> ArrayLike:
+    return _fp8_gemm(x, kernel, bias, fp8_meta.amax_list, fp8_meta.scale_list, out_dtype,
+                     contracting_dims, fuse_gelu, accumulate, use_split_accumulator)
+
+
+@partial(jax.custom_vjp, nondiff_argnums=(5, 6, 7, 8, 9))
+def _fp8_gemm(
+    x: ArrayLike,
+    kernel: ArrayLike,
+    bias: ArrayLike,
+    amax_list: ArrayLike,
+    scale_list: ArrayLike,
+    out_dtype: jnp.dtype,
+    contracting_dims: Tuple[int, int],
+    fuse_gelu: bool,
+    accumulate: bool,
+    use_split_accumulator: bool,
+) -> ArrayLike:
+    """Non-FP8 `nvte_cublas_gemm()` with optional GELU and bias-add fusions."""
+    out, _ = _fp8_gemm_fwd_rule(x, kernel, bias, amax_list, scale_list, out_dtype,
+                                contracting_dims, fuse_gelu, accumulate, use_split_accumulator)
+    return out
+
+
+def _fp8_gemm_fwd_rule(
+    x: ArrayLike,
+    kernel: ArrayLike,
+    bias: ArrayLike,
+    amax_list: ArrayLike,
+    scale_list: ArrayLike,
+    out_dtype: jnp.dtype,
+    contracting_dims: Tuple[int, int],
+    fuse_gelu: bool,
+    accumulate: bool,
+    use_split_accumulator: bool,
+) -> Tuple[ArrayLike, ...]:
+    fuse_bias = bias is not None
+
+    maybe_fm32_to_fp32, maybe_fp32_to_fm32 = FP8Helper.generate_fp8_meta_dtype_converter_pair(
+        *amax_list, *scale_list,
+    )
+    amax_list = maybe_fm32_to_fp32(*amax_list)
+    scale_list = maybe_fm32_to_fp32(*scale_list)
+
+    fwd_dtype = FP8Helper.FWD_DTYPE
+    bwd_dtype = FP8Helper.BWD_DTYPE
+    fp8_dtype_list = [fwd_dtype, fwd_dtype, bwd_dtype, fwd_dtype]
+    scale_list, scale_inv_list = FP8MetaPackage.update_fp8_scale(
+        amax_list, scale_list, fp8_dtype_list
+    )
+    amax_list = FP8MetaPackage.update_amax_list(amax_list)
+
+    x_amax = amax_list[FP8MetaPackage.INPUT_IDX][0:1]
+    x_scale = scale_list[FP8MetaPackage.INPUT_IDX]
+    x_scale_inv = scale_inv_list[FP8MetaPackage.INPUT_IDX]
+    if x.dtype not in [jnp.float8_e4m3fn, jnp.float8_e5m2]:
+        if contracting_dims[0] == 0:
+            _, casted_x, updated_x_amax = cast_transpose(
+                x,
+                x_amax,
+                x_scale,
+                x_scale_inv,
+                fwd_dtype,
+                static_axis_boundary=-1,
+                transpose_axis_boundary=-1,
+            )
+        else:
+            casted_x, updated_x_amax = cast_fp8(x, x_amax, x_scale, x_scale_inv, fwd_dtype)
+    else:
+        if contracting_dims[0] == 0:
+            casted_x_t = x
+            casted_x = casted_x_t.transpose()
+        else:
+            casted_x = x
+        updated_x_amax = x_amax
+
+    kernel_amax = amax_list[FP8MetaPackage.WEIGHT_IDX][0:1]
+    kernel_scale = scale_list[FP8MetaPackage.WEIGHT_IDX]
+    kernel_scale_inv = scale_inv_list[FP8MetaPackage.WEIGHT_IDX]
+    if kernel.dtype not in [jnp.float8_e4m3fn, jnp.float8_e5m2]:
+        if contracting_dims[1] == 0:  # need to transpose the kernel for FP8 GEMM
+            _, casted_kernel_t, updated_kernel_amax = cast_transpose(
+                kernel,
+                kernel_amax,
+                kernel_scale,
+                kernel_scale_inv,
+                fwd_dtype,
+                static_axis_boundary=-1,
+                transpose_axis_boundary=-1,
+            )
+        else:
+            casted_kernel_t, updated_kernel_amax = cast_fp8(
+                kernel,
+                kernel_amax,
+                kernel_scale,
+                kernel_scale_inv,
+                fwd_dtype,
+            )
+    else:
+        if contracting_dims[1] == 0:
+            casted_kernel = kernel
+            casted_kernel_t = casted_kernel.transpose()
+        else:
+            casted_kernel_t = kernel
+        updated_kernel_amax = kernel_amax
+
+    out_amax = (
+        amax_list[FP8MetaPackage.OUTPUT_IDX][0:1]
+        if out_dtype in [jnp.float8_e4m3fn, jnp.float8_e5m2]
+        else None
+    )
+    out_scale = (
+        scale_list[FP8MetaPackage.OUTPUT_IDX][0:1]
+        if out_dtype in [jnp.float8_e4m3fn, jnp.float8_e5m2]
+        else None
+    )
+    out, updated_out_amax, updated_out_scale, pre_gelu_out = fp8_gemm_impl(
+        casted_x,
+        x_scale_inv,
+        casted_kernel_t,
+        kernel_scale_inv,
+        bias=bias,
+        out_amax=out_amax,
+        out_scale=out_scale,
+        out_dtype=out_dtype,
+        fuse_gelu=fuse_gelu,
+        fuse_bias=fuse_bias,
+        accumulate=accumulate,
+        use_split_accumulator=use_split_accumulator
+    )
+    if out_dtype not in [jnp.float8_e4m3fn, jnp.float8_e5m2]:
+        updated_out_amax = None
+        updated_out_scale = None
+
+    ctx = (
+        casted_x,
+        casted_kernel_t,
+        amax_list,
+        scale_list,
+        scale_inv_list,
+        updated_x_amax,
+        updated_kernel_amax,
+        pre_gelu_out if fuse_gelu else None,
+        fuse_bias,
+        maybe_fp32_to_fm32
+    )
+
+    return (out, updated_out_amax, updated_out_scale), ctx
+
+
+def _fp8_gemm_bwd_rule(
+    out_dtype,
+    contracting_dims,
+    fuse_gelu,
+    accumulate,
+    use_split_accumulator,
+    ctx,
+    grad,
+):
+    (
+        casted_x,
+        casted_kernel_t,
+        amax_list,
+        scale_list,
+        scale_inv_list,
+        updated_x_amax,
+        updated_kernel_amax,
+        pre_gelu_out,
+        fuse_bias,
+        maybe_fp32_to_fm32
+    ) = ctx
+
+    fwd_dtype = FP8Helper.FWD_DTYPE
+    bwd_dtype = FP8Helper.BWD_DTYPE
+
+    grad_amax = amax_list[FP8MetaPackage.GRAD_IDX][0:1]
+    grad_scale = scale_list[FP8MetaPackage.GRAD_IDX]
+    grad_scale_inv = scale_inv_list[FP8MetaPackage.GRAD_ID]
+    if fuse_bias and not fuse_gelu:
+        # Since there is no GELU fusion, we need to fuse dbias into this cast_transpose.
+        _, casted_grad_t, bgrad, updated_grad_amax = dbias_cast_transpose(
+            grad,
+            grad_amax,
+            grad_scale,
+            grad_scale_inv,
+            bwd_dtype,
+            static_axis_boundary=-1,
+            transpose_axis_boundary=-1,
+        )
+    else:
+        # If both bias and GELU is fused into the forward pass, we will fuse dbias later with
+        # dGELU. No need to do it here.
+        _, casted_grad_t, updated_grad_amax = cast_transpose(
+            grad,
+            grad_amax,
+            grad_scale,
+            grad_scale_inv,
+            bwd_dtype,
+            static_axis_boundary=-1,
+            transpose_axis_boundary=-1,
+        )
+        bgrad = None
+
+
+
+    x_scale_inv = scale_inv_list[FP8MetaPackage.INPUT_IDX]
+    wgrad, *_ = fp8_gemm_impl(
+        casted_x,
+        x_scale_inv,
+        casted_grad_t,
+        grad_scale_inv,
+        accumulate=accumulate,
+        use_split_accumulator=use_split_accumulator,
+    )
+
+    if fuse_gelu and fuse_bias:
+        # Fuse dbias into this dGELU.
+        casted_dgelu, casted_dgelu_t, bgrad, updated_dgelu_amax = dact_lu_dbias_cast_transpose(
+            grad,
+            pre_gelu_out,
+            grad_amax,
+            grad_scale,
+            grad_scale_inv,
+            bwd_dtype,
+            static_axis_boundary=-1,
+            transpose_axis_boundary=-1,
+            activation_type=("gelu", ),
+        )
+    elif fuse_gelu:
+        # No bias to fuse so we just do dGELU.
+        casted_dgelu, casted_dgelu_t, updated_dgelu_amax = dact_lu(grad, pre_gelu_out, ("gelu", ))
+        bgrad = None
+
+    kernel_scale_inv = scale_inv_list[FP8MetaPackage.WEIGHT_IDX]
+    dgrad, *_ = gemm_impl(
+        casted_dgelu if fuse_gelu else grad,
+        grad_scale_inv,
+        casted_kernel_t,
+        kernel_scale_inv,
+        accumulate=accumulate,
+        use_split_accumulator=use_split_accumulator,
+    )
+
+    amax_list[FP8MetaPackage.INPUT_IDX] = (
+        amax_list[FP8MetaPackage.INPUT_IDX].at[0].set(updated_x_amax[0])
+    )
+    amax_list[FP8MetaPackage.WEIGHT_IDX] = (
+        amax_list[FP8MetaPackage.WEIGHT_IDX].at[0].set(updated_kernel_amax[0])
+    )
+
+    amax_list = maybe_fp32_to_fm32(*amax_list)
+    scale_list = maybe_fp32_to_fm32(*scale_list)
+
+    return dgrad, wgrad, bgrad, amax_list, scale_list
+
+
+_fp8_gemm.defvjp(_fp8_gemm_fwd_rule, _fp8_gemm_bwd_rule)
+
+
+def type_safe_gemm(
+    x: ArrayLike,
+    kernel: ArrayLike,
+    bias: Optional[ArrayLike] = None,
+    fp8_meta: Optional[FP8MetaPackage] = None,
+    out_dtype: Optional[jnp.dtype] = None,
+    contracting_dims: Tuple[int, int] = (1, 0),
+    fuse_gelu: bool = False,
+    accumulate: bool = False,
+    use_split_accumulator: bool = False,
+) -> ArrayLike:
+    if (x.dtype in [jnp.float8_e4m3fn, jnp.float8_e5m2]
+        or kernel.dtype in [jnp.float8_e4m3fn, jnp.float8_e5m2]):
+        assert fp8_meta is not None, "GEMM operands have FP8 dtypes but FP8MetaPackage is None."
+
+    if fp8_meta is not None:
+        return fp8_gemm(x, kernel, bias, fp8_meta, out_dtype, contracting_dims, fuse_gelu,
+                        accumulate, use_split_accumulator)
+    else:
+        return gemm(x, kernel, bias, contracting_dims, fuse_gelu, accumulate, use_split_accumulator)

From c9774d8c203d5b0f5769f47daf70e0c655d0d110 Mon Sep 17 00:00:00 2001
From: Alp Dener <adener@nvidia.com>
Date: Thu, 14 Nov 2024 17:59:20 +0000
Subject: [PATCH 02/32] fixed batching rules to accommodated batched RHS
 operand for GEMM

Signed-off-by: Alp Dener <adener@nvidia.com>
---
 .../common/util/pybind_helper.h               | 138 ++++++++++--------
 transformer_engine/jax/cpp_extensions/gemm.py | 133 ++++++-----------
 .../jax/csrc/extensions/pybind.cpp            |  59 +-------
 3 files changed, 123 insertions(+), 207 deletions(-)

diff --git a/transformer_engine/common/util/pybind_helper.h b/transformer_engine/common/util/pybind_helper.h
index 432ac815ec..a36ff3f0f9 100644
--- a/transformer_engine/common/util/pybind_helper.h
+++ b/transformer_engine/common/util/pybind_helper.h
@@ -8,72 +8,88 @@
 #define TRANSFORMER_ENGINE_COMMON_UTIL_PYBIND_HELPER_H_
 
 #include <pybind11/pybind11.h>
+#include <transformer_engine/activation.h>
 #include <transformer_engine/comm_gemm_overlap.h>
 #include <transformer_engine/fused_attn.h>
 #include <transformer_engine/transformer_engine.h>
 
 #include "cuda_runtime.h"
 
-#define NVTE_DECLARE_COMMON_PYBIND11_HANDLES(m)                                               \
-  pybind11::enum_<transformer_engine::DType>(m, "DType")                                      \
-      .value("kByte", transformer_engine::DType::kByte)                                       \
-      .value("kInt32", transformer_engine::DType::kInt32)                                     \
-      .value("kFloat32", transformer_engine::DType::kFloat32)                                 \
-      .value("kFloat16", transformer_engine::DType::kFloat16)                                 \
-      .value("kBFloat16", transformer_engine::DType::kBFloat16)                               \
-      .value("kFloat8E4M3", transformer_engine::DType::kFloat8E4M3)                           \
-      .value("kFloat8E5M2", transformer_engine::DType::kFloat8E5M2);                          \
-  pybind11::enum_<NVTE_Bias_Type>(m, "NVTE_Bias_Type")                                        \
-      .value("NVTE_NO_BIAS", NVTE_Bias_Type::NVTE_NO_BIAS)                                    \
-      .value("NVTE_PRE_SCALE_BIAS", NVTE_Bias_Type::NVTE_PRE_SCALE_BIAS)                      \
-      .value("NVTE_POST_SCALE_BIAS", NVTE_Bias_Type::NVTE_POST_SCALE_BIAS)                    \
-      .value("NVTE_ALIBI", NVTE_Bias_Type::NVTE_ALIBI);                                       \
-  pybind11::enum_<NVTE_Mask_Type>(m, "NVTE_Mask_Type")                                        \
-      .value("NVTE_NO_MASK", NVTE_Mask_Type::NVTE_NO_MASK)                                    \
-      .value("NVTE_PADDING_MASK", NVTE_Mask_Type::NVTE_PADDING_MASK)                          \
-      .value("NVTE_CAUSAL_MASK", NVTE_Mask_Type::NVTE_CAUSAL_MASK)                            \
-      .value("NVTE_PADDING_CAUSAL_MASK", NVTE_Mask_Type::NVTE_PADDING_CAUSAL_MASK)            \
-      .value("NVTE_CAUSAL_BOTTOM_RIGHT_MASK", NVTE_Mask_Type::NVTE_CAUSAL_BOTTOM_RIGHT_MASK)  \
-      .value("NVTE_PADDING_CAUSAL_BOTTOM_RIGHT_MASK",                                         \
-             NVTE_Mask_Type::NVTE_PADDING_CAUSAL_BOTTOM_RIGHT_MASK);                          \
-  pybind11::enum_<NVTE_QKV_Layout>(m, "NVTE_QKV_Layout")                                      \
-      .value("NVTE_SB3HD", NVTE_QKV_Layout::NVTE_SB3HD)                                       \
-      .value("NVTE_SBH3D", NVTE_QKV_Layout::NVTE_SBH3D)                                       \
-      .value("NVTE_SBHD_SB2HD", NVTE_QKV_Layout::NVTE_SBHD_SB2HD)                             \
-      .value("NVTE_SBHD_SBH2D", NVTE_QKV_Layout::NVTE_SBHD_SBH2D)                             \
-      .value("NVTE_SBHD_SBHD_SBHD", NVTE_QKV_Layout::NVTE_SBHD_SBHD_SBHD)                     \
-      .value("NVTE_BS3HD", NVTE_QKV_Layout::NVTE_BS3HD)                                       \
-      .value("NVTE_BSH3D", NVTE_QKV_Layout::NVTE_BSH3D)                                       \
-      .value("NVTE_BSHD_BS2HD", NVTE_QKV_Layout::NVTE_BSHD_BS2HD)                             \
-      .value("NVTE_BSHD_BSH2D", NVTE_QKV_Layout::NVTE_BSHD_BSH2D)                             \
-      .value("NVTE_BSHD_BSHD_BSHD", NVTE_QKV_Layout::NVTE_BSHD_BSHD_BSHD)                     \
-      .value("NVTE_T3HD", NVTE_QKV_Layout::NVTE_T3HD)                                         \
-      .value("NVTE_TH3D", NVTE_QKV_Layout::NVTE_TH3D)                                         \
-      .value("NVTE_THD_T2HD", NVTE_QKV_Layout::NVTE_THD_T2HD)                                 \
-      .value("NVTE_THD_TH2D", NVTE_QKV_Layout::NVTE_THD_TH2D)                                 \
-      .value("NVTE_THD_THD_THD", NVTE_QKV_Layout::NVTE_THD_THD_THD);                          \
-  pybind11::enum_<NVTE_Fused_Attn_Backend>(m, "NVTE_Fused_Attn_Backend")                      \
-      .value("NVTE_F16_max512_seqlen", NVTE_Fused_Attn_Backend::NVTE_F16_max512_seqlen)       \
-      .value("NVTE_F16_arbitrary_seqlen", NVTE_Fused_Attn_Backend::NVTE_F16_arbitrary_seqlen) \
-      .value("NVTE_FP8", NVTE_Fused_Attn_Backend::NVTE_FP8)                                   \
-      .value("NVTE_No_Backend", NVTE_Fused_Attn_Backend::NVTE_No_Backend);                    \
-  pybind11::enum_<transformer_engine::CommOverlapType>(m, "CommOverlapType")                  \
-      .value("RS", transformer_engine::CommOverlapType::RS)                                   \
-      .value("AG", transformer_engine::CommOverlapType::AG);                                  \
-  pybind11::enum_<transformer_engine::CommOverlapAlgo>(m, "CommOverlapAlgo")                  \
-      .value("BULK_OVERLAP_AG", transformer_engine::CommOverlapAlgo::BULK_OVERLAP_AG)         \
-      .value("BULK_OVERLAP_RS", transformer_engine::CommOverlapAlgo::BULK_OVERLAP_RS)         \
-      .value("SPLIT_PIPELINED_AG_P2P",                                                        \
-             transformer_engine::CommOverlapAlgo::SPLIT_PIPELINED_AG_P2P)                     \
-      .value("SPLIT_PIPELINED_RS", transformer_engine::CommOverlapAlgo::SPLIT_PIPELINED_RS)   \
-      .value("SPLIT_PIPELINED_RS_P2P",                                                        \
-             transformer_engine::CommOverlapAlgo::SPLIT_PIPELINED_RS_P2P)                     \
-      .value("ATOMIC_GEMM_RS", transformer_engine::CommOverlapAlgo::ATOMIC_GEMM_RS)           \
-      .value("ATOMIC_GEMM_AG_P2P", transformer_engine::CommOverlapAlgo::ATOMIC_GEMM_AG_P2P)   \
-      .value("ATOMIC_GEMM_RS_P2P", transformer_engine::CommOverlapAlgo::ATOMIC_GEMM_RS_P2P);  \
-  m.def("device_supports_multicast", &transformer_engine::cuda::supports_multicast,           \
-        py::call_guard<py::gil_scoped_release>(), py::arg("device_id") = -1);                 \
-  m.def("ubuf_built_with_mpi", &transformer_engine::ubuf_built_with_mpi,                      \
-        py::call_guard<py::gil_scoped_release>());
+#define NVTE_DECLARE_COMMON_PYBIND11_HANDLES(m)                                                 \
+  pybind11::enum_<transformer_engine::DType>(m, "DType")                                        \
+      .value("kByte", transformer_engine::DType::kByte)                                         \
+      .value("kInt32", transformer_engine::DType::kInt32)                                       \
+      .value("kFloat32", transformer_engine::DType::kFloat32)                                   \
+      .value("kFloat16", transformer_engine::DType::kFloat16)                                   \
+      .value("kBFloat16", transformer_engine::DType::kBFloat16)                                 \
+      .value("kFloat8E4M3", transformer_engine::DType::kFloat8E4M3)                             \
+      .value("kFloat8E5M2", transformer_engine::DType::kFloat8E5M2);                            \
+  pybind11::enum_<NVTE_Bias_Type>(m, "NVTE_Bias_Type")                                          \
+      .value("NVTE_NO_BIAS", NVTE_Bias_Type::NVTE_NO_BIAS)                                      \
+      .value("NVTE_PRE_SCALE_BIAS", NVTE_Bias_Type::NVTE_PRE_SCALE_BIAS)                        \
+      .value("NVTE_POST_SCALE_BIAS", NVTE_Bias_Type::NVTE_POST_SCALE_BIAS)                      \
+      .value("NVTE_ALIBI", NVTE_Bias_Type::NVTE_ALIBI);                                         \
+  pybind11::enum_<NVTE_Mask_Type>(m, "NVTE_Mask_Type")                                          \
+      .value("NVTE_NO_MASK", NVTE_Mask_Type::NVTE_NO_MASK)                                      \
+      .value("NVTE_PADDING_MASK", NVTE_Mask_Type::NVTE_PADDING_MASK)                            \
+      .value("NVTE_CAUSAL_MASK", NVTE_Mask_Type::NVTE_CAUSAL_MASK)                              \
+      .value("NVTE_PADDING_CAUSAL_MASK", NVTE_Mask_Type::NVTE_PADDING_CAUSAL_MASK)              \
+      .value("NVTE_CAUSAL_BOTTOM_RIGHT_MASK", NVTE_Mask_Type::NVTE_CAUSAL_BOTTOM_RIGHT_MASK)    \
+      .value("NVTE_PADDING_CAUSAL_BOTTOM_RIGHT_MASK",                                           \
+             NVTE_Mask_Type::NVTE_PADDING_CAUSAL_BOTTOM_RIGHT_MASK);                            \
+  pybind11::enum_<NVTE_QKV_Format>(m, "NVTE_QKV_Format")                                        \
+      .value("NVTE_SBHD", NVTE_QKV_Format::NVTE_SBHD)                                           \
+      .value("NVTE_BSHD", NVTE_QKV_Format::NVTE_BSHD)                                           \
+      .value("NVTE_THD", NVTE_QKV_Format::NVTE_THD);                                            \
+  pybind11::enum_<NVTE_QKV_Layout>(m, "NVTE_QKV_Layout")                                        \
+      .value("NVTE_SB3HD", NVTE_QKV_Layout::NVTE_SB3HD)                                         \
+      .value("NVTE_SBH3D", NVTE_QKV_Layout::NVTE_SBH3D)                                         \
+      .value("NVTE_SBHD_SB2HD", NVTE_QKV_Layout::NVTE_SBHD_SB2HD)                               \
+      .value("NVTE_SBHD_SBH2D", NVTE_QKV_Layout::NVTE_SBHD_SBH2D)                               \
+      .value("NVTE_SBHD_SBHD_SBHD", NVTE_QKV_Layout::NVTE_SBHD_SBHD_SBHD)                       \
+      .value("NVTE_BS3HD", NVTE_QKV_Layout::NVTE_BS3HD)                                         \
+      .value("NVTE_BSH3D", NVTE_QKV_Layout::NVTE_BSH3D)                                         \
+      .value("NVTE_BSHD_BS2HD", NVTE_QKV_Layout::NVTE_BSHD_BS2HD)                               \
+      .value("NVTE_BSHD_BSH2D", NVTE_QKV_Layout::NVTE_BSHD_BSH2D)                               \
+      .value("NVTE_BSHD_BSHD_BSHD", NVTE_QKV_Layout::NVTE_BSHD_BSHD_BSHD)                       \
+      .value("NVTE_T3HD", NVTE_QKV_Layout::NVTE_T3HD)                                           \
+      .value("NVTE_TH3D", NVTE_QKV_Layout::NVTE_TH3D)                                           \
+      .value("NVTE_THD_T2HD", NVTE_QKV_Layout::NVTE_THD_T2HD)                                   \
+      .value("NVTE_THD_TH2D", NVTE_QKV_Layout::NVTE_THD_TH2D)                                   \
+      .value("NVTE_THD_THD_THD", NVTE_QKV_Layout::NVTE_THD_THD_THD);                            \
+  pybind11::enum_<NVTE_Fused_Attn_Backend>(m, "NVTE_Fused_Attn_Backend")                        \
+      .value("NVTE_F16_max512_seqlen", NVTE_Fused_Attn_Backend::NVTE_F16_max512_seqlen)         \
+      .value("NVTE_F16_arbitrary_seqlen", NVTE_Fused_Attn_Backend::NVTE_F16_arbitrary_seqlen)   \
+      .value("NVTE_FP8", NVTE_Fused_Attn_Backend::NVTE_FP8)                                     \
+      .value("NVTE_No_Backend", NVTE_Fused_Attn_Backend::NVTE_No_Backend);                      \
+  pybind11::enum_<NVTE_Activation_Type>(m, "NVTE_Activation_Type")                              \
+      .value("GELU", NVTE_Activation_Type::GELU)                                                \
+      .value("GEGLU", NVTE_Activation_Type::GEGLU)                                              \
+      .value("SILU", NVTE_Activation_Type::SILU)                                                \
+      .value("SWIGLU", NVTE_Activation_Type::SWIGLU)                                            \
+      .value("RELU", NVTE_Activation_Type::RELU)                                                \
+      .value("REGLU", NVTE_Activation_Type::REGLU)                                              \
+      .value("QGELU", NVTE_Activation_Type::QGELU)                                              \
+      .value("QGEGLU", NVTE_Activation_Type::QGEGLU)                                            \
+      .value("SRELU", NVTE_Activation_Type::SRELU)                                              \
+      .value("SREGLU", NVTE_Activation_Type::SREGLU);                                           \
+  pybind11::enum_<transformer_engine::CommOverlapType>(m, "CommOverlapType")                    \
+      .value("RS", transformer_engine::CommOverlapType::RS)                                     \
+      .value("AG", transformer_engine::CommOverlapType::AG);                                    \
+  pybind11::enum_<transformer_engine::CommOverlapAlgo>(m, "CommOverlapAlgo")                    \
+      .value("BULK_OVERLAP_AG", transformer_engine::CommOverlapAlgo::BULK_OVERLAP_AG)           \
+      .value("BULK_OVERLAP_RS", transformer_engine::CommOverlapAlgo::BULK_OVERLAP_RS)           \
+      .value("SPLIT_PIPELINED_AG_P2P",                                                          \
+             transformer_engine::CommOverlapAlgo::SPLIT_PIPELINED_AG_P2P)                       \
+      .value("SPLIT_PIPELINED_RS", transformer_engine::CommOverlapAlgo::SPLIT_PIPELINED_RS)     \
+      .value("SPLIT_PIPELINED_RS_P2P",                                                          \
+             transformer_engine::CommOverlapAlgo::SPLIT_PIPELINED_RS_P2P)                       \
+      .value("ATOMIC_GEMM_RS", transformer_engine::CommOverlapAlgo::ATOMIC_GEMM_RS)             \
+      .value("ATOMIC_GEMM_AG_P2P", transformer_engine::CommOverlapAlgo::ATOMIC_GEMM_AG_P2P)     \
+      .value("ATOMIC_GEMM_RS_P2P", transformer_engine::CommOverlapAlgo::ATOMIC_GEMM_RS_P2P);    \
+  m.def("device_supports_multicast", &transformer_engine::cuda::supports_multicast,             \
+        pybind11::call_guard<pybind11::gil_scoped_release>(), pybind11::arg("device_id") = -1); \
+  m.def("ubuf_built_with_mpi", &transformer_engine::ubuf_built_with_mpi,                        \
+        pybind11::call_guard<pybind11::gil_scoped_release>());
 
 #endif
diff --git a/transformer_engine/jax/cpp_extensions/gemm.py b/transformer_engine/jax/cpp_extensions/gemm.py
index 677fabca59..ceafce46e1 100644
--- a/transformer_engine/jax/cpp_extensions/gemm.py
+++ b/transformer_engine/jax/cpp_extensions/gemm.py
@@ -24,10 +24,10 @@
     jax_dtype_is_fp8,
     get_padded_spec,
     is_ffi_enabled,
+    check_valid_batch_dims,
 )
 from ..sharding import (
     global_mesh_resource,
-    get_mesh_axis_size,
     lax_paral_op,
     all_reduce_max_along_all_axes_except_PP,
 )
@@ -83,9 +83,6 @@ def abstract(lhs_aval, lhs_scale_inv_aval, rhs_aval, rhs_scale_inv_aval, bias_av
                 and dtypes.canonicalize_dtype(rhs_scale_inv_aval.dtype) == jnp.float32
             ), "Missing RHS operand scale inverse in FP8 GEMM."
 
-        # Disallow batching for RHS
-        assert rhs_aval.ndim == 2, "GEMM does not support batching the RHS operand."
-
         # Validate operand layouts
         lhs_inner_dim, rhs_inner_dim = map(
             lambda inner_dim, ndims: (ndims - inner_dim) if inner_dim < 0 else inner_dim,
@@ -97,12 +94,12 @@ def abstract(lhs_aval, lhs_scale_inv_aval, rhs_aval, rhs_scale_inv_aval, bias_av
         ), f"Incompatible operand sizes: {lhs_aval.shape} x {rhs_aval.shape}."
 
         lhs_trans = lhs_inner_dim != lhs_aval.ndim - 1
-        rhs_trans = rhs_inner_dim == 1
+        rhs_trans = rhs_inner_dim == rhs_aval.ndim - 1
         assert (
             not (lhs_trans and rhs_trans)
         ), "GEMM does not support transposed LHS and transposed RHS at the same time."
         if is_fp8:
-            assert lhs_trans, "FP8 GEMM does not support transposed LHS."
+            assert not lhs_trans, "FP8 GEMM does not support transposed LHS."
             assert rhs_trans, "FP8 GEMM requires transposed RHS."
 
         # Validate output dtype
@@ -124,11 +121,18 @@ def abstract(lhs_aval, lhs_scale_inv_aval, rhs_aval, rhs_scale_inv_aval, bias_av
             out_scale_updated_dtype = jnp.float32
 
         # Infer output shape
-        rhs_outer_dim = 0 if rhs_trans else 1
         lhs_outer_dim = lhs_aval.ndim - 1 if lhs_trans else lhs_aval.ndim - 2
         lhs_bdims = [dim for dim in range(lhs_aval.ndim)
                      if dim not in [lhs_outer_dim, lhs_inner_dim]]
         lhs_batch_shape = [lhs_aval.shape[dim] for dim in lhs_bdims]
+        lhs_batch_size = reduce(operator.mul, lhs_batch_shape, 1)
+        rhs_outer_dim = rhs_aval.ndim - 2 if rhs_trans else rhs_aval.ndim - 1
+        rhs_bdims = [dim for dim in range(rhs_aval.ndim)
+                     if dim not in [rhs_outer_dim, rhs_inner_dim]]
+        rhs_batch_size = reduce(operator.mul, rhs_bdims, 1)
+        assert (
+            lhs_batch_size == rhs_batch_size
+        ), "LHS and RHS operands must have the same batched sizes."
         out_shape = (*lhs_batch_shape, lhs_aval.shape[lhs_outer_dim], rhs_aval.shape[rhs_outer_dim])
 
         # Validate bias/bias_grad shape against inferred output
@@ -201,7 +205,7 @@ def lowering(ctx, lhs, lhs_scale_inv, rhs, rhs_scale_inv, bias, gelu_input, out_
             (lhs_aval.ndim, rhs_aval.ndim)
         )
         lhs_trans = lhs_inner_dim != lhs_aval.ndim - 1
-        rhs_trans = rhs_inner_dim == 1
+        rhs_trans = rhs_inner_dim == rhs_aval.ndim - 1
 
         operand_output_aliases = {
             4: 4,  # bias        <-->  bias_grad
@@ -248,12 +252,9 @@ def lowering(ctx, lhs, lhs_scale_inv, rhs, rhs_scale_inv, bias, gelu_input, out_
             ]
             args = CustomCallArgsWrapper(out_types, operands, operand_shapes)
 
-            rhs_outer_dim = 0 if rhs_trans else 1
             lhs_outer_dim = lhs_aval.ndim - 1 if lhs_trans else lhs_aval.ndim - 2
-            lhs_bdims = [dim for dim in range(lhs_aval.ndim)
-                        if dim not in [lhs_outer_dim, lhs_inner_dim]]
-            lhs_batch_shape = [lhs_aval.shape[dim] for dim in lhs_bdims]
-            m = reduce(operator.mul, lhs_batch_shape, 1) * lhs_aval.shape[lhs_outer_dim]
+            rhs_outer_dim = rhs_aval.ndim - 2 if rhs_trans else rhs_aval.ndim - 1
+            m = lhs_aval.shape[lhs_outer_dim]
             k = rhs_aval.shape[rhs_inner_dim]
             n = rhs_aval.shape[rhs_outer_dim]
             workspace_size = get_cublas_workspace_size_bytes()
@@ -308,77 +309,32 @@ def impl(lhs, lhs_scale_inv, rhs, rhs_scale_inv, bias, gelu_input, out_amax, out
     def batcher(batched_args, batch_dims, *, out_dtype, contracting_dims, fuse_gelu, fuse_bias, grad,
                 accumulate, use_split_accumulator):
         assert CollectiveGemmPrimitive.outer_primitive is not None
+        check_valid_batch_dims(batch_dims)
+        lhs_bdims, *_, bias_bdims, gelu_input_bdims, out_amax_bdims, out_scale_bdims = batch_dims
 
-        lhs, lhs_scale_inv, rhs, rhs_scale_inv, bias, gelu_input, out_amax, out_scale = batched_args
-        assert rhs.ndim == 2, "TE/JAX GEMM custom op does not support batching RHS operands."
-
-        # Get contracting and batch dimensions out
-        lhs_inner_dim, rhs_inner_dim = map(
-            lambda inner_dim, ndims: (ndims - inner_dim) if inner_dim < 0 else inner_dim,
-            contracting_dims,
-            (lhs.ndim, rhs.ndim)
-        )
-        lhs_trans = lhs_inner_dim != lhs.ndim - 1
-        rhs_trans = rhs_inner_dim == 1
-        lhs_outer_dim = lhs.ndim - 1 if lhs_trans else lhs.ndim - 2
-        rhs_outer_dim = 0 if rhs_trans else 1
-        lhs_bdims = [dim for dim in range(lhs.ndim) if dim not in [lhs_outer_dim, lhs_inner_dim]]
-
-        # FP8 GEMM only supports lhs_trans = False and rhs_trans = True so we may need to
-        # reorder the axes here to match
-        if jax_dtype_is_fp8(lhs.dtype):
-            lhs = jnp.transpose(lhs, (*lhs_bdims, lhs_outer_dim, lhs_inner_dim))
-            lhs_trans = False
-            rhs = jnp.transpose(rhs, (rhs_outer_dim, rhs_inner_dim))
-            rhs_trans = True
-            contracting_dims = (1, 1)
-
-        # Collapse all non-contracting dimensions
-        batch_shape = [lhs.shape[dim] for dim in lhs_bdims]
-        batch_size = reduce(operator.mul, batch_shape, 1)
-        lhs_outer_size = lhs.shape[lhs_outer_dim]
-        lhs_shape_2d = (
-            (lhs.shape[lhs_inner_dim], batch_size * lhs_outer_size)
-            if lhs_trans
-            else (batch_size * lhs_outer_size, lhs.shape[lhs_inner_dim])
-        )
-        lhs = jnp.reshape(lhs, lhs_shape_2d)
-        if fuse_gelu:
-            gelu_input = jnp.reshape(
-                gelu_input, (batch_size * lhs_outer_size, rhs.shape[rhs_outer_dim])
-            )
-
-        outputs = CollectiveGemmPrimitive.outer_primitive.bind(
-            lhs,
-            lhs_scale_inv,
-            rhs,
-            rhs_scale_inv,
-            bias,
-            gelu_input,
-            out_amax,
-            out_scale,
-            out_dtype=out_dtype,
-            contracting_dims=contracting_dims,
-            fuse_gelu=fuse_gelu,
-            fuse_bias=fuse_bias,
-            grad=grad,
-            accumulate=accumulate,
-            use_split_accumulator=use_split_accumulator,
-        )
-
-        # Reshape output to recover original LHS batch shape
-        outputs[0] = jnp.reshape(
-            outputs[0],
-            (*batch_shape, lhs_outer_size, rhs.shape[rhs_outer_dim])
-        )
-        gelu_bdims = batch_dims[3]
-        if fuse_gelu:
-            outputs[3] = jnp.reshape(outputs[3], outputs[0].shape)
-            gelu_bdims = lhs_bdims
+         # FP8 GEMM only supports non-transposed LHS and transposed RHS
+        lhs, _, rhs, *_ = batched_args
+        lhs_trans = contracting_dims[0] != lhs.ndim - 1
+        rhs_trans = contracting_dims[1] == rhs.ndim - 1
+        lhs = jnp.matrix_transpose(lhs) if lhs_trans and jax_dtype_is_fp8(lhs.dtype) else lhs
+        rhs = jnp.matrix_transpose(rhs) if not rhs_trans and jax_dtype_is_fp8(rhs.dtype) else rhs
+        contracting_dims = (1, 1)
 
         return (
-            outputs,
-            (lhs_bdims, batch_dims[1], batch_dims[2], gelu_bdims, batch_dims[4])
+            CollectiveGemmPrimitive.outer_primitive.bind(
+                lhs,
+                batched_args[1],
+                rhs,
+                *batched_args[3:],
+                out_dtype=out_dtype,
+                contracting_dims=contracting_dims,
+                fuse_gelu=fuse_gelu,
+                fuse_bias=fuse_bias,
+                grad=grad,
+                accumulate=accumulate,
+                use_split_accumulator=use_split_accumulator,
+            )
+            (lhs_bdims, out_amax_bdims, out_scale_bdims, gelu_input_bdims, bias_bdims)
         )
 
     @staticmethod
@@ -400,9 +356,9 @@ def infer_sharding_from_operands(out_dtype, contracting_dims, fuse_gelu, fuse_bi
                           + "not already partitioned correctly.")
 
         lhs_trans = lhs_inner_dim != lhs.ndim - 1
-        rhs_trans = rhs_inner_dim == 1
+        rhs_trans = rhs_inner_dim == rhs.ndim - 1
         lhs_outer_dim = lhs.ndim - 1 if lhs_trans else lhs.ndim - 2
-        rhs_outer_dim = 0 if rhs_trans else 1
+        rhs_outer_dim = rhs.ndim - 2 if rhs_trans else rhs.ndim - 1
         lhs_bdims = [dim for dim in range(lhs.ndim) if dim not in [lhs_outer_dim, lhs_inner_dim]]
         batch_specs = [lhs_spec[bdim] for bdim in lhs_bdims]
         rhs_outer_spec = rhs_spec[rhs_outer_dim]
@@ -440,9 +396,9 @@ def partition(out_dtype, contracting_dims, fuse_gelu, fuse_bias, grad, accumulat
         )
 
         lhs_trans = lhs_inner_dim != lhs.ndim - 1
-        rhs_trans = rhs_inner_dim == 1
+        rhs_trans = rhs_inner_dim == rhs.ndim - 1
         lhs_outer_dim = lhs.ndim - 1 if lhs_trans else lhs.ndim - 2
-        rhs_outer_dim = 0 if rhs_trans else 1
+        rhs_outer_dim = rhs.ndim - 2 if rhs_trans else rhs.ndim - 1
         lhs_bdims = [dim for dim in range(lhs.ndim) if dim not in [lhs_outer_dim, lhs_inner_dim]]
         batch_specs = [lhs_spec[bdim] for bdim in lhs_bdims]
         rhs_outer_spec = rhs_spec[rhs_outer_dim]
@@ -558,7 +514,7 @@ def fp8_gemm_impl(
         gelu_input = jnp.zeros(0, dtype=bias.dtype)
     elif gelu_input is None:
         lhs_outer_dim = lhs.ndim - 1 if contracting_dims[0] == 1 else lhs.ndim - 2
-        rhs_outer_dim = 1 if contracting_dims[1] == 0 else 0
+        rhs_outer_dim = rhs.ndim - 2 if contracting_dims[1] == 0 else rhs.ndim - 1
         out_shape = (*lhs.shape[:-2], lhs.shape[lhs_outer_dim], rhs.shape[rhs_outer_dim])
         gelu_input = jnp.zeros(out_shape, dtype=bias.dtype)
 
@@ -599,7 +555,7 @@ def gemm_impl(
     dummy_fp8_meta = jnp.zeros(0, dtype=jnp.float32)
 
     lhs_outer_dim = lhs.ndim - 1 if contracting_dims[0] == 1 else lhs.ndim - 2
-    rhs_outer_dim = 1 if contracting_dims[1] == 0 else 0
+    rhs_outer_dim = rhs.ndim - 2 if contracting_dims[1] == 0 else rhs.ndim - 1
     out_shape = (*lhs.shape[:-2], lhs.shape[lhs_outer_dim], rhs.shape[rhs_outer_dim])
 
     if not fuse_bias:
@@ -618,9 +574,6 @@ def gemm_impl(
             gelu_input is not None
         ), "Backward GEMM with dGELU epilogue requires pre-GELU output from forward GEMM."
     elif gelu_input is None:
-        lhs_outer_dim = lhs.ndim - 1 if contracting_dims[0] == 1 else lhs.ndim - 2
-        rhs_outer_dim = 1 if contracting_dims[1] == 0 else 0
-        out_shape = (*lhs.shape[:-2], lhs.shape[lhs_outer_dim], rhs.shape[rhs_outer_dim])
         gelu_input = jnp.zeros(out_shape, dtype=lhs.dtypes)
 
     out, _, _, pre_gelu_out, bias_grad = CollectiveGemmPrimitive.outer_primitive.bind(
diff --git a/transformer_engine/jax/csrc/extensions/pybind.cpp b/transformer_engine/jax/csrc/extensions/pybind.cpp
index 7b8ebdcdd2..ddf98d9d78 100644
--- a/transformer_engine/jax/csrc/extensions/pybind.cpp
+++ b/transformer_engine/jax/csrc/extensions/pybind.cpp
@@ -4,6 +4,7 @@
  * See LICENSE for license information.
  ************************************************************************/
 
+#include "common/util/pybind_helper.h"
 #include "extensions.h"
 
 namespace transformer_engine {
@@ -107,6 +108,8 @@ pybind11::dict Registrations() {
 }
 
 PYBIND11_MODULE(transformer_engine_jax, m) {
+  NVTE_DECLARE_COMMON_PYBIND11_HANDLES(m)
+
   m.def("registrations", &Registrations);
   m.def("pack_common_descriptor", &PackCustomCallCommonDescriptor, pybind11::arg(), pybind11::arg(),
         pybind11::arg(), pybind11::arg("act_num") = 0);
@@ -129,62 +132,6 @@ PYBIND11_MODULE(transformer_engine_jax, m) {
   m.def("get_fused_attn_fwd_workspace_sizes", &GetFusedAttnForwardWorkspaceSizes);
   m.def("get_fused_attn_bwd_workspace_sizes", &GetFusedAttnBackwardWorkspaceSizes);
   m.def("nvte_get_qkv_format", &nvte_get_qkv_format);
-
-  pybind11::enum_<DType>(m, "DType", pybind11::module_local())
-      .value("kByte", DType::kByte)
-      .value("kInt32", DType::kInt32)
-      .value("kInt64", DType::kInt64)
-      .value("kFloat32", DType::kFloat32)
-      .value("kFloat16", DType::kFloat16)
-      .value("kBFloat16", DType::kBFloat16)
-      .value("kFloat8E4M3", DType::kFloat8E4M3)
-      .value("kFloat8E5M2", DType::kFloat8E5M2);
-
-  pybind11::enum_<NVTE_Bias_Type>(m, "NVTE_Bias_Type", pybind11::module_local())
-      .value("NVTE_NO_BIAS", NVTE_Bias_Type::NVTE_NO_BIAS)
-      .value("NVTE_PRE_SCALE_BIAS", NVTE_Bias_Type::NVTE_PRE_SCALE_BIAS)
-      .value("NVTE_POST_SCALE_BIAS", NVTE_Bias_Type::NVTE_POST_SCALE_BIAS);
-
-  pybind11::enum_<NVTE_Mask_Type>(m, "NVTE_Mask_Type", pybind11::module_local())
-      .value("NVTE_NO_MASK", NVTE_Mask_Type::NVTE_NO_MASK)
-      .value("NVTE_PADDING_MASK", NVTE_Mask_Type::NVTE_PADDING_MASK)
-      .value("NVTE_CAUSAL_MASK", NVTE_Mask_Type::NVTE_CAUSAL_MASK)
-      .value("NVTE_PADDING_CAUSAL_MASK", NVTE_Mask_Type::NVTE_PADDING_CAUSAL_MASK)
-      .value("NVTE_CAUSAL_BOTTOM_RIGHT_MASK", NVTE_Mask_Type::NVTE_CAUSAL_BOTTOM_RIGHT_MASK)
-      .value("NVTE_PADDING_CAUSAL_BOTTOM_RIGHT_MASK",
-             NVTE_Mask_Type::NVTE_PADDING_CAUSAL_BOTTOM_RIGHT_MASK);
-
-  pybind11::enum_<NVTE_QKV_Layout>(m, "NVTE_QKV_Layout", pybind11::module_local())
-      .value("NVTE_BS3HD", NVTE_QKV_Layout::NVTE_BS3HD)
-      .value("NVTE_BSHD_BS2HD", NVTE_QKV_Layout::NVTE_BSHD_BS2HD)
-      .value("NVTE_BSHD_BSHD_BSHD", NVTE_QKV_Layout::NVTE_BSHD_BSHD_BSHD)
-      .value("NVTE_T3HD", NVTE_QKV_Layout::NVTE_T3HD)
-      .value("NVTE_THD_T2HD", NVTE_QKV_Layout::NVTE_THD_T2HD)
-      .value("NVTE_THD_THD_THD", NVTE_QKV_Layout::NVTE_THD_THD_THD);
-
-  pybind11::enum_<NVTE_QKV_Format>(m, "NVTE_QKV_Format", pybind11::module_local())
-      .value("NVTE_SBHD", NVTE_QKV_Format::NVTE_SBHD)
-      .value("NVTE_BSHD", NVTE_QKV_Format::NVTE_BSHD)
-      .value("NVTE_THD", NVTE_QKV_Format::NVTE_THD);
-
-  pybind11::enum_<NVTE_Activation_Type>(m, "NVTE_Activation_Type", pybind11::module_local())
-      .value("GELU", NVTE_Activation_Type::GELU)
-      .value("GEGLU", NVTE_Activation_Type::GEGLU)
-      .value("SILU", NVTE_Activation_Type::SILU)
-      .value("SWIGLU", NVTE_Activation_Type::SWIGLU)
-      .value("RELU", NVTE_Activation_Type::RELU)
-      .value("REGLU", NVTE_Activation_Type::REGLU)
-      .value("QGELU", NVTE_Activation_Type::QGELU)
-      .value("QGEGLU", NVTE_Activation_Type::QGEGLU)
-      .value("SRELU", NVTE_Activation_Type::SRELU)
-      .value("SREGLU", NVTE_Activation_Type::SREGLU)
-      .export_values();
-
-  pybind11::enum_<NVTE_Fused_Attn_Backend>(m, "NVTE_Fused_Attn_Backend", pybind11::module_local())
-      .value("NVTE_No_Backend", NVTE_Fused_Attn_Backend::NVTE_No_Backend)
-      .value("NVTE_F16_max512_seqlen", NVTE_Fused_Attn_Backend::NVTE_F16_max512_seqlen)
-      .value("NVTE_F16_arbitrary_seqlen", NVTE_Fused_Attn_Backend::NVTE_F16_arbitrary_seqlen)
-      .value("NVTE_FP8", NVTE_Fused_Attn_Backend::NVTE_FP8);
 }
 
 }  // namespace jax

From e523018a8f7e3de2e1e4ab2a989eb6e13ca4a9b8 Mon Sep 17 00:00:00 2001
From: "pre-commit-ci[bot]"
 <66853113+pre-commit-ci[bot]@users.noreply.github.com>
Date: Thu, 14 Nov 2024 18:14:24 +0000
Subject: [PATCH 03/32] [pre-commit.ci] auto fixes from pre-commit.com hooks

for more information, see https://pre-commit.ci
---
 tests/jax/test_custom_call_compute.py         |  15 +-
 transformer_engine/jax/cpp_extensions/gemm.py | 275 ++++++++++++------
 .../jax/csrc/extensions/gemm.cpp              |  16 +-
 transformer_engine/jax/flax/module.py         |  12 +-
 transformer_engine/jax/gemm.py                |  70 +++--
 5 files changed, 254 insertions(+), 134 deletions(-)

diff --git a/tests/jax/test_custom_call_compute.py b/tests/jax/test_custom_call_compute.py
index 9bf3f9fa91..355f587265 100644
--- a/tests/jax/test_custom_call_compute.py
+++ b/tests/jax/test_custom_call_compute.py
@@ -425,19 +425,16 @@ def _generate_inputs(b, m, n, k, dtype):
         a = jax.random.normal(subkeys[0], (b, m, k), dtype)
         b = jax.random.normal(subkeys[1], (n, k), dtype)
         bias_dtype = dtype if dtype not in [jnp.float8_e4m3fn, jnp.float8_e5m2] else jnp.bfloat16
-        bias = jax.random.normal(subkeys[2], (n, ), bias_dtype)
+        bias = jax.random.normal(subkeys[2], (n,), bias_dtype)
         return a, b, bias
 
     @staticmethod
     def _generate_fp8_inputs(b, m, n, k, fp8_dtype):
         a, b, bias = TestGemm._generate_inputs(b, m, n, k, jnp.bfloat16)
-        a_scale, b_scale = map(
-            lambda x: (jnp.max(jnp.abs(x)) / 127.).astype(jnp.float32),
-            [a, b]
-        )
+        a_scale, b_scale = map(lambda x: (jnp.max(jnp.abs(x)) / 127.0).astype(jnp.float32), [a, b])
         a_q, b_q = map(
             lambda x, x_scale: jnp.round(x / x_scale).astype(fp8_dtype),
-            [(a, a_scale), (b, b_scale)]
+            [(a, a_scale), (b, b_scale)],
         )
         return a, a_q, jnp.reciprocal(a_scale), b, b_q, jnp.reciprocal(b_scale), bias
 
@@ -447,7 +444,7 @@ def _generate_fp8_inputs(b, m, n, k, fp8_dtype):
     def test_gemm(self, b, m, n, k, use_bias, do_gelu):
         a, b, bias = self._generate_inputs(b, m, n, k, jnp.bfloat16)
 
-        primitive_out = gemm(a, b, bias=bias if use_bias else None, layout='NT', do_gelu=do_gelu)
+        primitive_out = gemm(a, b, bias=bias if use_bias else None, layout="NT", do_gelu=do_gelu)
         ref_out = jnp.dot(a, b)
         if use_bias:
             ref_out += bias
@@ -460,9 +457,7 @@ def test_gemm(self, b, m, n, k, use_bias, do_gelu):
     @pytest.mark.parametrize("m,n,k", GEMM_CASES)
     @pytest.mark.parametrize("fp8_dtype", FP8_COMPUTE_TYPE)
     def test_fp8_gemm(self, m, n, k, fp8_dtype):
-        a, a_q, a_scale_inv, b, b_q, b_scale_inv, _ = self._generate_fp8_inputs(
-            m, n, k, fp8_dtype
-        )
+        a, a_q, a_scale_inv, b, b_q, b_scale_inv, _ = self._generate_fp8_inputs(m, n, k, fp8_dtype)
 
         primitive_out = fp8_gemm(a_q, a_scale_inv, b_q, b_scale_inv, out_dtype=jnp.bfloat16)
         ref_out = jnp.dot(a, b)
diff --git a/transformer_engine/jax/cpp_extensions/gemm.py b/transformer_engine/jax/cpp_extensions/gemm.py
index ceafce46e1..2df05d6df4 100644
--- a/transformer_engine/jax/cpp_extensions/gemm.py
+++ b/transformer_engine/jax/cpp_extensions/gemm.py
@@ -58,9 +58,23 @@ class CollectiveGemmPrimitive(BasePrimitive):
     outer_primitive = None
 
     @staticmethod
-    def abstract(lhs_aval, lhs_scale_inv_aval, rhs_aval, rhs_scale_inv_aval, bias_aval,
-                 gelu_input_aval, out_amax_aval, out_scale_aval, out_dtype, contracting_dims,
-                 fuse_gelu, fuse_bias, grad, accumulate, use_split_accumulator):
+    def abstract(
+        lhs_aval,
+        lhs_scale_inv_aval,
+        rhs_aval,
+        rhs_scale_inv_aval,
+        bias_aval,
+        gelu_input_aval,
+        out_amax_aval,
+        out_scale_aval,
+        out_dtype,
+        contracting_dims,
+        fuse_gelu,
+        fuse_bias,
+        grad,
+        accumulate,
+        use_split_accumulator,
+    ):
         """
         cuBlasLt GEMM abstract
         """
@@ -87,7 +101,7 @@ def abstract(lhs_aval, lhs_scale_inv_aval, rhs_aval, rhs_scale_inv_aval, bias_av
         lhs_inner_dim, rhs_inner_dim = map(
             lambda inner_dim, ndims: (ndims - inner_dim) if inner_dim < 0 else inner_dim,
             contracting_dims,
-            (lhs_aval.ndim, rhs_aval.ndim)
+            (lhs_aval.ndim, rhs_aval.ndim),
         )
         assert (
             lhs_aval.shape[lhs_inner_dim] == rhs_aval.shape[rhs_inner_dim]
@@ -95,8 +109,8 @@ def abstract(lhs_aval, lhs_scale_inv_aval, rhs_aval, rhs_scale_inv_aval, bias_av
 
         lhs_trans = lhs_inner_dim != lhs_aval.ndim - 1
         rhs_trans = rhs_inner_dim == rhs_aval.ndim - 1
-        assert (
-            not (lhs_trans and rhs_trans)
+        assert not (
+            lhs_trans and rhs_trans
         ), "GEMM does not support transposed LHS and transposed RHS at the same time."
         if is_fp8:
             assert not lhs_trans, "FP8 GEMM does not support transposed LHS."
@@ -104,8 +118,8 @@ def abstract(lhs_aval, lhs_scale_inv_aval, rhs_aval, rhs_scale_inv_aval, bias_av
 
         # Validate output dtype
         if jax_dtype_is_fp8(out_dtype):
-            assert (
-                jax_dtype_is_fp8(lhs_dtype) and jax_dtype_is_fp8(rhs_dtype)
+            assert jax_dtype_is_fp8(lhs_dtype) and jax_dtype_is_fp8(
+                rhs_dtype
             ), "FP8 GEMM output requires FP8 inputs."
             assert (
                 out_amax_aval.size == out_scale_aval.size == 1
@@ -122,13 +136,15 @@ def abstract(lhs_aval, lhs_scale_inv_aval, rhs_aval, rhs_scale_inv_aval, bias_av
 
         # Infer output shape
         lhs_outer_dim = lhs_aval.ndim - 1 if lhs_trans else lhs_aval.ndim - 2
-        lhs_bdims = [dim for dim in range(lhs_aval.ndim)
-                     if dim not in [lhs_outer_dim, lhs_inner_dim]]
+        lhs_bdims = [
+            dim for dim in range(lhs_aval.ndim) if dim not in [lhs_outer_dim, lhs_inner_dim]
+        ]
         lhs_batch_shape = [lhs_aval.shape[dim] for dim in lhs_bdims]
         lhs_batch_size = reduce(operator.mul, lhs_batch_shape, 1)
         rhs_outer_dim = rhs_aval.ndim - 2 if rhs_trans else rhs_aval.ndim - 1
-        rhs_bdims = [dim for dim in range(rhs_aval.ndim)
-                     if dim not in [rhs_outer_dim, rhs_inner_dim]]
+        rhs_bdims = [
+            dim for dim in range(rhs_aval.ndim) if dim not in [rhs_outer_dim, rhs_inner_dim]
+        ]
         rhs_batch_size = reduce(operator.mul, rhs_bdims, 1)
         assert (
             lhs_batch_size == rhs_batch_size
@@ -139,9 +155,7 @@ def abstract(lhs_aval, lhs_scale_inv_aval, rhs_aval, rhs_scale_inv_aval, bias_av
         bias_dtype = jnp.bfloat16 if jax_dtype_is_fp8(out_dtype) else out_dtype
         if fuse_bias:
             assert (
-                bias_aval.size > 0
-                and bias_aval.ndim == 1
-                and bias_aval.shape[0] == out_shape[-1]
+                bias_aval.size > 0 and bias_aval.ndim == 1 and bias_aval.shape[0] == out_shape[-1]
             ), "Incorrect bias shape."
             bias_dtype = dtypes.canonicalize_dtype(bias_aval.dtype)
         else:
@@ -149,8 +163,8 @@ def abstract(lhs_aval, lhs_scale_inv_aval, rhs_aval, rhs_scale_inv_aval, bias_av
 
         # Validate GELU input/output
         if fuse_gelu:
-            assert (
-                all([gelu_input_aval.shape[i] == out_shape[i] for i in len(out_shape)])
+            assert all(
+                [gelu_input_aval.shape[i] == out_shape[i] for i in len(out_shape)]
             ), "Invalid GELU input shape."
             assert gelu_input_aval.dtype == bias_dtype, "Invalid GELU dtype."
         else:
@@ -158,14 +172,17 @@ def abstract(lhs_aval, lhs_scale_inv_aval, rhs_aval, rhs_scale_inv_aval, bias_av
 
         # Create abstract arrays for all outputs
         out_aval = lhs_aval.update(shape=out_shape, dtype=out_dtype)
-        out_amax_updated_aval = out_amax_aval.update(shape=out_amax_aval.shape,
-                                                     dtype=out_amax_updated_dtype)
-        out_scale_updated_aval = out_scale_aval.update(shape=out_scale_aval.shape,
-                                                       dtype=out_scale_updated_dtype)
+        out_amax_updated_aval = out_amax_aval.update(
+            shape=out_amax_aval.shape, dtype=out_amax_updated_dtype
+        )
+        out_scale_updated_aval = out_scale_aval.update(
+            shape=out_scale_aval.shape, dtype=out_scale_updated_dtype
+        )
         pre_gelu_out_aval = gelu_input_aval.update(shape=gelu_input_aval.shape, dtype=bias_dtype)
         bias_grad_aval = bias_aval.update(shape=bias_aval.shape, dtype=bias_dtype)
-        workspace_aval = jax.core.ShapedArray(shape=(get_cublas_workspace_size_bytes(), ),
-                                              dtype=jnp.uint8)
+        workspace_aval = jax.core.ShapedArray(
+            shape=(get_cublas_workspace_size_bytes(),), dtype=jnp.uint8
+        )
 
         return (
             out_aval,
@@ -173,7 +190,7 @@ def abstract(lhs_aval, lhs_scale_inv_aval, rhs_aval, rhs_scale_inv_aval, bias_av
             out_scale_updated_aval,
             pre_gelu_out_aval,
             bias_grad_aval,
-            workspace_aval
+            workspace_aval,
         )
 
     @staticmethod
@@ -181,20 +198,31 @@ def outer_abstract(*args, **kwargs):
         """
         cuBlasLt GEMM outer abstract
         """
-        (
-            out_aval,
-            out_amax_aval,
-            out_scale_aval,
-            pre_gelu_out_aval,
-            bias_grad_aval,
-            _
-        ) = CollectiveGemmPrimitive.abstract(*args, **kwargs)
+        (out_aval, out_amax_aval, out_scale_aval, pre_gelu_out_aval, bias_grad_aval, _) = (
+            CollectiveGemmPrimitive.abstract(*args, **kwargs)
+        )
         return out_aval, out_amax_aval, out_scale_aval, pre_gelu_out_aval, bias_grad_aval
 
     @staticmethod
-    def lowering(ctx, lhs, lhs_scale_inv, rhs, rhs_scale_inv, bias, gelu_input, out_amax, out_scale,
-                 *, out_dtype, contracting_dims, fuse_gelu, fuse_bias, grad, accumulate,
-                 use_split_accumulator):
+    def lowering(
+        ctx,
+        lhs,
+        lhs_scale_inv,
+        rhs,
+        rhs_scale_inv,
+        bias,
+        gelu_input,
+        out_amax,
+        out_scale,
+        *,
+        out_dtype,
+        contracting_dims,
+        fuse_gelu,
+        fuse_bias,
+        grad,
+        accumulate,
+        use_split_accumulator,
+    ):
         """
         Fused attention fwd lowering rules
         """
@@ -202,7 +230,7 @@ def lowering(ctx, lhs, lhs_scale_inv, rhs, rhs_scale_inv, bias, gelu_input, out_
         lhs_inner_dim, rhs_inner_dim = map(
             lambda inner_dim, ndims: (ndims - inner_dim) if inner_dim < 0 else inner_dim,
             contracting_dims,
-            (lhs_aval.ndim, rhs_aval.ndim)
+            (lhs_aval.ndim, rhs_aval.ndim),
         )
         lhs_trans = lhs_inner_dim != lhs_aval.ndim - 1
         rhs_trans = rhs_inner_dim == rhs_aval.ndim - 1
@@ -232,7 +260,7 @@ def lowering(ctx, lhs, lhs_scale_inv, rhs, rhs_scale_inv, bias, gelu_input, out_
                 fuse_bias=fuse_bias,
                 grad=grad,
                 accumulate=accumulate,
-                use_split_accumulator=use_split_accumulator
+                use_split_accumulator=use_split_accumulator,
             )
         else:
             operands = [
@@ -260,10 +288,22 @@ def lowering(ctx, lhs, lhs_scale_inv, rhs, rhs_scale_inv, bias, gelu_input, out_
             workspace_size = get_cublas_workspace_size_bytes()
             operand_dtype = jax_dtype_to_te_dtype(lhs_aval.dtype)
             bias_dtype = jax_dtype_to_te_dtype(bias_aval.dtype)
-            opaque = tex.pack_gemm_descriptor(m, n, k, workspace_size, operand_dtype,
-                                              jax_dtype_to_te_dtype(out_dtype), bias_dtype,
-                                              lhs_trans, rhs_trans, fuse_gelu, fuse_bias, grad,
-                                              accumulate, use_split_accumulator)
+            opaque = tex.pack_gemm_descriptor(
+                m,
+                n,
+                k,
+                workspace_size,
+                operand_dtype,
+                jax_dtype_to_te_dtype(out_dtype),
+                bias_dtype,
+                lhs_trans,
+                rhs_trans,
+                fuse_gelu,
+                fuse_bias,
+                grad,
+                accumulate,
+                use_split_accumulator,
+            )
 
             return custom_caller(
                 CollectiveGemmPrimitive.name,
@@ -274,9 +314,23 @@ def lowering(ctx, lhs, lhs_scale_inv, rhs, rhs_scale_inv, bias, gelu_input, out_
             )
 
     @staticmethod
-    def impl(lhs, lhs_scale_inv, rhs, rhs_scale_inv, bias, gelu_input, out_amax, out_scale,
-             out_dtype, contracting_dims, fuse_gelu, fuse_bias, grad, accumulate,
-             use_split_accumulator):
+    def impl(
+        lhs,
+        lhs_scale_inv,
+        rhs,
+        rhs_scale_inv,
+        bias,
+        gelu_input,
+        out_amax,
+        out_scale,
+        out_dtype,
+        contracting_dims,
+        fuse_gelu,
+        fuse_bias,
+        grad,
+        accumulate,
+        use_split_accumulator,
+    ):
         assert CollectiveGemmPrimitive.inner_primitive is not None
 
         (
@@ -306,13 +360,23 @@ def impl(lhs, lhs_scale_inv, rhs, rhs_scale_inv, bias, gelu_input, out_amax, out
         return out, out_amax_updated, out_scale_updated, pre_gelu_out, bias_grad
 
     @staticmethod
-    def batcher(batched_args, batch_dims, *, out_dtype, contracting_dims, fuse_gelu, fuse_bias, grad,
-                accumulate, use_split_accumulator):
+    def batcher(
+        batched_args,
+        batch_dims,
+        *,
+        out_dtype,
+        contracting_dims,
+        fuse_gelu,
+        fuse_bias,
+        grad,
+        accumulate,
+        use_split_accumulator,
+    ):
         assert CollectiveGemmPrimitive.outer_primitive is not None
         check_valid_batch_dims(batch_dims)
         lhs_bdims, *_, bias_bdims, gelu_input_bdims, out_amax_bdims, out_scale_bdims = batch_dims
 
-         # FP8 GEMM only supports non-transposed LHS and transposed RHS
+        # FP8 GEMM only supports non-transposed LHS and transposed RHS
         lhs, _, rhs, *_ = batched_args
         lhs_trans = contracting_dims[0] != lhs.ndim - 1
         rhs_trans = contracting_dims[1] == rhs.ndim - 1
@@ -320,27 +384,33 @@ def batcher(batched_args, batch_dims, *, out_dtype, contracting_dims, fuse_gelu,
         rhs = jnp.matrix_transpose(rhs) if not rhs_trans and jax_dtype_is_fp8(rhs.dtype) else rhs
         contracting_dims = (1, 1)
 
-        return (
-            CollectiveGemmPrimitive.outer_primitive.bind(
-                lhs,
-                batched_args[1],
-                rhs,
-                *batched_args[3:],
-                out_dtype=out_dtype,
-                contracting_dims=contracting_dims,
-                fuse_gelu=fuse_gelu,
-                fuse_bias=fuse_bias,
-                grad=grad,
-                accumulate=accumulate,
-                use_split_accumulator=use_split_accumulator,
-            )
-            (lhs_bdims, out_amax_bdims, out_scale_bdims, gelu_input_bdims, bias_bdims)
-        )
+        return CollectiveGemmPrimitive.outer_primitive.bind(
+            lhs,
+            batched_args[1],
+            rhs,
+            *batched_args[3:],
+            out_dtype=out_dtype,
+            contracting_dims=contracting_dims,
+            fuse_gelu=fuse_gelu,
+            fuse_bias=fuse_bias,
+            grad=grad,
+            accumulate=accumulate,
+            use_split_accumulator=use_split_accumulator,
+        )(lhs_bdims, out_amax_bdims, out_scale_bdims, gelu_input_bdims, bias_bdims)
 
     @staticmethod
-    def infer_sharding_from_operands(out_dtype, contracting_dims, fuse_gelu, fuse_bias, grad,
-                                     accumulate, use_split_accumulator, mesh, arg_infos,
-                                     result_infos):
+    def infer_sharding_from_operands(
+        out_dtype,
+        contracting_dims,
+        fuse_gelu,
+        fuse_bias,
+        grad,
+        accumulate,
+        use_split_accumulator,
+        mesh,
+        arg_infos,
+        result_infos,
+    ):
         del out_dtype, accumulate, use_split_accumulator, result_infos
         lhs, _, rhs, *_ = arg_infos
         lhs_spec, rhs_spec = map(get_padded_spec, [lhs, rhs])
@@ -348,12 +418,14 @@ def infer_sharding_from_operands(out_dtype, contracting_dims, fuse_gelu, fuse_bi
         lhs_inner_dim, rhs_inner_dim = map(
             lambda inner_dim, ndims: (ndims - inner_dim) if inner_dim < 0 else inner_dim,
             contracting_dims,
-            (lhs.ndim, rhs.ndim)
+            (lhs.ndim, rhs.ndim),
         )
         if lhs_spec[lhs_inner_dim] != rhs_spec[rhs_inner_dim] and not grad:
-            warnings.warn("Forcing the inner dimension of LHS to match the sharding of inner "
-                          + "dimension of RHS. This can trigger additional communication if LHS is "
-                          + "not already partitioned correctly.")
+            warnings.warn(
+                "Forcing the inner dimension of LHS to match the sharding of inner "
+                + "dimension of RHS. This can trigger additional communication if LHS is "
+                + "not already partitioned correctly."
+            )
 
         lhs_trans = lhs_inner_dim != lhs.ndim - 1
         rhs_trans = rhs_inner_dim == rhs.ndim - 1
@@ -383,8 +455,18 @@ def infer_sharding_from_operands(out_dtype, contracting_dims, fuse_gelu, fuse_bi
         return (out_sharding, fp8_meta_sharding, fp8_meta_sharding, gelu_sharding, bias_sharding)
 
     @staticmethod
-    def partition(out_dtype, contracting_dims, fuse_gelu, fuse_bias, grad, accumulate,
-                  use_split_accumulator, mesh, arg_infos, result_infos):
+    def partition(
+        out_dtype,
+        contracting_dims,
+        fuse_gelu,
+        fuse_bias,
+        grad,
+        accumulate,
+        use_split_accumulator,
+        mesh,
+        arg_infos,
+        result_infos,
+    ):
         del result_infos
         lhs, _, rhs, *_ = arg_infos
         lhs_spec, rhs_spec = map(get_padded_spec, [lhs, rhs])
@@ -392,7 +474,7 @@ def partition(out_dtype, contracting_dims, fuse_gelu, fuse_bias, grad, accumulat
         lhs_inner_dim, rhs_inner_dim = map(
             lambda inner_dim, ndims: (ndims - inner_dim) if inner_dim < 0 else inner_dim,
             contracting_dims,
-            (lhs.ndim, rhs.ndim)
+            (lhs.ndim, rhs.ndim),
         )
 
         lhs_trans = lhs_inner_dim != lhs.ndim - 1
@@ -426,13 +508,27 @@ def partition(out_dtype, contracting_dims, fuse_gelu, fuse_bias, grad, accumulat
         gelu_spec = out_spec if fuse_gelu else [None]
         gelu_sharding = NamedSharding(mesh, PartitionSpec(*gelu_spec))
 
-        arg_shardings = (lhs_sharding, fp8_meta_sharding, rhs_sharding, fp8_meta_sharding,
-                         bias_sharding, gelu_sharding, fp8_meta_sharding, fp8_meta_sharding)
-        out_shardings = (out_sharding, fp8_meta_sharding, fp8_meta_sharding, gelu_sharding,
-                         bias_sharding)
+        arg_shardings = (
+            lhs_sharding,
+            fp8_meta_sharding,
+            rhs_sharding,
+            fp8_meta_sharding,
+            bias_sharding,
+            gelu_sharding,
+            fp8_meta_sharding,
+            fp8_meta_sharding,
+        )
+        out_shardings = (
+            out_sharding,
+            fp8_meta_sharding,
+            fp8_meta_sharding,
+            gelu_sharding,
+            bias_sharding,
+        )
 
-        def sharded_impl(lhs, lhs_scale_inv, rhs, rhs_scale_inv, bias, gelu_input, out_amax,
-                         out_scale):
+        def sharded_impl(
+            lhs, lhs_scale_inv, rhs, rhs_scale_inv, bias, gelu_input, out_amax, out_scale
+        ):
             (
                 out,
                 out_amax_updated,
@@ -465,8 +561,7 @@ def sharded_impl(lhs, lhs_scale_inv, rhs, rhs_scale_inv, bias, gelu_input, out_a
                 # GEMM output needs to be all-reduced when the contracting dimension is sharded.
                 # If the layer is sequence-parallel, we also need to scatter the output, which we
                 # can combine into a reduce-scatter here.
-                out = lax_paral_op(out, jax.lax.psum, global_mesh_resource().cp_resource,
-                                      mesh)
+                out = lax_paral_op(out, jax.lax.psum, global_mesh_resource().cp_resource, mesh)
                 if fuse_gelu:
                     pre_gelu_out = lax_paral_op(
                         pre_gelu_out, jax.lax.psum, global_mesh_resource().cp_resource, mesh
@@ -485,10 +580,10 @@ def fp8_gemm_impl(
     lhs_scale_inv: ArrayLike,
     rhs: ArrayLike,
     rhs_scale_inv: ArrayLike,
-    bias:  Optional[ArrayLike] = None,
+    bias: Optional[ArrayLike] = None,
     gelu_input: Optional[ArrayLike] = None,
-    out_amax:  Optional[ArrayLike] = None,
-    out_scale:  Optional[ArrayLike] = None,
+    out_amax: Optional[ArrayLike] = None,
+    out_scale: Optional[ArrayLike] = None,
     out_dtype: jnp.dtype = jnp.bfloat16,
     contracting_dims: Tuple[int, int] = (1, 1),
     fuse_gelu: bool = False,
@@ -506,9 +601,7 @@ def fp8_gemm_impl(
     if not fuse_bias:
         bias = jnp.zeros(0, dtype=jnp.bfloat16)
     else:
-        assert (
-            bias is not None
-        ), "Missing bias in forward GEMM when bias epilogue is enabled."
+        assert bias is not None, "Missing bias in forward GEMM when bias epilogue is enabled."
 
     if not fuse_gelu:
         gelu_input = jnp.zeros(0, dtype=bias.dtype)
@@ -542,8 +635,8 @@ def fp8_gemm_impl(
 def gemm_impl(
     lhs: ArrayLike,
     rhs: ArrayLike,
-    bias:  Optional[ArrayLike] = None,
-    gelu_input:  Optional[ArrayLike] = None,
+    bias: Optional[ArrayLike] = None,
+    gelu_input: Optional[ArrayLike] = None,
     contracting_dims: Tuple[int, int] = (1, 0),
     fuse_gelu: bool = False,
     fuse_bias: bool = False,
@@ -563,9 +656,7 @@ def gemm_impl(
     elif grad:
         bias = jnp.zeros(out_shape[-1], dtype=lhs.dtype)
     else:
-        assert (
-            bias is not None
-        ), "Missing bias in forward GEMM when bias epilogue is enabled."
+        assert bias is not None, "Missing bias in forward GEMM when bias epilogue is enabled."
 
     if not fuse_gelu:
         gelu_input = jnp.zeros(0, dtype=lhs.dtype)
diff --git a/transformer_engine/jax/csrc/extensions/gemm.cpp b/transformer_engine/jax/csrc/extensions/gemm.cpp
index f60ae510df..5dae9d6757 100644
--- a/transformer_engine/jax/csrc/extensions/gemm.cpp
+++ b/transformer_engine/jax/csrc/extensions/gemm.cpp
@@ -29,8 +29,8 @@ void GemmImpl(cudaStream_t stream, void *lhs, const std::vector<size_t> &lhs_sha
   auto out_ = TensorWrapper(out, out_shape, out_dtype, out_amax, out_scale, nullptr);
 
   void *bias_ptr = (fuse_bias) ? bias : nullptr;
-  std::vector<size_t> bias_shape = (fuse_bias) ? std::vector<size_t>{out_shape[1]}
-                                               : std::vector<size_t>{0};
+  std::vector<size_t> bias_shape =
+      (fuse_bias) ? std::vector<size_t>{out_shape[1]} : std::vector<size_t>{0};
   auto bias_ = TensorWrapper(bias_ptr, bias_shape, bias_dtype);
 
   void *pre_gelu_ptr = (fuse_gelu) ? pre_gelu_out : nullptr;
@@ -65,12 +65,9 @@ void Gemm(cudaStream_t stream, void **buffers, const char *opaque, size_t opaque
   auto *workspace = buffers[13];
 
   // Operand aliasing
-  NVTE_CHECK(bias == bias_grad,
-             "bias not bound to bias_grad in TE/JAX GEMM");
-  NVTE_CHECK(gelu_input == pre_gelu_out,
-             "gelu_input not bound to pre_gelu_out in TE/JAX GEMM");
-  NVTE_CHECK(out_amax == out_amax_updated,
-             "out_amax not bound to out_amax_updated in TE/JAX GEMM");
+  NVTE_CHECK(bias == bias_grad, "bias not bound to bias_grad in TE/JAX GEMM");
+  NVTE_CHECK(gelu_input == pre_gelu_out, "gelu_input not bound to pre_gelu_out in TE/JAX GEMM");
+  NVTE_CHECK(out_amax == out_amax_updated, "out_amax not bound to out_amax_updated in TE/JAX GEMM");
   NVTE_CHECK(out_scale == out_scale_updated,
              "out_scale not bound to out_scale_updated in TE/JAX GEMM");
 
@@ -117,8 +114,7 @@ Error_Type GemmFFI(cudaStream_t stream, Buffer_Type lhs, Buffer_Type lhs_scale_i
   auto workspace_size = workspace->dimensions().back();
 
   // Operand aliasing
-  NVTE_CHECK(bias_ptr == bias_grad_ptr,
-             "bias not bound to bias_grad in TE/JAX GEMM");
+  NVTE_CHECK(bias_ptr == bias_grad_ptr, "bias not bound to bias_grad in TE/JAX GEMM");
   NVTE_CHECK(gelu_input_ptr == pre_gelu_out_ptr,
              "gelu_input not bound to pre_gelu_out in TE/JAX GEMM");
   NVTE_CHECK(out_amax_ptr == out_amax_updated_ptr,
diff --git a/transformer_engine/jax/flax/module.py b/transformer_engine/jax/flax/module.py
index 7312aa8295..abe23fdf8b 100644
--- a/transformer_engine/jax/flax/module.py
+++ b/transformer_engine/jax/flax/module.py
@@ -362,8 +362,16 @@ def generate_a_set(target_postfix):
         grad_amax, grad_scale = generate_a_set(grad_name_post_fix)
         output_amax, output_scale = generate_a_set(output_name_post_fix)
 
-        return FP8MetaPackage(input_amax, input_scale, weight_amax, weight_scale, grad_amax,
-                              grad_scale, output_amax, output_scale)
+        return FP8MetaPackage(
+            input_amax,
+            input_scale,
+            weight_amax,
+            weight_scale,
+            grad_amax,
+            grad_scale,
+            output_amax,
+            output_scale,
+        )
 
 
 class DenseGeneral(TransformerEngineBase):
diff --git a/transformer_engine/jax/gemm.py b/transformer_engine/jax/gemm.py
index ccd109e095..79499725b7 100644
--- a/transformer_engine/jax/gemm.py
+++ b/transformer_engine/jax/gemm.py
@@ -21,7 +21,6 @@
 )
 
 
-
 __all__ = [
     "gemm",
     "fp8_gemm",
@@ -52,8 +51,9 @@ def _gemm(
     accumulate: bool,
     use_split_accumulator: bool,
 ) -> ArrayLike:
-    out, _ = _gemm_fwd_rule(x, kernel, bias, contracting_dims, fuse_gelu, accumulate,
-                            use_split_accumulator)
+    out, _ = _gemm_fwd_rule(
+        x, kernel, bias, contracting_dims, fuse_gelu, accumulate, use_split_accumulator
+    )
     return out
 
 
@@ -76,7 +76,7 @@ def _gemm_fwd_rule(
         fuse_gelu=fuse_gelu,
         fuse_bias=fuse_bias,
         accumulate=accumulate,
-        use_split_accumulator=use_split_accumulator
+        use_split_accumulator=use_split_accumulator,
     )
 
     ctx = (
@@ -145,8 +145,18 @@ def fp8_gemm(
     accumulate: bool = False,
     use_split_accumulator: bool = False,
 ) -> ArrayLike:
-    return _fp8_gemm(x, kernel, bias, fp8_meta.amax_list, fp8_meta.scale_list, out_dtype,
-                     contracting_dims, fuse_gelu, accumulate, use_split_accumulator)
+    return _fp8_gemm(
+        x,
+        kernel,
+        bias,
+        fp8_meta.amax_list,
+        fp8_meta.scale_list,
+        out_dtype,
+        contracting_dims,
+        fuse_gelu,
+        accumulate,
+        use_split_accumulator,
+    )
 
 
 @partial(jax.custom_vjp, nondiff_argnums=(5, 6, 7, 8, 9))
@@ -163,8 +173,18 @@ def _fp8_gemm(
     use_split_accumulator: bool,
 ) -> ArrayLike:
     """Non-FP8 `nvte_cublas_gemm()` with optional GELU and bias-add fusions."""
-    out, _ = _fp8_gemm_fwd_rule(x, kernel, bias, amax_list, scale_list, out_dtype,
-                                contracting_dims, fuse_gelu, accumulate, use_split_accumulator)
+    out, _ = _fp8_gemm_fwd_rule(
+        x,
+        kernel,
+        bias,
+        amax_list,
+        scale_list,
+        out_dtype,
+        contracting_dims,
+        fuse_gelu,
+        accumulate,
+        use_split_accumulator,
+    )
     return out
 
 
@@ -183,7 +203,8 @@ def _fp8_gemm_fwd_rule(
     fuse_bias = bias is not None
 
     maybe_fm32_to_fp32, maybe_fp32_to_fm32 = FP8Helper.generate_fp8_meta_dtype_converter_pair(
-        *amax_list, *scale_list,
+        *amax_list,
+        *scale_list,
     )
     amax_list = maybe_fm32_to_fp32(*amax_list)
     scale_list = maybe_fm32_to_fp32(*scale_list)
@@ -272,7 +293,7 @@ def _fp8_gemm_fwd_rule(
         fuse_gelu=fuse_gelu,
         fuse_bias=fuse_bias,
         accumulate=accumulate,
-        use_split_accumulator=use_split_accumulator
+        use_split_accumulator=use_split_accumulator,
     )
     if out_dtype not in [jnp.float8_e4m3fn, jnp.float8_e5m2]:
         updated_out_amax = None
@@ -288,7 +309,7 @@ def _fp8_gemm_fwd_rule(
         updated_kernel_amax,
         pre_gelu_out if fuse_gelu else None,
         fuse_bias,
-        maybe_fp32_to_fm32
+        maybe_fp32_to_fm32,
     )
 
     return (out, updated_out_amax, updated_out_scale), ctx
@@ -313,7 +334,7 @@ def _fp8_gemm_bwd_rule(
         updated_kernel_amax,
         pre_gelu_out,
         fuse_bias,
-        maybe_fp32_to_fm32
+        maybe_fp32_to_fm32,
     ) = ctx
 
     fwd_dtype = FP8Helper.FWD_DTYPE
@@ -347,8 +368,6 @@ def _fp8_gemm_bwd_rule(
         )
         bgrad = None
 
-
-
     x_scale_inv = scale_inv_list[FP8MetaPackage.INPUT_IDX]
     wgrad, *_ = fp8_gemm_impl(
         casted_x,
@@ -370,11 +389,11 @@ def _fp8_gemm_bwd_rule(
             bwd_dtype,
             static_axis_boundary=-1,
             transpose_axis_boundary=-1,
-            activation_type=("gelu", ),
+            activation_type=("gelu",),
         )
     elif fuse_gelu:
         # No bias to fuse so we just do dGELU.
-        casted_dgelu, casted_dgelu_t, updated_dgelu_amax = dact_lu(grad, pre_gelu_out, ("gelu", ))
+        casted_dgelu, casted_dgelu_t, updated_dgelu_amax = dact_lu(grad, pre_gelu_out, ("gelu",))
         bgrad = None
 
     kernel_scale_inv = scale_inv_list[FP8MetaPackage.WEIGHT_IDX]
@@ -414,12 +433,23 @@ def type_safe_gemm(
     accumulate: bool = False,
     use_split_accumulator: bool = False,
 ) -> ArrayLike:
-    if (x.dtype in [jnp.float8_e4m3fn, jnp.float8_e5m2]
-        or kernel.dtype in [jnp.float8_e4m3fn, jnp.float8_e5m2]):
+    if x.dtype in [jnp.float8_e4m3fn, jnp.float8_e5m2] or kernel.dtype in [
+        jnp.float8_e4m3fn,
+        jnp.float8_e5m2,
+    ]:
         assert fp8_meta is not None, "GEMM operands have FP8 dtypes but FP8MetaPackage is None."
 
     if fp8_meta is not None:
-        return fp8_gemm(x, kernel, bias, fp8_meta, out_dtype, contracting_dims, fuse_gelu,
-                        accumulate, use_split_accumulator)
+        return fp8_gemm(
+            x,
+            kernel,
+            bias,
+            fp8_meta,
+            out_dtype,
+            contracting_dims,
+            fuse_gelu,
+            accumulate,
+            use_split_accumulator,
+        )
     else:
         return gemm(x, kernel, bias, contracting_dims, fuse_gelu, accumulate, use_split_accumulator)

From 2c3dbf1cf516d3dec5022b9b8304ee0d053170ba Mon Sep 17 00:00:00 2001
From: Alp Dener <adener@nvidia.com>
Date: Fri, 15 Nov 2024 23:56:38 +0000
Subject: [PATCH 04/32] re-applied bug fixes to working older version, updated
 backward pass, passing test

Signed-off-by: Alp Dener <adener@nvidia.com>
---
 transformer_engine/jax/cpp_extensions/gemm.py |  93 +++----
 transformer_engine/jax/gemm.py                | 260 +++++++++---------
 2 files changed, 174 insertions(+), 179 deletions(-)

diff --git a/transformer_engine/jax/cpp_extensions/gemm.py b/transformer_engine/jax/cpp_extensions/gemm.py
index 2df05d6df4..ee4c38d076 100644
--- a/transformer_engine/jax/cpp_extensions/gemm.py
+++ b/transformer_engine/jax/cpp_extensions/gemm.py
@@ -1,7 +1,6 @@
 # Copyright (c) 2022-2024, NVIDIA CORPORATION & AFFILIATES. All rights reserved.
 #
 # See LICENSE for license information.
-"""JAX/TE custom ops for cuBlasLt GEMM"""
 import warnings
 import operator
 from functools import reduce
@@ -39,6 +38,10 @@
 ]
 
 
+def sanitize_dims(dim, ndims):
+    return (ndims + dim) if dim < 0 else dim
+
+
 def get_cublas_workspace_size_bytes() -> None:
     """Return 32 MiB if using hopper, 4 MiB for all other architectures."""
     if tex.get_device_compute_capability() >= 90:
@@ -98,11 +101,8 @@ def abstract(
             ), "Missing RHS operand scale inverse in FP8 GEMM."
 
         # Validate operand layouts
-        lhs_inner_dim, rhs_inner_dim = map(
-            lambda inner_dim, ndims: (ndims - inner_dim) if inner_dim < 0 else inner_dim,
-            contracting_dims,
-            (lhs_aval.ndim, rhs_aval.ndim),
-        )
+        lhs_inner_dim, rhs_inner_dim = map(sanitize_dims, contracting_dims,
+                                           (lhs_aval.ndim, rhs_aval.ndim))
         assert (
             lhs_aval.shape[lhs_inner_dim] == rhs_aval.shape[rhs_inner_dim]
         ), f"Incompatible operand sizes: {lhs_aval.shape} x {rhs_aval.shape}."
@@ -134,23 +134,31 @@ def abstract(
             out_amax_updated_dtype = jnp.float32
             out_scale_updated_dtype = jnp.float32
 
-        # Infer output shape
+        # Make sure leading dimensions of RHS is broadcast-compatible with LHS
         lhs_outer_dim = lhs_aval.ndim - 1 if lhs_trans else lhs_aval.ndim - 2
+        rhs_outer_dim = rhs_aval.ndim - 2 if rhs_trans else rhs_aval.ndim - 1
+
         lhs_bdims = [
             dim for dim in range(lhs_aval.ndim) if dim not in [lhs_outer_dim, lhs_inner_dim]
         ]
         lhs_batch_shape = [lhs_aval.shape[dim] for dim in lhs_bdims]
         lhs_batch_size = reduce(operator.mul, lhs_batch_shape, 1)
-        rhs_outer_dim = rhs_aval.ndim - 2 if rhs_trans else rhs_aval.ndim - 1
-        rhs_bdims = [
-            dim for dim in range(rhs_aval.ndim) if dim not in [rhs_outer_dim, rhs_inner_dim]
-        ]
-        rhs_batch_size = reduce(operator.mul, rhs_bdims, 1)
-        assert (
-            lhs_batch_size == rhs_batch_size
-        ), "LHS and RHS operands must have the same batched sizes."
-        out_shape = (*lhs_batch_shape, lhs_aval.shape[lhs_outer_dim], rhs_aval.shape[rhs_outer_dim])
+        if rhs_aval.ndim > 2:
+            rhs_bdims = [
+                dim for dim in range(rhs_aval.ndim) if dim not in [rhs_outer_dim, rhs_inner_dim]
+            ]
+            rhs_batch_shape = [rhs_aval.shape[dim] for dim in rhs_bdims]
+            rhs_batch_size = reduce(operator.mul, rhs_batch_shape, 1)
+            if rhs_batch_size > 1:
+                assert (
+                    lhs_batch_size == rhs_batch_size
+                ), (
+                    f"Leading dimensins of RHS ({rhs_batch_shape=}) is not broadcast-compatible "
+                    + f"with the leading dimensions of LHS ({lhs_batch_shape=})."
+                )
 
+        # Infer output shape
+        out_shape = (*lhs_batch_shape, lhs_aval.shape[lhs_outer_dim], rhs_aval.shape[rhs_outer_dim])
         # Validate bias/bias_grad shape against inferred output
         bias_dtype = jnp.bfloat16 if jax_dtype_is_fp8(out_dtype) else out_dtype
         if fuse_bias:
@@ -227,11 +235,8 @@ def lowering(
         Fused attention fwd lowering rules
         """
         lhs_aval, _, rhs_aval, _, bias_aval, *_ = ctx.avals_in
-        lhs_inner_dim, rhs_inner_dim = map(
-            lambda inner_dim, ndims: (ndims - inner_dim) if inner_dim < 0 else inner_dim,
-            contracting_dims,
-            (lhs_aval.ndim, rhs_aval.ndim),
-        )
+        lhs_inner_dim, rhs_inner_dim = map(sanitize_dims, contracting_dims,
+                                           (lhs_aval.ndim, rhs_aval.ndim))
         lhs_trans = lhs_inner_dim != lhs_aval.ndim - 1
         rhs_trans = rhs_inner_dim == rhs_aval.ndim - 1
 
@@ -376,19 +381,8 @@ def batcher(
         check_valid_batch_dims(batch_dims)
         lhs_bdims, *_, bias_bdims, gelu_input_bdims, out_amax_bdims, out_scale_bdims = batch_dims
 
-        # FP8 GEMM only supports non-transposed LHS and transposed RHS
-        lhs, _, rhs, *_ = batched_args
-        lhs_trans = contracting_dims[0] != lhs.ndim - 1
-        rhs_trans = contracting_dims[1] == rhs.ndim - 1
-        lhs = jnp.matrix_transpose(lhs) if lhs_trans and jax_dtype_is_fp8(lhs.dtype) else lhs
-        rhs = jnp.matrix_transpose(rhs) if not rhs_trans and jax_dtype_is_fp8(rhs.dtype) else rhs
-        contracting_dims = (1, 1)
-
         return CollectiveGemmPrimitive.outer_primitive.bind(
-            lhs,
-            batched_args[1],
-            rhs,
-            *batched_args[3:],
+            *batched_args,
             out_dtype=out_dtype,
             contracting_dims=contracting_dims,
             fuse_gelu=fuse_gelu,
@@ -415,11 +409,7 @@ def infer_sharding_from_operands(
         lhs, _, rhs, *_ = arg_infos
         lhs_spec, rhs_spec = map(get_padded_spec, [lhs, rhs])
 
-        lhs_inner_dim, rhs_inner_dim = map(
-            lambda inner_dim, ndims: (ndims - inner_dim) if inner_dim < 0 else inner_dim,
-            contracting_dims,
-            (lhs.ndim, rhs.ndim),
-        )
+        lhs_inner_dim, rhs_inner_dim = map(sanitize_dims, contracting_dims, (lhs.ndim, rhs.ndim))
         if lhs_spec[lhs_inner_dim] != rhs_spec[rhs_inner_dim] and not grad:
             warnings.warn(
                 "Forcing the inner dimension of LHS to match the sharding of inner "
@@ -471,11 +461,7 @@ def partition(
         lhs, _, rhs, *_ = arg_infos
         lhs_spec, rhs_spec = map(get_padded_spec, [lhs, rhs])
 
-        lhs_inner_dim, rhs_inner_dim = map(
-            lambda inner_dim, ndims: (ndims - inner_dim) if inner_dim < 0 else inner_dim,
-            contracting_dims,
-            (lhs.ndim, rhs.ndim),
-        )
+        lhs_inner_dim, rhs_inner_dim = map(sanitize_dims, contracting_dims, (lhs.ndim, rhs.ndim))
 
         lhs_trans = lhs_inner_dim != lhs.ndim - 1
         rhs_trans = rhs_inner_dim == rhs.ndim - 1
@@ -578,14 +564,13 @@ def sharded_impl(
 def fp8_gemm_impl(
     lhs: ArrayLike,
     lhs_scale_inv: ArrayLike,
-    rhs: ArrayLike,
+    rhs_t: ArrayLike,
     rhs_scale_inv: ArrayLike,
     bias: Optional[ArrayLike] = None,
     gelu_input: Optional[ArrayLike] = None,
     out_amax: Optional[ArrayLike] = None,
     out_scale: Optional[ArrayLike] = None,
     out_dtype: jnp.dtype = jnp.bfloat16,
-    contracting_dims: Tuple[int, int] = (1, 1),
     fuse_gelu: bool = False,
     fuse_bias: bool = False,
     accumulate: bool = False,
@@ -606,22 +591,20 @@ def fp8_gemm_impl(
     if not fuse_gelu:
         gelu_input = jnp.zeros(0, dtype=bias.dtype)
     elif gelu_input is None:
-        lhs_outer_dim = lhs.ndim - 1 if contracting_dims[0] == 1 else lhs.ndim - 2
-        rhs_outer_dim = rhs.ndim - 2 if contracting_dims[1] == 0 else rhs.ndim - 1
-        out_shape = (*lhs.shape[:-2], lhs.shape[lhs_outer_dim], rhs.shape[rhs_outer_dim])
+        out_shape = (*lhs.shape[:-2], lhs.shape[-2], rhs_t.shape[-2])
         gelu_input = jnp.zeros(out_shape, dtype=bias.dtype)
 
     out, out_amax, out_scale, pre_gelu_out, _ = CollectiveGemmPrimitive.outer_primitive.bind(
-        rhs,
-        rhs_scale_inv,
         lhs,
         lhs_scale_inv,
+        rhs_t,
+        rhs_scale_inv,
         bias,
         gelu_input,
         out_amax,
         out_scale,
         out_dtype=out_dtype,
-        contracting_dims=tuple(reversed(contracting_dims)),
+        contracting_dims=(-1, -1),
         fuse_gelu=fuse_gelu,
         fuse_bias=fuse_bias,
         grad=False,
@@ -645,10 +628,9 @@ def gemm_impl(
     use_split_accumulator: bool = False,
 ) -> Tuple[ArrayLike, ...]:
     """Non-FP8 mat-mul with `nvte_cublas_gemm()` custom op."""
-    dummy_fp8_meta = jnp.zeros(0, dtype=jnp.float32)
-
-    lhs_outer_dim = lhs.ndim - 1 if contracting_dims[0] == 1 else lhs.ndim - 2
-    rhs_outer_dim = rhs.ndim - 2 if contracting_dims[1] == 0 else rhs.ndim - 1
+    lhs_inner_dim, rhs_inner_dim = map(sanitize_dims, contracting_dims, (lhs.ndim, rhs.ndim))
+    lhs_outer_dim = lhs.ndim - 1 if lhs_inner_dim == lhs.ndim - 2 else lhs.ndim - 2
+    rhs_outer_dim = rhs.ndim - 2 if rhs_inner_dim == rhs.ndim - 1 else rhs.ndim - 1
     out_shape = (*lhs.shape[:-2], lhs.shape[lhs_outer_dim], rhs.shape[rhs_outer_dim])
 
     if not fuse_bias:
@@ -667,6 +649,7 @@ def gemm_impl(
     elif gelu_input is None:
         gelu_input = jnp.zeros(out_shape, dtype=lhs.dtypes)
 
+    dummy_fp8_meta = jnp.zeros(0, dtype=jnp.float32)
     out, _, _, pre_gelu_out, bias_grad = CollectiveGemmPrimitive.outer_primitive.bind(
         lhs,
         dummy_fp8_meta,
diff --git a/transformer_engine/jax/gemm.py b/transformer_engine/jax/gemm.py
index 79499725b7..e9e046d182 100644
--- a/transformer_engine/jax/gemm.py
+++ b/transformer_engine/jax/gemm.py
@@ -1,7 +1,8 @@
 # Copyright (c) 2022-2024, NVIDIA CORPORATION & AFFILIATES. All rights reserved.
 #
 # See LICENSE for license information.
-from functools import partial
+import operator
+from functools import partial, reduce
 from typing import Optional, Tuple, Union
 
 import jax
@@ -19,6 +20,7 @@
     dbias_cast_transpose,
     dact_lu_dbias_cast_transpose,
 )
+from .cpp_extensions.gemm import sanitize_dims
 
 
 __all__ = [
@@ -98,27 +100,48 @@ def _gemm_bwd_rule(
     grad,
 ):
     x, kernel, pre_gelu_out, fuse_bias = ctx
+    x_inner_dim, kernel_inner_dim = map(sanitize_dims, contracting_dims, (x.ndim, kernel.ndim))
 
-    x_t_contracting = 0 if contracting_dims[0] == 1 else 1
-    wgrad, dgelu, bgrad = gemm_impl(
-        x,
+
+    kernel_t_contracting = (
+        kernel.ndim - 2 if kernel_inner_dim == kernel.ndim - 1 else kernel.ndim - 1
+    )
+    # DGRAD: ([B], M, N) x (K, N)^T = ([B], M, K)
+    dgrad, dgelu, _ = gemm_impl(
         grad,
+        kernel,
         gelu_input=pre_gelu_out,
-        contracting_dims=(x_t_contracting, 0),
+        contracting_dims=(-1, kernel_t_contracting),
         fuse_gelu=fuse_gelu,
-        fuse_bias=fuse_bias,
+        fuse_bias=False,
         grad=True,
         accumulate=accumulate,
         use_split_accumulator=use_split_accumulator,
     )
 
-    kernel_t_contracting = 1 if contracting_dims[1] == 0 else 0
-    dgrad, *_ = gemm_impl(
-        dgelu if fuse_gelu else grad,
-        kernel,
+    # Collapse batch x sequence dimensions for WGRAD
+    x_outer_dim = x.ndim - 2 if x_inner_dim == x.ndim - 1 else x.ndim - 1
+    wgrad_rhs = dgelu if fuse_gelu else grad
+    if x.ndim > 2:
+        batch_size = reduce(operator.mul, x.shape[:-2], 1)
+        x = jax.lax.reshape(
+            jax.lax.transpose(x, (*list(range(x.ndim - 2)), x_outer_dim, x_inner_dim)),
+            (batch_size * x.shape[x_outer_dim], x.shape[x_inner_dim]),
+        )
+        wgrad_rhs = jnp.reshape(
+            wgrad_rhs, shape=(batch_size * wgrad_rhs.shape[-2], wgrad_rhs.shape[-1])
+        )
+        x_t_contracting = 0
+    else:
+        x_t_contracting = x_outer_dim
+
+    # WGRAD: ([B], M, K)^T x ([B], M, N) = ([B], K, N)
+    wgrad, _, bgrad = gemm_impl(
+        x,
+        wgrad_rhs,
         gelu_input=pre_gelu_out,
-        contracting_dims=(1, kernel_t_contracting),
-        fuse_gelu=fuse_gelu,
+        contracting_dims=(x_t_contracting, wgrad_rhs.ndim - 2),
+        fuse_gelu=False,
         fuse_bias=fuse_bias,
         grad=True,
         accumulate=accumulate,
@@ -140,7 +163,6 @@ def fp8_gemm(
     fp8_meta: FP8MetaPackage,
     bias: Optional[ArrayLike] = None,
     out_dtype: jnp.dtype = jnp.bfloat16,
-    contracting_dims: Tuple[int, int] = (1, 1),
     fuse_gelu: bool = False,
     accumulate: bool = False,
     use_split_accumulator: bool = False,
@@ -152,7 +174,6 @@ def fp8_gemm(
         fp8_meta.amax_list,
         fp8_meta.scale_list,
         out_dtype,
-        contracting_dims,
         fuse_gelu,
         accumulate,
         use_split_accumulator,
@@ -162,12 +183,11 @@ def fp8_gemm(
 @partial(jax.custom_vjp, nondiff_argnums=(5, 6, 7, 8, 9))
 def _fp8_gemm(
     x: ArrayLike,
-    kernel: ArrayLike,
+    kernel_t: ArrayLike,
     bias: ArrayLike,
     amax_list: ArrayLike,
     scale_list: ArrayLike,
     out_dtype: jnp.dtype,
-    contracting_dims: Tuple[int, int],
     fuse_gelu: bool,
     accumulate: bool,
     use_split_accumulator: bool,
@@ -175,12 +195,11 @@ def _fp8_gemm(
     """Non-FP8 `nvte_cublas_gemm()` with optional GELU and bias-add fusions."""
     out, _ = _fp8_gemm_fwd_rule(
         x,
-        kernel,
+        kernel_t,
         bias,
         amax_list,
         scale_list,
         out_dtype,
-        contracting_dims,
         fuse_gelu,
         accumulate,
         use_split_accumulator,
@@ -190,12 +209,11 @@ def _fp8_gemm(
 
 def _fp8_gemm_fwd_rule(
     x: ArrayLike,
-    kernel: ArrayLike,
+    kernel_t: ArrayLike,
     bias: ArrayLike,
     amax_list: ArrayLike,
     scale_list: ArrayLike,
     out_dtype: jnp.dtype,
-    contracting_dims: Tuple[int, int],
     fuse_gelu: bool,
     accumulate: bool,
     use_split_accumulator: bool,
@@ -221,54 +239,36 @@ def _fp8_gemm_fwd_rule(
     x_scale = scale_list[FP8MetaPackage.INPUT_IDX]
     x_scale_inv = scale_inv_list[FP8MetaPackage.INPUT_IDX]
     if x.dtype not in [jnp.float8_e4m3fn, jnp.float8_e5m2]:
-        if contracting_dims[0] == 0:
-            _, casted_x, updated_x_amax = cast_transpose(
-                x,
-                x_amax,
-                x_scale,
-                x_scale_inv,
-                fwd_dtype,
-                static_axis_boundary=-1,
-                transpose_axis_boundary=-1,
-            )
-        else:
-            casted_x, updated_x_amax = cast_fp8(x, x_amax, x_scale, x_scale_inv, fwd_dtype)
+        casted_x, casted_x_t, updated_x_amax = cast_transpose(
+            x,
+            x_amax,
+            x_scale,
+            x_scale_inv,
+            fwd_dtype,
+            static_axis_boundary=-1,
+            transpose_axis_boundary=-1,
+        )
     else:
-        if contracting_dims[0] == 0:
-            casted_x_t = x
-            casted_x = casted_x_t.transpose()
-        else:
-            casted_x = x
+        casted_x = x
+        casted_x_t = jnp.matrix_transpose(x)
         updated_x_amax = x_amax
 
     kernel_amax = amax_list[FP8MetaPackage.WEIGHT_IDX][0:1]
     kernel_scale = scale_list[FP8MetaPackage.WEIGHT_IDX]
     kernel_scale_inv = scale_inv_list[FP8MetaPackage.WEIGHT_IDX]
-    if kernel.dtype not in [jnp.float8_e4m3fn, jnp.float8_e5m2]:
-        if contracting_dims[1] == 0:  # need to transpose the kernel for FP8 GEMM
-            _, casted_kernel_t, updated_kernel_amax = cast_transpose(
-                kernel,
-                kernel_amax,
-                kernel_scale,
-                kernel_scale_inv,
-                fwd_dtype,
-                static_axis_boundary=-1,
-                transpose_axis_boundary=-1,
-            )
-        else:
-            casted_kernel_t, updated_kernel_amax = cast_fp8(
-                kernel,
-                kernel_amax,
-                kernel_scale,
-                kernel_scale_inv,
-                fwd_dtype,
-            )
+    if kernel_t.dtype not in [jnp.float8_e4m3fn, jnp.float8_e5m2]:
+        casted_kernel_t, casted_kernel, updated_kernel_amax = cast_transpose(
+            kernel_t,
+            kernel_amax,
+            kernel_scale,
+            kernel_scale_inv,
+            fwd_dtype,
+            static_axis_boundary=-1,
+            transpose_axis_boundary=-1,
+        )
     else:
-        if contracting_dims[1] == 0:
-            casted_kernel = kernel
-            casted_kernel_t = casted_kernel.transpose()
-        else:
-            casted_kernel_t = kernel
+        casted_kernel = jnp.matrix_transpose(kernel_t)
+        casted_kernel_t = kernel_t
         updated_kernel_amax = kernel_amax
 
     out_amax = (
@@ -300,24 +300,24 @@ def _fp8_gemm_fwd_rule(
         updated_out_scale = None
 
     ctx = (
-        casted_x,
-        casted_kernel_t,
+        casted_x_t,
+        casted_kernel,
         amax_list,
         scale_list,
         scale_inv_list,
         updated_x_amax,
         updated_kernel_amax,
+        updated_out_amax,
         pre_gelu_out if fuse_gelu else None,
         fuse_bias,
         maybe_fp32_to_fm32,
     )
 
-    return (out, updated_out_amax, updated_out_scale), ctx
+    return (out, updated_out_scale), ctx
 
 
 def _fp8_gemm_bwd_rule(
     out_dtype,
-    contracting_dims,
     fuse_gelu,
     accumulate,
     use_split_accumulator,
@@ -325,83 +325,84 @@ def _fp8_gemm_bwd_rule(
     grad,
 ):
     (
-        casted_x,
-        casted_kernel_t,
+        casted_x_t,
+        casted_kernel,
         amax_list,
         scale_list,
         scale_inv_list,
         updated_x_amax,
         updated_kernel_amax,
+        updated_out_amax,
         pre_gelu_out,
         fuse_bias,
         maybe_fp32_to_fm32,
     ) = ctx
 
-    fwd_dtype = FP8Helper.FWD_DTYPE
     bwd_dtype = FP8Helper.BWD_DTYPE
 
     grad_amax = amax_list[FP8MetaPackage.GRAD_IDX][0:1]
     grad_scale = scale_list[FP8MetaPackage.GRAD_IDX]
     grad_scale_inv = scale_inv_list[FP8MetaPackage.GRAD_ID]
-    if fuse_bias and not fuse_gelu:
-        # Since there is no GELU fusion, we need to fuse dbias into this cast_transpose.
-        _, casted_grad_t, bgrad, updated_grad_amax = dbias_cast_transpose(
-            grad,
-            grad_amax,
-            grad_scale,
-            grad_scale_inv,
-            bwd_dtype,
-            static_axis_boundary=-1,
-            transpose_axis_boundary=-1,
-        )
+    if fuse_gelu:
+        if fuse_bias:
+            # Fuse dbias into this dGELU.
+            casted_grad, casted_grad_t, bgrad, updated_grad_amax = dact_lu_dbias_cast_transpose(
+                grad,
+                pre_gelu_out,
+                grad_amax,
+                grad_scale,
+                grad_scale_inv,
+                bwd_dtype,
+                static_axis_boundary=-1,
+                transpose_axis_boundary=-1,
+                activation_type=("gelu",),
+            )
+        else:
+            # No bias to fuse so we just do dGELU.
+            casted_grad, casted_grad_t, updated_grad_amax = dact_lu(grad, pre_gelu_out, ("gelu",))
+            bgrad = None
     else:
-        # If both bias and GELU is fused into the forward pass, we will fuse dbias later with
-        # dGELU. No need to do it here.
-        _, casted_grad_t, updated_grad_amax = cast_transpose(
-            grad,
-            grad_amax,
-            grad_scale,
-            grad_scale_inv,
-            bwd_dtype,
-            static_axis_boundary=-1,
-            transpose_axis_boundary=-1,
-        )
-        bgrad = None
+        if fuse_bias:
+            # Since there is no GELU fusion, we need to fuse dbias into this cast_transpose.
+            casted_grad, casted_grad_t, bgrad, updated_grad_amax = dbias_cast_transpose(
+                grad,
+                grad_amax,
+                grad_scale,
+                grad_scale_inv,
+                bwd_dtype,
+                static_axis_boundary=-1,
+                transpose_axis_boundary=-1,
+            )
+        else:
+            # If both bias and GELU is fused into the forward pass, we will fuse dbias later with
+            # dGELU. No need to do it here.
+            casted_grad, casted_grad_t, updated_grad_amax = cast_transpose(
+                grad,
+                grad_amax,
+                grad_scale,
+                grad_scale_inv,
+                bwd_dtype,
+                static_axis_boundary=-1,
+                transpose_axis_boundary=-1,
+            )
+            bgrad = None
 
-    x_scale_inv = scale_inv_list[FP8MetaPackage.INPUT_IDX]
-    wgrad, *_ = fp8_gemm_impl(
-        casted_x,
-        x_scale_inv,
-        casted_grad_t,
+    kernel_scale_inv = scale_inv_list[FP8MetaPackage.WEIGHT_IDX]
+    dgrad, *_ = fp8_gemm_impl(
+        casted_grad,
         grad_scale_inv,
+        casted_kernel,
+        kernel_scale_inv,
         accumulate=accumulate,
         use_split_accumulator=use_split_accumulator,
     )
 
-    if fuse_gelu and fuse_bias:
-        # Fuse dbias into this dGELU.
-        casted_dgelu, casted_dgelu_t, bgrad, updated_dgelu_amax = dact_lu_dbias_cast_transpose(
-            grad,
-            pre_gelu_out,
-            grad_amax,
-            grad_scale,
-            grad_scale_inv,
-            bwd_dtype,
-            static_axis_boundary=-1,
-            transpose_axis_boundary=-1,
-            activation_type=("gelu",),
-        )
-    elif fuse_gelu:
-        # No bias to fuse so we just do dGELU.
-        casted_dgelu, casted_dgelu_t, updated_dgelu_amax = dact_lu(grad, pre_gelu_out, ("gelu",))
-        bgrad = None
-
-    kernel_scale_inv = scale_inv_list[FP8MetaPackage.WEIGHT_IDX]
-    dgrad, *_ = gemm_impl(
-        casted_dgelu if fuse_gelu else grad,
+    x_scale_inv = scale_inv_list[FP8MetaPackage.INPUT_IDX]
+    wgrad, *_ = fp8_gemm_impl(
+        casted_x_t,
+        x_scale_inv,
+        casted_grad_t,
         grad_scale_inv,
-        casted_kernel_t,
-        kernel_scale_inv,
         accumulate=accumulate,
         use_split_accumulator=use_split_accumulator,
     )
@@ -412,6 +413,13 @@ def _fp8_gemm_bwd_rule(
     amax_list[FP8MetaPackage.WEIGHT_IDX] = (
         amax_list[FP8MetaPackage.WEIGHT_IDX].at[0].set(updated_kernel_amax[0])
     )
+    amax_list[FP8MetaPackage.GRAD_IDX] = (
+        amax_list[FP8MetaPackage.GRAD_IDX].at[0].set(updated_grad_amax[0])
+    )
+    if out_dtype in [jnp.float8_e4m3fn, jnp.float8_e5m2]:
+        amax_list[FP8MetaPackage.OUTPUT_IDX] = (
+            amax_list[FP8MetaPackage.OUTPUT_IDX].at[0].set(updated_out_amax[0])
+        )
 
     amax_list = maybe_fp32_to_fm32(*amax_list)
     scale_list = maybe_fp32_to_fm32(*scale_list)
@@ -433,20 +441,24 @@ def type_safe_gemm(
     accumulate: bool = False,
     use_split_accumulator: bool = False,
 ) -> ArrayLike:
-    if x.dtype in [jnp.float8_e4m3fn, jnp.float8_e5m2] or kernel.dtype in [
-        jnp.float8_e4m3fn,
-        jnp.float8_e5m2,
-    ]:
+    if (x.dtype in [jnp.float8_e4m3fn, jnp.float8_e5m2]
+        or kernel.dtype in [jnp.float8_e4m3fn, jnp.float8_e5m2]):
         assert fp8_meta is not None, "GEMM operands have FP8 dtypes but FP8MetaPackage is None."
 
     if fp8_meta is not None:
+        x_inner_dim, kernel_inner_dim = map(sanitize_dims, contracting_dims, (x.ndim, kernel.ndim))
+        assert (
+            x_inner_dim == x.ndim - 1 and kernel_inner_dim == kernel.ndim - 2
+        ), (
+            "FP8 GEMM requires non-transposed X (LHS) and transposed kernel (RHS), "
+            + "i.e. contracting_dims=(-1, -1)."
+        )
         return fp8_gemm(
             x,
             kernel,
             bias,
             fp8_meta,
             out_dtype,
-            contracting_dims,
             fuse_gelu,
             accumulate,
             use_split_accumulator,

From 448eaa99a3c3c93d8bcf2cb2d8ca6273f4f950d0 Mon Sep 17 00:00:00 2001
From: "pre-commit-ci[bot]"
 <66853113+pre-commit-ci[bot]@users.noreply.github.com>
Date: Fri, 15 Nov 2024 23:57:09 +0000
Subject: [PATCH 05/32] [pre-commit.ci] auto fixes from pre-commit.com hooks

for more information, see https://pre-commit.ci
---
 transformer_engine/jax/cpp_extensions/gemm.py | 14 +++++++-------
 transformer_engine/jax/gemm.py                | 11 +++++------
 2 files changed, 12 insertions(+), 13 deletions(-)

diff --git a/transformer_engine/jax/cpp_extensions/gemm.py b/transformer_engine/jax/cpp_extensions/gemm.py
index ee4c38d076..b935a5c2f7 100644
--- a/transformer_engine/jax/cpp_extensions/gemm.py
+++ b/transformer_engine/jax/cpp_extensions/gemm.py
@@ -101,8 +101,9 @@ def abstract(
             ), "Missing RHS operand scale inverse in FP8 GEMM."
 
         # Validate operand layouts
-        lhs_inner_dim, rhs_inner_dim = map(sanitize_dims, contracting_dims,
-                                           (lhs_aval.ndim, rhs_aval.ndim))
+        lhs_inner_dim, rhs_inner_dim = map(
+            sanitize_dims, contracting_dims, (lhs_aval.ndim, rhs_aval.ndim)
+        )
         assert (
             lhs_aval.shape[lhs_inner_dim] == rhs_aval.shape[rhs_inner_dim]
         ), f"Incompatible operand sizes: {lhs_aval.shape} x {rhs_aval.shape}."
@@ -150,9 +151,7 @@ def abstract(
             rhs_batch_shape = [rhs_aval.shape[dim] for dim in rhs_bdims]
             rhs_batch_size = reduce(operator.mul, rhs_batch_shape, 1)
             if rhs_batch_size > 1:
-                assert (
-                    lhs_batch_size == rhs_batch_size
-                ), (
+                assert lhs_batch_size == rhs_batch_size, (
                     f"Leading dimensins of RHS ({rhs_batch_shape=}) is not broadcast-compatible "
                     + f"with the leading dimensions of LHS ({lhs_batch_shape=})."
                 )
@@ -235,8 +234,9 @@ def lowering(
         Fused attention fwd lowering rules
         """
         lhs_aval, _, rhs_aval, _, bias_aval, *_ = ctx.avals_in
-        lhs_inner_dim, rhs_inner_dim = map(sanitize_dims, contracting_dims,
-                                           (lhs_aval.ndim, rhs_aval.ndim))
+        lhs_inner_dim, rhs_inner_dim = map(
+            sanitize_dims, contracting_dims, (lhs_aval.ndim, rhs_aval.ndim)
+        )
         lhs_trans = lhs_inner_dim != lhs_aval.ndim - 1
         rhs_trans = rhs_inner_dim == rhs_aval.ndim - 1
 
diff --git a/transformer_engine/jax/gemm.py b/transformer_engine/jax/gemm.py
index e9e046d182..3cab17b10b 100644
--- a/transformer_engine/jax/gemm.py
+++ b/transformer_engine/jax/gemm.py
@@ -102,7 +102,6 @@ def _gemm_bwd_rule(
     x, kernel, pre_gelu_out, fuse_bias = ctx
     x_inner_dim, kernel_inner_dim = map(sanitize_dims, contracting_dims, (x.ndim, kernel.ndim))
 
-
     kernel_t_contracting = (
         kernel.ndim - 2 if kernel_inner_dim == kernel.ndim - 1 else kernel.ndim - 1
     )
@@ -441,15 +440,15 @@ def type_safe_gemm(
     accumulate: bool = False,
     use_split_accumulator: bool = False,
 ) -> ArrayLike:
-    if (x.dtype in [jnp.float8_e4m3fn, jnp.float8_e5m2]
-        or kernel.dtype in [jnp.float8_e4m3fn, jnp.float8_e5m2]):
+    if x.dtype in [jnp.float8_e4m3fn, jnp.float8_e5m2] or kernel.dtype in [
+        jnp.float8_e4m3fn,
+        jnp.float8_e5m2,
+    ]:
         assert fp8_meta is not None, "GEMM operands have FP8 dtypes but FP8MetaPackage is None."
 
     if fp8_meta is not None:
         x_inner_dim, kernel_inner_dim = map(sanitize_dims, contracting_dims, (x.ndim, kernel.ndim))
-        assert (
-            x_inner_dim == x.ndim - 1 and kernel_inner_dim == kernel.ndim - 2
-        ), (
+        assert x_inner_dim == x.ndim - 1 and kernel_inner_dim == kernel.ndim - 2, (
             "FP8 GEMM requires non-transposed X (LHS) and transposed kernel (RHS), "
             + "i.e. contracting_dims=(-1, -1)."
         )

From cb6ae3cf7570285a13aae30b414a3a7ec19b4f6c Mon Sep 17 00:00:00 2001
From: Alp Dener <adener@nvidia.com>
Date: Mon, 18 Nov 2024 22:31:35 +0000
Subject: [PATCH 06/32] batched operands for GEMM custom op seem to be working
 now

Signed-off-by: Alp Dener <adener@nvidia.com>
---
 transformer_engine/jax/cpp_extensions/gemm.py | 151 +++++++++++++-----
 transformer_engine/jax/gemm.py                |  26 +--
 2 files changed, 119 insertions(+), 58 deletions(-)

diff --git a/transformer_engine/jax/cpp_extensions/gemm.py b/transformer_engine/jax/cpp_extensions/gemm.py
index b935a5c2f7..cf029d16db 100644
--- a/transformer_engine/jax/cpp_extensions/gemm.py
+++ b/transformer_engine/jax/cpp_extensions/gemm.py
@@ -136,8 +136,11 @@ def abstract(
             out_scale_updated_dtype = jnp.float32
 
         # Make sure leading dimensions of RHS is broadcast-compatible with LHS
-        lhs_outer_dim = lhs_aval.ndim - 1 if lhs_trans else lhs_aval.ndim - 2
-        rhs_outer_dim = rhs_aval.ndim - 2 if rhs_trans else rhs_aval.ndim - 1
+        lhs_outer_dim, rhs_outer_dim = map(
+            lambda inner_dim, ndim: ndim - 2 if inner_dim == ndim - 1 else ndim - 1,
+            (lhs_inner_dim, rhs_inner_dim),
+            (lhs_aval.ndim, rhs_aval.ndim)
+        )
 
         lhs_bdims = [
             dim for dim in range(lhs_aval.ndim) if dim not in [lhs_outer_dim, lhs_inner_dim]
@@ -152,12 +155,17 @@ def abstract(
             rhs_batch_size = reduce(operator.mul, rhs_batch_shape, 1)
             if rhs_batch_size > 1:
                 assert lhs_batch_size == rhs_batch_size, (
-                    f"Leading dimensins of RHS ({rhs_batch_shape=}) is not broadcast-compatible "
-                    + f"with the leading dimensions of LHS ({lhs_batch_shape=})."
+                    f"Leading dimensins of RHS ({rhs_aval.shape=}) is not broadcast-compatible "
+                    + f"with the leading dimensions of LHS ({lhs_aval.shape=})."
                 )
 
-        # Infer output shape
+        # Infer output shape:
         out_shape = (*lhs_batch_shape, lhs_aval.shape[lhs_outer_dim], rhs_aval.shape[rhs_outer_dim])
+        if lhs_aval.ndim > 2 and rhs_aval.ndim > 2 and lhs_batch_size > 1:
+            # When both RHS and LHS are batched, the batch dimensions are collapsed into the
+            # contracting dimension.
+            out_shape = (lhs_aval.shape[lhs_outer_dim], rhs_aval.shape[rhs_outer_dim])
+
         # Validate bias/bias_grad shape against inferred output
         bias_dtype = jnp.bfloat16 if jax_dtype_is_fp8(out_dtype) else out_dtype
         if fuse_bias:
@@ -169,9 +177,16 @@ def abstract(
             assert bias_aval.size == 0, "Internal TE error."
 
         # Validate GELU input/output
+        gelu_shape = (0, )
         if fuse_gelu:
-            assert all(
-                [gelu_input_aval.shape[i] == out_shape[i] for i in len(out_shape)]
+            gelu_shape = (
+                (reduce(operator.mul, out_shape[:-1], 1), out_shape[-1])
+                if len(out_shape) > 2
+                else out_shape
+            )
+            assert (
+                gelu_input_aval.ndim == 2
+                and all([gelu_input_aval.shape[i] == gelu_shape[i] for i in len(gelu_shape)])
             ), "Invalid GELU input shape."
             assert gelu_input_aval.dtype == bias_dtype, "Invalid GELU dtype."
         else:
@@ -185,7 +200,7 @@ def abstract(
         out_scale_updated_aval = out_scale_aval.update(
             shape=out_scale_aval.shape, dtype=out_scale_updated_dtype
         )
-        pre_gelu_out_aval = gelu_input_aval.update(shape=gelu_input_aval.shape, dtype=bias_dtype)
+        pre_gelu_out_aval = gelu_input_aval.update(shape=gelu_shape, dtype=bias_dtype)
         bias_grad_aval = bias_aval.update(shape=bias_aval.shape, dtype=bias_dtype)
         workspace_aval = jax.core.ShapedArray(
             shape=(get_cublas_workspace_size_bytes(),), dtype=jnp.uint8
@@ -285,8 +300,11 @@ def lowering(
             ]
             args = CustomCallArgsWrapper(out_types, operands, operand_shapes)
 
-            lhs_outer_dim = lhs_aval.ndim - 1 if lhs_trans else lhs_aval.ndim - 2
-            rhs_outer_dim = rhs_aval.ndim - 2 if rhs_trans else rhs_aval.ndim - 1
+            lhs_outer_dim, rhs_outer_dim = map(
+                lambda inner_dim, ndim: ndim - 2 if inner_dim == ndim - 1 else ndim - 1,
+                (lhs_inner_dim, rhs_inner_dim),
+                (lhs_aval.ndim, rhs_aval.ndim)
+            )
             m = lhs_aval.shape[lhs_outer_dim]
             k = rhs_aval.shape[rhs_inner_dim]
             n = rhs_aval.shape[rhs_outer_dim]
@@ -338,6 +356,43 @@ def impl(
     ):
         assert CollectiveGemmPrimitive.inner_primitive is not None
 
+        lhs_inner_dim, rhs_inner_dim = map(sanitize_dims, contracting_dims, (lhs.ndim, rhs.ndim))
+        lhs_trans = lhs_inner_dim != lhs.ndim - 1
+        rhs_trans = rhs_inner_dim == rhs.ndim - 1
+
+        # Squeeze batch dimensions of size 1 without any modification.
+        squeeze_dims = []
+        expand_out = False
+        if lhs.ndim > 2:
+            squeeze_dims = [dim for dim in range(lhs.ndim - 2) if lhs.shape[dim] == 1]
+            if len(squeeze_dims) > 0:
+                expand_out = True
+                lhs = jax.lax.squeeze(lhs, squeeze_dims)
+                contracting_dims = (lhs.ndim - 2 if lhs_trans else lhs.ndim - 1,
+                                    contracting_dims[1])
+        if rhs.ndim > 2:
+            rhs_squeeze_dims = [dim for dim in range(rhs.ndim - 2) if rhs.shape[dim] == 1]
+            if len(squeeze_dims) > 0:
+                rhs = jax.lax.squeeze(rhs, rhs_squeeze_dims)
+                contracting_dims = (contracting_dims[0],
+                                    rhs.ndim - 1 if rhs_trans else rhs.ndim - 2)
+
+        # Collapse batch dimensions that are larger thanm size 1.
+        # FWD: (B, M, K) x (K, N) = (B*M, K) x (K, N) = (B*M, N)
+        # DGRAD: (B, M, N) x (K, N)^T = (B*M, N) x (N, K) = (B*M, K)
+        # WGRAD: (B, M, K)^T x (B, M, N) = (K, B*M) x (B*M, N) = (K, N)
+        batch_shape = [lhs.shape[dim] for dim in range(lhs.ndim - 2)]
+        batch_size = reduce(operator.mul, batch_shape, 1)
+        reshape_output = not (lhs.ndim > 2 and rhs.ndim > 2)
+        if lhs.ndim > 2:
+            lhs_2d_shape = (batch_size * lhs.shape[-2], lhs.shape[-1])
+            lhs = jax.lax.reshape(lhs, lhs_2d_shape)
+            contracting_dims = (0 if lhs_trans else 1, contracting_dims[1])
+        if rhs.ndim > 2:
+            rhs_2d_shape = (reduce(operator.mul, rhs.shape[:-1], 1), rhs.shape[-1])
+            rhs = jax.lax.reshape(rhs, rhs_2d_shape)
+            contracting_dims = (contracting_dims[0], 1 if rhs_trans else 0)
+
         (
             out,
             out_amax_updated,
@@ -362,6 +417,15 @@ def impl(
             accumulate=accumulate,
             use_split_accumulator=use_split_accumulator,
         )
+
+        # Recover batched dimensions in the output
+        if reshape_output:
+            out_batched_shape = (*batch_shape, int(out.shape[-2] / batch_size), out.shape[-1])
+            out = jax.lax.reshape(out, out_batched_shape)
+
+        if expand_out:
+            out = jax.lax.expand_dims(out, squeeze_dims)
+
         return out, out_amax_updated, out_scale_updated, pre_gelu_out, bias_grad
 
     @staticmethod
@@ -381,16 +445,19 @@ def batcher(
         check_valid_batch_dims(batch_dims)
         lhs_bdims, *_, bias_bdims, gelu_input_bdims, out_amax_bdims, out_scale_bdims = batch_dims
 
-        return CollectiveGemmPrimitive.outer_primitive.bind(
-            *batched_args,
-            out_dtype=out_dtype,
-            contracting_dims=contracting_dims,
-            fuse_gelu=fuse_gelu,
-            fuse_bias=fuse_bias,
-            grad=grad,
-            accumulate=accumulate,
-            use_split_accumulator=use_split_accumulator,
-        )(lhs_bdims, out_amax_bdims, out_scale_bdims, gelu_input_bdims, bias_bdims)
+        return (
+            CollectiveGemmPrimitive.outer_primitive.bind(
+                *batched_args,
+                out_dtype=out_dtype,
+                contracting_dims=contracting_dims,
+                fuse_gelu=fuse_gelu,
+                fuse_bias=fuse_bias,
+                grad=grad,
+                accumulate=accumulate,
+                use_split_accumulator=use_split_accumulator,
+            ),
+            (lhs_bdims, out_amax_bdims, out_scale_bdims, gelu_input_bdims, bias_bdims)
+        )
 
     @staticmethod
     def infer_sharding_from_operands(
@@ -417,10 +484,12 @@ def infer_sharding_from_operands(
                 + "not already partitioned correctly."
             )
 
-        lhs_trans = lhs_inner_dim != lhs.ndim - 1
-        rhs_trans = rhs_inner_dim == rhs.ndim - 1
-        lhs_outer_dim = lhs.ndim - 1 if lhs_trans else lhs.ndim - 2
-        rhs_outer_dim = rhs.ndim - 2 if rhs_trans else rhs.ndim - 1
+        lhs_outer_dim, rhs_outer_dim = map(
+            lambda inner_dim, ndim: ndim - 2 if inner_dim == ndim - 1 else ndim - 1,
+            (lhs_inner_dim, rhs_inner_dim),
+            (lhs.ndim, rhs.ndim)
+        )
+        rhs_outer_dim = rhs.ndim - 2 if rhs_inner_dim == rhs.ndim - 1 else rhs.ndim - 1
         lhs_bdims = [dim for dim in range(lhs.ndim) if dim not in [lhs_outer_dim, lhs_inner_dim]]
         batch_specs = [lhs_spec[bdim] for bdim in lhs_bdims]
         rhs_outer_spec = rhs_spec[rhs_outer_dim]
@@ -430,18 +499,20 @@ def infer_sharding_from_operands(
 
         # Outer (sequence) dimension of the GEMM output is always unsharded
         out_spec = [*batch_specs, None, rhs_outer_spec]
+        batch_size = reduce(operator.mul, lhs.shape[:-2], 1)
+        if lhs.ndim > 2 and rhs.ndim > 2 and batch_size > 1:
+            out_spec = [None, rhs_outer_spec]
         out_sharding = NamedSharding(mesh, PartitionSpec(*out_spec))
 
         # FP8 metas are always unsharded
         fp8_meta_sharding = NamedSharding(mesh, PartitionSpec(None))
 
-        # Pre-GELU output matches output spec if GELU fusion is turned on, otherwise unsharded
-        gelu_spec = out_spec if fuse_gelu else [None]
+        # Pre-GELU output matches output, if GELU fusion is turned on, otherwise unsharded
+        gelu_spec = [None, rhs_outer_spec] if fuse_gelu else [None]
         gelu_sharding = NamedSharding(mesh, PartitionSpec(*gelu_spec))
 
         # Bias gradient spec matches outer dimension of output if bias fusion is turned on
         bias_sharding = NamedSharding(mesh, PartitionSpec(rhs_outer_spec if fuse_bias else None))
-
         return (out_sharding, fp8_meta_sharding, fp8_meta_sharding, gelu_sharding, bias_sharding)
 
     @staticmethod
@@ -462,11 +533,11 @@ def partition(
         lhs_spec, rhs_spec = map(get_padded_spec, [lhs, rhs])
 
         lhs_inner_dim, rhs_inner_dim = map(sanitize_dims, contracting_dims, (lhs.ndim, rhs.ndim))
-
-        lhs_trans = lhs_inner_dim != lhs.ndim - 1
-        rhs_trans = rhs_inner_dim == rhs.ndim - 1
-        lhs_outer_dim = lhs.ndim - 1 if lhs_trans else lhs.ndim - 2
-        rhs_outer_dim = rhs.ndim - 2 if rhs_trans else rhs.ndim - 1
+        lhs_outer_dim, rhs_outer_dim = map(
+            lambda inner_dim, ndim: ndim - 2 if inner_dim == ndim - 1 else ndim - 1,
+            (lhs_inner_dim, rhs_inner_dim),
+            (lhs.ndim, rhs.ndim)
+        )
         lhs_bdims = [dim for dim in range(lhs.ndim) if dim not in [lhs_outer_dim, lhs_inner_dim]]
         batch_specs = [lhs_spec[bdim] for bdim in lhs_bdims]
         rhs_outer_spec = rhs_spec[rhs_outer_dim]
@@ -488,10 +559,13 @@ def partition(
 
         # Outer (sequence) dimension of the GEMM output is always unsharded
         out_spec = [*batch_specs, None, rhs_outer_spec]
+        batch_size = reduce(operator.mul, lhs.shape[:-2], 1)
+        if lhs.ndim > 2 and rhs.ndim > 2 and batch_size > 1:
+            out_spec = [None, rhs_outer_spec]
         out_sharding = NamedSharding(mesh, PartitionSpec(*out_spec))
 
         # Pre-GELU output matches output spec if GELU fusion is turned on, otherwise unsharded
-        gelu_spec = out_spec if fuse_gelu else [None]
+        gelu_spec = [None, rhs_outer_spec] if fuse_gelu else [None]
         gelu_sharding = NamedSharding(mesh, PartitionSpec(*gelu_spec))
 
         arg_shardings = (
@@ -547,10 +621,10 @@ def sharded_impl(
                 # GEMM output needs to be all-reduced when the contracting dimension is sharded.
                 # If the layer is sequence-parallel, we also need to scatter the output, which we
                 # can combine into a reduce-scatter here.
-                out = lax_paral_op(out, jax.lax.psum, global_mesh_resource().cp_resource, mesh)
+                out = lax_paral_op(out, jax.lax.psum, global_mesh_resource().tp_resource, mesh)
                 if fuse_gelu:
                     pre_gelu_out = lax_paral_op(
-                        pre_gelu_out, jax.lax.psum, global_mesh_resource().cp_resource, mesh
+                        pre_gelu_out, jax.lax.psum, global_mesh_resource().tp_resource, mesh
                     )
 
             return out, out_amax_updated, out_scale_updated, pre_gelu_out, bias_grad
@@ -629,8 +703,11 @@ def gemm_impl(
 ) -> Tuple[ArrayLike, ...]:
     """Non-FP8 mat-mul with `nvte_cublas_gemm()` custom op."""
     lhs_inner_dim, rhs_inner_dim = map(sanitize_dims, contracting_dims, (lhs.ndim, rhs.ndim))
-    lhs_outer_dim = lhs.ndim - 1 if lhs_inner_dim == lhs.ndim - 2 else lhs.ndim - 2
-    rhs_outer_dim = rhs.ndim - 2 if rhs_inner_dim == rhs.ndim - 1 else rhs.ndim - 1
+    lhs_outer_dim, rhs_outer_dim = map(
+        lambda inner_dim, ndim: ndim - 2 if inner_dim == ndim - 1 else ndim - 1,
+        (lhs_inner_dim, rhs_inner_dim),
+        (lhs.ndim, rhs.ndim)
+    )
     out_shape = (*lhs.shape[:-2], lhs.shape[lhs_outer_dim], rhs.shape[rhs_outer_dim])
 
     if not fuse_bias:
diff --git a/transformer_engine/jax/gemm.py b/transformer_engine/jax/gemm.py
index 3cab17b10b..01ee60f24b 100644
--- a/transformer_engine/jax/gemm.py
+++ b/transformer_engine/jax/gemm.py
@@ -101,16 +101,15 @@ def _gemm_bwd_rule(
 ):
     x, kernel, pre_gelu_out, fuse_bias = ctx
     x_inner_dim, kernel_inner_dim = map(sanitize_dims, contracting_dims, (x.ndim, kernel.ndim))
+    x_outer_dim = x.ndim - 1 if x_inner_dim != x.ndim - 1 else x.ndim - 2
+    kernel_outer_dim = kernel.ndim - 2 if kernel_inner_dim == kernel.ndim - 1 else kernel.ndim - 1
 
-    kernel_t_contracting = (
-        kernel.ndim - 2 if kernel_inner_dim == kernel.ndim - 1 else kernel.ndim - 1
-    )
     # DGRAD: ([B], M, N) x (K, N)^T = ([B], M, K)
     dgrad, dgelu, _ = gemm_impl(
         grad,
         kernel,
         gelu_input=pre_gelu_out,
-        contracting_dims=(-1, kernel_t_contracting),
+        contracting_dims=(-1, kernel_outer_dim),
         fuse_gelu=fuse_gelu,
         fuse_bias=False,
         grad=True,
@@ -118,28 +117,13 @@ def _gemm_bwd_rule(
         use_split_accumulator=use_split_accumulator,
     )
 
-    # Collapse batch x sequence dimensions for WGRAD
-    x_outer_dim = x.ndim - 2 if x_inner_dim == x.ndim - 1 else x.ndim - 1
+    # WGRAD: ([B], M, K)^T x ([B], M, N) = (K, N)
     wgrad_rhs = dgelu if fuse_gelu else grad
-    if x.ndim > 2:
-        batch_size = reduce(operator.mul, x.shape[:-2], 1)
-        x = jax.lax.reshape(
-            jax.lax.transpose(x, (*list(range(x.ndim - 2)), x_outer_dim, x_inner_dim)),
-            (batch_size * x.shape[x_outer_dim], x.shape[x_inner_dim]),
-        )
-        wgrad_rhs = jnp.reshape(
-            wgrad_rhs, shape=(batch_size * wgrad_rhs.shape[-2], wgrad_rhs.shape[-1])
-        )
-        x_t_contracting = 0
-    else:
-        x_t_contracting = x_outer_dim
-
-    # WGRAD: ([B], M, K)^T x ([B], M, N) = ([B], K, N)
     wgrad, _, bgrad = gemm_impl(
         x,
         wgrad_rhs,
         gelu_input=pre_gelu_out,
-        contracting_dims=(x_t_contracting, wgrad_rhs.ndim - 2),
+        contracting_dims=(x_outer_dim, wgrad_rhs.ndim - 2),
         fuse_gelu=False,
         fuse_bias=fuse_bias,
         grad=True,

From 6f673559d250c9cf9c2713201da256b641cad279 Mon Sep 17 00:00:00 2001
From: "pre-commit-ci[bot]"
 <66853113+pre-commit-ci[bot]@users.noreply.github.com>
Date: Mon, 18 Nov 2024 22:32:02 +0000
Subject: [PATCH 07/32] [pre-commit.ci] auto fixes from pre-commit.com hooks

for more information, see https://pre-commit.ci
---
 transformer_engine/jax/cpp_extensions/gemm.py | 31 ++++++++++---------
 1 file changed, 17 insertions(+), 14 deletions(-)

diff --git a/transformer_engine/jax/cpp_extensions/gemm.py b/transformer_engine/jax/cpp_extensions/gemm.py
index cf029d16db..0948139dc9 100644
--- a/transformer_engine/jax/cpp_extensions/gemm.py
+++ b/transformer_engine/jax/cpp_extensions/gemm.py
@@ -139,7 +139,7 @@ def abstract(
         lhs_outer_dim, rhs_outer_dim = map(
             lambda inner_dim, ndim: ndim - 2 if inner_dim == ndim - 1 else ndim - 1,
             (lhs_inner_dim, rhs_inner_dim),
-            (lhs_aval.ndim, rhs_aval.ndim)
+            (lhs_aval.ndim, rhs_aval.ndim),
         )
 
         lhs_bdims = [
@@ -177,16 +177,15 @@ def abstract(
             assert bias_aval.size == 0, "Internal TE error."
 
         # Validate GELU input/output
-        gelu_shape = (0, )
+        gelu_shape = (0,)
         if fuse_gelu:
             gelu_shape = (
                 (reduce(operator.mul, out_shape[:-1], 1), out_shape[-1])
                 if len(out_shape) > 2
                 else out_shape
             )
-            assert (
-                gelu_input_aval.ndim == 2
-                and all([gelu_input_aval.shape[i] == gelu_shape[i] for i in len(gelu_shape)])
+            assert gelu_input_aval.ndim == 2 and all(
+                [gelu_input_aval.shape[i] == gelu_shape[i] for i in len(gelu_shape)]
             ), "Invalid GELU input shape."
             assert gelu_input_aval.dtype == bias_dtype, "Invalid GELU dtype."
         else:
@@ -303,7 +302,7 @@ def lowering(
             lhs_outer_dim, rhs_outer_dim = map(
                 lambda inner_dim, ndim: ndim - 2 if inner_dim == ndim - 1 else ndim - 1,
                 (lhs_inner_dim, rhs_inner_dim),
-                (lhs_aval.ndim, rhs_aval.ndim)
+                (lhs_aval.ndim, rhs_aval.ndim),
             )
             m = lhs_aval.shape[lhs_outer_dim]
             k = rhs_aval.shape[rhs_inner_dim]
@@ -368,14 +367,18 @@ def impl(
             if len(squeeze_dims) > 0:
                 expand_out = True
                 lhs = jax.lax.squeeze(lhs, squeeze_dims)
-                contracting_dims = (lhs.ndim - 2 if lhs_trans else lhs.ndim - 1,
-                                    contracting_dims[1])
+                contracting_dims = (
+                    lhs.ndim - 2 if lhs_trans else lhs.ndim - 1,
+                    contracting_dims[1],
+                )
         if rhs.ndim > 2:
             rhs_squeeze_dims = [dim for dim in range(rhs.ndim - 2) if rhs.shape[dim] == 1]
             if len(squeeze_dims) > 0:
                 rhs = jax.lax.squeeze(rhs, rhs_squeeze_dims)
-                contracting_dims = (contracting_dims[0],
-                                    rhs.ndim - 1 if rhs_trans else rhs.ndim - 2)
+                contracting_dims = (
+                    contracting_dims[0],
+                    rhs.ndim - 1 if rhs_trans else rhs.ndim - 2,
+                )
 
         # Collapse batch dimensions that are larger thanm size 1.
         # FWD: (B, M, K) x (K, N) = (B*M, K) x (K, N) = (B*M, N)
@@ -456,7 +459,7 @@ def batcher(
                 accumulate=accumulate,
                 use_split_accumulator=use_split_accumulator,
             ),
-            (lhs_bdims, out_amax_bdims, out_scale_bdims, gelu_input_bdims, bias_bdims)
+            (lhs_bdims, out_amax_bdims, out_scale_bdims, gelu_input_bdims, bias_bdims),
         )
 
     @staticmethod
@@ -487,7 +490,7 @@ def infer_sharding_from_operands(
         lhs_outer_dim, rhs_outer_dim = map(
             lambda inner_dim, ndim: ndim - 2 if inner_dim == ndim - 1 else ndim - 1,
             (lhs_inner_dim, rhs_inner_dim),
-            (lhs.ndim, rhs.ndim)
+            (lhs.ndim, rhs.ndim),
         )
         rhs_outer_dim = rhs.ndim - 2 if rhs_inner_dim == rhs.ndim - 1 else rhs.ndim - 1
         lhs_bdims = [dim for dim in range(lhs.ndim) if dim not in [lhs_outer_dim, lhs_inner_dim]]
@@ -536,7 +539,7 @@ def partition(
         lhs_outer_dim, rhs_outer_dim = map(
             lambda inner_dim, ndim: ndim - 2 if inner_dim == ndim - 1 else ndim - 1,
             (lhs_inner_dim, rhs_inner_dim),
-            (lhs.ndim, rhs.ndim)
+            (lhs.ndim, rhs.ndim),
         )
         lhs_bdims = [dim for dim in range(lhs.ndim) if dim not in [lhs_outer_dim, lhs_inner_dim]]
         batch_specs = [lhs_spec[bdim] for bdim in lhs_bdims]
@@ -706,7 +709,7 @@ def gemm_impl(
     lhs_outer_dim, rhs_outer_dim = map(
         lambda inner_dim, ndim: ndim - 2 if inner_dim == ndim - 1 else ndim - 1,
         (lhs_inner_dim, rhs_inner_dim),
-        (lhs.ndim, rhs.ndim)
+        (lhs.ndim, rhs.ndim),
     )
     out_shape = (*lhs.shape[:-2], lhs.shape[lhs_outer_dim], rhs.shape[rhs_outer_dim])
 

From 4b2b2d44d735714ea9917fb00748c77e473fdafa Mon Sep 17 00:00:00 2001
From: Alp Dener <adener@nvidia.com>
Date: Tue, 19 Nov 2024 17:57:33 +0000
Subject: [PATCH 08/32] fixed batch size 1 issue and enabled FSDP sharding for
 RHS operand

Signed-off-by: Alp Dener <adener@nvidia.com>
---
 transformer_engine/jax/cpp_extensions/gemm.py | 65 ++++++++-----------
 transformer_engine/jax/gemm.py                | 18 +++--
 2 files changed, 39 insertions(+), 44 deletions(-)

diff --git a/transformer_engine/jax/cpp_extensions/gemm.py b/transformer_engine/jax/cpp_extensions/gemm.py
index 0948139dc9..431dea6c1d 100644
--- a/transformer_engine/jax/cpp_extensions/gemm.py
+++ b/transformer_engine/jax/cpp_extensions/gemm.py
@@ -161,7 +161,7 @@ def abstract(
 
         # Infer output shape:
         out_shape = (*lhs_batch_shape, lhs_aval.shape[lhs_outer_dim], rhs_aval.shape[rhs_outer_dim])
-        if lhs_aval.ndim > 2 and rhs_aval.ndim > 2 and lhs_batch_size > 1:
+        if lhs_aval.ndim > 2 and rhs_aval.ndim > 2:
             # When both RHS and LHS are batched, the batch dimensions are collapsed into the
             # contracting dimension.
             out_shape = (lhs_aval.shape[lhs_outer_dim], rhs_aval.shape[rhs_outer_dim])
@@ -359,27 +359,6 @@ def impl(
         lhs_trans = lhs_inner_dim != lhs.ndim - 1
         rhs_trans = rhs_inner_dim == rhs.ndim - 1
 
-        # Squeeze batch dimensions of size 1 without any modification.
-        squeeze_dims = []
-        expand_out = False
-        if lhs.ndim > 2:
-            squeeze_dims = [dim for dim in range(lhs.ndim - 2) if lhs.shape[dim] == 1]
-            if len(squeeze_dims) > 0:
-                expand_out = True
-                lhs = jax.lax.squeeze(lhs, squeeze_dims)
-                contracting_dims = (
-                    lhs.ndim - 2 if lhs_trans else lhs.ndim - 1,
-                    contracting_dims[1],
-                )
-        if rhs.ndim > 2:
-            rhs_squeeze_dims = [dim for dim in range(rhs.ndim - 2) if rhs.shape[dim] == 1]
-            if len(squeeze_dims) > 0:
-                rhs = jax.lax.squeeze(rhs, rhs_squeeze_dims)
-                contracting_dims = (
-                    contracting_dims[0],
-                    rhs.ndim - 1 if rhs_trans else rhs.ndim - 2,
-                )
-
         # Collapse batch dimensions that are larger thanm size 1.
         # FWD: (B, M, K) x (K, N) = (B*M, K) x (K, N) = (B*M, N)
         # DGRAD: (B, M, N) x (K, N)^T = (B*M, N) x (N, K) = (B*M, K)
@@ -426,9 +405,6 @@ def impl(
             out_batched_shape = (*batch_shape, int(out.shape[-2] / batch_size), out.shape[-1])
             out = jax.lax.reshape(out, out_batched_shape)
 
-        if expand_out:
-            out = jax.lax.expand_dims(out, squeeze_dims)
-
         return out, out_amax_updated, out_scale_updated, pre_gelu_out, bias_grad
 
     @staticmethod
@@ -497,13 +473,9 @@ def infer_sharding_from_operands(
         batch_specs = [lhs_spec[bdim] for bdim in lhs_bdims]
         rhs_outer_spec = rhs_spec[rhs_outer_dim]
 
-        if rhs_spec[rhs_inner_dim] is not None and rhs_outer_spec is not None:
-            raise RuntimeError("Both inner and outer dimensions of RHS cannot be sharded.")
-
         # Outer (sequence) dimension of the GEMM output is always unsharded
         out_spec = [*batch_specs, None, rhs_outer_spec]
-        batch_size = reduce(operator.mul, lhs.shape[:-2], 1)
-        if lhs.ndim > 2 and rhs.ndim > 2 and batch_size > 1:
+        if lhs.ndim > 2 and rhs.ndim > 2:
             out_spec = [None, rhs_outer_spec]
         out_sharding = NamedSharding(mesh, PartitionSpec(*out_spec))
 
@@ -543,7 +515,6 @@ def partition(
         )
         lhs_bdims = [dim for dim in range(lhs.ndim) if dim not in [lhs_outer_dim, lhs_inner_dim]]
         batch_specs = [lhs_spec[bdim] for bdim in lhs_bdims]
-        rhs_outer_spec = rhs_spec[rhs_outer_dim]
 
         # Force all-gather the outer (sequence) dimension of the LHS operand
         lhs_spec_new = [spec for spec in lhs_spec]
@@ -551,8 +522,29 @@ def partition(
         lhs_spec_new[lhs_inner_dim] = rhs_spec[rhs_inner_dim]
         lhs_sharding = NamedSharding(mesh, PartitionSpec(*lhs_spec_new))
 
+        # If both dims of RHS is sharded (i.e. FSDP), determine if we do AG or AR based on LHS
+        # sharding.
+        rhs_spec_new = [spec for spec in rhs_spec]
+        if rhs_spec[rhs_inner_dim] is not None and rhs_spec[rhs_outer_dim] is not None:
+            if lhs_spec[lhs_inner_dim] is not None and lhs_spec[lhs_outer_dim] is not None:
+                # All dimensions of both LHS and RHS are sharded and the collective operation is
+                # ambiguous, we cannot infer sharding.
+                raise RuntimeError(
+                    "Collective GEMM custom op cannot infer partitioning when both outer and "
+                    + "contracting dimensions of both LHS and RHS operands are sharded."
+                )
+            elif lhs_spec[lhs_inner_dim] is not None:
+                # All-reduce after GEMM, so unshard the outer dimension of RHS
+                rhs_spec_new[rhs_outer_dim] = None
+            else:
+                # We either do all-gather before GEMM, or LHS is already unsharded, so unshard
+                # the inner dimension of RHS to match
+                rhs_spec_new[rhs_inner_dim] = None
+
+        rhs_outer_spec = rhs_spec_new[rhs_outer_dim]
+
         # RHS operand is unchanged, we already enforce that only one dimension can be sharded
-        rhs_sharding = NamedSharding(mesh, PartitionSpec(*rhs_spec))
+        rhs_sharding = NamedSharding(mesh, PartitionSpec(*rhs_spec_new))
 
         # Bias is sharded to match outer dimension spec of the RHS operand (also the output)
         bias_sharding = NamedSharding(mesh, PartitionSpec(rhs_outer_spec if fuse_bias else None))
@@ -562,8 +554,7 @@ def partition(
 
         # Outer (sequence) dimension of the GEMM output is always unsharded
         out_spec = [*batch_specs, None, rhs_outer_spec]
-        batch_size = reduce(operator.mul, lhs.shape[:-2], 1)
-        if lhs.ndim > 2 and rhs.ndim > 2 and batch_size > 1:
+        if lhs.ndim > 2 and rhs.ndim > 2:
             out_spec = [None, rhs_outer_spec]
         out_sharding = NamedSharding(mesh, PartitionSpec(*out_spec))
 
@@ -620,10 +611,8 @@ def sharded_impl(
             if jax_dtype_is_fp8(lhs.dtype):
                 out_amax_updated = all_reduce_max_along_all_axes_except_PP(out_amax_updated, mesh)
 
-            if rhs_spec[rhs_inner_dim] is not None:
-                # GEMM output needs to be all-reduced when the contracting dimension is sharded.
-                # If the layer is sequence-parallel, we also need to scatter the output, which we
-                # can combine into a reduce-scatter here.
+            # GEMM output needs to be all-reduced when the contracting dimension is sharded.
+            if rhs_spec_new[rhs_inner_dim] is not None:
                 out = lax_paral_op(out, jax.lax.psum, global_mesh_resource().tp_resource, mesh)
                 if fuse_gelu:
                     pre_gelu_out = lax_paral_op(
diff --git a/transformer_engine/jax/gemm.py b/transformer_engine/jax/gemm.py
index 01ee60f24b..3b562e4ffa 100644
--- a/transformer_engine/jax/gemm.py
+++ b/transformer_engine/jax/gemm.py
@@ -8,13 +8,11 @@
 import jax
 import jax.numpy as jnp
 from jax.typing import ArrayLike
-from jax.ad_checkpoint import checkpoint_name
 
 from .fp8 import FP8Helper, FP8MetaPackage
 from .cpp_extensions import (
     gemm_impl,
     fp8_gemm_impl,
-    cast_fp8,
     cast_transpose,
     dact_lu,
     dbias_cast_transpose,
@@ -68,6 +66,10 @@ def _gemm_fwd_rule(
     accumulate: bool,
     use_split_accumulator: bool,
 ) -> Tuple[ArrayLike, ...]:
+    assert kernel.ndim == 2, (
+        "TE/JAX Collective GEMM custom op does not support batched RHS operand in forward mode."
+    )
+
     fuse_bias = bias is not None
 
     out, pre_gelu_out = gemm_impl(
@@ -142,7 +144,7 @@ def _gemm_bwd_rule(
 
 def fp8_gemm(
     x: ArrayLike,
-    kernel: ArrayLike,
+    kernel_t: ArrayLike,
     fp8_meta: FP8MetaPackage,
     bias: Optional[ArrayLike] = None,
     out_dtype: jnp.dtype = jnp.bfloat16,
@@ -150,9 +152,10 @@ def fp8_gemm(
     accumulate: bool = False,
     use_split_accumulator: bool = False,
 ) -> ArrayLike:
+    """Non-FP8 `nvte_cublas_gemm()` with optional GELU and bias-add fusions."""
     return _fp8_gemm(
         x,
-        kernel,
+        kernel_t,
         bias,
         fp8_meta.amax_list,
         fp8_meta.scale_list,
@@ -175,7 +178,6 @@ def _fp8_gemm(
     accumulate: bool,
     use_split_accumulator: bool,
 ) -> ArrayLike:
-    """Non-FP8 `nvte_cublas_gemm()` with optional GELU and bias-add fusions."""
     out, _ = _fp8_gemm_fwd_rule(
         x,
         kernel_t,
@@ -201,6 +203,10 @@ def _fp8_gemm_fwd_rule(
     accumulate: bool,
     use_split_accumulator: bool,
 ) -> Tuple[ArrayLike, ...]:
+    assert kernel_t.ndim == 2, (
+        "TE/JAX Collective GEMM custom op does not support batched RHS operand in forward mode."
+    )
+
     fuse_bias = bias is not None
 
     maybe_fm32_to_fp32, maybe_fp32_to_fm32 = FP8Helper.generate_fp8_meta_dtype_converter_pair(
@@ -432,7 +438,7 @@ def type_safe_gemm(
 
     if fp8_meta is not None:
         x_inner_dim, kernel_inner_dim = map(sanitize_dims, contracting_dims, (x.ndim, kernel.ndim))
-        assert x_inner_dim == x.ndim - 1 and kernel_inner_dim == kernel.ndim - 2, (
+        assert x_inner_dim == x.ndim - 1 and kernel_inner_dim == kernel.ndim - 1, (
             "FP8 GEMM requires non-transposed X (LHS) and transposed kernel (RHS), "
             + "i.e. contracting_dims=(-1, -1)."
         )

From 2b2753e2463ce788f5f7c582e898a304156b4f54 Mon Sep 17 00:00:00 2001
From: "pre-commit-ci[bot]"
 <66853113+pre-commit-ci[bot]@users.noreply.github.com>
Date: Tue, 19 Nov 2024 17:58:03 +0000
Subject: [PATCH 09/32] [pre-commit.ci] auto fixes from pre-commit.com hooks

for more information, see https://pre-commit.ci
---
 transformer_engine/jax/gemm.py | 12 ++++++------
 1 file changed, 6 insertions(+), 6 deletions(-)

diff --git a/transformer_engine/jax/gemm.py b/transformer_engine/jax/gemm.py
index 3b562e4ffa..730d17846e 100644
--- a/transformer_engine/jax/gemm.py
+++ b/transformer_engine/jax/gemm.py
@@ -66,9 +66,9 @@ def _gemm_fwd_rule(
     accumulate: bool,
     use_split_accumulator: bool,
 ) -> Tuple[ArrayLike, ...]:
-    assert kernel.ndim == 2, (
-        "TE/JAX Collective GEMM custom op does not support batched RHS operand in forward mode."
-    )
+    assert (
+        kernel.ndim == 2
+    ), "TE/JAX Collective GEMM custom op does not support batched RHS operand in forward mode."
 
     fuse_bias = bias is not None
 
@@ -203,9 +203,9 @@ def _fp8_gemm_fwd_rule(
     accumulate: bool,
     use_split_accumulator: bool,
 ) -> Tuple[ArrayLike, ...]:
-    assert kernel_t.ndim == 2, (
-        "TE/JAX Collective GEMM custom op does not support batched RHS operand in forward mode."
-    )
+    assert (
+        kernel_t.ndim == 2
+    ), "TE/JAX Collective GEMM custom op does not support batched RHS operand in forward mode."
 
     fuse_bias = bias is not None
 

From 969f597cb11fe9fd5b9780e57e818d402704fc0c Mon Sep 17 00:00:00 2001
From: Alp Dener <adener@nvidia.com>
Date: Thu, 21 Nov 2024 09:28:44 +0000
Subject: [PATCH 10/32] fixed FSDP+TP w/ DP=1 and TP+DP, but FSDP+TP w/ DP>1
 still crashes

Signed-off-by: Alp Dener <adener@nvidia.com>
---
 transformer_engine/jax/cpp_extensions/gemm.py | 283 +++++++++++-------
 transformer_engine/jax/gemm.py                |  29 +-
 2 files changed, 205 insertions(+), 107 deletions(-)

diff --git a/transformer_engine/jax/cpp_extensions/gemm.py b/transformer_engine/jax/cpp_extensions/gemm.py
index 431dea6c1d..bf80941f85 100644
--- a/transformer_engine/jax/cpp_extensions/gemm.py
+++ b/transformer_engine/jax/cpp_extensions/gemm.py
@@ -4,7 +4,8 @@
 import warnings
 import operator
 from functools import reduce
-from typing import Optional, Union, Tuple
+from typing import Optional, Tuple
+from collections.abc import Iterable
 
 import jax
 import jax.numpy as jnp
@@ -42,6 +43,34 @@ def sanitize_dims(dim, ndims):
     return (ndims + dim) if dim < 0 else dim
 
 
+def mirror_dim(dim, ndims):
+    return ndims - 2 if dim == ndims - 1 else ndims - 1
+
+
+def remove_fsdp_specs(pspecs):
+    fsdp_resource = global_mesh_resource().fsdp_resource
+    new_pspecs = []
+    for spec in pspecs:
+        if spec is None:
+            new_pspecs.append(None)
+        elif fsdp_resource not in spec:
+            new_pspecs.append(spec)
+        elif isinstance(spec, Iterable) and not isinstance(spec, str):
+            new_spec = []
+            for s in spec:
+                if s != fsdp_resource:
+                    new_spec.append(s)
+            if len(new_spec) > 1:
+                new_pspecs.append(new_spec)
+            elif len(new_spec) == 1:
+                new_pspecs.append(new_spec[0])
+            else:
+                new_pspecs.append(None)
+        else:
+            new_pspecs.append(None)
+    return new_pspecs
+
+
 def get_cublas_workspace_size_bytes() -> None:
     """Return 32 MiB if using hopper, 4 MiB for all other architectures."""
     if tex.get_device_compute_capability() >= 90:
@@ -55,7 +84,7 @@ class CollectiveGemmPrimitive(BasePrimitive):
     """
 
     name = "te_gemm"
-    impl_static_args = (8, 9, 10, 11, 12, 13, 14)
+    impl_static_args = (8, 9, 10, 11, 12, 13, 14, 15)
     multiple_results = True
     inner_primitive = None
     outer_primitive = None
@@ -71,6 +100,7 @@ def abstract(
         out_amax_aval,
         out_scale_aval,
         out_dtype,
+        batched_output,
         contracting_dims,
         fuse_gelu,
         fuse_bias,
@@ -137,33 +167,40 @@ def abstract(
 
         # Make sure leading dimensions of RHS is broadcast-compatible with LHS
         lhs_outer_dim, rhs_outer_dim = map(
-            lambda inner_dim, ndim: ndim - 2 if inner_dim == ndim - 1 else ndim - 1,
+            mirror_dim,
             (lhs_inner_dim, rhs_inner_dim),
             (lhs_aval.ndim, rhs_aval.ndim),
         )
-
         lhs_bdims = [
             dim for dim in range(lhs_aval.ndim) if dim not in [lhs_outer_dim, lhs_inner_dim]
         ]
         lhs_batch_shape = [lhs_aval.shape[dim] for dim in lhs_bdims]
         lhs_batch_size = reduce(operator.mul, lhs_batch_shape, 1)
-        if rhs_aval.ndim > 2:
-            rhs_bdims = [
-                dim for dim in range(rhs_aval.ndim) if dim not in [rhs_outer_dim, rhs_inner_dim]
-            ]
-            rhs_batch_shape = [rhs_aval.shape[dim] for dim in rhs_bdims]
-            rhs_batch_size = reduce(operator.mul, rhs_batch_shape, 1)
-            if rhs_batch_size > 1:
+
+        # Infer output shape
+        if batched_output:
+            assert lhs_aval.ndim > 2 and rhs_aval.ndim == 2, (
+                "Batched output requires batched LHS and non-batched RHS operands."
+            )
+            out_shape = (
+                *lhs_batch_shape,
+                lhs_aval.shape[lhs_outer_dim],
+                rhs_aval.shape[rhs_outer_dim]
+            )
+        else:
+            assert lhs_aval.ndim == rhs_aval.ndim, (
+                "Non-batched output requires LHS and RHS operands with same number of dimensions."
+            )
+            if lhs_aval.ndim > 2:
+                rhs_bdims = [
+                    dim for dim in range(rhs_aval.ndim) if dim not in [rhs_outer_dim, rhs_inner_dim]
+                ]
+                rhs_batch_shape = [rhs_aval.shape[dim] for dim in rhs_bdims]
+                rhs_batch_size = reduce(operator.mul, rhs_batch_shape, 1)
                 assert lhs_batch_size == rhs_batch_size, (
                     f"Leading dimensins of RHS ({rhs_aval.shape=}) is not broadcast-compatible "
                     + f"with the leading dimensions of LHS ({lhs_aval.shape=})."
                 )
-
-        # Infer output shape:
-        out_shape = (*lhs_batch_shape, lhs_aval.shape[lhs_outer_dim], rhs_aval.shape[rhs_outer_dim])
-        if lhs_aval.ndim > 2 and rhs_aval.ndim > 2:
-            # When both RHS and LHS are batched, the batch dimensions are collapsed into the
-            # contracting dimension.
             out_shape = (lhs_aval.shape[lhs_outer_dim], rhs_aval.shape[rhs_outer_dim])
 
         # Validate bias/bias_grad shape against inferred output
@@ -237,6 +274,7 @@ def lowering(
         out_scale,
         *,
         out_dtype,
+        batched_output,
         contracting_dims,
         fuse_gelu,
         fuse_bias,
@@ -247,6 +285,7 @@ def lowering(
         """
         Fused attention fwd lowering rules
         """
+        del batched_output
         lhs_aval, _, rhs_aval, _, bias_aval, *_ = ctx.avals_in
         lhs_inner_dim, rhs_inner_dim = map(
             sanitize_dims, contracting_dims, (lhs_aval.ndim, rhs_aval.ndim)
@@ -300,9 +339,9 @@ def lowering(
             args = CustomCallArgsWrapper(out_types, operands, operand_shapes)
 
             lhs_outer_dim, rhs_outer_dim = map(
-                lambda inner_dim, ndim: ndim - 2 if inner_dim == ndim - 1 else ndim - 1,
+                mirror_dim,
                 (lhs_inner_dim, rhs_inner_dim),
-                (lhs_aval.ndim, rhs_aval.ndim),
+                (lhs.ndim, rhs.ndim),
             )
             m = lhs_aval.shape[lhs_outer_dim]
             k = rhs_aval.shape[rhs_inner_dim]
@@ -346,6 +385,7 @@ def impl(
         out_amax,
         out_scale,
         out_dtype,
+        batched_output,
         contracting_dims,
         fuse_gelu,
         fuse_bias,
@@ -356,25 +396,59 @@ def impl(
         assert CollectiveGemmPrimitive.inner_primitive is not None
 
         lhs_inner_dim, rhs_inner_dim = map(sanitize_dims, contracting_dims, (lhs.ndim, rhs.ndim))
-        lhs_trans = lhs_inner_dim != lhs.ndim - 1
-        rhs_trans = rhs_inner_dim == rhs.ndim - 1
-
-        # Collapse batch dimensions that are larger thanm size 1.
-        # FWD: (B, M, K) x (K, N) = (B*M, K) x (K, N) = (B*M, N)
-        # DGRAD: (B, M, N) x (K, N)^T = (B*M, N) x (N, K) = (B*M, K)
-        # WGRAD: (B, M, K)^T x (B, M, N) = (K, B*M) x (B*M, N) = (K, N)
-        batch_shape = [lhs.shape[dim] for dim in range(lhs.ndim - 2)]
-        batch_size = reduce(operator.mul, batch_shape, 1)
-        reshape_output = not (lhs.ndim > 2 and rhs.ndim > 2)
-        if lhs.ndim > 2:
-            lhs_2d_shape = (batch_size * lhs.shape[-2], lhs.shape[-1])
-            lhs = jax.lax.reshape(lhs, lhs_2d_shape)
-            contracting_dims = (0 if lhs_trans else 1, contracting_dims[1])
-        if rhs.ndim > 2:
-            rhs_2d_shape = (reduce(operator.mul, rhs.shape[:-1], 1), rhs.shape[-1])
-            rhs = jax.lax.reshape(rhs, rhs_2d_shape)
-            contracting_dims = (contracting_dims[0], 1 if rhs_trans else 0)
+        lhs_outer_dim, rhs_outer_dim = map(
+            mirror_dim, (lhs_inner_dim, rhs_inner_dim), (lhs.ndim, rhs.ndim)
+        )
+
+        # Infer output shape and collapse batch dimensions
+        lhs_2d_shape = rhs_2d_shape = None
+        lhs_layout = rhs_layout = None
+        lhs_batch_dims = [
+            dim for dim in range(lhs.ndim) if dim not in [lhs_inner_dim, lhs_outer_dim]
+        ]
+        lhs_batch_shape = [lhs.shape[dim] for dim in lhs_batch_dims]
+        lhs_batch_size = reduce(operator.mul, lhs_batch_shape, 1)
+        contracting_dims_2d = list(contracting_dims).copy()
+        if batched_output:
+            # If output is batched, the LSH batch dimension collapses into the outer dimension
+            # and RHS cannot be batched
+            lhs_2d_shape = (lhs_batch_size * lhs.shape[lhs_outer_dim], lhs.shape[lhs_inner_dim])
+            lhs_layout = (*lhs_batch_dims, lhs_outer_dim, lhs_inner_dim)
+            contracting_dims_2d[0] = 1
+        else:
+            # If the output is not batched, both LHS and RHS batch  dimensions collapse into the
+            # contracting dimensions
+            lhs_2d_shape = (lhs_batch_size * lhs.shape[lhs_inner_dim], lhs.shape[lhs_outer_dim])
+            lhs_layout = (*lhs_batch_dims, lhs_inner_dim, lhs_outer_dim)
+            contracting_dims_2d[0] = 0
+
+            rhs_batch_dims = [
+                dim for dim in range(rhs.ndim) if dim not in [rhs_inner_dim, rhs_outer_dim]
+            ]
+            rhs_batch_shape = [rhs.shape[dim] for dim in rhs_batch_dims]
+            rhs_batch_size = reduce(operator.mul, rhs_batch_shape, 1)
+            rhs_2d_shape = (rhs_batch_size * rhs.shape[rhs_inner_dim], rhs.shape[rhs_outer_dim])
+            rhs_layout = (*rhs_batch_dims, rhs_inner_dim, rhs_outer_dim)
+            contracting_dims_2d[1] = 0
+
+        # Reshape LHS and RHS into 2D and fix layouts for FP8 GEMM
+        if lhs_2d_shape is not None and lhs.ndim > 2:
+            lhs = jax.lax.reshape(lhs, lhs_2d_shape, dimensions=lhs_layout)
+            if jax_dtype_is_fp8(lhs.dtype):
+                lhs = jax.lax.transpose(lhs, (1, 0))
+                contracting_dims_2d[0] = 1
+        else:
+            contracting_dims_2d[0] = contracting_dims[0]
+
+        if rhs_2d_shape is not None and rhs.ndim > 2:
+            rhs = jax.lax.reshape(rhs, rhs_2d_shape, dimensions=rhs_layout)
+            if jax_dtype_is_fp8(rhs.dtype):
+                rhs = jax.lax.transpose(rhs, (1, 0))
+                contracting_dims_2d[1] = 1
+        else:
+            contracting_dims_2d[1] = contracting_dims[1]
 
+        # Invoke GEMM with guaranteed 2D inputs, so batched_output=False
         (
             out,
             out_amax_updated,
@@ -392,7 +466,8 @@ def impl(
             out_amax,
             out_scale,
             out_dtype=out_dtype,
-            contracting_dims=contracting_dims,
+            batched_output=False,
+            contracting_dims=contracting_dims_2d,
             fuse_gelu=fuse_gelu,
             fuse_bias=fuse_bias,
             grad=grad,
@@ -401,9 +476,9 @@ def impl(
         )
 
         # Recover batched dimensions in the output
-        if reshape_output:
-            out_batched_shape = (*batch_shape, int(out.shape[-2] / batch_size), out.shape[-1])
-            out = jax.lax.reshape(out, out_batched_shape)
+        if batched_output:
+            out_shape = (*lhs_batch_shape, out.shape[-2] // lhs_batch_size, out.shape[-1])
+            out = jax.lax.reshape(out, out_shape)
 
         return out, out_amax_updated, out_scale_updated, pre_gelu_out, bias_grad
 
@@ -413,6 +488,7 @@ def batcher(
         batch_dims,
         *,
         out_dtype,
+        batched_output,
         contracting_dims,
         fuse_gelu,
         fuse_bias,
@@ -428,6 +504,7 @@ def batcher(
             CollectiveGemmPrimitive.outer_primitive.bind(
                 *batched_args,
                 out_dtype=out_dtype,
+                batched_output=batched_output,
                 contracting_dims=contracting_dims,
                 fuse_gelu=fuse_gelu,
                 fuse_bias=fuse_bias,
@@ -441,6 +518,7 @@ def batcher(
     @staticmethod
     def infer_sharding_from_operands(
         out_dtype,
+        batched_output,
         contracting_dims,
         fuse_gelu,
         fuse_bias,
@@ -456,34 +534,43 @@ def infer_sharding_from_operands(
         lhs_spec, rhs_spec = map(get_padded_spec, [lhs, rhs])
 
         lhs_inner_dim, rhs_inner_dim = map(sanitize_dims, contracting_dims, (lhs.ndim, rhs.ndim))
-        if lhs_spec[lhs_inner_dim] != rhs_spec[rhs_inner_dim] and not grad:
+        lhs_outer_dim, rhs_outer_dim = map(
+            mirror_dim,
+            (lhs_inner_dim, rhs_inner_dim),
+            (lhs.ndim, rhs.ndim),
+        )
+
+        # Modify operand specs:
+        # - FSDP axes are all-gathered
+        # - LHS operand outer dimension is all-gathered if RHS operand outer dimension is sharded
+        # - LHS operand contracting dimension sharding is forced to match RHS contracting dimension
+        lhs_spec_new = remove_fsdp_specs(lhs_spec)
+        rhs_spec_new = remove_fsdp_specs(rhs_spec)
+        if lhs_spec_new[lhs_inner_dim] != rhs_spec_new[rhs_inner_dim] and not grad:
             warnings.warn(
                 "Forcing the inner dimension of LHS to match the sharding of inner "
                 + "dimension of RHS. This can trigger additional communication if LHS is "
                 + "not already partitioned correctly."
             )
-
-        lhs_outer_dim, rhs_outer_dim = map(
-            lambda inner_dim, ndim: ndim - 2 if inner_dim == ndim - 1 else ndim - 1,
-            (lhs_inner_dim, rhs_inner_dim),
-            (lhs.ndim, rhs.ndim),
-        )
-        rhs_outer_dim = rhs.ndim - 2 if rhs_inner_dim == rhs.ndim - 1 else rhs.ndim - 1
-        lhs_bdims = [dim for dim in range(lhs.ndim) if dim not in [lhs_outer_dim, lhs_inner_dim]]
-        batch_specs = [lhs_spec[bdim] for bdim in lhs_bdims]
-        rhs_outer_spec = rhs_spec[rhs_outer_dim]
-
-        # Outer (sequence) dimension of the GEMM output is always unsharded
-        out_spec = [*batch_specs, None, rhs_outer_spec]
-        if lhs.ndim > 2 and rhs.ndim > 2:
-            out_spec = [None, rhs_outer_spec]
+        rhs_outer_spec = rhs_spec_new[rhs_outer_dim]
+        if rhs_outer_spec is not None:
+            lhs_spec_new[lhs_outer_dim] = None
+        lhs_spec_new[lhs_inner_dim] = rhs_spec_new[rhs_inner_dim]
+
+        # Output sharding is conditional on output shape
+        lhs_bdims = [dim for dim in range(lhs.ndim) if dim not in [lhs_inner_dim, lhs_outer_dim]]
+        batch_spec = [lhs_spec_new[dim] for dim in lhs_bdims]
+        lhs_outer_spec = lhs_spec_new[lhs_outer_dim]
+        out_spec = [lhs_outer_spec, rhs_outer_spec]
+        if batched_output:
+            out_spec = batch_spec + out_spec
         out_sharding = NamedSharding(mesh, PartitionSpec(*out_spec))
 
         # FP8 metas are always unsharded
         fp8_meta_sharding = NamedSharding(mesh, PartitionSpec(None))
 
-        # Pre-GELU output matches output, if GELU fusion is turned on, otherwise unsharded
-        gelu_spec = [None, rhs_outer_spec] if fuse_gelu else [None]
+        # Pre-GELU output is always 2D if GELU fusion is turned on, otherwise unsharded
+        gelu_spec = [lhs_outer_spec, rhs_outer_spec] if fuse_gelu else [None]
         gelu_sharding = NamedSharding(mesh, PartitionSpec(*gelu_spec))
 
         # Bias gradient spec matches outer dimension of output if bias fusion is turned on
@@ -493,6 +580,7 @@ def infer_sharding_from_operands(
     @staticmethod
     def partition(
         out_dtype,
+        batched_output,
         contracting_dims,
         fuse_gelu,
         fuse_bias,
@@ -509,41 +597,22 @@ def partition(
 
         lhs_inner_dim, rhs_inner_dim = map(sanitize_dims, contracting_dims, (lhs.ndim, rhs.ndim))
         lhs_outer_dim, rhs_outer_dim = map(
-            lambda inner_dim, ndim: ndim - 2 if inner_dim == ndim - 1 else ndim - 1,
+            mirror_dim,
             (lhs_inner_dim, rhs_inner_dim),
             (lhs.ndim, rhs.ndim),
         )
-        lhs_bdims = [dim for dim in range(lhs.ndim) if dim not in [lhs_outer_dim, lhs_inner_dim]]
-        batch_specs = [lhs_spec[bdim] for bdim in lhs_bdims]
-
-        # Force all-gather the outer (sequence) dimension of the LHS operand
-        lhs_spec_new = [spec for spec in lhs_spec]
-        lhs_spec_new[lhs_outer_dim] = None
-        lhs_spec_new[lhs_inner_dim] = rhs_spec[rhs_inner_dim]
-        lhs_sharding = NamedSharding(mesh, PartitionSpec(*lhs_spec_new))
-
-        # If both dims of RHS is sharded (i.e. FSDP), determine if we do AG or AR based on LHS
-        # sharding.
-        rhs_spec_new = [spec for spec in rhs_spec]
-        if rhs_spec[rhs_inner_dim] is not None and rhs_spec[rhs_outer_dim] is not None:
-            if lhs_spec[lhs_inner_dim] is not None and lhs_spec[lhs_outer_dim] is not None:
-                # All dimensions of both LHS and RHS are sharded and the collective operation is
-                # ambiguous, we cannot infer sharding.
-                raise RuntimeError(
-                    "Collective GEMM custom op cannot infer partitioning when both outer and "
-                    + "contracting dimensions of both LHS and RHS operands are sharded."
-                )
-            elif lhs_spec[lhs_inner_dim] is not None:
-                # All-reduce after GEMM, so unshard the outer dimension of RHS
-                rhs_spec_new[rhs_outer_dim] = None
-            else:
-                # We either do all-gather before GEMM, or LHS is already unsharded, so unshard
-                # the inner dimension of RHS to match
-                rhs_spec_new[rhs_inner_dim] = None
 
+        # Modify operand specs:
+        # - FSDP axes are all-gathered
+        # - LHS operand outer dimension is all-gathered if RHS operand outer dimension is sharded
+        # - LHS operand contracting dimension sharding is forced to match RHS contracting dimension
+        lhs_spec_new = remove_fsdp_specs(lhs_spec)
+        rhs_spec_new = remove_fsdp_specs(rhs_spec)
         rhs_outer_spec = rhs_spec_new[rhs_outer_dim]
-
-        # RHS operand is unchanged, we already enforce that only one dimension can be sharded
+        if rhs_outer_spec is not None:
+            lhs_spec_new[lhs_outer_dim] = None
+        lhs_spec_new[lhs_inner_dim] = rhs_spec_new[rhs_inner_dim]
+        lhs_sharding = NamedSharding(mesh, PartitionSpec(*lhs_spec_new))
         rhs_sharding = NamedSharding(mesh, PartitionSpec(*rhs_spec_new))
 
         # Bias is sharded to match outer dimension spec of the RHS operand (also the output)
@@ -552,14 +621,17 @@ def partition(
         # FP8 metas are always unsharded
         fp8_meta_sharding = NamedSharding(mesh, PartitionSpec(None))
 
-        # Outer (sequence) dimension of the GEMM output is always unsharded
-        out_spec = [*batch_specs, None, rhs_outer_spec]
-        if lhs.ndim > 2 and rhs.ndim > 2:
-            out_spec = [None, rhs_outer_spec]
+        # Output sharding is conditional on output shape
+        lhs_bdims = [dim for dim in range(lhs.ndim) if dim not in [lhs_inner_dim, lhs_outer_dim]]
+        batch_spec = [lhs_spec_new[dim] for dim in lhs_bdims]
+        lhs_outer_spec = lhs_spec_new[lhs_outer_dim]
+        out_spec = [lhs_outer_spec, rhs_outer_spec]
+        if batched_output:
+            out_spec = batch_spec + out_spec
         out_sharding = NamedSharding(mesh, PartitionSpec(*out_spec))
 
-        # Pre-GELU output matches output spec if GELU fusion is turned on, otherwise unsharded
-        gelu_spec = [None, rhs_outer_spec] if fuse_gelu else [None]
+        # Pre-GELU output is always 2D if GELU fusion is turned on, otherwise unsharded
+        gelu_spec = [lhs_outer_spec, rhs_outer_spec] if fuse_gelu else [None]
         gelu_sharding = NamedSharding(mesh, PartitionSpec(*gelu_spec))
 
         arg_shardings = (
@@ -599,6 +671,7 @@ def sharded_impl(
                 out_amax,
                 out_scale,
                 out_dtype=out_dtype,
+                batched_output=batched_output,
                 contracting_dims=contracting_dims,
                 fuse_gelu=fuse_gelu,
                 fuse_bias=fuse_bias,
@@ -637,6 +710,7 @@ def fp8_gemm_impl(
     out_amax: Optional[ArrayLike] = None,
     out_scale: Optional[ArrayLike] = None,
     out_dtype: jnp.dtype = jnp.bfloat16,
+    batched_output: bool = False,
     fuse_gelu: bool = False,
     fuse_bias: bool = False,
     accumulate: bool = False,
@@ -657,8 +731,8 @@ def fp8_gemm_impl(
     if not fuse_gelu:
         gelu_input = jnp.zeros(0, dtype=bias.dtype)
     elif gelu_input is None:
-        out_shape = (*lhs.shape[:-2], lhs.shape[-2], rhs_t.shape[-2])
-        gelu_input = jnp.zeros(out_shape, dtype=bias.dtype)
+        gelu_shape = (reduce(operator.mul, lhs.shape[:-1]), rhs_t.shape[-1])
+        gelu_input = jnp.zeros(gelu_shape, dtype=bias.dtype)
 
     out, out_amax, out_scale, pre_gelu_out, _ = CollectiveGemmPrimitive.outer_primitive.bind(
         lhs,
@@ -670,6 +744,7 @@ def fp8_gemm_impl(
         out_amax,
         out_scale,
         out_dtype=out_dtype,
+        batched_output=batched_output,
         contracting_dims=(-1, -1),
         fuse_gelu=fuse_gelu,
         fuse_bias=fuse_bias,
@@ -686,7 +761,8 @@ def gemm_impl(
     rhs: ArrayLike,
     bias: Optional[ArrayLike] = None,
     gelu_input: Optional[ArrayLike] = None,
-    contracting_dims: Tuple[int, int] = (1, 0),
+    batched_output: bool = False,
+    contracting_dims: Tuple[int, int] = (-1, -2),
     fuse_gelu: bool = False,
     fuse_bias: bool = False,
     grad: bool = False,
@@ -696,16 +772,15 @@ def gemm_impl(
     """Non-FP8 mat-mul with `nvte_cublas_gemm()` custom op."""
     lhs_inner_dim, rhs_inner_dim = map(sanitize_dims, contracting_dims, (lhs.ndim, rhs.ndim))
     lhs_outer_dim, rhs_outer_dim = map(
-        lambda inner_dim, ndim: ndim - 2 if inner_dim == ndim - 1 else ndim - 1,
+        mirror_dim,
         (lhs_inner_dim, rhs_inner_dim),
         (lhs.ndim, rhs.ndim),
     )
-    out_shape = (*lhs.shape[:-2], lhs.shape[lhs_outer_dim], rhs.shape[rhs_outer_dim])
 
     if not fuse_bias:
         bias = jnp.zeros(0, dtype=lhs.dtype)
     elif grad:
-        bias = jnp.zeros(out_shape[-1], dtype=lhs.dtype)
+        bias = jnp.zeros(rhs.shape[rhs_outer_dim], dtype=lhs.dtype)
     else:
         assert bias is not None, "Missing bias in forward GEMM when bias epilogue is enabled."
 
@@ -716,7 +791,10 @@ def gemm_impl(
             gelu_input is not None
         ), "Backward GEMM with dGELU epilogue requires pre-GELU output from forward GEMM."
     elif gelu_input is None:
-        gelu_input = jnp.zeros(out_shape, dtype=lhs.dtypes)
+        bdims = [dim for dim in range(lhs.ndim) if dim not in [lhs_inner_dim, lhs_outer_dim]]
+        batch_size = reduce(operator.mul, [lhs.shape[dim] for dim in bdims], 1)
+        gelu_shape = (batch_size * lhs.shape[lhs_outer_dim], rhs.shape[rhs_outer_dim])
+        gelu_input = jnp.zeros(gelu_shape, dtype=lhs.dtypes)
 
     dummy_fp8_meta = jnp.zeros(0, dtype=jnp.float32)
     out, _, _, pre_gelu_out, bias_grad = CollectiveGemmPrimitive.outer_primitive.bind(
@@ -729,6 +807,7 @@ def gemm_impl(
         dummy_fp8_meta,
         dummy_fp8_meta,
         out_dtype=lhs.dtype,
+        batched_output=batched_output,
         contracting_dims=contracting_dims,
         fuse_gelu=fuse_gelu,
         fuse_bias=fuse_bias,
diff --git a/transformer_engine/jax/gemm.py b/transformer_engine/jax/gemm.py
index 730d17846e..18d1f76da7 100644
--- a/transformer_engine/jax/gemm.py
+++ b/transformer_engine/jax/gemm.py
@@ -18,7 +18,7 @@
     dbias_cast_transpose,
     dact_lu_dbias_cast_transpose,
 )
-from .cpp_extensions.gemm import sanitize_dims
+from .cpp_extensions.gemm import sanitize_dims, mirror_dim
 
 
 __all__ = [
@@ -72,10 +72,13 @@ def _gemm_fwd_rule(
 
     fuse_bias = bias is not None
 
+    # AG+GEMM: ([B], M/P, K) --(AG)--> ([B], M, K) x (K, N/P) --------> ([B], M, N/P)
+    # GEMM+AR:                       ([B], M, K/P) x (K/P, N) --(AR)--> ([B], M, N)
     out, pre_gelu_out = gemm_impl(
         x,
         kernel,
         bias=bias,
+        batched_output=(x.ndim > 2),
         contracting_dims=contracting_dims,
         fuse_gelu=fuse_gelu,
         fuse_bias=fuse_bias,
@@ -103,14 +106,22 @@ def _gemm_bwd_rule(
 ):
     x, kernel, pre_gelu_out, fuse_bias = ctx
     x_inner_dim, kernel_inner_dim = map(sanitize_dims, contracting_dims, (x.ndim, kernel.ndim))
-    x_outer_dim = x.ndim - 1 if x_inner_dim != x.ndim - 1 else x.ndim - 2
-    kernel_outer_dim = kernel.ndim - 2 if kernel_inner_dim == kernel.ndim - 1 else kernel.ndim - 1
+    x_outer_dim, kernel_outer_dim = map(
+        mirror_dim, (x_inner_dim, kernel_inner_dim), (x.ndim, kernel.ndim)
+    )
+
+    # FWD MODE:
+    #    AG+GEMM: ([B], M/P, K) --(AG)--> ([B], M, K) x (K, N/P) --------> ([B], M, N/P)
+    #    GEMM+AR:                       ([B], M, K/P) x (K/P, N) --(AR)--> ([B], M, N)
 
-    # DGRAD: ([B], M, N) x (K, N)^T = ([B], M, K)
+    # DGRAD:
+    #    AG+GEMM: ([B], M, N/P) x (K, N/P)^T --(AR)--> ([B], M, K)
+    #    GEMM+AR:   ([B], M, N) x (K/P, N)^T --------> ([B], M, K/P)
     dgrad, dgelu, _ = gemm_impl(
         grad,
         kernel,
         gelu_input=pre_gelu_out,
+        batched_output=(x.ndim > 2),
         contracting_dims=(-1, kernel_outer_dim),
         fuse_gelu=fuse_gelu,
         fuse_bias=False,
@@ -119,12 +130,15 @@ def _gemm_bwd_rule(
         use_split_accumulator=use_split_accumulator,
     )
 
-    # WGRAD: ([B], M, K)^T x ([B], M, N) = (K, N)
+    # WGRAD:
+    #    AG+GEMM: ([B], M/P, K)^T --(AG)--> ([B], M, K)^T x ([B], M, N/P) --> (K, N/P)
+    #    GEMM+AR:                         ([B], M, K/P)^T x ([B], M, N) ----> (K/P, N)
     wgrad_rhs = dgelu if fuse_gelu else grad
     wgrad, _, bgrad = gemm_impl(
         x,
         wgrad_rhs,
         gelu_input=pre_gelu_out,
+        batched_output=False,
         contracting_dims=(x_outer_dim, wgrad_rhs.ndim - 2),
         fuse_gelu=False,
         fuse_bias=fuse_bias,
@@ -279,6 +293,7 @@ def _fp8_gemm_fwd_rule(
         out_amax=out_amax,
         out_scale=out_scale,
         out_dtype=out_dtype,
+        batched_output=(x.ndim > 2),
         fuse_gelu=fuse_gelu,
         fuse_bias=fuse_bias,
         accumulate=accumulate,
@@ -300,6 +315,7 @@ def _fp8_gemm_fwd_rule(
         pre_gelu_out if fuse_gelu else None,
         fuse_bias,
         maybe_fp32_to_fm32,
+        (x.ndim > 2),
     )
 
     return (out, updated_out_scale), ctx
@@ -325,6 +341,7 @@ def _fp8_gemm_bwd_rule(
         pre_gelu_out,
         fuse_bias,
         maybe_fp32_to_fm32,
+        batched_input,
     ) = ctx
 
     bwd_dtype = FP8Helper.BWD_DTYPE
@@ -382,6 +399,7 @@ def _fp8_gemm_bwd_rule(
         grad_scale_inv,
         casted_kernel,
         kernel_scale_inv,
+        batched_output=batched_input,
         accumulate=accumulate,
         use_split_accumulator=use_split_accumulator,
     )
@@ -392,6 +410,7 @@ def _fp8_gemm_bwd_rule(
         x_scale_inv,
         casted_grad_t,
         grad_scale_inv,
+        out_shape=False,
         accumulate=accumulate,
         use_split_accumulator=use_split_accumulator,
     )

From ce86dcb9c5d55c409ac92f9d8bafb0b7f01bc042 Mon Sep 17 00:00:00 2001
From: Alp Dener <adener@nvidia.com>
Date: Thu, 21 Nov 2024 11:38:22 +0000
Subject: [PATCH 11/32] fixed logic to remove FSDP sharding

Signed-off-by: Alp Dener <adener@nvidia.com>
---
 transformer_engine/jax/cpp_extensions/gemm.py | 27 ++++++++++++++++---
 1 file changed, 23 insertions(+), 4 deletions(-)

diff --git a/transformer_engine/jax/cpp_extensions/gemm.py b/transformer_engine/jax/cpp_extensions/gemm.py
index bf80941f85..d54009e60b 100644
--- a/transformer_engine/jax/cpp_extensions/gemm.py
+++ b/transformer_engine/jax/cpp_extensions/gemm.py
@@ -49,25 +49,44 @@ def mirror_dim(dim, ndims):
 
 def remove_fsdp_specs(pspecs):
     fsdp_resource = global_mesh_resource().fsdp_resource
+    if fsdp_resource is None:
+        return list(pspecs).copy()
+
     new_pspecs = []
     for spec in pspecs:
         if spec is None:
             new_pspecs.append(None)
-        elif fsdp_resource not in spec:
-            new_pspecs.append(spec)
+
         elif isinstance(spec, Iterable) and not isinstance(spec, str):
             new_spec = []
             for s in spec:
-                if s != fsdp_resource:
+                if s == fsdp_resource:
+                    new_spec.append(None)
+                else:
                     new_spec.append(s)
+
             if len(new_spec) > 1:
                 new_pspecs.append(new_spec)
             elif len(new_spec) == 1:
                 new_pspecs.append(new_spec[0])
             else:
                 new_pspecs.append(None)
+
+        elif isinstance(spec, str):
+            if spec == fsdp_resource:
+                new_pspecs.append(None)
+            else:
+                new_pspecs.append(spec)
+
         else:
-            new_pspecs.append(None)
+            new_pspecs.append(spec)
+
+    assert len(new_pspecs) == len(pspecs), (
+        "Length of partition specs changed when removing FSDP sharding!\n"
+        + f"Original: {pspecs}\n"
+        + f"Filtered: {new_pspecs}\n"
+    )
+
     return new_pspecs
 
 

From b215f207bd78acfd672264f7e52880a0a8137598 Mon Sep 17 00:00:00 2001
From: "pre-commit-ci[bot]"
 <66853113+pre-commit-ci[bot]@users.noreply.github.com>
Date: Thu, 21 Nov 2024 11:38:49 +0000
Subject: [PATCH 12/32] [pre-commit.ci] auto fixes from pre-commit.com hooks

for more information, see https://pre-commit.ci
---
 transformer_engine/jax/cpp_extensions/gemm.py | 14 +++++++-------
 1 file changed, 7 insertions(+), 7 deletions(-)

diff --git a/transformer_engine/jax/cpp_extensions/gemm.py b/transformer_engine/jax/cpp_extensions/gemm.py
index d54009e60b..3c4bf15d00 100644
--- a/transformer_engine/jax/cpp_extensions/gemm.py
+++ b/transformer_engine/jax/cpp_extensions/gemm.py
@@ -198,18 +198,18 @@ def abstract(
 
         # Infer output shape
         if batched_output:
-            assert lhs_aval.ndim > 2 and rhs_aval.ndim == 2, (
-                "Batched output requires batched LHS and non-batched RHS operands."
-            )
+            assert (
+                lhs_aval.ndim > 2 and rhs_aval.ndim == 2
+            ), "Batched output requires batched LHS and non-batched RHS operands."
             out_shape = (
                 *lhs_batch_shape,
                 lhs_aval.shape[lhs_outer_dim],
-                rhs_aval.shape[rhs_outer_dim]
+                rhs_aval.shape[rhs_outer_dim],
             )
         else:
-            assert lhs_aval.ndim == rhs_aval.ndim, (
-                "Non-batched output requires LHS and RHS operands with same number of dimensions."
-            )
+            assert (
+                lhs_aval.ndim == rhs_aval.ndim
+            ), "Non-batched output requires LHS and RHS operands with same number of dimensions."
             if lhs_aval.ndim > 2:
                 rhs_bdims = [
                     dim for dim in range(rhs_aval.ndim) if dim not in [rhs_outer_dim, rhs_inner_dim]

From cbab16c03109cf5b802b93adc03841828df332dd Mon Sep 17 00:00:00 2001
From: Alp Dener <adener@nvidia.com>
Date: Thu, 21 Nov 2024 19:03:52 +0000
Subject: [PATCH 13/32] retained FSDP dims and pushed FSDP all-gather of weight
 array to outside the custom op

Signed-off-by: Alp Dener <adener@nvidia.com>
---
 transformer_engine/jax/cpp_extensions/gemm.py | 52 ++-----------------
 transformer_engine/jax/gemm.py                |  1 +
 2 files changed, 6 insertions(+), 47 deletions(-)

diff --git a/transformer_engine/jax/cpp_extensions/gemm.py b/transformer_engine/jax/cpp_extensions/gemm.py
index 3c4bf15d00..353f2d2509 100644
--- a/transformer_engine/jax/cpp_extensions/gemm.py
+++ b/transformer_engine/jax/cpp_extensions/gemm.py
@@ -47,49 +47,6 @@ def mirror_dim(dim, ndims):
     return ndims - 2 if dim == ndims - 1 else ndims - 1
 
 
-def remove_fsdp_specs(pspecs):
-    fsdp_resource = global_mesh_resource().fsdp_resource
-    if fsdp_resource is None:
-        return list(pspecs).copy()
-
-    new_pspecs = []
-    for spec in pspecs:
-        if spec is None:
-            new_pspecs.append(None)
-
-        elif isinstance(spec, Iterable) and not isinstance(spec, str):
-            new_spec = []
-            for s in spec:
-                if s == fsdp_resource:
-                    new_spec.append(None)
-                else:
-                    new_spec.append(s)
-
-            if len(new_spec) > 1:
-                new_pspecs.append(new_spec)
-            elif len(new_spec) == 1:
-                new_pspecs.append(new_spec[0])
-            else:
-                new_pspecs.append(None)
-
-        elif isinstance(spec, str):
-            if spec == fsdp_resource:
-                new_pspecs.append(None)
-            else:
-                new_pspecs.append(spec)
-
-        else:
-            new_pspecs.append(spec)
-
-    assert len(new_pspecs) == len(pspecs), (
-        "Length of partition specs changed when removing FSDP sharding!\n"
-        + f"Original: {pspecs}\n"
-        + f"Filtered: {new_pspecs}\n"
-    )
-
-    return new_pspecs
-
-
 def get_cublas_workspace_size_bytes() -> None:
     """Return 32 MiB if using hopper, 4 MiB for all other architectures."""
     if tex.get_device_compute_capability() >= 90:
@@ -563,8 +520,8 @@ def infer_sharding_from_operands(
         # - FSDP axes are all-gathered
         # - LHS operand outer dimension is all-gathered if RHS operand outer dimension is sharded
         # - LHS operand contracting dimension sharding is forced to match RHS contracting dimension
-        lhs_spec_new = remove_fsdp_specs(lhs_spec)
-        rhs_spec_new = remove_fsdp_specs(rhs_spec)
+        lhs_spec_new = [spec for spec in lhs_spec]
+        rhs_spec_new = [spec for spec in rhs_spec]
         if lhs_spec_new[lhs_inner_dim] != rhs_spec_new[rhs_inner_dim] and not grad:
             warnings.warn(
                 "Forcing the inner dimension of LHS to match the sharding of inner "
@@ -594,6 +551,7 @@ def infer_sharding_from_operands(
 
         # Bias gradient spec matches outer dimension of output if bias fusion is turned on
         bias_sharding = NamedSharding(mesh, PartitionSpec(rhs_outer_spec if fuse_bias else None))
+
         return (out_sharding, fp8_meta_sharding, fp8_meta_sharding, gelu_sharding, bias_sharding)
 
     @staticmethod
@@ -625,8 +583,8 @@ def partition(
         # - FSDP axes are all-gathered
         # - LHS operand outer dimension is all-gathered if RHS operand outer dimension is sharded
         # - LHS operand contracting dimension sharding is forced to match RHS contracting dimension
-        lhs_spec_new = remove_fsdp_specs(lhs_spec)
-        rhs_spec_new = remove_fsdp_specs(rhs_spec)
+        lhs_spec_new = [spec for spec in lhs_spec]
+        rhs_spec_new = [spec for spec in rhs_spec]
         rhs_outer_spec = rhs_spec_new[rhs_outer_dim]
         if rhs_outer_spec is not None:
             lhs_spec_new[lhs_outer_dim] = None
diff --git a/transformer_engine/jax/gemm.py b/transformer_engine/jax/gemm.py
index 18d1f76da7..464ccb12f9 100644
--- a/transformer_engine/jax/gemm.py
+++ b/transformer_engine/jax/gemm.py
@@ -8,6 +8,7 @@
 import jax
 import jax.numpy as jnp
 from jax.typing import ArrayLike
+from jax.sharding import NamedSharding, PartitionSpec
 
 from .fp8 import FP8Helper, FP8MetaPackage
 from .cpp_extensions import (

From 0ea55c0eed1c5551a8b8872ff095d70d9e5d1625 Mon Sep 17 00:00:00 2001
From: Alp Dener <adener@nvidia.com>
Date: Thu, 21 Nov 2024 19:46:02 +0000
Subject: [PATCH 14/32] Added useful warning about DGRAD sharding not matching
 sequence/context-parallel LHS operands

Signed-off-by: Alp Dener <adener@nvidia.com>
---
 transformer_engine/jax/cpp_extensions/gemm.py | 31 ++++++++++---------
 1 file changed, 17 insertions(+), 14 deletions(-)

diff --git a/transformer_engine/jax/cpp_extensions/gemm.py b/transformer_engine/jax/cpp_extensions/gemm.py
index 353f2d2509..823e9f7ea1 100644
--- a/transformer_engine/jax/cpp_extensions/gemm.py
+++ b/transformer_engine/jax/cpp_extensions/gemm.py
@@ -53,7 +53,6 @@ def get_cublas_workspace_size_bytes() -> None:
         return 33_554_432
     return 4_194_304
 
-
 class CollectiveGemmPrimitive(BasePrimitive):
     """
     cuBlasLt GEMM Primitive w/ support for distributed inputs
@@ -385,15 +384,9 @@ def impl(
         lhs_batch_shape = [lhs.shape[dim] for dim in lhs_batch_dims]
         lhs_batch_size = reduce(operator.mul, lhs_batch_shape, 1)
         contracting_dims_2d = list(contracting_dims).copy()
-        if batched_output:
-            # If output is batched, the LSH batch dimension collapses into the outer dimension
-            # and RHS cannot be batched
-            lhs_2d_shape = (lhs_batch_size * lhs.shape[lhs_outer_dim], lhs.shape[lhs_inner_dim])
-            lhs_layout = (*lhs_batch_dims, lhs_outer_dim, lhs_inner_dim)
-            contracting_dims_2d[0] = 1
-        else:
-            # If the output is not batched, both LHS and RHS batch  dimensions collapse into the
-            # contracting dimensions
+        if lhs.ndim > 2 and rhs.ndim > 2:
+            # If both LHS and RHS are batched, the batch dimensions collapse into the
+            # contracting dimensions for both operands
             lhs_2d_shape = (lhs_batch_size * lhs.shape[lhs_inner_dim], lhs.shape[lhs_outer_dim])
             lhs_layout = (*lhs_batch_dims, lhs_inner_dim, lhs_outer_dim)
             contracting_dims_2d[0] = 0
@@ -406,6 +399,11 @@ def impl(
             rhs_2d_shape = (rhs_batch_size * rhs.shape[rhs_inner_dim], rhs.shape[rhs_outer_dim])
             rhs_layout = (*rhs_batch_dims, rhs_inner_dim, rhs_outer_dim)
             contracting_dims_2d[1] = 0
+        elif lhs.ndim > 2:
+            # If only the LHS is batched,the batch dimension collapses into the outer dimension
+            lhs_2d_shape = (lhs_batch_size * lhs.shape[lhs_outer_dim], lhs.shape[lhs_inner_dim])
+            lhs_layout = (*lhs_batch_dims, lhs_outer_dim, lhs_inner_dim)
+            contracting_dims_2d[0] = 1
 
         # Reshape LHS and RHS into 2D and fix layouts for FP8 GEMM
         if lhs_2d_shape is not None and lhs.ndim > 2:
@@ -524,12 +522,17 @@ def infer_sharding_from_operands(
         rhs_spec_new = [spec for spec in rhs_spec]
         if lhs_spec_new[lhs_inner_dim] != rhs_spec_new[rhs_inner_dim] and not grad:
             warnings.warn(
-                "Forcing the inner dimension of LHS to match the sharding of inner "
-                + "dimension of RHS. This can trigger additional communication if LHS is "
-                + "not already partitioned correctly."
+                "Forcing LHS sharding in the contracting dimension to match RHS. This can trigger "
+                + "additional communication if LHS is not already partitioned correctly."
             )
         rhs_outer_spec = rhs_spec_new[rhs_outer_dim]
         if rhs_outer_spec is not None:
+            warnings.warn(
+                "Forcing the outer dimension of LHS (sequence/context dim) to be all- gathered. "
+                + "This may trigger additional communication if LHS is not already partitioned "
+                + "correctly. Additionally, the DGRAD output in the backward pass will not match "
+                + "the sharding of a sequence/context-parallel LHS operand."
+            )
             lhs_spec_new[lhs_outer_dim] = None
         lhs_spec_new[lhs_inner_dim] = rhs_spec_new[rhs_inner_dim]
 
@@ -661,8 +664,8 @@ def sharded_impl(
             if jax_dtype_is_fp8(lhs.dtype):
                 out_amax_updated = all_reduce_max_along_all_axes_except_PP(out_amax_updated, mesh)
 
-            # GEMM output needs to be all-reduced when the contracting dimension is sharded.
             if rhs_spec_new[rhs_inner_dim] is not None:
+                # GEMM output needs to be all-reduced when the contracting dimension is sharded.
                 out = lax_paral_op(out, jax.lax.psum, global_mesh_resource().tp_resource, mesh)
                 if fuse_gelu:
                     pre_gelu_out = lax_paral_op(

From 2acb92f49b4687fde25f803f3115b693b900569b Mon Sep 17 00:00:00 2001
From: "pre-commit-ci[bot]"
 <66853113+pre-commit-ci[bot]@users.noreply.github.com>
Date: Thu, 21 Nov 2024 19:46:35 +0000
Subject: [PATCH 15/32] [pre-commit.ci] auto fixes from pre-commit.com hooks

for more information, see https://pre-commit.ci
---
 transformer_engine/jax/cpp_extensions/gemm.py | 1 +
 1 file changed, 1 insertion(+)

diff --git a/transformer_engine/jax/cpp_extensions/gemm.py b/transformer_engine/jax/cpp_extensions/gemm.py
index 823e9f7ea1..31a8760564 100644
--- a/transformer_engine/jax/cpp_extensions/gemm.py
+++ b/transformer_engine/jax/cpp_extensions/gemm.py
@@ -53,6 +53,7 @@ def get_cublas_workspace_size_bytes() -> None:
         return 33_554_432
     return 4_194_304
 
+
 class CollectiveGemmPrimitive(BasePrimitive):
     """
     cuBlasLt GEMM Primitive w/ support for distributed inputs

From b07bb2db5726d45dba28b7207bbc2051f166d8c4 Mon Sep 17 00:00:00 2001
From: Alp Dener <adener@nvidia.com>
Date: Thu, 21 Nov 2024 19:47:44 +0000
Subject: [PATCH 16/32] documentation fixes

Signed-off-by: Alp Dener <adener@nvidia.com>
---
 transformer_engine/jax/cpp_extensions/gemm.py | 2 --
 1 file changed, 2 deletions(-)

diff --git a/transformer_engine/jax/cpp_extensions/gemm.py b/transformer_engine/jax/cpp_extensions/gemm.py
index 31a8760564..0f567eecef 100644
--- a/transformer_engine/jax/cpp_extensions/gemm.py
+++ b/transformer_engine/jax/cpp_extensions/gemm.py
@@ -516,7 +516,6 @@ def infer_sharding_from_operands(
         )
 
         # Modify operand specs:
-        # - FSDP axes are all-gathered
         # - LHS operand outer dimension is all-gathered if RHS operand outer dimension is sharded
         # - LHS operand contracting dimension sharding is forced to match RHS contracting dimension
         lhs_spec_new = [spec for spec in lhs_spec]
@@ -584,7 +583,6 @@ def partition(
         )
 
         # Modify operand specs:
-        # - FSDP axes are all-gathered
         # - LHS operand outer dimension is all-gathered if RHS operand outer dimension is sharded
         # - LHS operand contracting dimension sharding is forced to match RHS contracting dimension
         lhs_spec_new = [spec for spec in lhs_spec]

From 765b844525e42d2def624bce7430f798828874d9 Mon Sep 17 00:00:00 2001
From: Alp Dener <adener@nvidia.com>
Date: Wed, 27 Nov 2024 21:54:39 +0000
Subject: [PATCH 17/32] added unit test, both AG+GEMM and GEMM+AR passing with
 FSDP+TP, DP+TP and TP-only meshes

Signed-off-by: Alp Dener <adener@nvidia.com>
---
 tests/jax/test_distributed_gemm.py            | 311 ++++++++++++++++++
 transformer_engine/jax/cpp_extensions/gemm.py | 107 +++---
 transformer_engine/jax/gemm.py                |  31 +-
 3 files changed, 400 insertions(+), 49 deletions(-)
 create mode 100644 tests/jax/test_distributed_gemm.py

diff --git a/tests/jax/test_distributed_gemm.py b/tests/jax/test_distributed_gemm.py
new file mode 100644
index 0000000000..f1e3c58c4a
--- /dev/null
+++ b/tests/jax/test_distributed_gemm.py
@@ -0,0 +1,311 @@
+# Copyright (c) 2022-2024, NVIDIA CORPORATION & AFFILIATES. All rights reserved.
+#
+# See LICENSE for license information.
+import pytest
+from functools import partial
+from collections.abc import Iterable
+
+import numpy as np
+
+import jax
+import jax.numpy as jnp
+from jax.sharding import Mesh, NamedSharding, PartitionSpec
+from jax.experimental import mesh_utils
+
+import transformer_engine.jax as te
+from transformer_engine.jax.gemm import gemm
+
+from utils import assert_allclose
+
+
+jax.config.update('jax_enable_compilation_cache', False)
+
+
+# AG+GEMM: (4, 32/P, 128) ----(AG)----> (4, 32, 128) x (128, 256/P) ----------> (4, 32, 256/P)
+# - DGRAD:                            (4, 32, 256/P) x (128, 256/P)^T --(AR)--> (4, 32, 128)
+# - WGRAD: (4, 32/P, 128)^T --(AG)--> (4, 32, 128)^T x (4, 32, 256/P) --------> (128, 256/P)
+
+# GEMM+AR:                            (4, 32, 256/P) x (256/P, 128) --(AR)--> (4, 32, 128)
+# - DGRAD:                              (4, 32, 128) x (256/P, 128)^T ------> (4, 32, 256/P)
+# - WGRAD: (4, 32, 256/P)^T --(AG)--> (4, 32, 256)^T x (4, 32, 128) --------> (256, 128)
+
+BATCH = 4
+BASE_SIZE = 16
+SEQ_LEN = BASE_SIZE * 8
+HIDDEN_SIZE = BASE_SIZE * 6
+FFN_HIDDEN_SIZE = BASE_SIZE * 16
+
+COMM_TYPES = ["ALL_GATHER", "ALL_REDUCE"]
+MESH_TYPES = ["FSDP_TP", "DP_TP", "TP"]
+NUM_DEVICES = 4
+
+is_fp8_supported, no_fp8_reason = te.fp8.is_fp8_available()
+
+
+def _get_mesh(parallel_dist):
+    jax.clear_caches()
+
+    batched = False
+    fsdp = False
+    mesh_shape = dict(tp=NUM_DEVICES)
+    resources = dict(cp_resource='tp', tp_resource='tp')
+    if parallel_dist in ["DP_TP", "FSDP_TP"]:
+        batched = True
+        mesh_shape.update(dict(tp=NUM_DEVICES//2, dp=NUM_DEVICES//2))
+        resources.update(dict(dp_resource='dp'))
+        if parallel_dist == "FSDP_TP":
+            fsdp = True
+            mesh_shape.update(dict(tp=NUM_DEVICES//2, dp=1, zp=NUM_DEVICES//2))
+            resources.update(dict(fsdp_resource='zp'))
+    mesh_resource = te.MeshResource(**resources)
+
+    devices = mesh_utils.create_device_mesh(
+        (NUM_DEVICES, ), devices=jax.devices()[:NUM_DEVICES]
+    )
+
+    mesh = Mesh(np.array(devices).reshape(tuple(mesh_shape.values())), tuple(mesh_shape.keys()))
+
+    return mesh, mesh_resource, batched, fsdp
+
+
+def _get_inputs(mesh, mesh_resource, dtype, fwd_comm_type, batched, fsdp, fwd_bwd=False):
+    fp8_gemm = dtype in [jnp.float8_e4m3fn, jnp.float8_e5m2]
+
+    # Operand and output shapes
+    lhs_shape = (
+        [SEQ_LEN, HIDDEN_SIZE]
+        if fwd_comm_type == "ALL_GATHER"
+        else [SEQ_LEN, FFN_HIDDEN_SIZE]
+    )
+    rhs_shape = (
+        [HIDDEN_SIZE, FFN_HIDDEN_SIZE]
+        if fwd_comm_type == "ALL_GATHER"
+        else [FFN_HIDDEN_SIZE, HIDDEN_SIZE]
+    )
+    out_shape = [lhs_shape[0], rhs_shape[1]]
+
+    if batched:
+        lhs_shape = [BATCH] + lhs_shape
+        out_shape = [BATCH] + out_shape
+
+    # Operand and output partition specs
+    lhs_spec = (
+        [mesh_resource.tp_resource, None]
+        if fwd_comm_type == "ALL_GATHER"
+        else [None, mesh_resource.tp_resource]
+    )
+    rhs_spec = (
+        [None, mesh_resource.tp_resource]
+        if fwd_comm_type == "ALL_GATHER"
+        else [mesh_resource.tp_resource, None]
+    )
+    out_spec = [None, rhs_spec[-1]]
+
+    # Modify RHS operand for FP8
+    fsdp_gathered_rhs_spec = rhs_spec.copy()
+    if fp8_gemm:
+        rhs_shape = list(reversed(rhs_shape))
+        rhs_spec = list(reversed(rhs_spec))
+        fsdp_gathered_rhs_spec = list(reversed(fsdp_gathered_rhs_spec))
+
+    # Add batch dimensions and specs
+    if batched:
+        if fsdp:
+            lhs_spec = [(mesh_resource.dp_resource, mesh_resource.fsdp_resource)] + lhs_spec
+            rhs_spec = [mesh_resource.fsdp_resource if spec is None else spec for spec in rhs_spec]
+            out_spec = [(mesh_resource.dp_resource, mesh_resource.fsdp_resource)] + out_spec
+        else:
+            lhs_spec = [mesh_resource.dp_resource] + lhs_spec
+            out_spec = [mesh_resource.dp_resource] + out_spec
+
+    # Allocate global operands on device
+    key = jax.random.PRNGKey(42)
+    split_keys = jax.random.split(key, 3 if fwd_bwd else 2)
+    mu = 0.0
+    sigma = 0.023
+    shapes = (lhs_shape, rhs_shape)
+    if fwd_bwd:
+        shapes += (out_shape, )
+    global_operands = list(
+        map(
+            lambda key, shape: jax.device_put(
+                mu + (sigma * jax.random.normal(key, shape, dtype=dtype)),
+                NamedSharding(mesh, PartitionSpec(None))
+            ),
+            split_keys,
+            shapes,
+        )
+    )
+
+    # Allocate sharded operands on device
+    partition_axes = (lhs_spec, rhs_spec)
+    if fwd_bwd:
+        partition_axes += (out_spec, )
+    local_operands = list(
+        map(
+            lambda x, spec: jax.device_put(x, NamedSharding(mesh, PartitionSpec(*spec))),
+            global_operands,
+            partition_axes,
+        )
+    )
+
+    # Tranpose global RHS back to non-transpoosed orientation if it was originally allocated
+    # for FP8 GEMM
+    if fp8_gemm:
+        rhs_global = jnp.matrix_transpose(global_operands[1])
+        global_operands = (global_operands[0], rhs_global, *global_operands[2:])
+
+    return (
+        local_operands,
+        global_operands,
+        (out_shape, out_spec),
+        fsdp_gathered_rhs_spec,
+    )
+
+
+def _check_output(mesh, expected_out_shape, expected_out_specs, *tensors, fwd_bwd=False):
+    num_operands = 3 if fwd_bwd else 2
+    ref_operands = tensors[:num_operands]
+    test_outputs = tensors[num_operands:]
+
+    # Check number of dimensions
+    assert test_outputs[0].ndim == len(expected_out_shape), (
+        f"Output has different number of dimensions ({test_outputs[0].ndim}) than expected "
+        + f"({len(expected_out_shape)})"
+    )
+
+    # Pad test output spec for unsharded dimensions
+    test_spec = te.sharding.get_padded_spec(test_outputs[0].sharding.spec, test_outputs[0].ndim)
+
+    for i in range(test_outputs[0].ndim):
+        # Check shape
+        assert test_outputs[0].shape[i] == expected_out_shape[i], (
+            f"Output with shape {test_outputs[0].shape} does not match expected shape "
+            + f"{expected_out_shape} in dimension index {i}."
+        )
+
+        # Check shardings (with padded output spec)
+        spec_mismatch = False
+        if isinstance(expected_out_specs[i], str):
+            if test_spec[i] != expected_out_specs[i]:
+                spec_mismatch = True
+        elif isinstance(expected_out_specs[i], Iterable):
+            if not isinstance(test_spec[i], type(expected_out_specs[i])):
+                if test_spec[i] not in expected_out_specs[i]:
+                    spec_mismatch = True
+            elif len(test_spec[i]) != len(expected_out_specs[i]):
+                spec_mismatch = True
+            else:
+                for j in range(len(expected_out_specs[i])):
+                    if test_spec[i][j] != expected_out_specs[i][j]:
+                        spec_mismatch = True
+                        break
+        elif expected_out_specs[i] == None:
+            if test_spec[i] != None:
+                spec_mismatch = True
+        else:
+            raise RuntimeError("Internal TE error: Unrecognized reference partition spec type.")
+        if spec_mismatch:
+            raise AssertionError(
+                f"Output sharding {test_spec} does not match expected sharding "
+                + f"{expected_out_specs} in dimension index {i}."
+            )
+
+    def _native_gemm_fwd_bwd(lhs, rhs, grad):
+        fwd_out, vjp_fn = jax.vjp(jnp.dot, lhs, rhs)
+        lhs_grad, rhs_grad = vjp_fn(grad)
+        return fwd_out, lhs_grad, rhs_grad
+
+    ref_fn = jax.jit(_native_gemm_fwd_bwd if fwd_bwd else jnp.dot)
+
+    out_names = ["output"]
+    ref_outputs = ref_fn(*ref_operands)
+    if not fwd_bwd:
+        ref_outputs = [ref_outputs]
+    else:
+        out_names += ["dgrad", "wgrad"]
+
+    for i, (test_out, ref_out) in enumerate(zip(test_outputs, ref_outputs)):
+        test_out_global = jax.lax.with_sharding_constraint(
+            test_out, NamedSharding(mesh, PartitionSpec(None))
+        )
+        try:
+            assert_allclose(ref_out, test_out_global)
+        except AssertionError as err:
+            raise AssertionError(f"Numerical mismatch in {out_names[i]}:\n" + str(err))
+
+
+@pytest.mark.parametrize("comm_type", COMM_TYPES)
+@pytest.mark.parametrize("mesh_type", MESH_TYPES)
+def test_gemm_impl(comm_type, mesh_type):
+    mesh, mesh_resource, batched, fsdp = _get_mesh(mesh_type)
+
+    (
+        local_operands,
+        global_operands,
+        output_info,
+        fsdp_gathered_rhs_spec,
+    ) = _get_inputs(
+        mesh, mesh_resource, jnp.bfloat16, comm_type, batched, fsdp
+    )
+
+    @jax.jit
+    def _test_fn(lhs, rhs):
+        rhs_no_fsdp = jax.lax.with_sharding_constraint(
+            rhs, NamedSharding(mesh, PartitionSpec(*fsdp_gathered_rhs_spec))
+        )
+        return te.cpp_extensions.gemm_impl(lhs, rhs_no_fsdp, batched_output=batched)
+
+    with te.sharding.global_shard_guard(mesh_resource):
+        output, *_ = _test_fn(*local_operands)
+
+    _check_output(mesh, *output_info, *global_operands, output)
+
+
+@pytest.mark.parametrize("comm_type", COMM_TYPES)
+@pytest.mark.parametrize("mesh_type", MESH_TYPES)
+def test_gemm_fwd_bwd(comm_type, mesh_type):
+    mesh, mesh_resource, batched, fsdp = _get_mesh(mesh_type)
+
+    (
+        local_operands,
+        global_operands,
+        output_info,
+        fsdp_gathered_rhs_spec,
+    ) = _get_inputs(
+        mesh, mesh_resource, jnp.bfloat16, comm_type, batched, fsdp, fwd_bwd=True
+    )
+
+    @jax.jit
+    def _test_fn(lhs, rhs, grad):
+        # Gather weights in FSDP axis
+        rhs_no_fsdp = jax.lax.with_sharding_constraint(
+            rhs, NamedSharding(mesh, PartitionSpec(*fsdp_gathered_rhs_spec))
+        )
+
+        # FWD pass
+        fwd_out, vjp_fn = jax.vjp(gemm, lhs, rhs_no_fsdp)
+
+        # BWD pass
+        lhs_grad, rhs_grad = vjp_fn(grad)
+
+        return fwd_out, lhs_grad, rhs_grad
+
+    print(
+        f"INPUTS: {local_operands[0].shape} x {local_operands[1].shape}\n"
+        + f"    LHS sharding: {local_operands[0].sharding.spec}\n"
+        + f"    RHS sharding: {local_operands[1].sharding.spec}\n"
+    )
+
+    with te.sharding.global_shard_guard(mesh_resource):
+        output, dgrad, wgrad = _test_fn(*local_operands)
+
+    print(
+        f"{'AG + GEMM' if comm_type == 'AG' else 'GEMM + AR'} output: "
+        + f"{output.shape} | {output.sharding.spec}\n"
+        + f"DGRAD: {dgrad.shape} | {dgrad.sharding.spec}\n"
+        + f"WGRAD: {wgrad.shape} | {wgrad.sharding.spec}\n"
+    )
+
+    _check_output(mesh, *output_info, *global_operands, output, dgrad, wgrad, fwd_bwd=True)
+
diff --git a/transformer_engine/jax/cpp_extensions/gemm.py b/transformer_engine/jax/cpp_extensions/gemm.py
index 0f567eecef..30ff0ca54a 100644
--- a/transformer_engine/jax/cpp_extensions/gemm.py
+++ b/transformer_engine/jax/cpp_extensions/gemm.py
@@ -516,31 +516,53 @@ def infer_sharding_from_operands(
         )
 
         # Modify operand specs:
-        # - LHS operand outer dimension is all-gathered if RHS operand outer dimension is sharded
-        # - LHS operand contracting dimension sharding is forced to match RHS contracting dimension
-        lhs_spec_new = [spec for spec in lhs_spec]
-        rhs_spec_new = [spec for spec in rhs_spec]
-        if lhs_spec_new[lhs_inner_dim] != rhs_spec_new[rhs_inner_dim] and not grad:
-            warnings.warn(
-                "Forcing LHS sharding in the contracting dimension to match RHS. This can trigger "
-                + "additional communication if LHS is not already partitioned correctly."
+        # - If contracting dimensions of both operands are sharded, force them to match.
+        # - If contracting dimensions of both operands are sharded, all-gather outer dimensions.
+        # - If contracting dimension of only one operand is sharded, all-gather the sharded
+        #   operand.
+        # - Never scatter any operand.
+        lhs_spec_new = list(lhs_spec).copy()
+        rhs_spec_new = list(rhs_spec).copy()
+        lhs_spec_new[lhs_outer_dim] = None
+        if lhs_spec_new[lhs_inner_dim] is not None and rhs_spec_new[rhs_inner_dim] is not None:
+            assert lhs_spec_new[lhs_inner_dim] == rhs_spec_new[rhs_inner_dim], (
+                "Contracting dimensions of LHS and RHS operands must have the same sharding."
             )
-        rhs_outer_spec = rhs_spec_new[rhs_outer_dim]
-        if rhs_outer_spec is not None:
-            warnings.warn(
-                "Forcing the outer dimension of LHS (sequence/context dim) to be all- gathered. "
-                + "This may trigger additional communication if LHS is not already partitioned "
-                + "correctly. Additionally, the DGRAD output in the backward pass will not match "
-                + "the sharding of a sequence/context-parallel LHS operand."
-            )
-            lhs_spec_new[lhs_outer_dim] = None
-        lhs_spec_new[lhs_inner_dim] = rhs_spec_new[rhs_inner_dim]
+            if lhs_spec_new[lhs_outer_dim] is not None:
+                warnings.warn(
+                    "Outer dimension of the LHS operand must be all-gathered when both contracting "
+                    + "dimensions are sharded. This will cause additional communication overhead."
+                )
+
+            if rhs_spec_new[rhs_outer_dim] is not None:
+                warnings.warn(
+                    "Outer dimension of the RHS operand must be all-gathered when both contracting "
+                    + "dimensions are sharded. This will cause additional communication overhead."
+                )
+            rhs_spec_new[rhs_outer_dim] = None
+        else:
+            if lhs_spec_new[lhs_inner_dim] is None and rhs_spec_new[rhs_inner_dim] is not None:
+                warnings.warn(
+                    "Contracting dimension of the RHS operand must be all-gathered when the "
+                    + "contracting dimension of the LHS operand is unsharded. This will cause "
+                    + "additional communication overhead."
+                )
+            if lhs_spec_new[lhs_inner_dim] is not None and rhs_spec_new[rhs_inner_dim] is None:
+                if not grad:
+                    # This is expected for sequence/context-parallel gradient in BWD (DGRAD) GEMM.
+                    warnings.warn(
+                        "Contracting dimension of the LHS operand must be all-gathered when the "
+                        + "contracting dimension of the RHS operand is unsharded. This will cause "
+                        + "additional communication overhead."
+                    )
+            lhs_spec_new[lhs_inner_dim] = None
+            rhs_spec_new[rhs_inner_dim] = None
+        out_col_spec = rhs_spec_new[rhs_outer_dim]
 
         # Output sharding is conditional on output shape
         lhs_bdims = [dim for dim in range(lhs.ndim) if dim not in [lhs_inner_dim, lhs_outer_dim]]
         batch_spec = [lhs_spec_new[dim] for dim in lhs_bdims]
-        lhs_outer_spec = lhs_spec_new[lhs_outer_dim]
-        out_spec = [lhs_outer_spec, rhs_outer_spec]
+        out_spec = [None, out_col_spec]
         if batched_output:
             out_spec = batch_spec + out_spec
         out_sharding = NamedSharding(mesh, PartitionSpec(*out_spec))
@@ -549,11 +571,11 @@ def infer_sharding_from_operands(
         fp8_meta_sharding = NamedSharding(mesh, PartitionSpec(None))
 
         # Pre-GELU output is always 2D if GELU fusion is turned on, otherwise unsharded
-        gelu_spec = [lhs_outer_spec, rhs_outer_spec] if fuse_gelu else [None]
+        gelu_spec = [None, out_col_spec] if fuse_gelu else [None]
         gelu_sharding = NamedSharding(mesh, PartitionSpec(*gelu_spec))
 
         # Bias gradient spec matches outer dimension of output if bias fusion is turned on
-        bias_sharding = NamedSharding(mesh, PartitionSpec(rhs_outer_spec if fuse_bias else None))
+        bias_sharding = NamedSharding(mesh, PartitionSpec(out_col_spec if fuse_bias else None))
 
         return (out_sharding, fp8_meta_sharding, fp8_meta_sharding, gelu_sharding, bias_sharding)
 
@@ -583,19 +605,27 @@ def partition(
         )
 
         # Modify operand specs:
-        # - LHS operand outer dimension is all-gathered if RHS operand outer dimension is sharded
-        # - LHS operand contracting dimension sharding is forced to match RHS contracting dimension
-        lhs_spec_new = [spec for spec in lhs_spec]
-        rhs_spec_new = [spec for spec in rhs_spec]
-        rhs_outer_spec = rhs_spec_new[rhs_outer_dim]
-        if rhs_outer_spec is not None:
-            lhs_spec_new[lhs_outer_dim] = None
-        lhs_spec_new[lhs_inner_dim] = rhs_spec_new[rhs_inner_dim]
+        # - Always all-gather the outer dimension of LHS.
+        # - If contracting dimensions of both operands are sharded, all-gather RHS outer dimension.
+        # - If contracting dimension of only one operand is sharded, all-gather the sharded
+        #   operand.
+        # - Never scatter any operand.
+        lhs_spec_new = list(lhs_spec).copy()
+        rhs_spec_new = list(rhs_spec).copy()
+        reduce_output = False
+        lhs_spec_new[lhs_outer_dim] = None
+        if lhs_spec_new[lhs_inner_dim] is not None and rhs_spec_new[rhs_inner_dim] is not None:
+            rhs_spec_new[rhs_outer_dim] = None
+            reduce_output = True
+        else:
+            lhs_spec_new[lhs_inner_dim] = None
+            rhs_spec_new[rhs_inner_dim] = None
+        out_col_spec = rhs_spec_new[rhs_outer_dim]
         lhs_sharding = NamedSharding(mesh, PartitionSpec(*lhs_spec_new))
         rhs_sharding = NamedSharding(mesh, PartitionSpec(*rhs_spec_new))
 
         # Bias is sharded to match outer dimension spec of the RHS operand (also the output)
-        bias_sharding = NamedSharding(mesh, PartitionSpec(rhs_outer_spec if fuse_bias else None))
+        bias_sharding = NamedSharding(mesh, PartitionSpec(out_col_spec if fuse_bias else None))
 
         # FP8 metas are always unsharded
         fp8_meta_sharding = NamedSharding(mesh, PartitionSpec(None))
@@ -603,14 +633,13 @@ def partition(
         # Output sharding is conditional on output shape
         lhs_bdims = [dim for dim in range(lhs.ndim) if dim not in [lhs_inner_dim, lhs_outer_dim]]
         batch_spec = [lhs_spec_new[dim] for dim in lhs_bdims]
-        lhs_outer_spec = lhs_spec_new[lhs_outer_dim]
-        out_spec = [lhs_outer_spec, rhs_outer_spec]
+        out_spec = [None, out_col_spec]
         if batched_output:
             out_spec = batch_spec + out_spec
         out_sharding = NamedSharding(mesh, PartitionSpec(*out_spec))
 
         # Pre-GELU output is always 2D if GELU fusion is turned on, otherwise unsharded
-        gelu_spec = [lhs_outer_spec, rhs_outer_spec] if fuse_gelu else [None]
+        gelu_spec = [None, out_col_spec] if fuse_gelu else [None]
         gelu_sharding = NamedSharding(mesh, PartitionSpec(*gelu_spec))
 
         arg_shardings = (
@@ -663,13 +692,11 @@ def sharded_impl(
             if jax_dtype_is_fp8(lhs.dtype):
                 out_amax_updated = all_reduce_max_along_all_axes_except_PP(out_amax_updated, mesh)
 
-            if rhs_spec_new[rhs_inner_dim] is not None:
-                # GEMM output needs to be all-reduced when the contracting dimension is sharded.
-                out = lax_paral_op(out, jax.lax.psum, global_mesh_resource().tp_resource, mesh)
+            # All-reduce sum GEMM output when contracting dimensions are sharded
+            if reduce_output:
+                out = jax.lax.psum(out, global_mesh_resource().tp_resource)
                 if fuse_gelu:
-                    pre_gelu_out = lax_paral_op(
-                        pre_gelu_out, jax.lax.psum, global_mesh_resource().tp_resource, mesh
-                    )
+                    pre_gelu_out = jax.lax.psum(pre_gelu_out, global_mesh_resource().tp_resource)
 
             return out, out_amax_updated, out_scale_updated, pre_gelu_out, bias_grad
 
diff --git a/transformer_engine/jax/gemm.py b/transformer_engine/jax/gemm.py
index 464ccb12f9..4cf09a204f 100644
--- a/transformer_engine/jax/gemm.py
+++ b/transformer_engine/jax/gemm.py
@@ -33,7 +33,7 @@ def gemm(
     x: ArrayLike,
     kernel: ArrayLike,
     bias: Optional[ArrayLike] = None,
-    contracting_dims: Tuple[int, int] = (1, 0),
+    contracting_dims: Tuple[int, int] = (-1, -2),
     fuse_gelu: bool = False,
     accumulate: bool = False,
     use_split_accumulator: bool = False,
@@ -73,8 +73,11 @@ def _gemm_fwd_rule(
 
     fuse_bias = bias is not None
 
-    # AG+GEMM: ([B], M/P, K) --(AG)--> ([B], M, K) x (K, N/P) --------> ([B], M, N/P)
-    # GEMM+AR:                       ([B], M, K/P) x (K/P, N) --(AR)--> ([B], M, N)
+    # AG+GEMM:    ([B], M/P, K) --(AG)--> ([B], M, K) x (K, N/P) ------> ([B], M, N/P)
+    # (DP, TP, None) --(AG)--> (DP, None, None) x (None, TP) --> (DP, None, TP)
+    #
+    # GEMM+AR: ([B], M, K/P) x (K/P, N) --(AR)--> ([B], M, N)
+    #     (DP, None, TP) x (TP, None) --(AR)--> (DP, None, None)
     out, pre_gelu_out = gemm_impl(
         x,
         kernel,
@@ -112,12 +115,18 @@ def _gemm_bwd_rule(
     )
 
     # FWD MODE:
-    #    AG+GEMM: ([B], M/P, K) --(AG)--> ([B], M, K) x (K, N/P) --------> ([B], M, N/P)
-    #    GEMM+AR:                       ([B], M, K/P) x (K/P, N) --(AR)--> ([B], M, N)
+    #     AG+GEMM: ([B], M/P, K) --(AG)--> ([B], M, K) x (K, N/P) ------> ([B], M, N/P)
+    #  (DP, TP, None) --(AG)--> (DP, None, None) x (None, TP) --> (DP, None, TP)
+    #
+    #     GEMM+AR: ([B], M, K/P) x (K/P, N) --(AR)--> ([B], M, N)
+    #         (DP, None, TP) x (TP, None) --(AR)--> (DP, None, None)
 
     # DGRAD:
-    #    AG+GEMM: ([B], M, N/P) x (K, N/P)^T --(AR)--> ([B], M, K)
-    #    GEMM+AR:   ([B], M, N) x (K/P, N)^T --------> ([B], M, K/P)
+    #    AG+GEMM: ([B], M, N/P) x (K, N/P)^T ----(AR)----> ([B], M, K)
+    #        (DP, None, TP) x (None, TP)^T --(AR)--> (DP, None, None)
+    #
+    #    GEMM+AR:   ([B], M, N) x (K/P, N)^T ------> ([B], M, K/P)
+    #        (DP, None, None) x (TP, None)^T --> (DP, None, TP)
     dgrad, dgelu, _ = gemm_impl(
         grad,
         kernel,
@@ -133,7 +142,11 @@ def _gemm_bwd_rule(
 
     # WGRAD:
     #    AG+GEMM: ([B], M/P, K)^T --(AG)--> ([B], M, K)^T x ([B], M, N/P) --> (K, N/P)
-    #    GEMM+AR:                         ([B], M, K/P)^T x ([B], M, N) ----> (K/P, N)
+    #  (DP, 'tp', None)^T --(AG)-->(DP, None, None)^T x (DP, None, 'tp') --> (None, 'tp')
+    #
+    #    GEMM+AR: ([B], M, K/P)^T --(AG)--> ([B], M, K)^T x ([B], M, N) ---------> (K/P, N)
+    #     (DP, None, 'tp')^T --(AG)--> (DP, None, None)^T x (DP, None, None) ----> (None, None)
+    #     Make XLA scatter output in first dim.
     wgrad_rhs = dgelu if fuse_gelu else grad
     wgrad, _, bgrad = gemm_impl(
         x,
@@ -445,7 +458,7 @@ def type_safe_gemm(
     bias: Optional[ArrayLike] = None,
     fp8_meta: Optional[FP8MetaPackage] = None,
     out_dtype: Optional[jnp.dtype] = None,
-    contracting_dims: Tuple[int, int] = (1, 0),
+    contracting_dims: Tuple[int, int] = (-1, -2),
     fuse_gelu: bool = False,
     accumulate: bool = False,
     use_split_accumulator: bool = False,

From 2ce4377702d20d48564383647caede1f2dcf1e6e Mon Sep 17 00:00:00 2001
From: "pre-commit-ci[bot]"
 <66853113+pre-commit-ci[bot]@users.noreply.github.com>
Date: Wed, 27 Nov 2024 21:55:29 +0000
Subject: [PATCH 18/32] [pre-commit.ci] auto fixes from pre-commit.com hooks

for more information, see https://pre-commit.ci
---
 tests/jax/test_distributed_gemm.py            | 35 +++++++------------
 transformer_engine/jax/cpp_extensions/gemm.py |  6 ++--
 2 files changed, 16 insertions(+), 25 deletions(-)

diff --git a/tests/jax/test_distributed_gemm.py b/tests/jax/test_distributed_gemm.py
index f1e3c58c4a..b246999d8a 100644
--- a/tests/jax/test_distributed_gemm.py
+++ b/tests/jax/test_distributed_gemm.py
@@ -18,7 +18,7 @@
 from utils import assert_allclose
 
 
-jax.config.update('jax_enable_compilation_cache', False)
+jax.config.update("jax_enable_compilation_cache", False)
 
 
 # AG+GEMM: (4, 32/P, 128) ----(AG)----> (4, 32, 128) x (128, 256/P) ----------> (4, 32, 256/P)
@@ -48,20 +48,18 @@ def _get_mesh(parallel_dist):
     batched = False
     fsdp = False
     mesh_shape = dict(tp=NUM_DEVICES)
-    resources = dict(cp_resource='tp', tp_resource='tp')
+    resources = dict(cp_resource="tp", tp_resource="tp")
     if parallel_dist in ["DP_TP", "FSDP_TP"]:
         batched = True
-        mesh_shape.update(dict(tp=NUM_DEVICES//2, dp=NUM_DEVICES//2))
-        resources.update(dict(dp_resource='dp'))
+        mesh_shape.update(dict(tp=NUM_DEVICES // 2, dp=NUM_DEVICES // 2))
+        resources.update(dict(dp_resource="dp"))
         if parallel_dist == "FSDP_TP":
             fsdp = True
-            mesh_shape.update(dict(tp=NUM_DEVICES//2, dp=1, zp=NUM_DEVICES//2))
-            resources.update(dict(fsdp_resource='zp'))
+            mesh_shape.update(dict(tp=NUM_DEVICES // 2, dp=1, zp=NUM_DEVICES // 2))
+            resources.update(dict(fsdp_resource="zp"))
     mesh_resource = te.MeshResource(**resources)
 
-    devices = mesh_utils.create_device_mesh(
-        (NUM_DEVICES, ), devices=jax.devices()[:NUM_DEVICES]
-    )
+    devices = mesh_utils.create_device_mesh((NUM_DEVICES,), devices=jax.devices()[:NUM_DEVICES])
 
     mesh = Mesh(np.array(devices).reshape(tuple(mesh_shape.values())), tuple(mesh_shape.keys()))
 
@@ -73,9 +71,7 @@ def _get_inputs(mesh, mesh_resource, dtype, fwd_comm_type, batched, fsdp, fwd_bw
 
     # Operand and output shapes
     lhs_shape = (
-        [SEQ_LEN, HIDDEN_SIZE]
-        if fwd_comm_type == "ALL_GATHER"
-        else [SEQ_LEN, FFN_HIDDEN_SIZE]
+        [SEQ_LEN, HIDDEN_SIZE] if fwd_comm_type == "ALL_GATHER" else [SEQ_LEN, FFN_HIDDEN_SIZE]
     )
     rhs_shape = (
         [HIDDEN_SIZE, FFN_HIDDEN_SIZE]
@@ -125,12 +121,12 @@ def _get_inputs(mesh, mesh_resource, dtype, fwd_comm_type, batched, fsdp, fwd_bw
     sigma = 0.023
     shapes = (lhs_shape, rhs_shape)
     if fwd_bwd:
-        shapes += (out_shape, )
+        shapes += (out_shape,)
     global_operands = list(
         map(
             lambda key, shape: jax.device_put(
                 mu + (sigma * jax.random.normal(key, shape, dtype=dtype)),
-                NamedSharding(mesh, PartitionSpec(None))
+                NamedSharding(mesh, PartitionSpec(None)),
             ),
             split_keys,
             shapes,
@@ -140,7 +136,7 @@ def _get_inputs(mesh, mesh_resource, dtype, fwd_comm_type, batched, fsdp, fwd_bw
     # Allocate sharded operands on device
     partition_axes = (lhs_spec, rhs_spec)
     if fwd_bwd:
-        partition_axes += (out_spec, )
+        partition_axes += (out_spec,)
     local_operands = list(
         map(
             lambda x, spec: jax.device_put(x, NamedSharding(mesh, PartitionSpec(*spec))),
@@ -245,9 +241,7 @@ def test_gemm_impl(comm_type, mesh_type):
         global_operands,
         output_info,
         fsdp_gathered_rhs_spec,
-    ) = _get_inputs(
-        mesh, mesh_resource, jnp.bfloat16, comm_type, batched, fsdp
-    )
+    ) = _get_inputs(mesh, mesh_resource, jnp.bfloat16, comm_type, batched, fsdp)
 
     @jax.jit
     def _test_fn(lhs, rhs):
@@ -272,9 +266,7 @@ def test_gemm_fwd_bwd(comm_type, mesh_type):
         global_operands,
         output_info,
         fsdp_gathered_rhs_spec,
-    ) = _get_inputs(
-        mesh, mesh_resource, jnp.bfloat16, comm_type, batched, fsdp, fwd_bwd=True
-    )
+    ) = _get_inputs(mesh, mesh_resource, jnp.bfloat16, comm_type, batched, fsdp, fwd_bwd=True)
 
     @jax.jit
     def _test_fn(lhs, rhs, grad):
@@ -308,4 +300,3 @@ def _test_fn(lhs, rhs, grad):
     )
 
     _check_output(mesh, *output_info, *global_operands, output, dgrad, wgrad, fwd_bwd=True)
-
diff --git a/transformer_engine/jax/cpp_extensions/gemm.py b/transformer_engine/jax/cpp_extensions/gemm.py
index 30ff0ca54a..250e8e0c29 100644
--- a/transformer_engine/jax/cpp_extensions/gemm.py
+++ b/transformer_engine/jax/cpp_extensions/gemm.py
@@ -525,9 +525,9 @@ def infer_sharding_from_operands(
         rhs_spec_new = list(rhs_spec).copy()
         lhs_spec_new[lhs_outer_dim] = None
         if lhs_spec_new[lhs_inner_dim] is not None and rhs_spec_new[rhs_inner_dim] is not None:
-            assert lhs_spec_new[lhs_inner_dim] == rhs_spec_new[rhs_inner_dim], (
-                "Contracting dimensions of LHS and RHS operands must have the same sharding."
-            )
+            assert (
+                lhs_spec_new[lhs_inner_dim] == rhs_spec_new[rhs_inner_dim]
+            ), "Contracting dimensions of LHS and RHS operands must have the same sharding."
             if lhs_spec_new[lhs_outer_dim] is not None:
                 warnings.warn(
                     "Outer dimension of the LHS operand must be all-gathered when both contracting "

From f68d71edc56980932b4a4a07ab7d26c44fdaa4e7 Mon Sep 17 00:00:00 2001
From: Alp Dener <adener@nvidia.com>
Date: Thu, 5 Dec 2024 21:29:27 +0000
Subject: [PATCH 19/32] restored old test_custom_call_compute.py to remove
 erroneous changes

Signed-off-by: Alp Dener <adener@nvidia.com>
---
 tests/jax/test_custom_call_compute.py | 50 ---------------------------
 1 file changed, 50 deletions(-)

diff --git a/tests/jax/test_custom_call_compute.py b/tests/jax/test_custom_call_compute.py
index 355f587265..20b16c2809 100644
--- a/tests/jax/test_custom_call_compute.py
+++ b/tests/jax/test_custom_call_compute.py
@@ -25,7 +25,6 @@
     _jax_dbias_cast_transpose,
 )
 from transformer_engine.jax.cpp_extensions.quantization import _jax_cast_fp8
-from transformer_engine.jax.gemm import fp8_gemm, gemm
 from transformer_engine.jax import cpp_extensions as tex
 
 
@@ -416,55 +415,6 @@ def ref_func(x, ln_s, y, z, w, v, amax_list_1, amax_list_2, scale_list_1, scale_
             )
 
 
-class TestGemm:
-
-    @staticmethod
-    def _generate_inputs(b, m, n, k, dtype):
-        key = jax.random.PRNGKey(0)
-        subkeys = jax.random.split(key, 3)
-        a = jax.random.normal(subkeys[0], (b, m, k), dtype)
-        b = jax.random.normal(subkeys[1], (n, k), dtype)
-        bias_dtype = dtype if dtype not in [jnp.float8_e4m3fn, jnp.float8_e5m2] else jnp.bfloat16
-        bias = jax.random.normal(subkeys[2], (n,), bias_dtype)
-        return a, b, bias
-
-    @staticmethod
-    def _generate_fp8_inputs(b, m, n, k, fp8_dtype):
-        a, b, bias = TestGemm._generate_inputs(b, m, n, k, jnp.bfloat16)
-        a_scale, b_scale = map(lambda x: (jnp.max(jnp.abs(x)) / 127.0).astype(jnp.float32), [a, b])
-        a_q, b_q = map(
-            lambda x, x_scale: jnp.round(x / x_scale).astype(fp8_dtype),
-            [(a, a_scale), (b, b_scale)],
-        )
-        return a, a_q, jnp.reciprocal(a_scale), b, b_q, jnp.reciprocal(b_scale), bias
-
-    @pytest.mark.parametrize("m,n,k", GEMM_CASES)
-    @pytest.mark.parametrize("use_bias", (False, True))
-    @pytest.mark.parametrize("do_gelu", (False, True))
-    def test_gemm(self, b, m, n, k, use_bias, do_gelu):
-        a, b, bias = self._generate_inputs(b, m, n, k, jnp.bfloat16)
-
-        primitive_out = gemm(a, b, bias=bias if use_bias else None, layout="NT", do_gelu=do_gelu)
-        ref_out = jnp.dot(a, b)
-        if use_bias:
-            ref_out += bias
-        if do_gelu:
-            ref_out = jax.nn.gelu(ref_out)
-
-        assert_allclose(primitive_out, ref_out, dtype=jnp.bfloat16)
-
-    @pytest.mark.skipif(not is_fp8_supported, reason=reason)
-    @pytest.mark.parametrize("m,n,k", GEMM_CASES)
-    @pytest.mark.parametrize("fp8_dtype", FP8_COMPUTE_TYPE)
-    def test_fp8_gemm(self, m, n, k, fp8_dtype):
-        a, a_q, a_scale_inv, b, b_q, b_scale_inv, _ = self._generate_fp8_inputs(m, n, k, fp8_dtype)
-
-        primitive_out = fp8_gemm(a_q, a_scale_inv, b_q, b_scale_inv, out_dtype=jnp.bfloat16)
-        ref_out = jnp.dot(a, b)
-
-        assert_allclose(primitive_out, ref_out, dtype=fp8_dtype)
-
-
 @pytest.fixture(name="random_inputs")
 def random_inputs_fixture(shape):
     key = jax.random.PRNGKey(0)

From 6b322bb163c2de7d53cd69cb9306c5f0567fcdf6 Mon Sep 17 00:00:00 2001
From: Alp Dener <adener@nvidia.com>
Date: Thu, 14 Nov 2024 09:23:13 +0000
Subject: [PATCH 20/32] added XLA custom ops and C++ infrastructure for
 comm+GEMM overlap in TE/JAX

Signed-off-by: Alp Dener <adener@nvidia.com>

comm+GEMM overlap API for TE/JAX compiles, untested, but did not break collective GEMM op

Signed-off-by: Alp Dener <adener@nvidia.com>

[pre-commit.ci] auto fixes from pre-commit.com hooks

for more information, see https://pre-commit.ci

fixed static args

Signed-off-by: Alp Dener <adener@nvidia.com>

[pre-commit.ci] auto fixes from pre-commit.com hooks

for more information, see https://pre-commit.ci
---
 .gitmodules                                   |   3 +
 3rdparty/dlpack                               |   1 +
 build_tools/jax.py                            |  20 +
 setup.py                                      |   1 +
 .../comm_gemm_overlap/comm_gemm_overlap.cpp   |  72 ++-
 .../transformer_engine/comm_gemm_overlap.h    | 143 +++--
 .../transformer_engine/transformer_engine.h   |   2 +-
 .../common/transformer_engine.cpp             |   2 +-
 .../common/util/dlpack_helper.h               | 188 ++++++
 .../common/util/pybind_helper.h               |  18 +-
 transformer_engine/jax/cpp_extensions/gemm.py | 566 ++++++++++++-----
 transformer_engine/jax/csrc/extensions.h      | 100 ++-
 .../jax/csrc/extensions/comm_gemm_overlap.cpp | 291 +++++++++
 .../jax/csrc/extensions/gemm.cpp              |  11 +-
 .../jax/csrc/extensions/packing.cpp           |  34 +-
 .../jax/csrc/extensions/pybind.cpp            |  14 +
 transformer_engine/jax/gemm.py                | 575 +++++++++++++++++-
 transformer_engine/pytorch/csrc/extensions.h  |   3 +-
 .../csrc/extensions/comm_gemm_overlap.cpp     |  14 +-
 .../pytorch/csrc/extensions/pybind.cpp        |   5 +-
 transformer_engine/pytorch/module/base.py     |  13 +-
 21 files changed, 1815 insertions(+), 261 deletions(-)
 create mode 160000 3rdparty/dlpack
 create mode 100644 transformer_engine/common/util/dlpack_helper.h
 create mode 100644 transformer_engine/jax/csrc/extensions/comm_gemm_overlap.cpp

diff --git a/.gitmodules b/.gitmodules
index 21492db5ef..7fc91b1f54 100644
--- a/.gitmodules
+++ b/.gitmodules
@@ -4,3 +4,6 @@
 [submodule "3rdparty/cudnn-frontend"]
 	path = 3rdparty/cudnn-frontend
 	url = https://github.com/NVIDIA/cudnn-frontend.git
+[submodule "3rdparty/dlpack"]
+	path = 3rdparty/dlpack
+	url = git@github.com:dmlc/dlpack.git
diff --git a/3rdparty/dlpack b/3rdparty/dlpack
new file mode 160000
index 0000000000..bbd2f4d324
--- /dev/null
+++ b/3rdparty/dlpack
@@ -0,0 +1 @@
+Subproject commit bbd2f4d32427e548797929af08cfe2a9cbb3cf12
diff --git a/build_tools/jax.py b/build_tools/jax.py
index f829230f50..bb4da4e5ed 100644
--- a/build_tools/jax.py
+++ b/build_tools/jax.py
@@ -5,6 +5,7 @@
 """JAX related extensions."""
 import os
 from pathlib import Path
+from typing import Optional
 
 import setuptools
 from glob import glob
@@ -36,6 +37,7 @@ def setup_jax_extension(
     csrc_source_files,
     csrc_header_files,
     common_header_files,
+    third_party_packages,
 ) -> setuptools.Extension:
     """Setup PyBind11 extension for JAX support"""
     # Source files
@@ -55,12 +57,28 @@ def setup_jax_extension(
         common_header_files / "common" / "include",
         csrc_header_files,
         xla_home,
+        third_party_packages / "dlpack" / "include",
     ]
 
     # Compile flags
     cxx_flags = ["-O3"]
     nvcc_flags = ["-O3"]
 
+    # Userbuffers MPI dependence
+    libraries = []
+    library_dirs = []
+    if bool(int(os.getenv("NVTE_UB_WITH_MPI", "0"))):
+        mpi_home = os.getenv("MPI_HOME")
+        assert mpi_home is not None, "MPI_HOME must be set when compiling with NVTE_UB_WITH_MPI=1"
+        mpi_home = Path(mpi_home)
+        libraries.append("mpi")
+        library_dirs.append(mpi_home / "lib")
+
+        include_dirs.append(mpi_home / "include")
+
+        cxx_flags.append("-DNVTE_UB_WITH_MPI")
+        nvcc_flags.append("-DNVTE_UB_WITH_MPI")
+
     # Define TE/JAX as a Pybind11Extension
     from pybind11.setup_helpers import Pybind11Extension
 
@@ -79,5 +97,7 @@ def _add_cflags(self, flags: List[str]) -> None:
         "transformer_engine_jax",
         sources=[str(path) for path in sources],
         include_dirs=[str(path) for path in include_dirs],
+        library_dirs=[str(path) for path in library_dirs],
+        libraries=libraries,
         extra_compile_args={"cxx": cxx_flags, "nvcc": nvcc_flags},
     )
diff --git a/setup.py b/setup.py
index 3bb2fe6b95..a702399bc9 100644
--- a/setup.py
+++ b/setup.py
@@ -164,6 +164,7 @@ def setup_requirements() -> Tuple[List[str], List[str], List[str]]:
                         "transformer_engine/jax/csrc",
                         current_file_path / "transformer_engine" / "jax" / "csrc",
                         current_file_path / "transformer_engine",
+                        current_file_path / "3rdparty",
                     )
                 )
             if "paddle" in frameworks:
diff --git a/transformer_engine/common/comm_gemm_overlap/comm_gemm_overlap.cpp b/transformer_engine/common/comm_gemm_overlap/comm_gemm_overlap.cpp
index c6f0f870ff..810eeb2ebe 100644
--- a/transformer_engine/common/comm_gemm_overlap/comm_gemm_overlap.cpp
+++ b/transformer_engine/common/comm_gemm_overlap/comm_gemm_overlap.cpp
@@ -139,11 +139,12 @@ CommOverlapBase::CommOverlapBase(const std::vector<size_t> &buffer_shape, DType
                                  int numnodes, int tp_size, ExtAllgatherOp allgather_handle,
                                  ExtBarrierOp barrier_handle, int num_splits, int num_max_streams,
                                  int comm_cga_size, int num_comm_sm, bool set_sm_margin,
-                                 bool atomic_gemm)
+                                 bool atomic_gemm, bool overlap_first_gemm)
     : CommOverlapCore(myrank, numranks, mylocal, numlocal, mynode, numnodes, tp_size,
                       allgather_handle, barrier_handle, num_splits, num_max_streams, comm_cga_size,
                       num_comm_sm, set_sm_margin, false, atomic_gemm) {
   _rs_kernel_type = getenv<int>("NVTE_RS_STRIDED_ATOMIC", 0);
+  _overlap_first_gemm = overlap_first_gemm;
   NVTE_CHECK(_rs_kernel_type >= 0 && _rs_kernel_type <= 3,
              "Invalid choice for NVTE_RS_STRIDED_ATOMIC: Must be 0 (non-atomic), 1 (atomic) ",
              "or 2 (multi-atomic).");
@@ -164,6 +165,36 @@ CommOverlapBase::~CommOverlapBase() {
   cudaStreamDestroy(_stream_comm);
 }
 
+TensorWrapper CommOverlapBase::get_ubuf_output(CommOverlapType comm_type) {
+  char *output_ptr = reinterpret_cast<char *>(_ubuf.dptr());
+  if (comm_type == CommOverlapType::RS)
+    output_ptr += _ubuf.numel() / _tp_size * _tp_id * _ubuf.element_size();
+  size_t output_c_dim0 =
+      (comm_type == CommOverlapType::AG) ? _ubuf.size(0) : _ubuf.size(0) / _tp_size;
+  size_t output_c_dim1 = _ubuf.size(1);
+  return TensorWrapper(reinterpret_cast<void *>(output_ptr), {output_c_dim0, output_c_dim1},
+                       _ubuf.dtype());
+}
+
+void CommOverlapBase::copy_into_ubuf(cudaStream_t stream, TensorWrapper &input,
+                                     CommOverlapType comm_type) {
+  char *ubuf_ptr = reinterpret_cast<char *>(_ubuf.dptr());
+  if (comm_type == CommOverlapType::AG) {
+    if ((input.numel() * _tp_size) != (int64_t)_ubuf.numel() ||
+        input.element_size() != (int64_t)_ubuf.element_size()) {
+      NVTE_ERROR("Input and buffer sizes do not match!");
+    }
+    ubuf_ptr += _ubuf.numel() / _tp_size * _tp_id * _ubuf.element_size();
+  } else {
+    if (input.numel() != (int64_t)_ubuf.numel() ||
+        input.element_size() != (int64_t)_ubuf.element_size()) {
+      NVTE_ERROR("Input and buffer sizes do not match!");
+    }
+  }
+  NVTE_CHECK_CUDA(cudaMemcpyAsync(ubuf_ptr, input.dptr(), input.numel() * input.element_size(),
+                                  cudaMemcpyDeviceToDevice, stream));
+}
+
 /*
 ** Bulk GEMM + COMM
 ** This function assumes the communication input is pre-copied to _ubuf
@@ -225,8 +256,7 @@ void CommOverlapBase::atomic_gemm_overlap_rs(TensorWrapper &A, bool transa, Tens
                                              bool transb, TensorWrapper &D, TensorWrapper &bias,
                                              TensorWrapper &pre_gelu_out, TensorWrapper &workspace,
                                              bool grad, bool accumulate, bool use_split_accumulator,
-                                             bool gemm_overlap, TensorWrapper &rs_output,
-                                             cudaStream_t stream_main) {
+                                             TensorWrapper &rs_output, cudaStream_t stream_main) {
   int ori_sms = _ub_comm->sms;
   _ub_comm->use_ce = _use_ce;
   _ub_comm->sms = _num_comm_sm;
@@ -325,8 +355,7 @@ void CommOverlapBase::split_overlap_rs(TensorWrapper &A, bool transa, TensorWrap
                                        TensorWrapper &D, TensorWrapper &bias,
                                        TensorWrapper &pre_gelu_out, TensorWrapper &workspace,
                                        bool grad, bool accumulate, bool use_split_accumulator,
-                                       bool gemm_overlap, TensorWrapper &rs_output,
-                                       cudaStream_t stream_main) {
+                                       TensorWrapper &rs_output, cudaStream_t stream_main) {
   // Get GEMM dimensions
   int ori_sms = _ub_comm->sms;
   _ub_comm->use_ce = _use_ce;
@@ -358,7 +387,7 @@ void CommOverlapBase::split_overlap_rs(TensorWrapper &A, bool transa, TensorWrap
 
   assert(pre_gelu_out.numel() == 0);
 
-  if (gemm_overlap) {
+  if (_overlap_first_gemm) {
     auto input_a_chunk =
         TensorWrapper(A.dptr(), {m_chunk, k}, A.dtype(), nullptr, nullptr, A.scale_inv());
     auto output_chunk =
@@ -565,6 +594,37 @@ CommOverlapP2PBase::~CommOverlapP2PBase() {
   cudaStreamDestroy(_stream_send);
 }
 
+TensorWrapper CommOverlapP2PBase::get_ubuf_output(CommOverlapType comm_type) {
+  char *output_ptr = reinterpret_cast<char *>(_ubuf.dptr());
+  if (comm_type == CommOverlapType::RS)
+    output_ptr += _ubuf.numel() / _tp_size * _self_chunk_id * _ubuf.element_size();
+  size_t output_c_dim0 =
+      (comm_type == CommOverlapType::AG) ? _ubuf.size(0) : _ubuf.size(0) / _tp_size;
+  size_t output_c_dim1 = _ubuf.size(1);
+  return TensorWrapper(reinterpret_cast<void *>(output_ptr), {output_c_dim0, output_c_dim1},
+                       _ubuf.dtype());
+}
+
+void CommOverlapP2PBase::copy_into_ubuf(cudaStream_t stream, TensorWrapper &input,
+                                        CommOverlapType comm_type) {
+  if (comm_type == CommOverlapType::RS) {
+    // Copy input to the target ubuf chunk by rank offset
+    if (input.numel() != _ubufs[0].numel() || input.element_size() != _ubufs[0].element_size()) {
+      NVTE_ERROR("Input and buffer sizes do not match!");
+    }
+    NVTE_CHECK_CUDA(cudaMemcpyAsync(_ubufs[_tp_id].dptr(), input.dptr(),
+                                    input.numel() * input.element_size(), cudaMemcpyDeviceToDevice,
+                                    stream));
+  } else {
+    if (input.numel() != _ubuf.numel() || input.element_size() != _ubuf.element_size()) {
+      NVTE_ERROR("Input and buffer sizes do not match!");
+    }
+    NVTE_CHECK_CUDA(cudaMemcpyAsync(_ubuf.dptr(), input.dptr(),
+                                    input.numel() * input.element_size(), cudaMemcpyDeviceToDevice,
+                                    stream));
+  }
+}
+
 /*
 ** Split AllGather + AtomicGEMM using P2P communication
 ** This function assumes the input_b is pre-copied to _ubufs[rank_id]. This is needed to have AG
diff --git a/transformer_engine/common/include/transformer_engine/comm_gemm_overlap.h b/transformer_engine/common/include/transformer_engine/comm_gemm_overlap.h
index 1d5d192a39..16e4ccf16a 100644
--- a/transformer_engine/common/include/transformer_engine/comm_gemm_overlap.h
+++ b/transformer_engine/common/include/transformer_engine/comm_gemm_overlap.h
@@ -17,6 +17,9 @@
 
 #define NVTE_COMM_OVERLAP_MAX_STREAMS 3
 
+#define NOT_IMPLEMENTED_ERROR() NVTE_ERROR("Operation is not implemented.")
+
+#define NOT_SUPPORTED_ERROR() NVTE_ERROR("Operation not supported.")
 namespace transformer_engine {
 
 /* \brief Check if Userbufers bootstraps with direct calls to MPI collectives.
@@ -26,9 +29,9 @@ namespace transformer_engine {
  */
 bool ubuf_built_with_mpi();
 
-enum class CommOverlapType { RS = 0, AG = 1 };
+enum class CommOverlapType : int32_t { RS = 0, AG = 1 };
 
-enum class CommOverlapAlgo {
+enum class CommOverlapAlgo : int32_t {
   BULK_OVERLAP_AG = 0,
   BULK_OVERLAP_RS = 1,
   SPLIT_PIPELINED_AG_P2P = 2,
@@ -77,16 +80,64 @@ class CommOverlapCore {
     _ubuf_scale_inv_initialized = true;
   }
 
-  bool is_atomic_gemm() { return _atomic_gemm; }
+  virtual TensorWrapper get_ubuf_output(CommOverlapType comm_type) { NOT_IMPLEMENTED_ERROR(); }
+
+  virtual void copy_into_ubuf(cudaStream_t stream, TensorWrapper &input,
+                              CommOverlapType comm_type) {
+    NOT_IMPLEMENTED_ERROR();
+  }
+
+  virtual bool is_atomic_gemm() { return _atomic_gemm; }
+
+  virtual bool is_p2p_overlap() { return _is_p2p; }
+
+  virtual bool is_fp8_ubuf() { return _ubuf.element_size() == 1; }
+
+  virtual void bulk_overlap(TensorWrapper &A, bool transa, TensorWrapper &B, bool transb,
+                            TensorWrapper &D, TensorWrapper &bias, TensorWrapper &pre_gelu_out,
+                            TensorWrapper &workspace, bool grad, bool accumulate,
+                            bool use_split_accumulator, CommOverlapType comm_type,
+                            TensorWrapper &rs_output, cudaStream_t stream_main) {
+    NOT_IMPLEMENTED_ERROR();
+  }
+
+  virtual void atomic_gemm_overlap_rs(TensorWrapper &A, bool transa, TensorWrapper &B, bool transb,
+                                      TensorWrapper &D, TensorWrapper &bias,
+                                      TensorWrapper &pre_gelu_out, TensorWrapper &workspace,
+                                      bool grad, bool accumulate, bool use_split_accumulator,
+                                      TensorWrapper &rs_output, cudaStream_t stream_main) {
+    NOT_IMPLEMENTED_ERROR();
+  }
 
-  bool is_p2p_overlap() { return _is_p2p; }
+  virtual void split_overlap_rs(TensorWrapper &A, bool transa, TensorWrapper &B, bool transb,
+                                TensorWrapper &D, TensorWrapper &bias, TensorWrapper &pre_gelu_out,
+                                TensorWrapper &workspace, bool grad, bool accumulate,
+                                bool use_split_accumulator, TensorWrapper &rs_output,
+                                cudaStream_t stream_main) {
+    NOT_IMPLEMENTED_ERROR();
+  }
 
-  bool is_fp8_ubuf() { return _ubuf.element_size() == 1; }
+  virtual void atomic_gemm_overlap_ag(TensorWrapper &A, bool transa, TensorWrapper &B, bool transb,
+                                      TensorWrapper &D, TensorWrapper &bias,
+                                      TensorWrapper &pre_gelu_out, TensorWrapper &workspace,
+                                      bool grad, bool accumulate, bool use_split_accumulator,
+                                      TensorWrapper &B_copy, cudaStream_t stream_main) {
+    NOT_IMPLEMENTED_ERROR();
+  }
+
+  virtual void split_overlap_ag(TensorWrapper &A, bool transa, TensorWrapper &B, bool transb,
+                                TensorWrapper &D, TensorWrapper &bias, TensorWrapper &pre_gelu_out,
+                                TensorWrapper &workspace, bool grad, bool accumulate,
+                                bool use_split_accumulator, TensorWrapper &B_copy,
+                                cudaStream_t stream_main) {
+    NOT_IMPLEMENTED_ERROR();
+  }
 };  // CommOverlapCore
 
 class CommOverlapBase : public CommOverlapCore {
  protected:
   int _rs_kernel_type;
+  bool _overlap_first_gemm;
   cudaStream_t _stream_comm;
   cudaEvent_t _start_d2dcopy;
 
@@ -95,36 +146,47 @@ class CommOverlapBase : public CommOverlapCore {
                   int numranks, int mylocal, int numlocal, int mynode, int numnodes, int tp_size,
                   ExtAllgatherOp allgather_handle, ExtBarrierOp barrier_handle, int num_splits = 3,
                   int num_max_streams = NVTE_COMM_OVERLAP_MAX_STREAMS, int comm_cga_size = 2,
-                  int num_comm_sm = 16, bool set_sm_margin = true, bool atomic_gemm = false);
+                  int num_comm_sm = 16, bool set_sm_margin = true, bool atomic_gemm = false,
+                  bool overlap_first_gemm = false);
 
   virtual ~CommOverlapBase();
 
-  /*
-  ** Bulk GEMM + COMM
-  ** This function assumes the communication input is pre-copied to _ubuf
-  */
+  TensorWrapper get_ubuf_output(CommOverlapType comm_type);
+
+  void copy_into_ubuf(cudaStream_t stream, TensorWrapper &input, CommOverlapType comm_type);
+
   void bulk_overlap(TensorWrapper &A, bool transa, TensorWrapper &B, bool transb, TensorWrapper &D,
                     TensorWrapper &bias, TensorWrapper &pre_gelu_out, TensorWrapper &workspace,
                     bool grad, bool accumulate, bool use_split_accumulator,
                     CommOverlapType comm_type, TensorWrapper &rs_output, cudaStream_t stream_main);
 
-  /*
-  ** Split FPROP GEMM + ReduceScatter
-  */
   void atomic_gemm_overlap_rs(TensorWrapper &A, bool transa, TensorWrapper &B, bool transb,
                               TensorWrapper &D, TensorWrapper &bias, TensorWrapper &pre_gelu_out,
                               TensorWrapper &workspace, bool grad, bool accumulate,
-                              bool use_split_accumulator, bool gemm_overlap,
-                              TensorWrapper &rs_output, cudaStream_t stream_main);
+                              bool use_split_accumulator, TensorWrapper &rs_output,
+                              cudaStream_t stream_main);
 
-  /*
-  ** Split FPROP GEMM + ReduceScatter
-  */
   void split_overlap_rs(TensorWrapper &A, bool transa, TensorWrapper &B, bool transb,
                         TensorWrapper &D, TensorWrapper &bias, TensorWrapper &pre_gelu_out,
                         TensorWrapper &workspace, bool grad, bool accumulate,
-                        bool use_split_accumulator, bool gemm_overlap, TensorWrapper &rs_output,
+                        bool use_split_accumulator, TensorWrapper &rs_output,
                         cudaStream_t stream_main);
+
+  void atomic_gemm_overlap_ag(TensorWrapper &A, bool transa, TensorWrapper &B, bool transb,
+                              TensorWrapper &D, TensorWrapper &bias, TensorWrapper &pre_gelu_out,
+                              TensorWrapper &workspace, bool grad, bool accumulate,
+                              bool use_split_accumulator, TensorWrapper &B_copy,
+                              cudaStream_t stream_main) {
+    NOT_SUPPORTED_ERROR();
+  }
+
+  void split_overlap_ag(TensorWrapper &A, bool transa, TensorWrapper &B, bool transb,
+                        TensorWrapper &D, TensorWrapper &bias, TensorWrapper &pre_gelu_out,
+                        TensorWrapper &workspace, bool grad, bool accumulate,
+                        bool use_split_accumulator, TensorWrapper &B_copy,
+                        cudaStream_t stream_main) {
+    NOT_SUPPORTED_ERROR();
+  }
 };  // CommOverlapBase
 
 class CommOverlapP2PBase : public CommOverlapCore {
@@ -155,44 +217,39 @@ class CommOverlapP2PBase : public CommOverlapCore {
 
   virtual ~CommOverlapP2PBase();
 
-  /*
-  ** Split AllGather + AtomicGEMM using P2P communication
-  ** This function assumes the input_b is pre-copied to _ubufs[rank_id]. This is needed to have AG
-  ** outputs in each rank to be in the contiguous memory space after all ring exchange phases.
-  */
-  void atomic_gemm_overlap_ag(TensorWrapper &A, bool transa, TensorWrapper &B, bool transb,
+  TensorWrapper get_ubuf_output(CommOverlapType comm_type);
+
+  void copy_into_ubuf(cudaStream_t stream, TensorWrapper &input, CommOverlapType comm_type);
+
+  void bulk_overlap(TensorWrapper &A, bool transa, TensorWrapper &B, bool transb, TensorWrapper &D,
+                    TensorWrapper &bias, TensorWrapper &pre_gelu_out, TensorWrapper &workspace,
+                    bool grad, bool accumulate, bool use_split_accumulator,
+                    CommOverlapType comm_type, TensorWrapper &rs_output, cudaStream_t stream_main) {
+    NOT_SUPPORTED_ERROR();
+  }
+
+  void atomic_gemm_overlap_rs(TensorWrapper &A, bool transa, TensorWrapper &B, bool transb,
                               TensorWrapper &D, TensorWrapper &bias, TensorWrapper &pre_gelu_out,
                               TensorWrapper &workspace, bool grad, bool accumulate,
-                              bool use_split_accumulator, TensorWrapper &B_copy,
+                              bool use_split_accumulator, TensorWrapper &rs_output,
                               cudaStream_t stream_main);
 
-  /*
-  ** Split AllGather + GEMM using P2P communication
-  ** This function assumes the input_b is pre-copied to _ubufs[rank_id]. This is needed to have AG
-  ** outputs in each rank to be in the contiguous memory space after all ring exchange phases.
-  */
-  void split_overlap_ag(TensorWrapper &A, bool transa, TensorWrapper &B, bool transb,
+  void split_overlap_rs(TensorWrapper &A, bool transa, TensorWrapper &B, bool transb,
                         TensorWrapper &D, TensorWrapper &bias, TensorWrapper &pre_gelu_out,
                         TensorWrapper &workspace, bool grad, bool accumulate,
-                        bool use_split_accumulator, TensorWrapper &B_copy,
+                        bool use_split_accumulator, TensorWrapper &rs_output,
                         cudaStream_t stream_main);
 
-  /*
-  ** Split ReduceScatter + GEMM using P2P communication
-  */
-  void atomic_gemm_overlap_rs(TensorWrapper &A, bool transa, TensorWrapper &B, bool transb,
+  void atomic_gemm_overlap_ag(TensorWrapper &A, bool transa, TensorWrapper &B, bool transb,
                               TensorWrapper &D, TensorWrapper &bias, TensorWrapper &pre_gelu_out,
                               TensorWrapper &workspace, bool grad, bool accumulate,
-                              bool use_split_accumulator, TensorWrapper &rs_output,
+                              bool use_split_accumulator, TensorWrapper &B_copy,
                               cudaStream_t stream_main);
 
-  /*
-  ** Split ReduceScatter + GEMM using P2P communication
-  */
-  void split_overlap_rs(TensorWrapper &A, bool transa, TensorWrapper &B, bool transb,
+  void split_overlap_ag(TensorWrapper &A, bool transa, TensorWrapper &B, bool transb,
                         TensorWrapper &D, TensorWrapper &bias, TensorWrapper &pre_gelu_out,
                         TensorWrapper &workspace, bool grad, bool accumulate,
-                        bool use_split_accumulator, TensorWrapper &rs_output,
+                        bool use_split_accumulator, TensorWrapper &B_copy,
                         cudaStream_t stream_main);
 };  // CommOverlapP2PBase
 
diff --git a/transformer_engine/common/include/transformer_engine/transformer_engine.h b/transformer_engine/common/include/transformer_engine/transformer_engine.h
index d302518235..6fdc93098f 100644
--- a/transformer_engine/common/include/transformer_engine/transformer_engine.h
+++ b/transformer_engine/common/include/transformer_engine/transformer_engine.h
@@ -393,7 +393,7 @@ class TensorWrapper {
     return nvte_tensor_scale_inv(tensor_);
   }
 
- private:
+ protected:
   /*! \brief Wrapped NVTETensor. */
   NVTETensor tensor_ = nullptr;
 };
diff --git a/transformer_engine/common/transformer_engine.cpp b/transformer_engine/common/transformer_engine.cpp
index 1a3b49f9fa..b92a993d49 100644
--- a/transformer_engine/common/transformer_engine.cpp
+++ b/transformer_engine/common/transformer_engine.cpp
@@ -93,7 +93,7 @@ NVTEShape nvte_tensor_shape(const NVTETensor tensor) {
   return ret;
 }
 
-size_t nvte_tensor_ndim(const NVTETensor tensor) {
+size_t nvte_tensor_ndims(const NVTETensor tensor) {
   const auto &t = *reinterpret_cast<const transformer_engine::Tensor *>(tensor);
   return t.data.shape.size();
 }
diff --git a/transformer_engine/common/util/dlpack_helper.h b/transformer_engine/common/util/dlpack_helper.h
new file mode 100644
index 0000000000..cd8210e37a
--- /dev/null
+++ b/transformer_engine/common/util/dlpack_helper.h
@@ -0,0 +1,188 @@
+/*************************************************************************
+ * Copyright (c) 2022-2024, NVIDIA CORPORATION & AFFILIATES. All rights reserved.
+ *
+ * See LICENSE for license information.
+ ************************************************************************/
+
+#ifndef TRANSFORMER_ENGINE_COMMON_UTIL_DLPACK_HELPER_H
+#define TRANSFORMER_ENGINE_COMMON_UTIL_DLPACK_HELPER_H
+
+#include <dlpack/dlpack.h>
+#include <pybind11/pybind11.h>
+#include <transformer_engine/transformer_engine.h>
+
+#include "cuda_runtime.h"
+#include "logging.h"
+
+namespace transformer_engine {
+
+DLDataType nvte_dtype_to_dldtype(DType dtype) {
+  DLDataType dldtype;
+  dldtype.lanes = 1;
+  switch (dtype) {
+    case DType::kInt64:
+      dldtype.bits = 64;
+      dldtype.code = DLDataTypeCode::kDLInt;
+      break;
+
+    case DType::kInt32:
+      dldtype.bits = 32;
+      dldtype.code = DLDataTypeCode::kDLInt;
+      break;
+
+    case DType::kByte:
+      dldtype.bits = 8;
+      dldtype.code = DLDataTypeCode::kDLUInt;
+      break;
+
+    case DType::kFloat32:
+      dldtype.bits = 32;
+      dldtype.code = DLDataTypeCode::kDLFloat;
+      break;
+
+    case DType::kFloat16:
+      dldtype.bits = 16;
+      dldtype.code = DLDataTypeCode::kDLFloat;
+      break;
+
+    case DType::kBFloat16:
+      dldtype.bits = 16;
+      dldtype.code = DLDataTypeCode::kDLBfloat;
+      break;
+
+    case DType::kFloat8E4M3:
+      dldtype.bits = 8;
+      dldtype.code = DLDataTypeCode::kDLFloat;
+      break;
+
+    case DType::kFloat8E5M2:
+      dldtype.bits = 8;
+      dldtype.code = DLDataTypeCode::kDLFloat;
+      break;
+
+    default:
+      NVTE_ERROR("Unrecognized transformer_engine::DType.");
+  }
+  return dldtype;
+}
+
+DType dldtype_to_nvte_dtype(const DLDataType &dldtype, bool grad) {
+  NVTE_CHECK(dldtype.lanes == 1, "Unsupported number of lanes in DLDataType: ", dldtype.lanes);
+
+  switch (dldtype.code) {
+    case DLDataTypeCode::kDLInt:
+      switch (dldtype.bits) {
+        case 64:
+          return DType::kInt64;
+
+        case 32:
+          return DType::kInt32;
+
+        default:
+          NVTE_ERROR("Unsupported bits in integer DLDataType: ", dldtype.bits);
+      }
+
+    case DLDataTypeCode::kDLFloat:
+      switch (dldtype.bits) {
+        case 32:
+          return DType::kFloat32;
+
+        case 16:
+          return DType::kFloat16;
+
+        case 8:
+          if (grad) {
+            return DType::kFloat8E5M2;
+          } else {
+            return DType::kFloat8E4M3;
+          }
+
+        default:
+          NVTE_ERROR("Unsupported bits in float DLDataType: ", dldtype.bits);
+      }
+
+    case DLDataTypeCode::kDLBfloat:
+      if (dldtype.bits == 16) {
+        return DType::kBFloat16;
+      } else {
+        NVTE_ERROR("Unsupported bits in bfloat DLDataType: ", dldtype.bits);
+      }
+
+    case DLDataTypeCode::kDLBool:
+    case DLDataTypeCode::kDLUInt:
+      if (dldtype.bits == 8) {
+        return DType::kByte;
+      } else {
+        NVTE_ERROR("Unsupported bits in unsigned int DLDataType: ", dldtype.bits);
+      }
+
+    default:
+      NVTE_ERROR("Unsupported DLDataType.");
+  }
+}
+
+class DLPackWrapper : public TensorWrapper {
+ protected:
+  DLManagedTensor managed_tensor;
+
+ public:
+  // Inherit TensorWrapper constructors
+  using TensorWrapper::TensorWrapper;
+
+  // Construct a new DLPackWrapper from existing TensorWrapper
+  DLPackWrapper(TensorWrapper &&other) : TensorWrapper(std::move(other)) {}
+
+  // New constructor from PyObject
+  DLPackWrapper(pybind11::object obj, bool grad = false) {
+    NVTE_CHECK(PyCapsule_CheckExact(obj.ptr()), "Expected DLPack capsule");
+
+    DLManagedTensor *dlMTensor = (DLManagedTensor *)PyCapsule_GetPointer(obj.ptr(), "dltensor");
+    NVTE_CHECK(dlMTensor, "Invalid DLPack capsule.");
+
+    DLTensor *dlTensor = &dlMTensor->dl_tensor;
+    NVTE_CHECK(dlTensor->device.device_type == DLDeviceType::kDLCUDA,
+               "DLPack tensor is not on a CUDA device.");
+    NVTE_CHECK(dlTensor->device.device_id == cuda::current_device(),
+               "DLPack tensor resides on a different device.");
+
+    if (dlTensor->strides) {
+      for (int idx = dlTensor->ndim - 1; idx >= 0; ++idx) {
+        NVTE_CHECK(dlTensor->strides[idx] == 1,
+                   "DLPack tensors with non-standard strides are not supported.");
+      }
+    }
+
+    NVTEShape shape;
+    shape.data = reinterpret_cast<size_t *>(dlTensor->shape);
+    shape.ndim = static_cast<size_t>(dlTensor->ndim);
+    this->tensor_ = nvte_create_tensor(
+        dlTensor->data, shape, static_cast<NVTEDType>(dldtype_to_nvte_dtype(dlTensor->dtype, grad)),
+        nullptr, nullptr, nullptr);
+  }
+
+  pybind11::object capsule() {
+    DLDevice tensor_context;
+    tensor_context.device_type = DLDeviceType::kDLCUDA;
+    tensor_context.device_id = cuda::current_device();
+
+    DLTensor dlTensor;
+    dlTensor.data = dptr();
+    dlTensor.device = tensor_context;
+    dlTensor.ndim = ndim();
+    dlTensor.dtype = nvte_dtype_to_dldtype(dtype());
+    dlTensor.shape = reinterpret_cast<int64_t *>(const_cast<size_t *>(shape().data));
+    dlTensor.strides = nullptr;
+    dlTensor.byte_offset = 0;
+
+    managed_tensor.dl_tensor = dlTensor;
+    managed_tensor.manager_ctx = nullptr;
+    managed_tensor.deleter = [](DLManagedTensor *) {};
+
+    return pybind11::reinterpret_steal<pybind11::object>(
+        PyCapsule_New(&managed_tensor, "dltensor", nullptr));
+  }
+};
+
+}  // namespace transformer_engine
+
+#endif
diff --git a/transformer_engine/common/util/pybind_helper.h b/transformer_engine/common/util/pybind_helper.h
index a36ff3f0f9..6fa9574f63 100644
--- a/transformer_engine/common/util/pybind_helper.h
+++ b/transformer_engine/common/util/pybind_helper.h
@@ -28,7 +28,8 @@
       .value("NVTE_NO_BIAS", NVTE_Bias_Type::NVTE_NO_BIAS)                                      \
       .value("NVTE_PRE_SCALE_BIAS", NVTE_Bias_Type::NVTE_PRE_SCALE_BIAS)                        \
       .value("NVTE_POST_SCALE_BIAS", NVTE_Bias_Type::NVTE_POST_SCALE_BIAS)                      \
-      .value("NVTE_ALIBI", NVTE_Bias_Type::NVTE_ALIBI);                                         \
+      .value("NVTE_ALIBI", NVTE_Bias_Type::NVTE_ALIBI)                                          \
+      .export_values();                                                                         \
   pybind11::enum_<NVTE_Mask_Type>(m, "NVTE_Mask_Type")                                          \
       .value("NVTE_NO_MASK", NVTE_Mask_Type::NVTE_NO_MASK)                                      \
       .value("NVTE_PADDING_MASK", NVTE_Mask_Type::NVTE_PADDING_MASK)                            \
@@ -36,11 +37,13 @@
       .value("NVTE_PADDING_CAUSAL_MASK", NVTE_Mask_Type::NVTE_PADDING_CAUSAL_MASK)              \
       .value("NVTE_CAUSAL_BOTTOM_RIGHT_MASK", NVTE_Mask_Type::NVTE_CAUSAL_BOTTOM_RIGHT_MASK)    \
       .value("NVTE_PADDING_CAUSAL_BOTTOM_RIGHT_MASK",                                           \
-             NVTE_Mask_Type::NVTE_PADDING_CAUSAL_BOTTOM_RIGHT_MASK);                            \
+             NVTE_Mask_Type::NVTE_PADDING_CAUSAL_BOTTOM_RIGHT_MASK)                             \
+      .export_values();                                                                         \
   pybind11::enum_<NVTE_QKV_Format>(m, "NVTE_QKV_Format")                                        \
       .value("NVTE_SBHD", NVTE_QKV_Format::NVTE_SBHD)                                           \
       .value("NVTE_BSHD", NVTE_QKV_Format::NVTE_BSHD)                                           \
-      .value("NVTE_THD", NVTE_QKV_Format::NVTE_THD);                                            \
+      .value("NVTE_THD", NVTE_QKV_Format::NVTE_THD)                                             \
+      .export_values();                                                                         \
   pybind11::enum_<NVTE_QKV_Layout>(m, "NVTE_QKV_Layout")                                        \
       .value("NVTE_SB3HD", NVTE_QKV_Layout::NVTE_SB3HD)                                         \
       .value("NVTE_SBH3D", NVTE_QKV_Layout::NVTE_SBH3D)                                         \
@@ -56,12 +59,14 @@
       .value("NVTE_TH3D", NVTE_QKV_Layout::NVTE_TH3D)                                           \
       .value("NVTE_THD_T2HD", NVTE_QKV_Layout::NVTE_THD_T2HD)                                   \
       .value("NVTE_THD_TH2D", NVTE_QKV_Layout::NVTE_THD_TH2D)                                   \
-      .value("NVTE_THD_THD_THD", NVTE_QKV_Layout::NVTE_THD_THD_THD);                            \
+      .value("NVTE_THD_THD_THD", NVTE_QKV_Layout::NVTE_THD_THD_THD)                             \
+      .export_values();                                                                         \
   pybind11::enum_<NVTE_Fused_Attn_Backend>(m, "NVTE_Fused_Attn_Backend")                        \
       .value("NVTE_F16_max512_seqlen", NVTE_Fused_Attn_Backend::NVTE_F16_max512_seqlen)         \
       .value("NVTE_F16_arbitrary_seqlen", NVTE_Fused_Attn_Backend::NVTE_F16_arbitrary_seqlen)   \
       .value("NVTE_FP8", NVTE_Fused_Attn_Backend::NVTE_FP8)                                     \
-      .value("NVTE_No_Backend", NVTE_Fused_Attn_Backend::NVTE_No_Backend);                      \
+      .value("NVTE_No_Backend", NVTE_Fused_Attn_Backend::NVTE_No_Backend)                       \
+      .export_values();                                                                         \
   pybind11::enum_<NVTE_Activation_Type>(m, "NVTE_Activation_Type")                              \
       .value("GELU", NVTE_Activation_Type::GELU)                                                \
       .value("GEGLU", NVTE_Activation_Type::GEGLU)                                              \
@@ -72,7 +77,8 @@
       .value("QGELU", NVTE_Activation_Type::QGELU)                                              \
       .value("QGEGLU", NVTE_Activation_Type::QGEGLU)                                            \
       .value("SRELU", NVTE_Activation_Type::SRELU)                                              \
-      .value("SREGLU", NVTE_Activation_Type::SREGLU);                                           \
+      .value("SREGLU", NVTE_Activation_Type::SREGLU)                                            \
+      .export_values();                                                                         \
   pybind11::enum_<transformer_engine::CommOverlapType>(m, "CommOverlapType")                    \
       .value("RS", transformer_engine::CommOverlapType::RS)                                     \
       .value("AG", transformer_engine::CommOverlapType::AG);                                    \
diff --git a/transformer_engine/jax/cpp_extensions/gemm.py b/transformer_engine/jax/cpp_extensions/gemm.py
index 250e8e0c29..2ff98c20d9 100644
--- a/transformer_engine/jax/cpp_extensions/gemm.py
+++ b/transformer_engine/jax/cpp_extensions/gemm.py
@@ -3,9 +3,8 @@
 # See LICENSE for license information.
 import warnings
 import operator
-from functools import reduce
+from functools import reduce, partial
 from typing import Optional, Tuple
-from collections.abc import Iterable
 
 import jax
 import jax.numpy as jnp
@@ -30,6 +29,7 @@
     global_mesh_resource,
     lax_paral_op,
     all_reduce_max_along_all_axes_except_PP,
+    get_mesh_axis_size,
 )
 
 
@@ -38,6 +38,14 @@
     "gemm_impl",
 ]
 
+_COMM_GEMM_OVERLAP_LAYERS = ["qkv", "proj", "fc1", "fc2"]
+_COMM_GEMM_OVERLAP_NAMES = (
+    [layer + "_fprop" for layer in _COMM_GEMM_OVERLAP_LAYERS]
+    + [layer + "_dgrad" for layer in _COMM_GEMM_OVERLAP_LAYERS]
+    + [layer + "_wgrad" for layer in _COMM_GEMM_OVERLAP_LAYERS if layer != "fc2"]
+    + ["generic_ag", "generic_rs"]
+)
+
 
 def sanitize_dims(dim, ndims):
     return (ndims + dim) if dim < 0 else dim
@@ -60,7 +68,7 @@ class CollectiveGemmPrimitive(BasePrimitive):
     """
 
     name = "te_gemm"
-    impl_static_args = (8, 9, 10, 11, 12, 13, 14, 15)
+    impl_static_args = (8, 9, 10, 11, 12, 13, 14, 15, 16)
     multiple_results = True
     inner_primitive = None
     outer_primitive = None
@@ -83,12 +91,18 @@ def abstract(
         grad,
         accumulate,
         use_split_accumulator,
+        comm_overlap_config,
     ):
         """
         cuBlasLt GEMM abstract
         """
         del grad, accumulate, use_split_accumulator
 
+        assert tex.ubuf_built_with_mpi(), (
+            "Comm+GEMM overlap in TE/JAX requires Transformer Engine to be compiled with "
+            + "`NVTE_UB_WITH_MPI=1` and `MPI_HOME=/path/to/mpi` options."
+        )
+
         # Validate operand dtypes
         lhs_dtype = dtypes.canonicalize_dtype(lhs_aval.dtype)
         rhs_dtype = dtypes.canonicalize_dtype(rhs_aval.dtype)
@@ -106,13 +120,13 @@ def abstract(
                 and dtypes.canonicalize_dtype(rhs_scale_inv_aval.dtype) == jnp.float32
             ), "Missing RHS operand scale inverse in FP8 GEMM."
 
-        # Validate operand layouts
+        # Validate operand layouts, adjusted for comm-overlap if necessary
         lhs_inner_dim, rhs_inner_dim = map(
             sanitize_dims, contracting_dims, (lhs_aval.ndim, rhs_aval.ndim)
         )
         assert (
             lhs_aval.shape[lhs_inner_dim] == rhs_aval.shape[rhs_inner_dim]
-        ), f"Incompatible operand sizes: {lhs_aval.shape} x {rhs_aval.shape}."
+        ), f"Incompatible contracting dimensions: {lhs_aval.shape} x {rhs_aval.shape}."
 
         lhs_trans = lhs_inner_dim != lhs_aval.ndim - 1
         rhs_trans = rhs_inner_dim == rhs_aval.ndim - 1
@@ -153,6 +167,18 @@ def abstract(
         lhs_batch_shape = [lhs_aval.shape[dim] for dim in lhs_bdims]
         lhs_batch_size = reduce(operator.mul, lhs_batch_shape, 1)
 
+        if rhs_aval.ndim > 2:
+            rhs_bdims = [
+                dim for dim in range(rhs_aval.ndim) if dim not in [rhs_outer_dim, rhs_inner_dim]
+            ]
+            rhs_batch_shape = [rhs_aval.shape[dim] for dim in rhs_bdims]
+            rhs_batch_size = reduce(operator.mul, rhs_bdims, 1)
+            if rhs_batch_size > 1:
+                assert lhs_batch_size == rhs_batch_size, (
+                    f"Leading dimensins of RHS ({rhs_batch_shape=}) is not broadcast-compatible "
+                    + f"with the leading dimensions of LHS ({lhs_batch_shape=})."
+                )
+
         # Infer output shape
         if batched_output:
             assert (
@@ -204,6 +230,43 @@ def abstract(
         else:
             assert gelu_input_aval.size == 0, "Internal TE error."
 
+        # Adjust output sizes for comm-overlap
+        extra_out_shape = (0,)
+        extra_out_dtype = jnp.bfloat16
+        if comm_overlap_config is not None:
+            comm_overlap_type = comm_overlap_config.get("comm_type", None)
+            assert comm_overlap_type is not None, "Missing comm type for comm+GEMM overlap."
+            comm_overlap_name = comm_overlap_config.get("name", None)
+            assert (
+                comm_overlap_name in _COMM_GEMM_OVERLAP_NAMES
+            ), f"Unrecognized comm+GEMM overlap name: {comm_overlap_name=}"
+
+            mesh = comm_overlap_config.get("mesh", None)
+            tp_resource = comm_overlap_config.get("tp_resource", global_mesh_resource().tp_resource)
+            tp_size = get_mesh_axis_size(tp_resource, mesh=mesh)
+
+            match comm_overlap_type:
+                case tex.CommOverlapType.AG:
+                    # Extra output is all-gathered LHS copy
+                    extra_out_shape = list(lhs_aval.shape).copy()
+                    extra_out_shape[lhs_outer_dim] *= tp_size
+                    extra_out_dtype = lhs_dtype
+
+                case tex.CommOverlapType.RS:
+                    # FP8 GEMM output for RS overlap is always FP8
+                    if jax_dtype_is_fp8(lhs_dtype):
+                        assert jax_dtype_is_fp8(
+                            out_dtype
+                        ), "FP8 GEMM with reduce-scatter overlap requires FP8 output."
+                    # Extra output is reduce-scattered GEMM output
+                    extra_out_shape = list(out_shape).copy()
+                    extra_out_shape[-2] /= tp_size
+
+                case _:
+                    raise RuntimeError(
+                        f"Unrecognized comm type for comm+GEMM overlap: {comm_overlap_type=}"
+                    )
+
         # Create abstract arrays for all outputs
         out_aval = lhs_aval.update(shape=out_shape, dtype=out_dtype)
         out_amax_updated_aval = out_amax_aval.update(
@@ -214,6 +277,7 @@ def abstract(
         )
         pre_gelu_out_aval = gelu_input_aval.update(shape=gelu_shape, dtype=bias_dtype)
         bias_grad_aval = bias_aval.update(shape=bias_aval.shape, dtype=bias_dtype)
+        extra_out_aval = jax.core.ShapedArray(shape=extra_out_shape, dtype=extra_out_dtype)
         workspace_aval = jax.core.ShapedArray(
             shape=(get_cublas_workspace_size_bytes(),), dtype=jnp.uint8
         )
@@ -224,6 +288,7 @@ def abstract(
             out_scale_updated_aval,
             pre_gelu_out_aval,
             bias_grad_aval,
+            extra_out_aval,  # global LHS for AG overlap, or sharded output for RS overlap
             workspace_aval,
         )
 
@@ -232,10 +297,23 @@ def outer_abstract(*args, **kwargs):
         """
         cuBlasLt GEMM outer abstract
         """
-        (out_aval, out_amax_aval, out_scale_aval, pre_gelu_out_aval, bias_grad_aval, _) = (
-            CollectiveGemmPrimitive.abstract(*args, **kwargs)
+        (
+            out_aval,
+            out_amax_aval,
+            out_scale_aval,
+            pre_gelu_out_aval,
+            bias_grad_aval,
+            extra_out_aval,
+            *_,
+        ) = CollectiveGemmPrimitive.abstract(*args, **kwargs)
+        return (
+            out_aval,
+            out_amax_aval,
+            out_scale_aval,
+            pre_gelu_out_aval,
+            bias_grad_aval,
+            extra_out_aval,
         )
-        return out_aval, out_amax_aval, out_scale_aval, pre_gelu_out_aval, bias_grad_aval
 
     @staticmethod
     def lowering(
@@ -257,6 +335,7 @@ def lowering(
         grad,
         accumulate,
         use_split_accumulator,
+        comm_overlap_config,
     ):
         """
         Fused attention fwd lowering rules
@@ -278,7 +357,7 @@ def lowering(
 
         if is_ffi_enabled():
             name = "te_gemm_ffi"
-            return ffi.ffi_lowering(name, operand_output_aliases=operand_output_aliases)(
+            ffi_args = (
                 ctx,
                 lhs,
                 lhs_scale_inv,
@@ -288,6 +367,8 @@ def lowering(
                 gelu_input,
                 out_amax,
                 out_scale,
+            )
+            ffi_kwargs = dict(
                 lhs_trans=lhs_trans,
                 rhs_trans=rhs_trans,
                 fuse_gelu=fuse_gelu,
@@ -296,6 +377,15 @@ def lowering(
                 accumulate=accumulate,
                 use_split_accumulator=use_split_accumulator,
             )
+
+            if comm_overlap_config is not None:
+                name = "te_comm_gemm_overlap_ffi"
+                ffi_kwargs["comm_type"] = int(comm_overlap_config["comm_type"])
+                ffi_kwargs["name"] = comm_overlap_config["name"]
+
+            return ffi.ffi_lowering(name, operand_output_aliases=operand_output_aliases)(
+                *ffi_args, **ffi_kwargs
+            )
         else:
             operands = [
                 lhs,
@@ -325,7 +415,9 @@ def lowering(
             workspace_size = get_cublas_workspace_size_bytes()
             operand_dtype = jax_dtype_to_te_dtype(lhs_aval.dtype)
             bias_dtype = jax_dtype_to_te_dtype(bias_aval.dtype)
-            opaque = tex.pack_gemm_descriptor(
+
+            descriptor_packer_fn = tex.pack_gemm_decriptor
+            descriptor_args = (
                 m,
                 n,
                 k,
@@ -342,6 +434,16 @@ def lowering(
                 use_split_accumulator,
             )
 
+            comm_overlap_type = comm_overlap_config.get("comm_type", None)
+            if comm_overlap_type is not None:
+                name = "te_comm_gemm_overlap"
+                descriptor_packer_fn = tex.pack_overlap_descriptor
+                descriptor_args += (
+                    comm_overlap_type,
+                    comm_overlap_config.get("name", None),
+                )
+            opaque = descriptor_packer_fn(*descriptor_args)
+
             return custom_caller(
                 CollectiveGemmPrimitive.name,
                 args,
@@ -368,6 +470,7 @@ def impl(
         grad,
         accumulate,
         use_split_accumulator,
+        comm_overlap_config,
     ):
         assert CollectiveGemmPrimitive.inner_primitive is not None
 
@@ -430,6 +533,7 @@ def impl(
             out_scale_updated,
             pre_gelu_out,
             bias_grad,
+            extra_out,
             _,
         ) = CollectiveGemmPrimitive.inner_primitive.bind(
             lhs,
@@ -448,6 +552,7 @@ def impl(
             grad=grad,
             accumulate=accumulate,
             use_split_accumulator=use_split_accumulator,
+            comm_overlap_config=comm_overlap_config,
         )
 
         # Recover batched dimensions in the output
@@ -455,7 +560,7 @@ def impl(
             out_shape = (*lhs_batch_shape, out.shape[-2] // lhs_batch_size, out.shape[-1])
             out = jax.lax.reshape(out, out_shape)
 
-        return out, out_amax_updated, out_scale_updated, pre_gelu_out, bias_grad
+        return out, out_amax_updated, out_scale_updated, pre_gelu_out, bias_grad, extra_out
 
     @staticmethod
     def batcher(
@@ -470,6 +575,7 @@ def batcher(
         grad,
         accumulate,
         use_split_accumulator,
+        comm_overlap_config,
     ):
         assert CollectiveGemmPrimitive.outer_primitive is not None
         check_valid_batch_dims(batch_dims)
@@ -500,6 +606,7 @@ def infer_sharding_from_operands(
         grad,
         accumulate,
         use_split_accumulator,
+        comm_overlap_config,
         mesh,
         arg_infos,
         result_infos,
@@ -515,48 +622,59 @@ def infer_sharding_from_operands(
             (lhs.ndim, rhs.ndim),
         )
 
-        # Modify operand specs:
-        # - If contracting dimensions of both operands are sharded, force them to match.
-        # - If contracting dimensions of both operands are sharded, all-gather outer dimensions.
-        # - If contracting dimension of only one operand is sharded, all-gather the sharded
-        #   operand.
-        # - Never scatter any operand.
-        lhs_spec_new = list(lhs_spec).copy()
-        rhs_spec_new = list(rhs_spec).copy()
-        lhs_spec_new[lhs_outer_dim] = None
-        if lhs_spec_new[lhs_inner_dim] is not None and rhs_spec_new[rhs_inner_dim] is not None:
-            assert (
-                lhs_spec_new[lhs_inner_dim] == rhs_spec_new[rhs_inner_dim]
-            ), "Contracting dimensions of LHS and RHS operands must have the same sharding."
-            if lhs_spec_new[lhs_outer_dim] is not None:
-                warnings.warn(
-                    "Outer dimension of the LHS operand must be all-gathered when both contracting "
-                    + "dimensions are sharded. This will cause additional communication overhead."
-                )
+        # Modify operand specs
+        lhs_spec_new = [spec for spec in lhs_spec]
+        rhs_spec_new = [spec for spec in rhs_spec]
+        reduce_output = False
+        if comm_overlap_config is None:
+            # When comm overlap is not enabled:
+            # - Always all-gather the outer dimension of LHS.
+            # - If contracting dims of both operands are sharded, all-gather RHS outer dim.
+            # - If contracting dim of only one operand is sharded, all-gather the sharded operand.
+            # - Never scatter any operand.
+            lhs_spec_new[lhs_outer_dim] = None
+            if lhs_spec_new[lhs_inner_dim] is not None and rhs_spec_new[rhs_inner_dim] is not None:
+                assert (
+                    lhs_spec_new[lhs_inner_dim] == rhs_spec_new[rhs_inner_dim]
+                ), "Contracting dimensions of LHS and RHS operands must have the same sharding."
+                if lhs_spec_new[lhs_outer_dim] is not None:
+                    warnings.warn(
+                        "Outer dimension of the LHS operand must be all-gathered when both "
+                        + "contracting dimensions are sharded. This will cause additional "
+                        + "communication overhead."
+                    )
 
-            if rhs_spec_new[rhs_outer_dim] is not None:
-                warnings.warn(
-                    "Outer dimension of the RHS operand must be all-gathered when both contracting "
-                    + "dimensions are sharded. This will cause additional communication overhead."
-                )
-            rhs_spec_new[rhs_outer_dim] = None
-        else:
-            if lhs_spec_new[lhs_inner_dim] is None and rhs_spec_new[rhs_inner_dim] is not None:
-                warnings.warn(
-                    "Contracting dimension of the RHS operand must be all-gathered when the "
-                    + "contracting dimension of the LHS operand is unsharded. This will cause "
-                    + "additional communication overhead."
-                )
-            if lhs_spec_new[lhs_inner_dim] is not None and rhs_spec_new[rhs_inner_dim] is None:
-                if not grad:
-                    # This is expected for sequence/context-parallel gradient in BWD (DGRAD) GEMM.
+                if rhs_spec_new[rhs_outer_dim] is not None:
+                    warnings.warn(
+                        "Outer dimension of the RHS operand must be all-gathered when both "
+                        + "contracting dimensions are sharded. This will cause additional "
+                        + "communication overhead."
+                    )
+                rhs_spec_new[rhs_outer_dim] = None
+                reduce_output = True
+            else:
+                if lhs_spec_new[lhs_inner_dim] is None and rhs_spec_new[rhs_inner_dim] is not None:
                     warnings.warn(
-                        "Contracting dimension of the LHS operand must be all-gathered when the "
-                        + "contracting dimension of the RHS operand is unsharded. This will cause "
+                        "Contracting dimension of the RHS operand must be all-gathered when the "
+                        + "contracting dimension of the LHS operand is unsharded. This will cause "
                         + "additional communication overhead."
                     )
-            lhs_spec_new[lhs_inner_dim] = None
-            rhs_spec_new[rhs_inner_dim] = None
+                if lhs_spec_new[lhs_inner_dim] is not None and rhs_spec_new[rhs_inner_dim] is None:
+                    if not grad:
+                        # This is expected for sequence/context-parallel gradient in BWD (DGRAD) GEMM.
+                        warnings.warn(
+                            "Contracting dimension of the LHS operand must be all-gathered when "
+                            + "the contracting dimension of the RHS operand is unsharded. This "
+                            + "will cause additional communication overhead."
+                        )
+                lhs_spec_new[lhs_inner_dim] = None
+                rhs_spec_new[rhs_inner_dim] = None
+        else:
+            # When comm overlap is enabled, make sure both contracting dims are unsharded if one
+            # of them is unsharded.
+            if lhs_spec_new[lhs_inner_dim] is None or rhs_spec_new[rhs_inner_dim] is None:
+                lhs_spec_new[lhs_inner_dim] = None
+                rhs_spec_new[rhs_inner_dim] = None
         out_col_spec = rhs_spec_new[rhs_outer_dim]
 
         # Output sharding is conditional on output shape
@@ -577,7 +695,50 @@ def infer_sharding_from_operands(
         # Bias gradient spec matches outer dimension of output if bias fusion is turned on
         bias_sharding = NamedSharding(mesh, PartitionSpec(out_col_spec if fuse_bias else None))
 
-        return (out_sharding, fp8_meta_sharding, fp8_meta_sharding, gelu_sharding, bias_sharding)
+        # Validate operand sharding for comm+GEMM overlap and adust extra output sharding
+        extra_out_spec = [None]
+        if comm_overlap_config is not None:
+            mesh = comm_overlap_config.get("mesh", None)
+            tp_resource = comm_overlap_config.get("tp_resource", global_mesh_resource().tp_resource)
+            match comm_overlap_config.get("comm_type", None):
+                case tex.CommOverlapType.AG:
+                    # AG overlap requires the outer dimension of LHS to be sharded
+                    # over the TP resource
+                    assert lhs_spec[lhs_outer_dim] == tp_resource, (
+                        "AG+GEMM overlap requires the outer (sequence) dimension of the LHS "
+                        + f"operand to be sharded over the TP resource (mesh axis: {tp_resource=})."
+                    )
+                    extra_out_spec = list(lhs_spec).copy()
+                    extra_out_spec[lhs_outer_dim] = None
+
+                case tex.CommOverlapType.RS:
+                    # RS overlap requires the contracting dimensions of both LHS and RHS to be
+                    # sharded over the TP resource, and the outer dimension of LHS to be unsharded
+                    assert lhs_spec[lhs_outer_dim] is None, (
+                        "GEMM+RS overlap requires the outer (sequence) dimension of the LHS "
+                        + "operand to be un-sharded."
+                    )
+                    assert lhs_spec[lhs_inner_dim] == tp_resource, (
+                        "GEMM+RS overlap requires the contracting dimension of the LHS operand "
+                        + f"to be sharded over the TP resource (mesh axis: {tp_resource=})."
+                    )
+                    assert rhs_spec[rhs_inner_dim] == tp_resource, (
+                        "GEMM+RS overlap requires the contracting dimension of the RHS operand "
+                        + f"to be sharded over the TP resource (mesh axis: {tp_resource=})."
+                    )
+                    extra_out_spec = out_spec.copy()
+                    extra_out_spec[-2] = tp_resource
+
+        extra_out_sharding = NamedSharding(mesh, PartitionSpec(*extra_out_spec))
+
+        return (
+            out_sharding,
+            fp8_meta_sharding,
+            fp8_meta_sharding,
+            gelu_sharding,
+            bias_sharding,
+            extra_out_sharding,
+        )
 
     @staticmethod
     def partition(
@@ -589,6 +750,7 @@ def partition(
         grad,
         accumulate,
         use_split_accumulator,
+        comm_overlap_config,
         mesh,
         arg_infos,
         result_infos,
@@ -604,23 +766,31 @@ def partition(
             (lhs.ndim, rhs.ndim),
         )
 
-        # Modify operand specs:
-        # - Always all-gather the outer dimension of LHS.
-        # - If contracting dimensions of both operands are sharded, all-gather RHS outer dimension.
-        # - If contracting dimension of only one operand is sharded, all-gather the sharded
-        #   operand.
-        # - Never scatter any operand.
-        lhs_spec_new = list(lhs_spec).copy()
-        rhs_spec_new = list(rhs_spec).copy()
+        # Modify operand specs
+        lhs_spec_new = [spec for spec in lhs_spec]
+        rhs_spec_new = [spec for spec in rhs_spec]
         reduce_output = False
-        lhs_spec_new[lhs_outer_dim] = None
-        if lhs_spec_new[lhs_inner_dim] is not None and rhs_spec_new[rhs_inner_dim] is not None:
-            rhs_spec_new[rhs_outer_dim] = None
-            reduce_output = True
+        if comm_overlap_config is None:
+            # When comm overlap is not enabled:
+            # - Always all-gather the outer dimension of LHS.
+            # - If contracting dims of both operands are sharded, all-gather RHS outer dim.
+            # - If contracting dim of only one operand is sharded, all-gather the sharded operand.
+            # - Never scatter any operand.
+            lhs_spec_new[lhs_outer_dim] = None
+            if lhs_spec_new[lhs_inner_dim] is not None and rhs_spec_new[rhs_inner_dim] is not None:
+                rhs_spec_new[rhs_outer_dim] = None
+                reduce_output = True
+            else:
+                lhs_spec_new[lhs_inner_dim] = None
+                rhs_spec_new[rhs_inner_dim] = None
         else:
-            lhs_spec_new[lhs_inner_dim] = None
-            rhs_spec_new[rhs_inner_dim] = None
+            # When comm overlap is enabled, make sure both contracting dims are unsharded if one
+            # of them is unsharded.
+            if lhs_spec_new[lhs_inner_dim] is None or rhs_spec_new[rhs_inner_dim] is None:
+                lhs_spec_new[lhs_inner_dim] = None
+                rhs_spec_new[rhs_inner_dim] = None
         out_col_spec = rhs_spec_new[rhs_outer_dim]
+
         lhs_sharding = NamedSharding(mesh, PartitionSpec(*lhs_spec_new))
         rhs_sharding = NamedSharding(mesh, PartitionSpec(*rhs_spec_new))
 
@@ -642,6 +812,22 @@ def partition(
         gelu_spec = [None, out_col_spec] if fuse_gelu else [None]
         gelu_sharding = NamedSharding(mesh, PartitionSpec(*gelu_spec))
 
+        # Adjust extra output sharding for comm+GEMM overlap
+        extra_out_spec = [None]
+        if comm_overlap_config is not None:
+            mesh = comm_overlap_config.get("mesh", None)
+            tp_resource = comm_overlap_config.get("tp_resource", global_mesh_resource().tp_resource)
+            match comm_overlap_config.get("comm_type", None):
+                case tex.CommOverlapType.AG:
+                    extra_out_spec = list(lhs_spec).copy()
+                    extra_out_spec[lhs_outer_dim] = None
+
+                case tex.CommOverlapType.RS:
+                    extra_out_spec = out_spec.copy()
+                    extra_out_spec[-2] = tp_resource
+
+        extra_out_sharding = NamedSharding(mesh, PartitionSpec(*extra_out_spec))
+
         arg_shardings = (
             lhs_sharding,
             fp8_meta_sharding,
@@ -658,6 +844,7 @@ def partition(
             fp8_meta_sharding,
             gelu_sharding,
             bias_sharding,
+            extra_out_sharding,
         )
 
         def sharded_impl(
@@ -669,6 +856,7 @@ def sharded_impl(
                 out_scale_updated,
                 pre_gelu_out,
                 bias_grad,
+                extra_out,
             ) = CollectiveGemmPrimitive.impl(
                 lhs,
                 lhs_scale_inv,
@@ -686,6 +874,7 @@ def sharded_impl(
                 grad=grad,
                 accumulate=accumulate,
                 use_split_accumulator=use_split_accumulator,
+                comm_overlap_config=comm_overlap_config,
             )
 
             # FP8 amax reduction
@@ -693,12 +882,15 @@ def sharded_impl(
                 out_amax_updated = all_reduce_max_along_all_axes_except_PP(out_amax_updated, mesh)
 
             # All-reduce sum GEMM output when contracting dimensions are sharded
-            if reduce_output:
-                out = jax.lax.psum(out, global_mesh_resource().tp_resource)
-                if fuse_gelu:
-                    pre_gelu_out = jax.lax.psum(pre_gelu_out, global_mesh_resource().tp_resource)
+            if comm_overlap_config is None:
+                if reduce_output:
+                    out = jax.lax.psum(out, global_mesh_resource().tp_resource)
+                    if fuse_gelu:
+                        pre_gelu_out = jax.lax.psum(
+                            pre_gelu_out, global_mesh_resource().tp_resource
+                        )
 
-            return out, out_amax_updated, out_scale_updated, pre_gelu_out, bias_grad
+            return out, out_amax_updated, out_scale_updated, pre_gelu_out, bias_grad, extra_out
 
         return mesh, sharded_impl, out_shardings, arg_shardings
 
@@ -706,62 +898,6 @@ def sharded_impl(
 register_primitive(CollectiveGemmPrimitive)
 
 
-def fp8_gemm_impl(
-    lhs: ArrayLike,
-    lhs_scale_inv: ArrayLike,
-    rhs_t: ArrayLike,
-    rhs_scale_inv: ArrayLike,
-    bias: Optional[ArrayLike] = None,
-    gelu_input: Optional[ArrayLike] = None,
-    out_amax: Optional[ArrayLike] = None,
-    out_scale: Optional[ArrayLike] = None,
-    out_dtype: jnp.dtype = jnp.bfloat16,
-    batched_output: bool = False,
-    fuse_gelu: bool = False,
-    fuse_bias: bool = False,
-    accumulate: bool = False,
-    use_split_accumulator: bool = False,
-) -> Tuple[ArrayLike, ...]:
-    """FP8 mat-mul with `nvte_cublas_gemm()` custom op."""
-    if out_dtype is not None and jax_dtype_is_fp8(out_dtype):
-        assert out_amax is not None and out_scale is not None, "Missing output amax and scale."
-    else:
-        out_amax = jnp.zeros(0, dtype=jnp.float32)
-        out_scale = jnp.zeros(0, dtype=jnp.float32)
-
-    if not fuse_bias:
-        bias = jnp.zeros(0, dtype=jnp.bfloat16)
-    else:
-        assert bias is not None, "Missing bias in forward GEMM when bias epilogue is enabled."
-
-    if not fuse_gelu:
-        gelu_input = jnp.zeros(0, dtype=bias.dtype)
-    elif gelu_input is None:
-        gelu_shape = (reduce(operator.mul, lhs.shape[:-1]), rhs_t.shape[-1])
-        gelu_input = jnp.zeros(gelu_shape, dtype=bias.dtype)
-
-    out, out_amax, out_scale, pre_gelu_out, _ = CollectiveGemmPrimitive.outer_primitive.bind(
-        lhs,
-        lhs_scale_inv,
-        rhs_t,
-        rhs_scale_inv,
-        bias,
-        gelu_input,
-        out_amax,
-        out_scale,
-        out_dtype=out_dtype,
-        batched_output=batched_output,
-        contracting_dims=(-1, -1),
-        fuse_gelu=fuse_gelu,
-        fuse_bias=fuse_bias,
-        grad=False,
-        accumulate=accumulate,
-        use_split_accumulator=use_split_accumulator,
-    )
-
-    return out, out_amax, out_scale, pre_gelu_out
-
-
 def gemm_impl(
     lhs: ArrayLike,
     rhs: ArrayLike,
@@ -774,19 +910,19 @@ def gemm_impl(
     grad: bool = False,
     accumulate: bool = False,
     use_split_accumulator: bool = False,
+    comm_overlap_config: Optional[dict] = None,
 ) -> Tuple[ArrayLike, ...]:
     """Non-FP8 mat-mul with `nvte_cublas_gemm()` custom op."""
+    dummy_fp8_meta = jnp.zeros(0, dtype=jnp.float32)
     lhs_inner_dim, rhs_inner_dim = map(sanitize_dims, contracting_dims, (lhs.ndim, rhs.ndim))
-    lhs_outer_dim, rhs_outer_dim = map(
-        mirror_dim,
-        (lhs_inner_dim, rhs_inner_dim),
-        (lhs.ndim, rhs.ndim),
-    )
+    lhs_outer_dim = lhs.ndim - 1 if lhs_inner_dim != lhs.ndim - 1 else lhs.ndim - 2
+    rhs_outer_dim = rhs.ndim - 2 if rhs_inner_dim == rhs.ndim - 1 else rhs.ndim - 1
+    out_shape = (*lhs.shape[:-2], lhs.shape[lhs_outer_dim], rhs.shape[rhs_outer_dim])
 
     if not fuse_bias:
         bias = jnp.zeros(0, dtype=lhs.dtype)
     elif grad:
-        bias = jnp.zeros(rhs.shape[rhs_outer_dim], dtype=lhs.dtype)
+        bias = jnp.zeros(out_shape[-1], dtype=lhs.dtype)
     else:
         assert bias is not None, "Missing bias in forward GEMM when bias epilogue is enabled."
 
@@ -797,13 +933,16 @@ def gemm_impl(
             gelu_input is not None
         ), "Backward GEMM with dGELU epilogue requires pre-GELU output from forward GEMM."
     elif gelu_input is None:
-        bdims = [dim for dim in range(lhs.ndim) if dim not in [lhs_inner_dim, lhs_outer_dim]]
-        batch_size = reduce(operator.mul, [lhs.shape[dim] for dim in bdims], 1)
-        gelu_shape = (batch_size * lhs.shape[lhs_outer_dim], rhs.shape[rhs_outer_dim])
-        gelu_input = jnp.zeros(gelu_shape, dtype=lhs.dtypes)
-
-    dummy_fp8_meta = jnp.zeros(0, dtype=jnp.float32)
-    out, _, _, pre_gelu_out, bias_grad = CollectiveGemmPrimitive.outer_primitive.bind(
+        gelu_input = jnp.zeros(out_shape, dtype=lhs.dtypes)
+
+    (
+        out,
+        _,  # out_amax in FP8 GEMM
+        _,  # out_scale in FP8 GEMM
+        pre_gelu_out,
+        bias_grad,
+        extra_out,
+    ) = CollectiveGemmPrimitive.outer_primitive.bind(
         lhs,
         dummy_fp8_meta,
         rhs,
@@ -820,9 +959,156 @@ def gemm_impl(
         grad=grad,
         accumulate=accumulate,
         use_split_accumulator=use_split_accumulator,
+        comm_overlap_config=comm_overlap_config,
     )
 
     if grad:
-        return out, pre_gelu_out, bias_grad
+        return out, pre_gelu_out, bias_grad, extra_out
+    else:
+        return out, pre_gelu_out, extra_out
+
+
+def fp8_gemm_impl(
+    lhs: ArrayLike,
+    lhs_scale_inv: ArrayLike,
+    rhs_t: ArrayLike,
+    rhs_scale_inv: ArrayLike,
+    bias: Optional[ArrayLike] = None,
+    gelu_input: Optional[ArrayLike] = None,
+    out_amax: Optional[ArrayLike] = None,
+    out_scale: Optional[ArrayLike] = None,
+    out_dtype: jnp.dtype = jnp.bfloat16,
+    batched_output: bool = False,
+    fuse_gelu: bool = False,
+    fuse_bias: bool = False,
+    accumulate: bool = False,
+    use_split_accumulator: bool = False,
+    comm_overlap_config: Optional[dict] = None,
+) -> Tuple[ArrayLike, ...]:
+    """FP8 mat-mul with `nvte_cublas_gemm()` custom op."""
+    if out_dtype is not None and jax_dtype_is_fp8(out_dtype):
+        assert out_amax is not None and out_scale is not None, "Missing output amax and scale."
     else:
-        return out, pre_gelu_out
+        out_amax = jnp.zeros(0, dtype=jnp.float32)
+        out_scale = jnp.zeros(0, dtype=jnp.float32)
+
+    if not fuse_bias:
+        bias = jnp.zeros(0, dtype=jnp.bfloat16)
+    else:
+        assert bias is not None, "Missing bias in forward GEMM when bias epilogue is enabled."
+
+    if not fuse_gelu:
+        gelu_input = jnp.zeros(0, dtype=bias.dtype)
+    elif gelu_input is None:
+        gelu_shape = (reduce(operator.mul, lhs.shape[:-1]), rhs_t.shape[-1])
+        gelu_input = jnp.zeros(gelu_shape, dtype=bias.dtype)
+
+    (out, out_amax, out_scale, pre_gelu_out, _, extra_out) = (  # bias_grad in non-FP8 GEMM
+        CollectiveGemmPrimitive.outer_primitive.bind(
+            rhs_t,
+            rhs_scale_inv,
+            lhs,
+            lhs_scale_inv,
+            bias,
+            gelu_input,
+            out_amax,
+            out_scale,
+            out_dtype=out_dtype,
+            batched_output=batched_output,
+            contracting_dims=(-1, -1),
+            fuse_gelu=fuse_gelu,
+            fuse_bias=fuse_bias,
+            grad=False,
+            accumulate=accumulate,
+            use_split_accumulator=use_split_accumulator,
+            comm_overlap_config=comm_overlap_config,
+        )
+    )
+
+    return out, out_amax, out_scale, pre_gelu_out, extra_out
+
+
+class CopyIntoOverlapBufferPrimitive(BasePrimitive):
+    """
+    Copy JAX array data into comm+GEMM overlap buffer
+    """
+
+    name = "te_copy_into_overlap_buffer"
+    impl_static_args = (1, 2)
+    multiple_results = False
+    inner_primitive = None
+    outer_primitive = None
+
+    @staticmethod
+    def abstract(inp_aval, name, comm_type):
+        assert name in _COMM_GEMM_OVERLAP_NAMES, f"Unrecognized comm+GEMM overlap name: {name=}"
+        assert comm_type in [
+            tex.CommOverlapType.AG,
+            tex.CommOverlapType.RS,
+        ], "Invalid comm+GEMM overlap type."
+        assert inp_aval.size > 0, "Cannot copy a zero-size array into overlap buffer."
+        assert inp_aval.ndim == 2, "Cannot copy more than 2 dimensions!"
+        return jax.core.ShapedArray(shape=(0,), dtype=dtypes.canonicalize_dtype(inp_aval.dtype))
+
+    @staticmethod
+    def lowering(ctx, inp, *, name, comm_type):
+        if is_ffi_enabled():
+            name = "te_copy_into_overlap_buffer_ffi"
+            return ffi.ffi_lowering(name)(
+                ctx,
+                inp,
+                name=name,
+                comm_type=int(comm_type),
+            )
+        else:
+            operands = [inp]
+            operand_shapes = [ir.RankedTensorType(inp.type).shape]
+            out_types = []
+            args = CustomCallArgsWrapper(out_types, operands, operand_shapes)
+            opaque = tex.pack_buffer_descriptor(
+                name, inp.shape, jax_dtype_to_te_dtype(inp.dtype), comm_type
+            )
+            return custom_caller(CopyIntoOverlapBufferPrimitive.name, args, opaque, False)
+
+    @staticmethod
+    def impl(inp, name, comm_type):
+        assert CopyIntoOverlapBufferPrimitive.inner_primitive is not None
+        return CopyIntoOverlapBufferPrimitive.inner_primitive.bind(
+            inp, name=name, comm_type=comm_type
+        )
+
+    @staticmethod
+    def batcher(batched_args, batch_dims, *, name, comm_type):
+        assert CopyIntoOverlapBufferPrimitive.inner_primitive is not None
+        check_valid_batch_dims(batch_dims)
+        return (
+            CopyIntoOverlapBufferPrimitive.inner_primitive.bind(
+                *batched_args, name=name, comm_type=comm_type
+            ),
+            None,
+        )
+
+    @staticmethod
+    def infer_sharding_from_operands(name, comm_type, mesh, arg_infos, result_infos):
+        del name, comm_type, arg_infos, result_infos
+        return NamedSharding(mesh, PartitionSpec(None))
+
+    @staticmethod
+    def partition(name, comm_type, mesh, arg_infos, result_infos):
+        del name, comm_type, result_infos
+        inp_spec = arg_infos[0]
+        arg_shardings = (NamedSharding(mesh, PartitionSpec(*inp_spec)),)
+        out_sharding = NamedSharding(mesh, PartitionSpec(None))
+        return (
+            mesh,
+            partial(CopyIntoOverlapBufferPrimitive.impl, name=name, comm_type=comm_type),
+            out_sharding,
+            arg_shardings,
+        )
+
+
+register_primitive(CopyIntoOverlapBufferPrimitive)
+
+
+def copy_into_overlap_buffer(inp: ArrayLike, name: str, comm_type: tex.CommOverlapType) -> None:
+    _ = CollectiveGemmPrimitive.outer_primitive.bind(inp, name=name, comm_type=comm_type)
diff --git a/transformer_engine/jax/csrc/extensions.h b/transformer_engine/jax/csrc/extensions.h
index afac283a6f..d123d9b5b4 100644
--- a/transformer_engine/jax/csrc/extensions.h
+++ b/transformer_engine/jax/csrc/extensions.h
@@ -4,8 +4,8 @@
  * See LICENSE for license information.
  ************************************************************************/
 
-#ifndef TRANSFORMER_ENGINE_JAX_CSRC_FP8_MODULES_H_
-#define TRANSFORMER_ENGINE_JAX_CSRC_FP8_MODULES_H_
+#ifndef TRANSFORMER_ENGINE_JAX_CSRC_EXTENSIONS_H_
+#define TRANSFORMER_ENGINE_JAX_CSRC_EXTENSIONS_H_
 
 #include <cublasLt.h>
 #include <cublas_v2.h>
@@ -13,6 +13,7 @@
 #include <cudnn.h>
 #include <pybind11/pybind11.h>
 #include <pybind11/stl.h>
+#include <transformer_engine/comm_gemm_overlap.h>
 #include <transformer_engine/transformer_engine.h>
 
 #include <cassert>
@@ -148,7 +149,6 @@ pybind11::bytes PackCustomCallFusedAttnDescriptor(
     bool deterministic, int64_t window_size_left, int64_t window_size_right);
 
 struct CustomCallGemmDescriptor {
-  size_t batch;
   size_t m;
   size_t k;
   size_t n;
@@ -165,13 +165,50 @@ struct CustomCallGemmDescriptor {
   bool use_split_accumulator;
 };
 
-pybind11::bytes PackCustomCallGemmDescriptor(size_t batch, size_t m, size_t n, size_t k,
-                                             size_t workspace_size, DType operand_dtype,
-                                             DType out_dtype, DType bias_dtype, bool lhs_trans,
-                                             bool rhs_trans, bool fuse_gelu, bool fuse_bias,
-                                             bool grad, bool accumulate,
+pybind11::bytes PackCustomCallGemmDescriptor(size_t m, size_t n, size_t k, size_t workspace_size,
+                                             DType operand_dtype, DType out_dtype, DType bias_dtype,
+                                             bool lhs_trans, bool rhs_trans, bool fuse_gelu,
+                                             bool fuse_bias, bool grad, bool accumulate,
                                              bool use_split_accumulator);
 
+struct CustomCallBufferDescriptor {
+  const std::string name;
+  const size_t *shape;
+  const size_t ndim;
+  DType dtype;
+  CommOverlapType comm_type;
+};
+
+pybind11::bytes PackCustomCallBufferDescriptor(const std::string &name,
+                                               const std::vector<size_t> &shape, DType dtype,
+                                               CommOverlapType comm_type);
+
+struct CustomCallOverlapDescriptor {
+  size_t m;
+  size_t k;
+  size_t n;
+  size_t workspace_size;
+  DType operand_dtype;
+  DType bias_dtype;
+  DType out_dtype;
+  bool lhs_trans;
+  bool rhs_trans;
+  bool fuse_gelu;
+  bool fuse_bias;
+  bool grad;
+  bool accumulate;
+  bool use_split_accumulator;
+  CommOverlapType comm_type;
+  const std::string name;
+};
+
+pybind11::bytes PackCustomCallOverlapDescriptor(size_t m, size_t k, size_t n, size_t workspace_size,
+                                                DType operand_dtype, DType bias_dtype,
+                                                DType out_dtype, bool lhs_trans, bool rhs_trans,
+                                                bool fuse_gelu, bool fuse_bias, bool grad,
+                                                bool accumulate, bool use_split_accumulator,
+                                                CommOverlapType comm_type, const std::string &name);
+
 // Transpose
 
 void Transpose(cudaStream_t stream, void **buffers, const char *opaque, size_t opaque_len);
@@ -341,13 +378,52 @@ Error_Type GemmFFI(cudaStream_t stream, Buffer_Type lhs, Buffer_Type lhs_scale_i
                    Buffer_Type rhs_scale_inv, Buffer_Type bias, Buffer_Type gelu_input,
                    Buffer_Type out_amax, Buffer_Type out_scale, Result_Type out,
                    Result_Type out_amax_updated, Result_Type out_scale_updated,
-                   Result_Type pre_gelu_out, Result_Type bias_grad, Result_Type workspace,
-                   bool lhs_trans, bool rhs_trans, bool fuse_gelu, bool fuse_bias, bool grad,
-                   bool accumulate, bool use_split_accumulator);
+                   Result_Type pre_gelu_out, Result_Type bias_grad, Result_Type dummy_out,
+                   Result_Type workspace, bool lhs_trans, bool rhs_trans, bool fuse_gelu,
+                   bool fuse_bias, bool grad, bool accumulate, bool use_split_accumulator);
 
 XLA_FFI_DECLARE_HANDLER_SYMBOL(GemmHandler);
 
+// Comm+GEMM Overlap
+
+void BootstrapCommGemmOverlap(const std::string &name, const std::string &method,
+                              const std::vector<size_t> &buffer_shape, DType buffer_dtype,
+                              CommOverlapType comm_type, int tp_size, int num_splits,
+                              int num_max_streams, int comm_cga_size, int num_comm_sm,
+                              int set_sm_margin, bool use_ce, bool atomic_gemm, bool aggregate,
+                              bool pipeline_rs_overlap_first_gemm);
+
+void DestroyCommGemmOverlap(const std::string &name);
+
+void SetOverlapBufferScaleInverse(const std::string &name, pybind11::object scale_inv,
+                                  bool grad = false);
+
+bool OverlapBufferIsFp8(const std::string &name);
+
+pybind11::object GetOverlapBuffer(const std::string &name, CommOverlapType comm_type);
+
+void CopyIntoOverlapBuffer(cudaStream_t, void **buffers, const char *opaque, size_t opaque_len);
+
+Error_Type CopyIntoOverlapBufferFFI(cudaStream_t stream, Buffer_Type input, std::string_view name,
+                                    int32_t comm_type_flag);
+
+XLA_FFI_DECLARE_HANDLER_SYMBOL(CopyIntoOverlapBufferHandler);
+
+void CommGemmOverlap(cudaStream_t stream, void **buffers, const char *opaque, size_t opaque_len);
+
+Error_Type CommGemmOverlapFFI(cudaStream_t stream, Buffer_Type lhs, Buffer_Type lhs_scale_inv,
+                              Buffer_Type rhs, Buffer_Type rhs_scale_inv, Buffer_Type bias,
+                              Buffer_Type gelu_input, Buffer_Type out_amax, Buffer_Type out_scale,
+                              Result_Type out, Result_Type out_amax_new, Result_Type out_scale_new,
+                              Result_Type pre_gelu_out, Result_Type bias_grad,
+                              Result_Type extra_out, Result_Type workspace, bool lhs_trans,
+                              bool rhs_trans, bool fuse_gelu, bool fuse_bias, bool grad,
+                              bool accumulate, bool use_split_accumulator, int32_t comm_type_flag,
+                              std::string_view name);
+
+XLA_FFI_DECLARE_HANDLER_SYMBOL(CommGemmOverlapHandler);
+
 }  // namespace jax
 }  // namespace transformer_engine
 
-#endif  // TRANSFORMER_ENGINE_JAX_CSRC_FP8_MODULES_H_
+#endif  // TRANSFORMER_ENGINE_JAX_CSRC_EXTENSIONS_H_
diff --git a/transformer_engine/jax/csrc/extensions/comm_gemm_overlap.cpp b/transformer_engine/jax/csrc/extensions/comm_gemm_overlap.cpp
new file mode 100644
index 0000000000..df1f4bdc23
--- /dev/null
+++ b/transformer_engine/jax/csrc/extensions/comm_gemm_overlap.cpp
@@ -0,0 +1,291 @@
+/*************************************************************************
+ * Copyright (c) 2022-2024, NVIDIA CORPORATION & AFFILIATES. All rights reserved.
+ *
+ * See LICENSE for license information.
+ ************************************************************************/
+
+#include "common/util/dlpack_helper.h"
+#include "extensions.h"
+
+void _dummy_allgather(void *global, size_t globalbytes, void *local, size_t localbytes,
+                      ExtComm comm) {};
+
+void _dummy_barrier(ExtComm comm) {};
+
+namespace transformer_engine {
+
+namespace jax {
+
+static std::unordered_map<std::string, CommOverlapCore *> _overlaps;
+
+void BootstrapCommGemmOverlap(const std::string &name, const std::string &method,
+                              const std::vector<size_t> &buffer_shape, DType buffer_dtype,
+                              CommOverlapType comm_type, int tp_size, int num_splits,
+                              int num_max_streams, int comm_cga_size, int num_comm_sm,
+                              int set_sm_margin, bool use_ce, bool atomic_gemm, bool aggregate,
+                              bool pipeline_rs_overlap_first_gemm) {
+#ifndef NVTE_UB_WITH_MPI
+  NVTE_ERROR(
+      std::string("Comm+GEMM overlap in TE/JAX requires bootstrapping Userbuffers with MPI. ") +
+      std::string("Please compile TE with `NVTE_UB_WITH_MPI=1`."));
+#endif
+
+  // Initialize overlap object -- this allocates the comm buffer
+  NVTE_CHECK(_overlaps.find(name) == _overlaps.end(), name, " is already initialized!");
+  if (method == "ring-exchange") {
+    _overlaps[name] = reinterpret_cast<CommOverlapCore *>(new CommOverlapP2PBase(
+        buffer_shape, buffer_dtype, -1, -1, -1, -1, -1, -1, tp_size, &_dummy_allgather,
+        &_dummy_barrier, comm_type, num_max_streams, comm_cga_size, num_comm_sm, set_sm_margin,
+        use_ce, atomic_gemm, aggregate));
+  } else {
+    _overlaps[name] = reinterpret_cast<CommOverlapCore *>(new CommOverlapBase(
+        buffer_shape, buffer_dtype, -1, -1, -1, -1, -1, -1, tp_size, &_dummy_allgather,
+        &_dummy_barrier, num_splits, num_max_streams, comm_cga_size, num_comm_sm, set_sm_margin,
+        atomic_gemm, pipeline_rs_overlap_first_gemm));
+  }
+};
+
+void DestroyCommGemmOverlap(const std::string &name) {
+  auto overlap = _overlaps.find(name);
+  if (overlap != _overlaps.end()) {
+    delete overlap->second;
+    _overlaps.erase(overlap);
+  }
+};
+
+void SetOverlapBufferScaleInverse(const std::string &name, pybind11::object scale_inv, bool grad) {
+  auto scale_inv_tensor = DLPackWrapper(scale_inv, grad);
+  _overlaps[name]->set_ubuf_scale_inv(reinterpret_cast<float *>(scale_inv_tensor.dptr()));
+}
+
+bool OverlapBufferIsFp8(const std::string &name) { return _overlaps[name]->is_fp8_ubuf(); }
+
+pybind11::object GetOverlapBuffer(const std::string &name, CommOverlapType comm_type) {
+  DLPackWrapper output = std::move(_overlaps[name]->get_ubuf_output(comm_type));
+  auto capsule = output.capsule();
+  return capsule;
+};
+
+void CopyIntoOverlapBufferImpl(cudaStream_t stream, void *input_ptr,
+                               const std::vector<size_t> &shape, DType dtype,
+                               const std::string &name, CommOverlapType comm_type) {
+  auto input = TensorWrapper(input_ptr, shape, dtype);
+  _overlaps[name]->copy_into_ubuf(stream, input, comm_type);
+}
+
+void CopyIntoOverlapBuffer(cudaStream_t stream, void **buffers, const char *opaque,
+                           size_t opaque_len) {
+  auto input_ptr = buffers[0];
+
+  const auto &desc = *UnpackOpaque<CustomCallBufferDescriptor>(opaque, opaque_len);
+
+  CopyIntoOverlapBufferImpl(stream, input_ptr,
+                            std::vector<size_t>(desc.shape, desc.shape + desc.ndim), desc.dtype,
+                            desc.name, desc.comm_type);
+}
+
+Error_Type CopyIntoOverlapBufferFFI(cudaStream_t stream, Buffer_Type input, std::string_view name,
+                                    int32_t comm_type_flag) {
+  auto input_ptr = input.untyped_data();
+  auto shape = std::vector<size_t>(input.dimensions().begin(), input.dimensions().end());
+  auto dtype = convert_ffi_datatype_to_te_dtype(input.element_type());
+
+  CopyIntoOverlapBufferImpl(stream, input_ptr, shape, dtype, static_cast<std::string>(name),
+                            static_cast<CommOverlapType>(comm_type_flag));
+
+  return ffi_with_cuda_error_check();
+}
+
+XLA_FFI_DEFINE_HANDLER_SYMBOL(CopyIntoOverlapBufferHandler, CopyIntoOverlapBufferFFI,
+                              FFI::Bind()
+                                  .Ctx<FFI_Stream_Type>()  // stream
+                                  .Arg<Buffer_Type>()      // input
+                                  .Attr<std::string_view>("name")
+                                  .Attr<int32_t>("comm_type_flag"),
+                              FFI_CudaGraph_Traits);
+
+void CommGemmOverlapImpl(void *lhs, const std::vector<size_t> &lhs_shape, DType lhs_dtype,
+                         float *lhs_scale_inv, bool lhs_trans, void *rhs,
+                         const std::vector<size_t> &rhs_shape, DType rhs_dtype,
+                         float *rhs_scale_inv, bool rhs_trans, void *out,
+                         const std::vector<size_t> &out_shape, DType out_dtype, float *out_amax,
+                         float *out_scale, void *bias, DType bias_dtype, void *pre_gelu_out,
+                         void *extra_out, const std::vector<size_t> &extra_out_shape,
+                         void *workspace, size_t workspace_size, bool fuse_gelu, bool fuse_bias,
+                         bool grad, bool accumulate, bool use_split_accumulator,
+                         CommOverlapType comm_type, const std::string &name, cudaStream_t stream) {
+  auto lhs_ = TensorWrapper(lhs, lhs_shape, lhs_dtype, nullptr, nullptr, lhs_scale_inv);
+  auto rhs_ = TensorWrapper(rhs, rhs_shape, rhs_dtype, nullptr, nullptr, rhs_scale_inv);
+  auto out_ = TensorWrapper(out, out_shape, out_dtype, out_amax, out_scale, nullptr);
+
+  auto bias_ptr = (fuse_bias) ? bias : nullptr;
+  auto bias_shape = (fuse_bias) ? std::vector<size_t>(out_shape.back()) : std::vector<size_t>{0};
+  auto bias_ = TensorWrapper(bias_ptr, bias_shape, bias_dtype);
+
+  auto pre_gelu_ptr = (fuse_gelu) ? pre_gelu_out : nullptr;
+  auto pre_gelu_shape = (fuse_gelu) ? out_shape : std::vector<size_t>{0};
+  auto pre_gelu_out_ = TensorWrapper(pre_gelu_ptr, pre_gelu_shape, bias_dtype);
+
+  auto workspace_ = TensorWrapper(workspace, std::vector<size_t>{workspace_size}, DType::kByte);
+
+  auto extra_out_ =
+      TensorWrapper(extra_out, extra_out_shape, lhs_dtype, nullptr, nullptr, lhs_scale_inv);
+
+  auto overlap = _overlaps[name];
+  if (comm_type == CommOverlapType::AG) {
+    // AG overlap is only ring-exchange
+    if (overlap->is_atomic_gemm()) {
+      overlap->atomic_gemm_overlap_ag(rhs_, rhs_trans, lhs_, lhs_trans, out_, bias_, pre_gelu_out_,
+                                      workspace_, grad, accumulate, use_split_accumulator,
+                                      extra_out_, stream);
+    } else {
+      overlap->split_overlap_ag(rhs_, rhs_trans, lhs_, lhs_trans, out_, bias_, pre_gelu_out_,
+                                workspace_, grad, accumulate, use_split_accumulator, extra_out_,
+                                stream);
+    }
+  } else if (comm_type == CommOverlapType::RS) {
+    if (overlap->is_atomic_gemm()) {
+      overlap->atomic_gemm_overlap_rs(rhs_, rhs_trans, lhs_, lhs_trans, out_, bias_, pre_gelu_out_,
+                                      workspace_, grad, accumulate, use_split_accumulator,
+                                      extra_out_, stream);
+    } else {
+      overlap->split_overlap_rs(rhs_, rhs_trans, lhs_, lhs_trans, out_, bias_, pre_gelu_out_,
+                                workspace_, grad, accumulate, use_split_accumulator, extra_out_,
+                                stream);
+    }
+  }
+}
+
+void CommGemmOverlap(cudaStream_t stream, void **buffers, const char *opaque, size_t opaque_len) {
+  // Inputs
+  auto lhs = buffers[0];
+  auto lhs_scale_inv = reinterpret_cast<float *>(buffers[1]);
+  auto rhs = buffers[2];
+  auto rhs_scale_inv = reinterpret_cast<float *>(buffers[3]);
+  auto bias = buffers[4];
+  auto gelu_input = buffers[5];
+  auto out_amax = reinterpret_cast<float *>(buffers[6]);
+  auto out_scale = reinterpret_cast<float *>(buffers[7]);
+
+  // Outputs
+  auto out = buffers[8];
+  auto out_amax_new = reinterpret_cast<float *>(buffers[9]);
+  auto out_scale_new = reinterpret_cast<float *>(buffers[10]);
+  auto pre_gelu_out = buffers[11];
+  auto bias_grad = buffers[12];
+  auto extra_out = buffers[13];
+  auto workspace = buffers[14];
+
+  // Check operand-output aliases
+  NVTE_CHECK(bias == bias_grad, "bias not bound to bias_grad in AG+GEMM overlap.");
+  NVTE_CHECK(gelu_input == pre_gelu_out,
+             "gelu_input not bound to pre_gelu_out in AG+GEMM overlap.");
+  NVTE_CHECK(out_amax == out_amax_new, "out_amax not bound to out_amax_new in AG+GEMM overlap.");
+  NVTE_CHECK(out_scale == out_scale_new,
+             "out_scale not bound to out_scale_new in AG+GEMM overlap.");
+
+  const auto &desc = *UnpackOpaque<CustomCallOverlapDescriptor>(opaque, opaque_len);
+
+  auto lhs_shape =
+      (desc.lhs_trans) ? std::vector<size_t>{desc.k, desc.m} : std::vector<size_t>{desc.m, desc.k};
+  auto rhs_shape =
+      (desc.rhs_trans) ? std::vector<size_t>{desc.n, desc.k} : std::vector<size_t>{desc.k, desc.n};
+  auto out_shape = std::vector<size_t>{desc.m, desc.n};
+
+  CommGemmOverlapImpl(lhs, lhs_shape, desc.operand_dtype, lhs_scale_inv, desc.lhs_trans, rhs,
+                      rhs_shape, desc.operand_dtype, rhs_scale_inv, desc.rhs_trans, out, out_shape,
+                      desc.out_dtype, out_amax, out_scale, bias, desc.bias_dtype, pre_gelu_out,
+                      extra_out, lhs_shape, workspace, desc.workspace_size, desc.fuse_gelu,
+                      desc.fuse_bias, desc.grad, desc.accumulate, desc.use_split_accumulator,
+                      desc.comm_type, desc.name, stream);
+}
+
+Error_Type CommGemmOverlapFFI(cudaStream_t stream, Buffer_Type lhs, Buffer_Type lhs_scale_inv,
+                              Buffer_Type rhs, Buffer_Type rhs_scale_inv, Buffer_Type bias,
+                              Buffer_Type gelu_input, Buffer_Type out_amax, Buffer_Type out_scale,
+                              Result_Type out, Result_Type out_amax_new, Result_Type out_scale_new,
+                              Result_Type pre_gelu_out, Result_Type bias_grad,
+                              Result_Type extra_out, Result_Type workspace, bool lhs_trans,
+                              bool rhs_trans, bool fuse_gelu, bool fuse_bias, bool grad,
+                              bool accumulate, bool use_split_accumulator, int32_t comm_type_flag,
+                              std::string_view name) {
+  // Inputs
+  auto lhs_ptr = lhs.untyped_data();
+  auto lhs_shape = std::vector<size_t>(lhs.dimensions().begin(), lhs.dimensions().end());
+  auto lhs_dtype = convert_ffi_datatype_to_te_dtype(lhs.element_type());
+  auto lhs_scale_inv_ptr = reinterpret_cast<float *>(lhs_scale_inv.untyped_data());
+  auto rhs_ptr = rhs.untyped_data();
+  auto rhs_shape = std::vector<size_t>(rhs.dimensions().begin(), rhs.dimensions().end());
+  auto rhs_dtype = convert_ffi_datatype_to_te_dtype(rhs.element_type());
+  auto rhs_scale_inv_ptr = reinterpret_cast<float *>(rhs_scale_inv.untyped_data());
+  auto bias_ptr = bias.untyped_data();
+  auto bias_dtype = convert_ffi_datatype_to_te_dtype(bias.element_type());
+  auto gelu_input_ptr = gelu_input.untyped_data();
+  auto out_amax_ptr = reinterpret_cast<float *>(out_amax.untyped_data());
+  auto out_scale_ptr = reinterpret_cast<float *>(out_scale.untyped_data());
+
+  // Outputs
+  auto out_ptr = out->untyped_data();
+  auto out_shape = std::vector<size_t>(out->dimensions().begin(), out->dimensions().end());
+  auto out_dtype = convert_ffi_datatype_to_te_dtype(out->element_type());
+  auto out_amax_new_ptr = reinterpret_cast<float *>(out_amax_new->untyped_data());
+  auto out_scale_new_ptr = reinterpret_cast<float *>(out_scale_new->untyped_data());
+  auto pre_gelu_ptr = pre_gelu_out->untyped_data();
+  auto bias_grad_ptr = bias_grad->untyped_data();
+  auto extra_out_ptr = extra_out->untyped_data();
+  auto extra_out_shape =
+      std::vector<size_t>(extra_out->dimensions().begin(), extra_out->dimensions().end());
+  auto workspace_ptr = workspace->untyped_data();
+  auto workspace_size = workspace->element_count();
+
+  // Check operand-output aliases
+  NVTE_CHECK(bias_ptr == bias_grad_ptr, "bias not bound to bias_grad in AG+GEMM overlap.");
+  NVTE_CHECK(gelu_input_ptr == pre_gelu_ptr,
+             "gelu_input not bound to pre_gelu_out in AG+GEMM overlap.");
+  NVTE_CHECK(out_amax_ptr == out_amax_new_ptr,
+             "out_amax not bound to out_amax_new in AG+GEMM overlap.");
+  NVTE_CHECK(out_scale_ptr == out_scale_new_ptr,
+             "out_scale not bound to out_scale_new in AG+GEMM overlap.");
+
+  CommGemmOverlapImpl(
+      lhs_ptr, lhs_shape, lhs_dtype, lhs_scale_inv_ptr, lhs_trans, rhs_ptr, rhs_shape, rhs_dtype,
+      rhs_scale_inv_ptr, rhs_trans, out_ptr, out_shape, out_dtype, out_amax_ptr, out_scale_ptr,
+      bias_ptr, bias_dtype, pre_gelu_ptr, extra_out_ptr, extra_out_shape, workspace_ptr,
+      workspace_size, fuse_gelu, fuse_bias, grad, accumulate, use_split_accumulator,
+      static_cast<CommOverlapType>(comm_type_flag), static_cast<std::string>(name), stream);
+
+  return ffi_with_cuda_error_check();
+}
+
+XLA_FFI_DEFINE_HANDLER_SYMBOL(CommGemmOverlapHandler, CommGemmOverlapFFI,
+                              FFI::Bind()
+                                  .Ctx<FFI_Stream_Type>()  // stream
+                                  .Arg<Buffer_Type>()      // lhs
+                                  .Arg<Buffer_Type>()      // lhs_scale_inv
+                                  .Arg<Buffer_Type>()      // rhs
+                                  .Arg<Buffer_Type>()      // rhs_scale_inv
+                                  .Arg<Buffer_Type>()      // bias
+                                  .Arg<Buffer_Type>()      // gelu_input
+                                  .Arg<Buffer_Type>()      // out_amax
+                                  .Arg<Buffer_Type>()      // out_scale
+                                  .Ret<Buffer_Type>()      // out
+                                  .Ret<Buffer_Type>()      // out_amax_new
+                                  .Ret<Buffer_Type>()      // out_scale_new
+                                  .Ret<Buffer_Type>()      // pre_gelu_out
+                                  .Ret<Buffer_Type>()      // bias_grad
+                                  .Ret<Buffer_Type>()      // extra_out
+                                  .Ret<Buffer_Type>()      // workspace
+                                  .Attr<bool>("lhs_trans")
+                                  .Attr<bool>("rhs_trans")
+                                  .Attr<bool>("fuse_gelu")
+                                  .Attr<bool>("fuse_bias")
+                                  .Attr<bool>("grad")
+                                  .Attr<bool>("accumulate")
+                                  .Attr<bool>("use_split_accumulator")
+                                  .Attr<int32_t>("comm_type_flag")
+                                  .Attr<std::string_view>("name"),
+                              FFI_CudaGraph_Traits);
+
+}  // namespace jax
+
+}  // namespace transformer_engine
diff --git a/transformer_engine/jax/csrc/extensions/gemm.cpp b/transformer_engine/jax/csrc/extensions/gemm.cpp
index 5dae9d6757..14148ecbd0 100644
--- a/transformer_engine/jax/csrc/extensions/gemm.cpp
+++ b/transformer_engine/jax/csrc/extensions/gemm.cpp
@@ -62,7 +62,8 @@ void Gemm(cudaStream_t stream, void **buffers, const char *opaque, size_t opaque
   auto *out_scale_updated = reinterpret_cast<float *>(buffers[10]);
   auto *pre_gelu_out = buffers[11];
   auto *bias_grad = buffers[12];
-  auto *workspace = buffers[13];
+  // buffers[13] is the extra output for comm+GEMM overlap, not used here
+  auto *workspace = buffers[14];
 
   // Operand aliasing
   NVTE_CHECK(bias == bias_grad, "bias not bound to bias_grad in TE/JAX GEMM");
@@ -88,9 +89,9 @@ Error_Type GemmFFI(cudaStream_t stream, Buffer_Type lhs, Buffer_Type lhs_scale_i
                    Buffer_Type rhs_scale_inv, Buffer_Type bias, Buffer_Type gelu_input,
                    Buffer_Type out_amax, Buffer_Type out_scale, Result_Type out,
                    Result_Type out_amax_updated, Result_Type out_scale_updated,
-                   Result_Type pre_gelu_out, Result_Type bias_grad, Result_Type workspace,
-                   bool lhs_trans, bool rhs_trans, bool fuse_gelu, bool fuse_bias, bool grad,
-                   bool accumulate, bool use_split_accumulator) {
+                   Result_Type pre_gelu_out, Result_Type bias_grad, Result_Type dummy_out,
+                   Result_Type workspace, bool lhs_trans, bool rhs_trans, bool fuse_gelu,
+                   bool fuse_bias, bool grad, bool accumulate, bool use_split_accumulator) {
   // Inputs
   auto lhs_ptr = lhs.untyped_data();
   auto lhs_scale_inv_ptr = reinterpret_cast<float *>(lhs_scale_inv.untyped_data());
@@ -110,6 +111,7 @@ Error_Type GemmFFI(cudaStream_t stream, Buffer_Type lhs, Buffer_Type lhs_scale_i
   auto out_dtype = convert_ffi_datatype_to_te_dtype(out->element_type());
   auto pre_gelu_out_ptr = pre_gelu_out->untyped_data();
   auto bias_grad_ptr = bias_grad->untyped_data();
+  // dummy_out is the extra output for comm+GEMM overlap, not used here
   auto workspace_ptr = workspace->untyped_data();
   auto workspace_size = workspace->dimensions().back();
 
@@ -151,6 +153,7 @@ XLA_FFI_DEFINE_HANDLER_SYMBOL(GemmHandler, GemmFFI,
                                   .Ret<Buffer_Type>()      // out_scale_updated
                                   .Ret<Buffer_Type>()      // pre_gelu_out
                                   .Ret<Buffer_Type>()      // bias_grad
+                                  .Ret<Buffer_Type>()      // dummy_out
                                   .Ret<Buffer_Type>()      // workspace
                                   .Attr<bool>("lhs_trans")
                                   .Attr<bool>("rhs_trans")
diff --git a/transformer_engine/jax/csrc/extensions/packing.cpp b/transformer_engine/jax/csrc/extensions/packing.cpp
index 1a9ce987af..31a53529e3 100644
--- a/transformer_engine/jax/csrc/extensions/packing.cpp
+++ b/transformer_engine/jax/csrc/extensions/packing.cpp
@@ -80,15 +80,33 @@ pybind11::bytes PackCustomCallFusedAttnDescriptor(
                                     deterministic, window_size_left, window_size_right});
 }
 
-pybind11::bytes PackCustomCallGemmDescriptor(size_t batch, size_t m, size_t n, size_t k,
-                                             size_t workspace_size, DType operand_dtype,
-                                             DType bias_dtype, DType out_dtype, bool lhs_trans,
-                                             bool rhs_trans, bool fuse_gelu, bool fuse_bias,
-                                             bool grad, bool accumulate,
+pybind11::bytes PackCustomCallGemmDescriptor(size_t m, size_t n, size_t k, size_t workspace_size,
+                                             DType operand_dtype, DType bias_dtype, DType out_dtype,
+                                             bool lhs_trans, bool rhs_trans, bool fuse_gelu,
+                                             bool fuse_bias, bool grad, bool accumulate,
                                              bool use_split_accumulator) {
-  return PackOpaque(CustomCallGemmDescriptor{batch, m, n, k, workspace_size, operand_dtype,
-                                             bias_dtype, out_dtype, lhs_trans, rhs_trans, fuse_gelu,
-                                             fuse_bias, grad, accumulate, use_split_accumulator});
+  return PackOpaque(CustomCallGemmDescriptor{m, n, k, workspace_size, operand_dtype, bias_dtype,
+                                             out_dtype, lhs_trans, rhs_trans, fuse_gelu, fuse_bias,
+                                             grad, accumulate, use_split_accumulator});
+}
+
+pybind11::bytes PackCustomCallBufferDescriptor(const std::string &name,
+                                               const std::vector<size_t> &shape, DType dtype,
+                                               CommOverlapType comm_type) {
+  return PackOpaque<CustomCallBufferDescriptor>(
+      {name, shape.data(), shape.size(), dtype, comm_type});
+}
+
+pybind11::bytes PackCustomCallOverlapDescriptor(size_t m, size_t k, size_t n, size_t workspace_size,
+                                                DType operand_dtype, DType bias_dtype,
+                                                DType out_dtype, bool lhs_trans, bool rhs_trans,
+                                                bool fuse_gelu, bool fuse_bias, bool grad,
+                                                bool accumulate, bool use_split_accumulator,
+                                                CommOverlapType comm_type,
+                                                const std::string &name) {
+  return PackOpaque<CustomCallOverlapDescriptor>(
+      {m, n, k, workspace_size, operand_dtype, bias_dtype, out_dtype, lhs_trans, rhs_trans,
+       fuse_gelu, fuse_bias, grad, accumulate, use_split_accumulator, comm_type, name});
 }
 
 }  // namespace jax
diff --git a/transformer_engine/jax/csrc/extensions/pybind.cpp b/transformer_engine/jax/csrc/extensions/pybind.cpp
index ddf98d9d78..2bf13a600d 100644
--- a/transformer_engine/jax/csrc/extensions/pybind.cpp
+++ b/transformer_engine/jax/csrc/extensions/pybind.cpp
@@ -8,6 +8,7 @@
 #include "extensions.h"
 
 namespace transformer_engine {
+
 namespace jax {
 
 template <typename T>
@@ -53,6 +54,8 @@ pybind11::dict Registrations() {
   dict["te_fused_attn_forward"] = EncapsulateFunction(FusedAttnForward);
   dict["te_fused_attn_backward"] = EncapsulateFunction(FusedAttnBackward);
   dict["te_gemm"] = EncapsulateFunction(Gemm);
+  dict["te_copy_into_overlap_buffer"] = EncapsulateFunction(CopyIntoOverlapBuffer);
+  dict["te_comm_gemm_overlap"] = EncapsulateFunction(CommGemmOverlap);
 
   // Transpose
   dict["te_transpose_ffi"] = EncapsulateFFI(TransposeHandler);
@@ -104,6 +107,8 @@ pybind11::dict Registrations() {
   dict["te_fused_attn_backward_ffi"] = fused_attn_backward_ffi;
 
   dict["te_gemm_ffi"] = EncapsulateFFI(GemmHandler);
+  dict["te_copy_into_overlap_buffer_ffi"] = EncapsulateFFI(CopyIntoOverlapBufferHandler);
+  dict["te_comm_gemm_overlap_ffi"] = EncapsulateFFI(CommGemmOverlapHandler);
   return dict;
 }
 
@@ -120,6 +125,8 @@ PYBIND11_MODULE(transformer_engine_jax, m) {
   m.def("pack_softmax_descriptor", &PackCustomCallSoftmaxDescriptor);
   m.def("pack_fused_attn_descriptor", &PackCustomCallFusedAttnDescriptor);
   m.def("pack_gemm_descriptor", &PackCustomCallGemmDescriptor);
+  m.def("pack_buffer_descriptor", &PackCustomCallBufferDescriptor);
+  m.def("pack_overlap_descriptor", &PackCustomCallOverlapDescriptor);
   m.def("get_fused_attn_backend", &GetFusedAttnBackend);
   m.def("get_cuda_version", &GetCudaRuntimeVersion);
   m.def("get_cudnn_version", &GetCudnnRuntimeVersion);
@@ -132,7 +139,14 @@ PYBIND11_MODULE(transformer_engine_jax, m) {
   m.def("get_fused_attn_fwd_workspace_sizes", &GetFusedAttnForwardWorkspaceSizes);
   m.def("get_fused_attn_bwd_workspace_sizes", &GetFusedAttnBackwardWorkspaceSizes);
   m.def("nvte_get_qkv_format", &nvte_get_qkv_format);
+  m.def("bootstrap_comm_gemm_overlap", &BootstrapCommGemmOverlap);
+  m.def("destroy_comm_gemm_overlaps", &DestroyCommGemmOverlap);
+  m.def("set_buffer_scale_inv", &SetOverlapBufferScaleInverse, pybind11::arg(), pybind11::arg(),
+        pybind11::arg("grad") = false);
+  m.def("get_overlap_buffer", &GetOverlapBuffer);
+  m.def("overlap_buffer_is_fp8", &OverlapBufferIsFp8);
 }
 
 }  // namespace jax
+
 }  // namespace transformer_engine
diff --git a/transformer_engine/jax/gemm.py b/transformer_engine/jax/gemm.py
index 4cf09a204f..e463f0ace2 100644
--- a/transformer_engine/jax/gemm.py
+++ b/transformer_engine/jax/gemm.py
@@ -1,15 +1,18 @@
 # Copyright (c) 2022-2024, NVIDIA CORPORATION & AFFILIATES. All rights reserved.
 #
 # See LICENSE for license information.
+import os
+import warnings
 import operator
 from functools import partial, reduce
-from typing import Optional, Tuple, Union
+from typing import Optional, Tuple, Union, Sequence
 
 import jax
 import jax.numpy as jnp
 from jax.typing import ArrayLike
 from jax.sharding import NamedSharding, PartitionSpec
 
+from transformer_engine import transformer_engine_jax as tex
 from .fp8 import FP8Helper, FP8MetaPackage
 from .cpp_extensions import (
     gemm_impl,
@@ -19,15 +22,23 @@
     dbias_cast_transpose,
     dact_lu_dbias_cast_transpose,
 )
-from .cpp_extensions.gemm import sanitize_dims, mirror_dim
+
+from .cpp_extensions.gemm import sanitize_dims, mirror_dim, copy_into_overlap_buffer
+from .cpp_extensions.misc import jax_dtype_is_fp8, jax_dtype_to_te_dtype
+from .sharding import get_mesh_axis_size, global_mesh_resource
 
 
 __all__ = [
     "gemm",
     "fp8_gemm",
     "type_safe_gemm",
+    "initialize_comm_gemm_overlaps",
+    "destroy_comm_gemm_overlap",
 ]
 
+_NUM_MAX_UB_STREAMS = 3
+_ACTIVE_COMM_GEMM_OVERLAPS = dict()
+
 
 def gemm(
     x: ArrayLike,
@@ -37,12 +48,70 @@ def gemm(
     fuse_gelu: bool = False,
     accumulate: bool = False,
     use_split_accumulator: bool = False,
+    comm_overlap_name: Optional[str] = None,
+    ag_overlap_skip_copy: bool = False,
 ) -> ArrayLike:
-    """Non-FP8 collective/distributed `nvte_cublas_gemm()` with GELU and bias-add fusions."""
-    return _gemm(x, kernel, bias, contracting_dims, fuse_gelu, accumulate, use_split_accumulator)
+    """
+    Non-FP8 collective/distributed `nvte_cublas_gemm()` with GELU and bias-add fusions.
+
+    Parameters
+    ----------
+    x : ArrayLike
+        LHS operand, sized ([B], M, K) when not transposed.
+    kernel : ArrayLike
+        RHS operand, sized (K, N) when not transposed.
+    bias : Optional[ArrayLike], default = `None`
+        Optional bias term to add onto the (LHS x RHS) result.
+    contracting_dims : Tuple[int, int], default = `(-1, 0)`
+        Contracting dimensions of LHS and RHS, respectively, in the matrix-multiplication.
+        The default (-1, 0) describes the fully non-transposed 'NN' layout where LHS contracts in
+        the last dimension, and RHS contracts in the first dimension.
+    fuse_gelu : bool, default = `False`
+        Enable the GELU epilogue for GEMM. This applies GELU after the bias-addition if the bias
+        term is not `None`.
+    accumulate : bool, default = `False`
+    use_split_accumulator : bool, default = `False`
+    comm_overlap_name : Optional[str], default = `None`
+        Name of the comm+GEMM overlap layer that this GEMM is associated with. Comm+GEMM overlap
+        must be initialized with `te.jax.gemm.initialize_comm_gemm_overlaps()` before this
+        GEMM call, and the configuration dictionary used in the initialization must include
+        the name passed into this function.
+    ag_overlap_skip_copy: bool = `False`
+        All-gather overlap requires the LHS operand to be copied into the communication buffer.
+        If the communication buffer already has the necessary data, setting this flag will
+        avoid an unnecessary memcpy operation.
+    """
+    comm_overlap_config = None
+    if comm_overlap_name is not None:
+        comm_overlap_config = _ACTIVE_COMM_GEMM_OVERLAPS.get(comm_overlap_name, None)
+        if comm_overlap_config is None:
+            warnings.warn(
+                f"Comm+GEMM overlap for {comm_overlap_name} has not been initialized! "
+                + "Sharded operands will trigger XLA collectives instead."
+            )
+
+        elif (
+            not ag_overlap_skip_copy
+            and comm_overlap_config["method"] != "bulk"
+            and comm_overlap_config["comm_type"] == tex.CommOverlapType.AG
+        ):
+            if sanitize_dims(contracting_dims[0], x.ndim) != x.ndim - 1:
+                x = jnp.matrix_transpose(x)
+            copy_into_overlap_buffer(x, comm_overlap_name, tex.CommOverlapType.RS)
+
+    return _gemm(
+        x,
+        kernel,
+        bias,
+        contracting_dims,
+        fuse_gelu,
+        accumulate,
+        use_split_accumulator,
+        comm_overlap_config,
+    )
 
 
-@partial(jax.custom_vjp, nondiff_argnums=(3, 4, 5, 6))
+@partial(jax.custom_vjp, nondiff_argnums=(3, 4, 5, 6, 7))
 def _gemm(
     x: ArrayLike,
     kernel: ArrayLike,
@@ -51,9 +120,17 @@ def _gemm(
     fuse_gelu: bool,
     accumulate: bool,
     use_split_accumulator: bool,
+    comm_overlap_config: dict,
 ) -> ArrayLike:
     out, _ = _gemm_fwd_rule(
-        x, kernel, bias, contracting_dims, fuse_gelu, accumulate, use_split_accumulator
+        x,
+        kernel,
+        bias,
+        contracting_dims,
+        fuse_gelu,
+        accumulate,
+        use_split_accumulator,
+        comm_overlap_config,
     )
     return out
 
@@ -66,6 +143,7 @@ def _gemm_fwd_rule(
     fuse_gelu: bool,
     accumulate: bool,
     use_split_accumulator: bool,
+    comm_overlap_config: dict,
 ) -> Tuple[ArrayLike, ...]:
     assert (
         kernel.ndim == 2
@@ -78,7 +156,7 @@ def _gemm_fwd_rule(
     #
     # GEMM+AR: ([B], M, K/P) x (K/P, N) --(AR)--> ([B], M, N)
     #     (DP, None, TP) x (TP, None) --(AR)--> (DP, None, None)
-    out, pre_gelu_out = gemm_impl(
+    out, pre_gelu_out, extra_out = gemm_impl(
         x,
         kernel,
         bias=bias,
@@ -88,16 +166,29 @@ def _gemm_fwd_rule(
         fuse_bias=fuse_bias,
         accumulate=accumulate,
         use_split_accumulator=use_split_accumulator,
+        comm_overlap_config=comm_overlap_config,
     )
 
+    # Update returned and saved tensors based on comm+GEMM overlap
+    saved_x = x
+    final_out = out
+    if comm_overlap_config is not None:
+        match comm_overlap_config.get("comm_type", None):
+            case tex.CommOverlapType.AG:
+                # AG overlap puts the all-gathered global LHS (X) into extra_out
+                saved_x = extra_out
+            case tex.CommOverlapType.RS:
+                # RS overlap puts the reduce-scattered sharded output into extra_out
+                final_out = extra_out
+
     ctx = (
-        x,
+        saved_x,
         kernel,
         pre_gelu_out if fuse_gelu else None,
         fuse_bias,
     )
 
-    return out, ctx
+    return final_out, ctx
 
 
 def _gemm_bwd_rule(
@@ -105,6 +196,7 @@ def _gemm_bwd_rule(
     fuse_gelu,
     accumulate,
     use_split_accumulator,
+    comm_overlap_config,
     ctx,
     grad,
 ):
@@ -114,6 +206,11 @@ def _gemm_bwd_rule(
         mirror_dim, (x_inner_dim, kernel_inner_dim), (x.ndim, kernel.ndim)
     )
 
+    dgrad_overlap_config = None
+    if comm_overlap_config is not None:
+        dgrad_overlap_name = comm_overlap_config["name"].rstrip("_fprop") + "_dgrad"
+        dgrad_overlap_config = _ACTIVE_COMM_GEMM_OVERLAPS.get(dgrad_overlap_name, None)
+
     # FWD MODE:
     #     AG+GEMM: ([B], M/P, K) --(AG)--> ([B], M, K) x (K, N/P) ------> ([B], M, N/P)
     #  (DP, TP, None) --(AG)--> (DP, None, None) x (None, TP) --> (DP, None, TP)
@@ -127,7 +224,7 @@ def _gemm_bwd_rule(
     #
     #    GEMM+AR:   ([B], M, N) x (K/P, N)^T ------> ([B], M, K/P)
     #        (DP, None, None) x (TP, None)^T --> (DP, None, TP)
-    dgrad, dgelu, _ = gemm_impl(
+    dgrad, dgelu, _, dgrad_extra_out = gemm_impl(
         grad,
         kernel,
         gelu_input=pre_gelu_out,
@@ -138,17 +235,42 @@ def _gemm_bwd_rule(
         grad=True,
         accumulate=accumulate,
         use_split_accumulator=use_split_accumulator,
+        comm_overlap_config=dgrad_overlap_config,
     )
 
+    # If dgrad overlapped reduce-scatter, set it to the RS output
+    if dgrad_overlap_config is not None:
+        if (
+            dgrad_overlap_config["method"] != "bulk"
+            and dgrad_overlap_config["comm_type"] == tex.CommOverlapType.RS
+        ):
+            dgrad = dgrad_extra_out
+
+    # Collapse batch dimension for wgrad
+    wgrad_rhs = dgelu if fuse_gelu else grad
+    if x.ndim > 2:
+        # If x was originally transposed, we need to transpose it back in order to collapse
+        # the batch dims correctly.
+        if x_inner_dim == x.ndim - 2:
+            x = jnp.matrix_transpose(x)
+        batch_size = reduce(operator.mul, x.shape[:-2], 1)
+        x = jnp.reshape(x, (batch_size * x.shape[-2], x.shape[-1]))
+        wgrad_rhs = jnp.reshape(wgrad_rhs, (batch_size * wgrad_rhs.shape[-2], wgrad_rhs.shape[-1]))
+
+    # Recover comm+GEMM overlap config for wgrad
+    wgrad_overlap_config = None
+    if comm_overlap_config is not None:
+        wgrad_overlap_name = comm_overlap_config["name"].rstrip("_fprop") + "_wgrad"
+        wgrad_overlap_config = _ACTIVE_COMM_GEMM_OVERLAPS.get(wgrad_overlap_name, None)
+
     # WGRAD:
     #    AG+GEMM: ([B], M/P, K)^T --(AG)--> ([B], M, K)^T x ([B], M, N/P) --> (K, N/P)
     #  (DP, 'tp', None)^T --(AG)-->(DP, None, None)^T x (DP, None, 'tp') --> (None, 'tp')
     #
     #    GEMM+AR: ([B], M, K/P)^T --(AG)--> ([B], M, K)^T x ([B], M, N) ---------> (K/P, N)
     #     (DP, None, 'tp')^T --(AG)--> (DP, None, None)^T x (DP, None, None) ----> (None, None)
-    #     Make XLA scatter output in first dim.
     wgrad_rhs = dgelu if fuse_gelu else grad
-    wgrad, _, bgrad = gemm_impl(
+    wgrad, _, bgrad, wgrad_extra_out = gemm_impl(
         x,
         wgrad_rhs,
         gelu_input=pre_gelu_out,
@@ -159,8 +281,17 @@ def _gemm_bwd_rule(
         grad=True,
         accumulate=accumulate,
         use_split_accumulator=use_split_accumulator,
+        comm_overlap_config=wgrad_overlap_config,
     )
 
+    # If wgrad overlapped reduce-scatter, set it to the RS output
+    if wgrad_overlap_config is not None:
+        if (
+            wgrad_overlap_config["method"] != "bulk"
+            and wgrad_overlap_config["comm_type"] == tex.CommOverlapType.RS
+        ):
+            wgrad = wgrad_extra_out
+
     if not fuse_bias:
         bgrad = None
 
@@ -179,8 +310,60 @@ def fp8_gemm(
     fuse_gelu: bool = False,
     accumulate: bool = False,
     use_split_accumulator: bool = False,
+    comm_overlap_name: Optional[str] = None,
+    ag_overlap_skip_copy: bool = False,
 ) -> ArrayLike:
-    """Non-FP8 `nvte_cublas_gemm()` with optional GELU and bias-add fusions."""
+    """
+    FP8 collective/distributed `nvte_cublas_gemm()` with GELU and bias-add fusions.
+
+    FP8 GEMM requires the LHS operand to be non-transposed, and the RHS operand to be transposed,
+    such that the contracting dimensions are always the last dimension for both operands.
+
+    Parameters
+    ----------
+    x : ArrayLike
+        Non-transposed LHS operand, sized ([B], M, K).
+    kernel_t : ArrayLike
+        Transposed RHS operand, sized (N, K).
+    fp8_meta : transformer_engine.jax.fp8.FP8MetaPackage
+        FP8MetaPackage object carrying amax, scale and scale_inv information for the GEMM operands.
+    bias : Optional[ArrayLike], default = `None`
+        Optional bias term to add onto the (LHS x RHS) result.
+    out_dtype : jnp.dtype, default = `jnp.bfloat16`
+        Data type of the FP8 GEMM output. If chosen as an FP8 dtype (i.e. `jnp.float8_e4m3fn` or
+        `jnp.float8_e5m2`), the `fp8_meta` must also contain amax and scale information for the
+        GEMM output.
+    fuse_gelu : bool, default = `False`
+        Enable the GELU epilogue for GEMM. This applies GELU after the bias-addition if the bias
+        term is not `None`.
+    accumulate : bool, default = `False`
+    use_split_accumulator : bool, default = `False`
+    comm_overlap_name : Optional[str], default = `None`
+        Name of the comm+GEMM overlap layer that this GEMM is associated with. Comm+GEMM overlap
+        must be initialized with `te.jax.gemm.initialize_comm_gemm_overlaps()` before this
+        GEMM call, and the configuration dictionary used in the initialization must include
+        the name passed into this function.
+    ag_overlap_skip_copy: bool = `False`
+        All-gather overlap requires the LHS operand to be copied into the communication buffer.
+        If the communication buffer already has the necessary data, setting this flag will
+        avoid an unnecessary memcpy operation.
+    """
+    comm_overlap_config = None
+    if comm_overlap_name is not None:
+        comm_overlap_config = _ACTIVE_COMM_GEMM_OVERLAPS.get(comm_overlap_name, None)
+        if comm_overlap_config is None:
+            warnings.warn(
+                f"Comm+GEMM overlap for {comm_overlap_name} has not been initialized! "
+                + "Sharded operands will trigger XLA collectives instead."
+            )
+
+        elif (
+            not ag_overlap_skip_copy
+            and comm_overlap_config["method"] != "bulk"
+            and comm_overlap_config["comm_type"] == tex.CommOverlapType.AG
+        ):
+            copy_into_overlap_buffer(x, comm_overlap_name, tex.CommOverlapType.RS)
+
     return _fp8_gemm(
         x,
         kernel_t,
@@ -191,6 +374,7 @@ def fp8_gemm(
         fuse_gelu,
         accumulate,
         use_split_accumulator,
+        comm_overlap_config,
     )
 
 
@@ -205,6 +389,7 @@ def _fp8_gemm(
     fuse_gelu: bool,
     accumulate: bool,
     use_split_accumulator: bool,
+    comm_overlap_config: dict,
 ) -> ArrayLike:
     out, _ = _fp8_gemm_fwd_rule(
         x,
@@ -216,6 +401,7 @@ def _fp8_gemm(
         fuse_gelu,
         accumulate,
         use_split_accumulator,
+        comm_overlap_config,
     )
     return out
 
@@ -230,6 +416,7 @@ def _fp8_gemm_fwd_rule(
     fuse_gelu: bool,
     accumulate: bool,
     use_split_accumulator: bool,
+    comm_overlap_config: dict,
 ) -> Tuple[ArrayLike, ...]:
     assert (
         kernel_t.ndim == 2
@@ -298,7 +485,26 @@ def _fp8_gemm_fwd_rule(
         if out_dtype in [jnp.float8_e4m3fn, jnp.float8_e5m2]
         else None
     )
-    out, updated_out_amax, updated_out_scale, pre_gelu_out = fp8_gemm_impl(
+
+    # Set scale_inv for comm overlap buffer
+    buffer_scale_inv = None
+    if comm_overlap_config is not None:
+        overlap_name = comm_overlap_config["name"]
+
+        if comm_overlap_config["method"] != "bulk" and tex.overlap_buffer_is_fp8(overlap_name):
+            match comm_overlap_config["comm_type"]:
+                case tex.CommOverlapType.AG:
+                    buffer_scale_inv = x_scale_inv
+
+                case tex.CommOverlapType.RS:
+                    buffer_scale_inv = jnp.reciprocal(out_scale)
+
+            tex.set_overlap_buffer_scale_inverse(
+                overlap_name,
+                jax.dlpack.to_dlpack(buffer_scale_inv),
+            )
+
+    out, updated_out_amax, updated_out_scale, pre_gelu_out, extra_out = fp8_gemm_impl(
         casted_x,
         x_scale_inv,
         casted_kernel_t,
@@ -312,12 +518,26 @@ def _fp8_gemm_fwd_rule(
         fuse_bias=fuse_bias,
         accumulate=accumulate,
         use_split_accumulator=use_split_accumulator,
+        comm_overlap_config=comm_overlap_config,
     )
-    if out_dtype not in [jnp.float8_e4m3fn, jnp.float8_e5m2]:
+    if not jax_dtype_is_fp8(out_dtype):
         updated_out_amax = None
         updated_out_scale = None
 
+    # Update returned and saved arrays based on comm+GEMM overlap config
+    final_out = out
+    saved_casted_x = casted_x
+    if comm_overlap_config is not None:
+        match comm_overlap_config.get("comm_type", None):
+            case tex.CommOverlapType.AG:
+                # AG overlap puts all-gathered global LHS (X) array into extra_out
+                saved_casted_x = extra_out
+            case tex.CommOverlapType.RS:
+                # RS overlap puts the reduce-scattered sharded output into extra_out
+                final_out = extra_out
+
     ctx = (
+        saved_casted_x,
         casted_x_t,
         casted_kernel,
         amax_list,
@@ -332,7 +552,7 @@ def _fp8_gemm_fwd_rule(
         (x.ndim > 2),
     )
 
-    return (out, updated_out_scale), ctx
+    return (final_out, updated_out_amax, updated_out_scale), ctx
 
 
 def _fp8_gemm_bwd_rule(
@@ -340,6 +560,7 @@ def _fp8_gemm_bwd_rule(
     fuse_gelu,
     accumulate,
     use_split_accumulator,
+    comm_overlap_config,
     ctx,
     grad,
 ):
@@ -407,28 +628,128 @@ def _fp8_gemm_bwd_rule(
             )
             bgrad = None
 
+    # Recover dgrad comm+GEMM overlap config
+    dgrad_overlap_config = None
+    if comm_overlap_config is not None:
+        dgrad_overlap_name = comm_overlap_config["name"].rstrip("_fprop") + "_dgrad"
+        dgrad_overlap_config = _ACTIVE_COMM_GEMM_OVERLAPS.get(dgrad_overlap_name, None)
+
+    # Set scale_inv for comm overlap buffer
+    dgrad_out_dtype = jnp.bfloat16
+    dgrad_amax = None
+    dgrad_scale = None
+    if (
+        dgrad_overlap_config is not None
+        and dgrad_overlap_config["method"] != "bulk"
+        and tex.overlap_buffer_is_fp8(dgrad_overlap_name)
+    ):
+        dgrad_out_dtype = bwd_dtype
+        dgrad_amax = grad_amax
+        dgrad_scale = grad_scale
+        tex.set_overlap_buffer_scale_inverse(
+            dgrad_overlap_name,
+            jax.dlpack.to_dlpack(grad_scale_inv),
+        )
+
+    # DGRAD: ([B], M, N) x (K, N)^T = ([B], M, K)
     kernel_scale_inv = scale_inv_list[FP8MetaPackage.WEIGHT_IDX]
-    dgrad, *_ = fp8_gemm_impl(
+    dgrad, *_, dgrad_extra_out = fp8_gemm_impl(
         casted_grad,
         grad_scale_inv,
         casted_kernel,
         kernel_scale_inv,
+        None,
+        None,
+        dgrad_amax,
+        dgrad_scale,
+        out_dtype=dgrad_out_dtype,
         batched_output=batched_input,
         accumulate=accumulate,
         use_split_accumulator=use_split_accumulator,
+        comm_overlap_config=dgrad_overlap_config,
     )
 
+    # If dgrad overlapped reduce-scatter, set it to the RS output
+    if (
+        dgrad_overlap_config is not None
+        and dgrad_overlap_config["method"] != "bulk"
+        and dgrad_overlap_config["comm_type"] == tex.CommOverlapType.RS
+    ):
+        dgrad = dgrad_extra_out
+
+    if fuse_gelu and fuse_bias:
+        # Fuse bgrad with dGELU.
+        _, casted_dgelu_t, bgrad, updated_grad_amax = dact_lu_dbias_cast_transpose(
+            grad,
+            pre_gelu_out,
+            grad_amax,
+            grad_scale,
+            grad_scale_inv,
+            bwd_dtype,
+            static_axis_boundary=-1,
+            transpose_axis_boundary=-1,
+            activation_type=("gelu",),
+        )
+    elif fuse_gelu:
+        # No bias grad to fuse so we just do dGELU.
+        _, casted_dgelu_t, updated_grad_amax = dact_lu(grad, pre_gelu_out, ("gelu",))
+        bgrad = None
+
+    # Recover wgrad config
+    wgrad_overlap_config = None
+    if comm_overlap_config is not None:
+        wgrad_overlap_name = comm_overlap_config["name"].rstrip("_fprop") + "_wgrad"
+        wgrad_overlap_config = _ACTIVE_COMM_GEMM_OVERLAPS.get(wgrad_overlap_name, None)
+
+    # Set scale_inv for comm overlap buffer
+    wgrad_out_dtype = jnp.bfloat16
+    wgrad_amax = None
+    wgrad_scale = None
+    if (
+        wgrad_overlap_config is not None
+        and wgrad_overlap_config["method"] != "bulk"
+        and tex.overlap_buffer_is_fp8(wgrad_overlap_name)
+    ):
+        match wgrad_overlap_config["comm_type"]:
+            case tex.CommOverlapType.AG:
+                buffer_scale_inv = x_scale_inv
+            case tex.CommOverlapType.RS:
+                buffer_scale_inv = grad_scale_inv
+                wgrad_out_dtype = bwd_dtype
+                wgrad_amax = grad_amax
+                wgrad_scale = grad_scale
+        tex.set_overlap_buffer_scale_inverse(
+            dgrad_overlap_name,
+            jax.dlpack.to_dlpack(buffer_scale_inv),
+        )
+
+    # WGRAD: ([B], N, M) x ([B], K, M)^T = (N, K)
+    wgrad_rhs_t = casted_dgelu_t if fuse_gelu else casted_grad_t
     x_scale_inv = scale_inv_list[FP8MetaPackage.INPUT_IDX]
-    wgrad, *_ = fp8_gemm_impl(
+    wgrad, *_, wgrad_extra_out = fp8_gemm_impl(
         casted_x_t,
         x_scale_inv,
-        casted_grad_t,
+        wgrad_rhs_t,
         grad_scale_inv,
-        out_shape=False,
+        None,
+        None,
+        wgrad_amax,
+        wgrad_scale,
+        out_dtype=wgrad_out_dtype,
+        batched_output=False,
         accumulate=accumulate,
         use_split_accumulator=use_split_accumulator,
+        comm_overlap_config=wgrad_overlap_config,
     )
 
+    # If wgrad overlapped reduce-scatter, set it to the RS output
+    if (
+        wgrad_overlap_config is not None
+        and wgrad_overlap_config["method"] != "bulk"
+        and wgrad_overlap_config["comm_type"] == tex.CommOverlapType.RS
+    ):
+        wgrad = wgrad_extra_out
+
     amax_list[FP8MetaPackage.INPUT_IDX] = (
         amax_list[FP8MetaPackage.INPUT_IDX].at[0].set(updated_x_amax[0])
     )
@@ -462,11 +783,9 @@ def type_safe_gemm(
     fuse_gelu: bool = False,
     accumulate: bool = False,
     use_split_accumulator: bool = False,
+    comm_overlap_name: Optional[str] = None,
 ) -> ArrayLike:
-    if x.dtype in [jnp.float8_e4m3fn, jnp.float8_e5m2] or kernel.dtype in [
-        jnp.float8_e4m3fn,
-        jnp.float8_e5m2,
-    ]:
+    if jax_dtype_is_fp8(x.dtype) or jax_dtype_is_fp8(kernel.dtype):
         assert fp8_meta is not None, "GEMM operands have FP8 dtypes but FP8MetaPackage is None."
 
     if fp8_meta is not None:
@@ -484,6 +803,212 @@ def type_safe_gemm(
             fuse_gelu,
             accumulate,
             use_split_accumulator,
+            comm_overlap_name,
         )
     else:
-        return gemm(x, kernel, bias, contracting_dims, fuse_gelu, accumulate, use_split_accumulator)
+        return gemm(
+            x,
+            kernel,
+            bias,
+            contracting_dims,
+            fuse_gelu,
+            accumulate,
+            use_split_accumulator,
+            comm_overlap_name,
+        )
+
+
+def initialize_comm_gemm_overlaps(
+    buffer_shape: Sequence[int],
+    buffer_dtype: jnp.dtype,
+    mesh: Optional[jax.sharding.Mesh] = None,
+    tp_resource: Optional[str] = None,
+    use_fp8: bool = False,
+    overlap_configs: Optional[dict] = None,
+) -> None:
+    assert tex.ubuf_built_with_mpi(), (
+        "Comm+GEMM overlap in TE/JAX requires Transformer Engine to be compiled with "
+        + "`NVTE_UB_WITH_MPI=1` and `MPI_HOME=/path/to/mpi` options."
+    )
+    if not tex.device_supports_multicast():
+        assert bool(int(os.getenv("UB_SKIPMC", "0"))), (
+            "CUDA device, driver and/or toolkit version does not support comm+GEMM overlap with "
+            + "CUDA Multicast. Launch app with UB_SKIPMC=1 to try CUDA IPC instead."
+        )
+
+    # Get # of devices in the mesh axis for comm+GEMM overlap
+    tp_resource = global_mesh_resource().tp_resource if tp_resource is None else tp_resource
+    tp_size = get_mesh_axis_size(tp_resource, mesh=mesh)
+
+    # Layers that support comm+GEMM overlap
+    layers_all_gather_overlap = [
+        "generic_ag",
+        "qkv_fprop",
+        "qkv_dgrad",
+        "proj_dgrad",
+        "fc1_fprop",
+        "fc1_dgrad",
+        "fc2_dgrad",
+    ]
+    layers_reduce_scatter_overlap = [
+        "generic_rs",
+        "proj_fprop",
+        "fc2_fprop",
+        "qkv_wgrad",
+        "fc1_wgrad",
+    ]
+    dgrad_reduce_scatter_overlap = ["qkv_dgrad", "fc1_dgrad"]
+
+    # Default overlap methods for layers
+    methods = {
+        "ring_exchange": [
+            "generic_ag",
+            "generic_rs",
+            "qkv_fprop",
+            "fc1_fprop",
+            "proj_dgrad",
+            "fc2_dgrad",
+        ],
+        "pipeline": ["proj_fprop", "fc2_fprop"],
+        "bulk": ["qkv_dgrad", "qkv_wgrad", "fc1_dgrad", "fc1_wgrad"],
+    }
+
+    # AG-RS overlap pairs of layers forming a tensor-parallel block
+    ag_rs_pairs = {"qkv_fprop": "proj_fprop", "fc1_fprop": "fc2_fprop"}
+    rs_ag_pairs = {v: k for k, v in ag_rs_pairs.items()}
+    global layers_atomic_ring_exchange
+    layers_atomic_ring_exchange = []
+
+    def get_method(name):
+        for method, names in methods.items():
+            if name in names:
+                return method
+        raise KeyError(f"Given layer name {name} does not exist.")
+
+    def get_default_config(name):
+        method = get_method(name)
+        default_cfg = {
+            "method": method,
+            "comm_type": (
+                tex.CommOverlapType.AG if name in layers_all_gather_overlap else tex.CommOverlap.RS
+            ),
+            "num_sm": 1 if method == "ring_exchange" else 16,
+            "cga_size": 1 if method == "ring_exchange" else 2,
+            "set_sm_margin": False,
+            "num_splits": 4 if method == "pipeline" else tp_size,
+            "aggregate": False,
+            "atomic_gemm": False,
+            "pipeline_rs_overlap_first_gemm": False,
+            "use_ce": True,
+            "fp8_buf": name in layers_all_gather_overlap,
+        }
+        return default_cfg
+
+    def add_new_comm_gemm_overlap(
+        name: str,
+        method: str,
+        shape: Sequence[int],
+        dtype: jnp.dtype,
+        comm_type: tex.CommOverlapType,
+        num_sm: int = 16,
+        cga_size: int = 2,
+        set_sm_margin: bool = False,
+        num_splits: int = 4,
+        aggregate: bool = False,
+        atomic_gemm: bool = False,
+        pipeline_rs_overlap_first_gemm: bool = False,
+        use_ce: bool = True,
+        fp8_buf: bool = False,
+    ) -> None:
+        assert (
+            name not in _ACTIVE_COMM_GEMM_OVERLAPS
+        ), "Duplicate initialization for `{name}` overlap!"
+
+        if atomic_gemm:
+            warnings.warn(
+                "Atomic GEMM uses a beta API from cublas and is not tested for all use cases."
+            )
+            assert use_fp8, "Atomic GEMM overlap supported only for FP8 GEMM."
+            if method == "bulk":
+                warnings.warn(
+                    f"At {name}, atoimic GEMM not is supported for a bulk overlap."
+                    "Defaulting to `atomic_gemm=False`."
+                )
+                atomic_gemm = False
+        if method == "pipeline" and comm_type == tex.CommOverlapType.AG:
+            raise ValueError(
+                f"At {name}, `pipeline` overlap method is not supported for AllGather."
+            )
+        # Check if both AG and RS overlaps use `atomic GEMM`` + `p2p ring-exchange`.
+        # Using atomic GEMM + p2p ring-exchange in only one of the pair breaks functionality.
+        global layers_atomic_ring_exchange
+        if atomic_gemm and method == "ring_exchange" and name in ag_rs_pairs:
+            layers_atomic_ring_exchange += [name, ag_rs_pairs[name]]
+        if name in rs_ag_pairs:
+            assert_message = (
+                f"At {name}, atomic AG-GEMM overlap with `ring_exchange` shuffles GEMM chunk "
+                "outputs, and  RS-GEMM overlap un-suffle them. When one of the GEMM-AG and "
+                "GEMM-RS overlaps forming a TP block (e.g., qkv_fprop and proj_fprop) uses "
+                "`atomic gemm` and `ring_exhcnage`, its pair must use the same overlap config "
+                "for functionality."
+            )
+            if name in layers_atomic_ring_exchange:
+                assert atomic_gemm and method == "ring_exchange", assert_message
+            else:
+                if atomic_gemm and method == "ring_exchange":
+                    assert rs_ag_pairs[name] in layers_atomic_ring_exchange, assert_message
+
+        dtype = jnp.uint8 if (use_fp8 and fp8_buf) else dtype
+        tex.bootstrap_comm_gemm_overlap(
+            name,
+            method,
+            shape,
+            jax_dtype_to_te_dtype(dtype),
+            comm_type,
+            tp_size,
+            num_splits,
+            _NUM_MAX_UB_STREAMS,
+            cga_size,
+            num_sm,
+            set_sm_margin,
+            use_ce,
+            atomic_gemm,
+            aggregate,
+            pipeline_rs_overlap_first_gemm,
+        )
+
+    if overlap_configs is not None:
+        for name in dgrad_reduce_scatter_overlap:
+            if (
+                name in overlap_configs
+                and "method" in overlap_configs[name]
+                and overlap_configs[name]["method"] != "bulk"
+            ):
+                wgrad_name = name.replace("dgrad", "wgrad")
+                assert wgrad_name not in overlap_configs
+                layers_reduce_scatter_overlap.remove(wgrad_name)
+                layers_all_gather_overlap.remove(name)
+                layers_reduce_scatter_overlap.append(name)
+                methods["bulk"].remove(name)
+                methods["bulk"].remove(wgrad_name)
+                new_method = overlap_configs[name]["method"]
+                methods[new_method].append(name)
+
+    global _ACTIVE_COMM_GEMM_OVERLAPS
+    for name in methods["ring_exchange"] + methods["pipeline"] + methods["bulk"]:
+        if overlap_configs is not None and name in overlap_configs:
+            fp8_buf = (name in layers_all_gather_overlap) or (
+                overlap_configs[name].get("fp8_buf", False) and name in methods["pipeline"]
+            )
+            default_config = get_default_config(name)
+            final_config = default_config.update(overlap_configs[name])
+            final_config["fp8_buf"] = fp8_buf
+            add_new_comm_gemm_overlap(name, buffer_shape, buffer_dtype, **final_config)
+            _ACTIVE_COMM_GEMM_OVERLAPS.update({name: final_config})
+
+
+def destroy_comm_gemm_overlaps():
+    for name in _ACTIVE_COMM_GEMM_OVERLAPS:
+        tex.destroy_comm_gemm_overlap(name)
+        _ACTIVE_COMM_GEMM_OVERLAPS.pop(name)
+    _ACTIVE_COMM_GEMM_OVERLAPS = dict()
diff --git a/transformer_engine/pytorch/csrc/extensions.h b/transformer_engine/pytorch/csrc/extensions.h
index 3b49ece4a3..d906bba98f 100644
--- a/transformer_engine/pytorch/csrc/extensions.h
+++ b/transformer_engine/pytorch/csrc/extensions.h
@@ -553,7 +553,8 @@ class CommOverlap : torch::CustomClassHolder, public transformer_engine::CommOve
   CommOverlap(const std::vector<size_t> &buffer_shape, at::ScalarType buffer_dtype,
               CommOverlapHelper *helper, int tp_size, int num_splits = 3,
               int num_max_streams = NVTE_COMM_OVERLAP_MAX_STREAMS, int comm_cga_size = 2,
-              int num_comm_sm = 16, bool set_sm_margin = true, bool atomic_gemm = false);
+              int num_comm_sm = 16, bool set_sm_margin = true, bool atomic_gemm = false,
+              bool overlap_first_gemm = false);
 
   void set_ubuf_scale_inv(torch::Tensor scale_inv) {
     assert(scale_inv.numel());
diff --git a/transformer_engine/pytorch/csrc/extensions/comm_gemm_overlap.cpp b/transformer_engine/pytorch/csrc/extensions/comm_gemm_overlap.cpp
index d212d13516..587e3115b9 100644
--- a/transformer_engine/pytorch/csrc/extensions/comm_gemm_overlap.cpp
+++ b/transformer_engine/pytorch/csrc/extensions/comm_gemm_overlap.cpp
@@ -186,13 +186,13 @@ void CommOverlapHelper::ub_barrier(ExtComm group) {
 CommOverlap::CommOverlap(const std::vector<size_t> &buffer_shape, at::ScalarType buffer_dtype,
                          CommOverlapHelper *helper, int tp_size, int num_splits,
                          int num_max_streams, int comm_cga_size, int num_comm_sm,
-                         bool set_sm_margin, bool atomic_gemm)
-    : te::CommOverlapBase(buffer_shape, GetTransformerEngineDType(buffer_dtype), helper->myrank,
-                          helper->numranks, helper->mylocal, helper->numlocal, helper->mynode,
-                          helper->numnodes, tp_size,
-                          std::bind(&CommOverlapHelper::ub_allgather, helper, _1, _2, _3, _4, _5),
-                          std::bind(&CommOverlapHelper::ub_barrier, helper, _1), num_splits,
-                          num_max_streams, comm_cga_size, num_comm_sm, set_sm_margin, atomic_gemm) {
+                         bool set_sm_margin, bool atomic_gemm, bool overlap_first_gemm)
+    : te::CommOverlapBase(
+          buffer_shape, GetTransformerEngineDType(buffer_dtype), helper->myrank, helper->numranks,
+          helper->mylocal, helper->numlocal, helper->mynode, helper->numnodes, tp_size,
+          std::bind(&CommOverlapHelper::ub_allgather, helper, _1, _2, _3, _4, _5),
+          std::bind(&CommOverlapHelper::ub_barrier, helper, _1), num_splits, num_max_streams,
+          comm_cga_size, num_comm_sm, set_sm_margin, atomic_gemm, overlap_first_gemm) {
   // Even though we never use these PyTorch tensor wrappers directly, they're still necessary to
   // for PyTorch to factor externally allocated memory into its memory pool and garbage collection
   // threshold calculation.
diff --git a/transformer_engine/pytorch/csrc/extensions/pybind.cpp b/transformer_engine/pytorch/csrc/extensions/pybind.cpp
index 8856553c54..9841b5d640 100644
--- a/transformer_engine/pytorch/csrc/extensions/pybind.cpp
+++ b/transformer_engine/pytorch/csrc/extensions/pybind.cpp
@@ -263,12 +263,13 @@ PYBIND11_MODULE(TORCH_EXTENSION_NAME, m) {
 
   py::class_<CommOverlap>(m, "CommOverlap")
       .def(py::init<const std::vector<size_t> &, at::ScalarType, CommOverlapHelper *, int, int, int,
-                    int, int, bool, bool>(),
+                    int, int, bool, bool, bool>(),
            py::call_guard<py::gil_scoped_release>(), py::arg("buffer_shape"),
            py::arg("buffer_dtype"), py::arg("helper"), py::arg("tp_size"),
            py::arg("num_splits") = 3, py::arg("num_max_streams") = NVTE_COMM_OVERLAP_MAX_STREAMS,
            py::arg("comm_cga_size") = 2, py::arg("num_comm_sm") = 16,
-           py::arg("set_sm_margin") = true, py::arg("atomic_gemm") = false)
+           py::arg("set_sm_margin") = true, py::arg("atomic_gemm") = false,
+           py::arg("overlap_first_gemm") = false)
       .def("bulk_overlap", &CommOverlap::bulk_overlap, py::call_guard<py::gil_scoped_release>())
       .def("split_overlap_rs", &CommOverlap::split_overlap_rs,
            py::call_guard<py::gil_scoped_release>())
diff --git a/transformer_engine/pytorch/module/base.py b/transformer_engine/pytorch/module/base.py
index d115efedaa..164d371985 100644
--- a/transformer_engine/pytorch/module/base.py
+++ b/transformer_engine/pytorch/module/base.py
@@ -306,6 +306,7 @@ def get_default_config(name):
             "num_splits": 4 if method == "pipeline" else tp_size,
             "aggregate": False,
             "atomic_gemm": False,
+            "pipeline_rs_overlap_first_gemm": False,
             "use_ce": True,
             "fp8_buf": name in layers_all_gather_overlap,
         }
@@ -314,13 +315,14 @@ def get_default_config(name):
     def add_ub(
         name: str,
         method: str,
-        is_reduce_scatter: int,
+        is_reduce_scatter: bool,
         num_sm: int = 16,
         cga_size: int = 2,
-        set_sm_margin: int = 0,
-        num_splits: int = 0,
-        aggregate: int = 0,
-        atomic_gemm: int = 0,
+        set_sm_margin: bool = False,
+        num_splits: int = 4,
+        aggregate: bool = False,
+        atomic_gemm: bool = False,
+        pipeline_rs_overlap_first_gemm: bool = False,
         use_ce: bool = True,
         fp8_buf: bool = False,
     ) -> None:
@@ -386,6 +388,7 @@ def add_ub(
                 num_comm_sm=num_sm,
                 set_sm_margin=set_sm_margin,
                 atomic_gemm=atomic_gemm,
+                overlap_first_gemm=pipeline_rs_overlap_first_gemm,
             )
         _ub_communicators[name] = ub_obj
 

From b306608da620a95e53a2dc2dca8e7063bc950277 Mon Sep 17 00:00:00 2001
From: Alp Dener <adener@nvidia.com>
Date: Tue, 3 Dec 2024 14:08:27 +0000
Subject: [PATCH 21/32] AG+GEMM overlap working

Signed-off-by: Alp Dener <adener@nvidia.com>
---
 .../transformer_engine/comm_gemm_overlap.h    |   4 +-
 .../common/util/pybind_helper.h               |   1 +
 transformer_engine/jax/cpp_extensions/gemm.py | 762 +++++++++++-------
 transformer_engine/jax/csrc/extensions.h      | 109 +--
 .../jax/csrc/extensions/comm_gemm_overlap.cpp | 233 +++---
 .../jax/csrc/extensions/gemm.cpp              |  50 +-
 .../jax/csrc/extensions/packing.cpp           |  19 -
 .../jax/csrc/extensions/pybind.cpp            |  19 +-
 transformer_engine/jax/gemm.py                | 314 ++++----
 9 files changed, 873 insertions(+), 638 deletions(-)

diff --git a/transformer_engine/common/include/transformer_engine/comm_gemm_overlap.h b/transformer_engine/common/include/transformer_engine/comm_gemm_overlap.h
index 16e4ccf16a..0605825c82 100644
--- a/transformer_engine/common/include/transformer_engine/comm_gemm_overlap.h
+++ b/transformer_engine/common/include/transformer_engine/comm_gemm_overlap.h
@@ -29,9 +29,9 @@ namespace transformer_engine {
  */
 bool ubuf_built_with_mpi();
 
-enum class CommOverlapType : int32_t { RS = 0, AG = 1 };
+enum class CommOverlapType : int { RS = 0, AG = 1 };
 
-enum class CommOverlapAlgo : int32_t {
+enum class CommOverlapAlgo : int {
   BULK_OVERLAP_AG = 0,
   BULK_OVERLAP_RS = 1,
   SPLIT_PIPELINED_AG_P2P = 2,
diff --git a/transformer_engine/common/util/pybind_helper.h b/transformer_engine/common/util/pybind_helper.h
index 6fa9574f63..9091e7e364 100644
--- a/transformer_engine/common/util/pybind_helper.h
+++ b/transformer_engine/common/util/pybind_helper.h
@@ -19,6 +19,7 @@
   pybind11::enum_<transformer_engine::DType>(m, "DType")                                        \
       .value("kByte", transformer_engine::DType::kByte)                                         \
       .value("kInt32", transformer_engine::DType::kInt32)                                       \
+      .value("kInt64", transformer_engine::DType::kInt64)                                       \
       .value("kFloat32", transformer_engine::DType::kFloat32)                                   \
       .value("kFloat16", transformer_engine::DType::kFloat16)                                   \
       .value("kBFloat16", transformer_engine::DType::kBFloat16)                                 \
diff --git a/transformer_engine/jax/cpp_extensions/gemm.py b/transformer_engine/jax/cpp_extensions/gemm.py
index 2ff98c20d9..59bf28434d 100644
--- a/transformer_engine/jax/cpp_extensions/gemm.py
+++ b/transformer_engine/jax/cpp_extensions/gemm.py
@@ -27,15 +27,15 @@
 )
 from ..sharding import (
     global_mesh_resource,
-    lax_paral_op,
     all_reduce_max_along_all_axes_except_PP,
-    get_mesh_axis_size,
 )
 
 
 __all__ = [
     "fp8_gemm_impl",
     "gemm_impl",
+    "copy_into_overlap_buffer",
+    "bootstrap_comm_gemm_overlap",
 ]
 
 _COMM_GEMM_OVERLAP_LAYERS = ["qkv", "proj", "fc1", "fc2"]
@@ -43,7 +43,7 @@
     [layer + "_fprop" for layer in _COMM_GEMM_OVERLAP_LAYERS]
     + [layer + "_dgrad" for layer in _COMM_GEMM_OVERLAP_LAYERS]
     + [layer + "_wgrad" for layer in _COMM_GEMM_OVERLAP_LAYERS if layer != "fc2"]
-    + ["generic_ag", "generic_rs"]
+    + ["ag_gemm", "gemm_rs"]
 )
 
 
@@ -68,7 +68,7 @@ class CollectiveGemmPrimitive(BasePrimitive):
     """
 
     name = "te_gemm"
-    impl_static_args = (8, 9, 10, 11, 12, 13, 14, 15, 16)
+    impl_static_args = (10, 11, 12, 13, 14, 15, 16, 17, 18)
     multiple_results = True
     inner_primitive = None
     outer_primitive = None
@@ -81,9 +81,10 @@ def abstract(
         rhs_scale_inv_aval,
         bias_aval,
         gelu_input_aval,
+        out_aval,
         out_amax_aval,
         out_scale_aval,
-        out_dtype,
+        extra_out_aval,
         batched_output,
         contracting_dims,
         fuse_gelu,
@@ -92,16 +93,19 @@ def abstract(
         accumulate,
         use_split_accumulator,
         comm_overlap_config,
+        sharded_abstract,
     ):
         """
         cuBlasLt GEMM abstract
         """
-        del grad, accumulate, use_split_accumulator
+        if comm_overlap_config is not None:
+            assert tex.ubuf_built_with_mpi(), (
+                "Comm+GEMM overlap in TE/JAX requires Transformer Engine to be compiled with "
+                + "`NVTE_UB_WITH_MPI=1` and `MPI_HOME=/path/to/mpi` options."
+            )
+            assert is_ffi_enabled(), "Comm+GEMM overlap is supported only via XLA FFI."
 
-        assert tex.ubuf_built_with_mpi(), (
-            "Comm+GEMM overlap in TE/JAX requires Transformer Engine to be compiled with "
-            + "`NVTE_UB_WITH_MPI=1` and `MPI_HOME=/path/to/mpi` options."
-        )
+        del grad, accumulate, use_split_accumulator
 
         # Validate operand dtypes
         lhs_dtype = dtypes.canonicalize_dtype(lhs_aval.dtype)
@@ -120,13 +124,14 @@ def abstract(
                 and dtypes.canonicalize_dtype(rhs_scale_inv_aval.dtype) == jnp.float32
             ), "Missing RHS operand scale inverse in FP8 GEMM."
 
-        # Validate operand layouts, adjusted for comm-overlap if necessary
+        # Validate operand layouts
         lhs_inner_dim, rhs_inner_dim = map(
             sanitize_dims, contracting_dims, (lhs_aval.ndim, rhs_aval.ndim)
         )
-        assert (
-            lhs_aval.shape[lhs_inner_dim] == rhs_aval.shape[rhs_inner_dim]
-        ), f"Incompatible contracting dimensions: {lhs_aval.shape} x {rhs_aval.shape}."
+        assert lhs_aval.shape[lhs_inner_dim] == rhs_aval.shape[rhs_inner_dim], (
+            "Incompatible operand sizes: "
+            + f"{lhs_aval.shape} @ idx {lhs_inner_dim} X {rhs_aval.shape} @ idx {rhs_inner_dim}."
+        )
 
         lhs_trans = lhs_inner_dim != lhs_aval.ndim - 1
         rhs_trans = rhs_inner_dim == rhs_aval.ndim - 1
@@ -137,80 +142,125 @@ def abstract(
             assert not lhs_trans, "FP8 GEMM does not support transposed LHS."
             assert rhs_trans, "FP8 GEMM requires transposed RHS."
 
-        # Validate output dtype
-        if jax_dtype_is_fp8(out_dtype):
-            assert jax_dtype_is_fp8(lhs_dtype) and jax_dtype_is_fp8(
-                rhs_dtype
-            ), "FP8 GEMM output requires FP8 inputs."
-            assert (
-                out_amax_aval.size == out_scale_aval.size == 1
-            ), "Invalid/missing output amax and scale."
-            out_amax_updated_dtype = dtypes.canonicalize_dtype(out_amax_aval.dtype)
-            out_scale_updated_dtype = dtypes.canonicalize_dtype(out_scale_aval.dtype)
-            assert (
-                out_amax_updated_dtype == out_scale_updated_dtype == jnp.float32
-            ), "Invalid output amax or scale dtype."
-        else:
-            out_dtype = lhs_dtype
-            out_amax_updated_dtype = jnp.float32
-            out_scale_updated_dtype = jnp.float32
-
         # Make sure leading dimensions of RHS is broadcast-compatible with LHS
         lhs_outer_dim, rhs_outer_dim = map(
             mirror_dim,
             (lhs_inner_dim, rhs_inner_dim),
             (lhs_aval.ndim, rhs_aval.ndim),
         )
-        lhs_bdims = [
-            dim for dim in range(lhs_aval.ndim) if dim not in [lhs_outer_dim, lhs_inner_dim]
-        ]
-        lhs_batch_shape = [lhs_aval.shape[dim] for dim in lhs_bdims]
-        lhs_batch_size = reduce(operator.mul, lhs_batch_shape, 1)
-
-        if rhs_aval.ndim > 2:
+        if lhs_aval.ndim > 2 and rhs_aval.ndim > 2:
+            assert not batched_output, (
+                "Batched output requires batched LHS and non-batched RHS operands."
+            )
+            lhs_bdims = [
+                dim for dim in range(lhs_aval.ndim) if dim not in [lhs_outer_dim, lhs_inner_dim]
+            ]
+            lhs_batch_shape = [lhs_aval.shape[dim] for dim in lhs_bdims]
+            lhs_batch_size = reduce(operator.mul, lhs_batch_shape, 1)
             rhs_bdims = [
                 dim for dim in range(rhs_aval.ndim) if dim not in [rhs_outer_dim, rhs_inner_dim]
             ]
             rhs_batch_shape = [rhs_aval.shape[dim] for dim in rhs_bdims]
-            rhs_batch_size = reduce(operator.mul, rhs_bdims, 1)
-            if rhs_batch_size > 1:
-                assert lhs_batch_size == rhs_batch_size, (
-                    f"Leading dimensins of RHS ({rhs_batch_shape=}) is not broadcast-compatible "
-                    + f"with the leading dimensions of LHS ({lhs_batch_shape=})."
-                )
+            rhs_batch_size = reduce(operator.mul, rhs_batch_shape, 1)
+            assert lhs_batch_size == rhs_batch_size, (
+                "Leading dimensions of LHS and RHS are not broadcast-compatible: "
+                + f"{lhs_aval.shape} @ idx {lhs_inner_dim} X {rhs_aval.shape} @ idx {rhs_inner_dim}"
+            )
 
-        # Infer output shape
-        if batched_output:
-            assert (
-                lhs_aval.ndim > 2 and rhs_aval.ndim == 2
-            ), "Batched output requires batched LHS and non-batched RHS operands."
-            out_shape = (
-                *lhs_batch_shape,
-                lhs_aval.shape[lhs_outer_dim],
-                rhs_aval.shape[rhs_outer_dim],
+        # Validate output dtypes
+        out_dtype = dtypes.canonicalize_dtype(out_aval.dtype)
+        if jax_dtype_is_fp8(out_dtype):
+            assert jax_dtype_is_fp8(lhs_dtype) and jax_dtype_is_fp8(rhs_dtype), (
+                "FP8 GEMM output requires FP8 inputs."
+            )
+            assert out_amax_aval.size == out_scale_aval.size == 1, (
+                "Invalid/missing output amax and scale."
+            )
+            out_amax_updated_dtype = dtypes.canonicalize_dtype(out_amax_aval.dtype)
+            out_scale_updated_dtype = dtypes.canonicalize_dtype(out_scale_aval.dtype)
+            assert out_amax_updated_dtype == out_scale_updated_dtype == jnp.float32, (
+                "Invalid output amax or scale dtype."
             )
         else:
-            assert (
-                lhs_aval.ndim == rhs_aval.ndim
-            ), "Non-batched output requires LHS and RHS operands with same number of dimensions."
-            if lhs_aval.ndim > 2:
-                rhs_bdims = [
-                    dim for dim in range(rhs_aval.ndim) if dim not in [rhs_outer_dim, rhs_inner_dim]
-                ]
-                rhs_batch_shape = [rhs_aval.shape[dim] for dim in rhs_bdims]
-                rhs_batch_size = reduce(operator.mul, rhs_batch_shape, 1)
-                assert lhs_batch_size == rhs_batch_size, (
-                    f"Leading dimensins of RHS ({rhs_aval.shape=}) is not broadcast-compatible "
-                    + f"with the leading dimensions of LHS ({lhs_aval.shape=})."
-                )
-            out_shape = (lhs_aval.shape[lhs_outer_dim], rhs_aval.shape[rhs_outer_dim])
+            assert out_dtype == lhs_dtype, (
+                "Output buffer has incorrect dtype: "
+                + f"expected {lhs_dtype} but found {out_dtype}"
+            )
+            out_amax_updated_dtype = jnp.float32
+            out_scale_updated_dtype = jnp.float32
+
+        # Validate output buffers
+        out_shape = out_aval.shape
+        expected_out_shape = [
+            *lhs_aval.shape[:-2],
+            lhs_aval.shape[lhs_outer_dim],
+            rhs_aval.shape[rhs_outer_dim]
+        ]
+        extra_out_shape = extra_out_aval.shape
+        expected_extra_out_shape = [0]
+        extra_out_dtype = dtypes.canonicalize_dtype(extra_out_aval.dtype)
+        expected_extra_out_dtype = jnp.bfloat16
+        if batched_output:
+            assert out_aval.ndim > 2, "Batched output buffer is missing batch dimensions."
+        else:
+            expected_out_shape = [reduce(operator.mul, expected_out_shape[:-1], 1),
+                                  expected_out_shape[-1]]
 
-        # Validate bias/bias_grad shape against inferred output
+        if (comm_overlap_config is not None
+            and comm_overlap_config["method"] != "bulk"):
+            comm_type = comm_overlap_config.get("comm_type", None)
+            assert comm_type is not None, "Missing comm type for comm+GEMM overlap."
+
+            tp_size = comm_overlap_config.get("tp_size", 1)
+            assert tp_size > 1, (
+                "Comm+GEMM overlap requires tensor-parallel mesh axis size greater than 1."
+            )
+
+            if comm_type == tex.CommOverlapType.AG:
+                expected_extra_out_shape = list(lhs_aval.shape).copy()
+            elif comm_type == tex.CommOverlapType.RS:
+                expected_extra_out_shape = list(expected_out_shape).copy()
+                expected_extra_out_dtype = lhs_dtype
+
+            if sharded_abstract:
+                if comm_type == tex.CommOverlapType.AG:
+                    expected_out_shape[-2] *= tp_size
+                    expected_extra_out_shape[-2] *= tp_size
+                else:
+                    expected_extra_out_shape[-2] = expected_extra_out_shape[-2] // tp_size
+
+        assert out_aval.ndim == len(expected_out_shape), (
+            "Output buffer has incorrect number of dimensions: "
+            + f"expected {len(expected_out_shape)} but found {out_aval.ndim}"
+        )
+        assert all([out_aval.shape[i] == expected_out_shape[i] for i in range(out_aval.ndim)]), (
+            "Output buffer has incorrect shape: "
+            + f"expected {expected_out_shape=} but found {out_aval.shape=}"
+        )
+
+        assert extra_out_dtype == expected_extra_out_dtype, (
+            "Extra output has incorrect dtype: "
+            + f"expected {expected_extra_out_dtype} but found {extra_out_dtype}"
+        )
+        assert extra_out_aval.ndim == len(expected_extra_out_shape), (
+            "Extra output buffer has incorrect number of dimensions: "
+            + f"expected {len(expected_extra_out_shape)} but found {extra_out_aval.ndim}"
+        )
+        assert all([extra_out_aval.shape[i] == expected_extra_out_shape[i]
+                    for i in range(extra_out_aval.ndim)]), (
+            "Extra output buffer has incorrect shape: "
+            + f"expected {expected_extra_out_shape=} but found {extra_out_aval.shape=}"
+        )
+
+        # Validate bias/bias_grad shape against output bufer
         bias_dtype = jnp.bfloat16 if jax_dtype_is_fp8(out_dtype) else out_dtype
         if fuse_bias:
             assert (
                 bias_aval.size > 0 and bias_aval.ndim == 1 and bias_aval.shape[0] == out_shape[-1]
-            ), "Incorrect bias shape."
+            ), (
+                "Incorrect bias shape: "
+                + f"expected ({out_shape[-1]}, ) but found ({bias_aval.shape[0]}, )"
+            )
             bias_dtype = dtypes.canonicalize_dtype(bias_aval.dtype)
         else:
             assert bias_aval.size == 0, "Internal TE error."
@@ -230,45 +280,8 @@ def abstract(
         else:
             assert gelu_input_aval.size == 0, "Internal TE error."
 
-        # Adjust output sizes for comm-overlap
-        extra_out_shape = (0,)
-        extra_out_dtype = jnp.bfloat16
-        if comm_overlap_config is not None:
-            comm_overlap_type = comm_overlap_config.get("comm_type", None)
-            assert comm_overlap_type is not None, "Missing comm type for comm+GEMM overlap."
-            comm_overlap_name = comm_overlap_config.get("name", None)
-            assert (
-                comm_overlap_name in _COMM_GEMM_OVERLAP_NAMES
-            ), f"Unrecognized comm+GEMM overlap name: {comm_overlap_name=}"
-
-            mesh = comm_overlap_config.get("mesh", None)
-            tp_resource = comm_overlap_config.get("tp_resource", global_mesh_resource().tp_resource)
-            tp_size = get_mesh_axis_size(tp_resource, mesh=mesh)
-
-            match comm_overlap_type:
-                case tex.CommOverlapType.AG:
-                    # Extra output is all-gathered LHS copy
-                    extra_out_shape = list(lhs_aval.shape).copy()
-                    extra_out_shape[lhs_outer_dim] *= tp_size
-                    extra_out_dtype = lhs_dtype
-
-                case tex.CommOverlapType.RS:
-                    # FP8 GEMM output for RS overlap is always FP8
-                    if jax_dtype_is_fp8(lhs_dtype):
-                        assert jax_dtype_is_fp8(
-                            out_dtype
-                        ), "FP8 GEMM with reduce-scatter overlap requires FP8 output."
-                    # Extra output is reduce-scattered GEMM output
-                    extra_out_shape = list(out_shape).copy()
-                    extra_out_shape[-2] /= tp_size
-
-                case _:
-                    raise RuntimeError(
-                        f"Unrecognized comm type for comm+GEMM overlap: {comm_overlap_type=}"
-                    )
-
         # Create abstract arrays for all outputs
-        out_aval = lhs_aval.update(shape=out_shape, dtype=out_dtype)
+        out_updated_aval = out_aval.update(shape=out_shape, dtype=out_dtype)
         out_amax_updated_aval = out_amax_aval.update(
             shape=out_amax_aval.shape, dtype=out_amax_updated_dtype
         )
@@ -277,18 +290,18 @@ def abstract(
         )
         pre_gelu_out_aval = gelu_input_aval.update(shape=gelu_shape, dtype=bias_dtype)
         bias_grad_aval = bias_aval.update(shape=bias_aval.shape, dtype=bias_dtype)
-        extra_out_aval = jax.core.ShapedArray(shape=extra_out_shape, dtype=extra_out_dtype)
+        extra_out_updated_aval = extra_out_aval.update(shape=extra_out_shape, dtype=extra_out_dtype)
         workspace_aval = jax.core.ShapedArray(
             shape=(get_cublas_workspace_size_bytes(),), dtype=jnp.uint8
         )
 
         return (
-            out_aval,
+            out_updated_aval,
             out_amax_updated_aval,
             out_scale_updated_aval,
             pre_gelu_out_aval,
             bias_grad_aval,
-            extra_out_aval,  # global LHS for AG overlap, or sharded output for RS overlap
+            extra_out_updated_aval,  # global LHS for AG overlap, or sharded output for RS overlap
             workspace_aval,
         )
 
@@ -324,10 +337,11 @@ def lowering(
         rhs_scale_inv,
         bias,
         gelu_input,
+        out,
         out_amax,
         out_scale,
+        extra_out,
         *,
-        out_dtype,
         batched_output,
         contracting_dims,
         fuse_gelu,
@@ -336,11 +350,12 @@ def lowering(
         accumulate,
         use_split_accumulator,
         comm_overlap_config,
+        sharded_abstract
     ):
         """
         Fused attention fwd lowering rules
         """
-        del batched_output
+        del batched_output, sharded_abstract
         lhs_aval, _, rhs_aval, _, bias_aval, *_ = ctx.avals_in
         lhs_inner_dim, rhs_inner_dim = map(
             sanitize_dims, contracting_dims, (lhs_aval.ndim, rhs_aval.ndim)
@@ -348,26 +363,31 @@ def lowering(
         lhs_trans = lhs_inner_dim != lhs_aval.ndim - 1
         rhs_trans = rhs_inner_dim == rhs_aval.ndim - 1
 
+        operands = [
+            lhs,
+            lhs_scale_inv,
+            rhs,
+            rhs_scale_inv,
+            bias,
+            gelu_input,
+            out,
+            out_amax,
+            out_scale,
+            extra_out,
+        ]
+
         operand_output_aliases = {
             4: 4,  # bias        <-->  bias_grad
             5: 3,  # gelu_input  <-->  pre_gelu_out
-            6: 1,  # out_amax    <-->  out_amax_updated
-            7: 2,  # out_scale   <-->  out_scale_updated
+            6: 0,  # out         <-->  out_updated
+            7: 1,  # out_amax    <-->  out_amax_updated
+            8: 2,  # out_scale   <-->  out_scale_updated
+            9: 5,  # extra_out   <-->  extra_out_updated
         }
 
         if is_ffi_enabled():
             name = "te_gemm_ffi"
-            ffi_args = (
-                ctx,
-                lhs,
-                lhs_scale_inv,
-                rhs,
-                rhs_scale_inv,
-                bias,
-                gelu_input,
-                out_amax,
-                out_scale,
-            )
+            ffi_args = (ctx, *operands)
             ffi_kwargs = dict(
                 lhs_trans=lhs_trans,
                 rhs_trans=rhs_trans,
@@ -380,23 +400,14 @@ def lowering(
 
             if comm_overlap_config is not None:
                 name = "te_comm_gemm_overlap_ffi"
-                ffi_kwargs["comm_type"] = int(comm_overlap_config["comm_type"])
+                ffi_kwargs["comm_type_flag"] = int(comm_overlap_config["comm_type"])
                 ffi_kwargs["name"] = comm_overlap_config["name"]
 
             return ffi.ffi_lowering(name, operand_output_aliases=operand_output_aliases)(
                 *ffi_args, **ffi_kwargs
             )
+
         else:
-            operands = [
-                lhs,
-                lhs_scale_inv,
-                rhs,
-                rhs_scale_inv,
-                bias,
-                gelu_input,
-                out_amax,
-                out_scale,
-            ]
             operand_shapes = map(lambda x: ir.RankedTensorType(x.type).shape, operands)
             out_types = [
                 ir.RankedTensorType.get(output.shape, mlir.dtype_to_ir_dtype(output.dtype))
@@ -423,7 +434,7 @@ def lowering(
                 k,
                 workspace_size,
                 operand_dtype,
-                jax_dtype_to_te_dtype(out_dtype),
+                jax_dtype_to_te_dtype(dtypes.canonicalize_dtype(ctx.avals_out[0].dtype)),
                 bias_dtype,
                 lhs_trans,
                 rhs_trans,
@@ -434,14 +445,6 @@ def lowering(
                 use_split_accumulator,
             )
 
-            comm_overlap_type = comm_overlap_config.get("comm_type", None)
-            if comm_overlap_type is not None:
-                name = "te_comm_gemm_overlap"
-                descriptor_packer_fn = tex.pack_overlap_descriptor
-                descriptor_args += (
-                    comm_overlap_type,
-                    comm_overlap_config.get("name", None),
-                )
             opaque = descriptor_packer_fn(*descriptor_args)
 
             return custom_caller(
@@ -460,9 +463,10 @@ def impl(
         rhs_scale_inv,
         bias,
         gelu_input,
+        out,
         out_amax,
         out_scale,
-        out_dtype,
+        extra_out,
         batched_output,
         contracting_dims,
         fuse_gelu,
@@ -471,6 +475,7 @@ def impl(
         accumulate,
         use_split_accumulator,
         comm_overlap_config,
+        sharded_abstract,
     ):
         assert CollectiveGemmPrimitive.inner_primitive is not None
 
@@ -526,14 +531,33 @@ def impl(
         else:
             contracting_dims_2d[1] = contracting_dims[1]
 
+        # Reshape output and extra output buffers into 2D as well
+        if out.ndim > 2:
+            out = jax.lax.reshape(out, (reduce(operator.mul, out.shape[:-1], 1), out.shape[-1]))
+        if extra_out.size > 0 and extra_out.ndim > 2:
+            extra_out = jax.lax.reshape(
+                extra_out, (reduce(operator.mul, extra_out.shape[:-1], 1), extra_out.shape[-1])
+            )
+
+        batched_extra_out = False
+        if comm_overlap_config is not None and comm_overlap_config["method"] != "bulk":
+            comm_type = comm_overlap_config["comm_type"]
+            if comm_type == tex.CommOverlapType.AG:
+                # Extra output is global LHS, we can collapse but need to recover batches later
+                batched_extra_out = len(lhs_batch_dims) > 0
+            elif comm_type == tex.CommOverlapType.RS:
+                # Extra output is scattered GEMM output, so we recover batches only if the output is
+                # batched
+                batched_extra_out = batched_output
+
         # Invoke GEMM with guaranteed 2D inputs, so batched_output=False
         (
-            out,
+            out_updated,
             out_amax_updated,
             out_scale_updated,
             pre_gelu_out,
             bias_grad,
-            extra_out,
+            extra_out_updated,
             _,
         ) = CollectiveGemmPrimitive.inner_primitive.bind(
             lhs,
@@ -542,9 +566,10 @@ def impl(
             rhs_scale_inv,
             bias,
             gelu_input,
+            out,
             out_amax,
             out_scale,
-            out_dtype=out_dtype,
+            extra_out,
             batched_output=False,
             contracting_dims=contracting_dims_2d,
             fuse_gelu=fuse_gelu,
@@ -553,21 +578,40 @@ def impl(
             accumulate=accumulate,
             use_split_accumulator=use_split_accumulator,
             comm_overlap_config=comm_overlap_config,
+            sharded_abstract=sharded_abstract,
         )
 
         # Recover batched dimensions in the output
         if batched_output:
-            out_shape = (*lhs_batch_shape, out.shape[-2] // lhs_batch_size, out.shape[-1])
-            out = jax.lax.reshape(out, out_shape)
+            out_shape = (
+                *lhs_batch_shape,
+                out_updated.shape[-2] // lhs_batch_size,
+                out_updated.shape[-1]
+            )
+            out_updated = jax.lax.reshape(out_updated, out_shape)
 
-        return out, out_amax_updated, out_scale_updated, pre_gelu_out, bias_grad, extra_out
+        if batched_extra_out:
+            extra_out_shape = (
+                *lhs_batch_shape,
+                extra_out_updated.shape[-2] // lhs_batch_size,
+                extra_out_updated.shape[-1]
+            )
+            extra_out_updated = jax.lax.reshape(extra_out_updated, extra_out_shape)
+
+        return (
+            out_updated,
+            out_amax_updated,
+            out_scale_updated,
+            pre_gelu_out,
+            bias_grad,
+            extra_out_updated,
+        )
 
     @staticmethod
     def batcher(
         batched_args,
         batch_dims,
         *,
-        out_dtype,
         batched_output,
         contracting_dims,
         fuse_gelu,
@@ -576,15 +620,23 @@ def batcher(
         accumulate,
         use_split_accumulator,
         comm_overlap_config,
+        sharded_abstract,
     ):
         assert CollectiveGemmPrimitive.outer_primitive is not None
         check_valid_batch_dims(batch_dims)
-        lhs_bdims, *_, bias_bdims, gelu_input_bdims, out_amax_bdims, out_scale_bdims = batch_dims
+        (
+            *_,
+            bias_bdims,
+            gelu_input_bdims,
+            out_bdims,
+            out_amax_bdims,
+            out_scale_bdims,
+            extra_out_bdims,
+        ) = batch_dims
 
         return (
             CollectiveGemmPrimitive.outer_primitive.bind(
                 *batched_args,
-                out_dtype=out_dtype,
                 batched_output=batched_output,
                 contracting_dims=contracting_dims,
                 fuse_gelu=fuse_gelu,
@@ -592,13 +644,21 @@ def batcher(
                 grad=grad,
                 accumulate=accumulate,
                 use_split_accumulator=use_split_accumulator,
+                comm_overlap_config=comm_overlap_config,
+                sharded_abstract=sharded_abstract,
+            ),
+            (
+                out_bdims,
+                out_amax_bdims,
+                out_scale_bdims,
+                gelu_input_bdims,
+                bias_bdims,
+                extra_out_bdims
             ),
-            (lhs_bdims, out_amax_bdims, out_scale_bdims, gelu_input_bdims, bias_bdims),
         )
 
     @staticmethod
     def infer_sharding_from_operands(
-        out_dtype,
         batched_output,
         contracting_dims,
         fuse_gelu,
@@ -607,11 +667,12 @@ def infer_sharding_from_operands(
         accumulate,
         use_split_accumulator,
         comm_overlap_config,
+        sharded_abstract,
         mesh,
         arg_infos,
         result_infos,
     ):
-        del out_dtype, accumulate, use_split_accumulator, result_infos
+        del accumulate, use_split_accumulator, sharded_abstract, result_infos
         lhs, _, rhs, *_ = arg_infos
         lhs_spec, rhs_spec = map(get_padded_spec, [lhs, rhs])
 
@@ -625,7 +686,6 @@ def infer_sharding_from_operands(
         # Modify operand specs
         lhs_spec_new = [spec for spec in lhs_spec]
         rhs_spec_new = [spec for spec in rhs_spec]
-        reduce_output = False
         if comm_overlap_config is None:
             # When comm overlap is not enabled:
             # - Always all-gather the outer dimension of LHS.
@@ -651,7 +711,6 @@ def infer_sharding_from_operands(
                         + "communication overhead."
                     )
                 rhs_spec_new[rhs_outer_dim] = None
-                reduce_output = True
             else:
                 if lhs_spec_new[lhs_inner_dim] is None and rhs_spec_new[rhs_inner_dim] is not None:
                     warnings.warn(
@@ -669,12 +728,6 @@ def infer_sharding_from_operands(
                         )
                 lhs_spec_new[lhs_inner_dim] = None
                 rhs_spec_new[rhs_inner_dim] = None
-        else:
-            # When comm overlap is enabled, make sure both contracting dims are unsharded if one
-            # of them is unsharded.
-            if lhs_spec_new[lhs_inner_dim] is None or rhs_spec_new[rhs_inner_dim] is None:
-                lhs_spec_new[lhs_inner_dim] = None
-                rhs_spec_new[rhs_inner_dim] = None
         out_col_spec = rhs_spec_new[rhs_outer_dim]
 
         # Output sharding is conditional on output shape
@@ -698,37 +751,48 @@ def infer_sharding_from_operands(
         # Validate operand sharding for comm+GEMM overlap and adust extra output sharding
         extra_out_spec = [None]
         if comm_overlap_config is not None:
-            mesh = comm_overlap_config.get("mesh", None)
+            comm_type = comm_overlap_config.get("comm_type", None)
             tp_resource = comm_overlap_config.get("tp_resource", global_mesh_resource().tp_resource)
-            match comm_overlap_config.get("comm_type", None):
-                case tex.CommOverlapType.AG:
-                    # AG overlap requires the outer dimension of LHS to be sharded
-                    # over the TP resource
-                    assert lhs_spec[lhs_outer_dim] == tp_resource, (
-                        "AG+GEMM overlap requires the outer (sequence) dimension of the LHS "
-                        + f"operand to be sharded over the TP resource (mesh axis: {tp_resource=})."
-                    )
-                    extra_out_spec = list(lhs_spec).copy()
-                    extra_out_spec[lhs_outer_dim] = None
-
-                case tex.CommOverlapType.RS:
-                    # RS overlap requires the contracting dimensions of both LHS and RHS to be
-                    # sharded over the TP resource, and the outer dimension of LHS to be unsharded
-                    assert lhs_spec[lhs_outer_dim] is None, (
-                        "GEMM+RS overlap requires the outer (sequence) dimension of the LHS "
-                        + "operand to be un-sharded."
-                    )
-                    assert lhs_spec[lhs_inner_dim] == tp_resource, (
-                        "GEMM+RS overlap requires the contracting dimension of the LHS operand "
-                        + f"to be sharded over the TP resource (mesh axis: {tp_resource=})."
-                    )
-                    assert rhs_spec[rhs_inner_dim] == tp_resource, (
-                        "GEMM+RS overlap requires the contracting dimension of the RHS operand "
-                        + f"to be sharded over the TP resource (mesh axis: {tp_resource=})."
-                    )
-                    extra_out_spec = out_spec.copy()
-                    extra_out_spec[-2] = tp_resource
-
+            if comm_type == tex.CommOverlapType.AG:
+                # AG overlap requires the outer dimension of LHS to be sharded
+                # over the TP resource
+                assert lhs_spec[lhs_outer_dim] == tp_resource, (
+                    "AG+GEMM overlap requires the outer (sequence) dimension of the LHS "
+                    + f"operand to be sharded over the TP resource '{tp_resource=}'."
+                )
+                assert lhs_spec[lhs_inner_dim] is None, (
+                    "AG+GEMM overlap requires the contracting dimension of the LHS operand "
+                    + "to be unsharded."
+                )
+                assert rhs_spec[rhs_inner_dim] is None, (
+                    "AG+GEMM overlap requires the contracting dimension of the RHS operand "
+                    + "to be unsharded."
+                )
+                extra_out_spec = list(lhs_spec).copy()
+                extra_out_spec[lhs_outer_dim] = None
+
+            elif comm_type == tex.CommOverlapType.RS:
+                # RS overlap requires the contracting dimensions of both LHS and RHS to be
+                # sharded over the TP resource, and the outer dimensions of LHS and RHS to be
+                # unsharded.
+                assert lhs_spec[lhs_outer_dim] is None, (
+                    "GEMM+RS overlap requires the outer (sequence) dimension of the LHS "
+                    + "operand to be unsharded."
+                )
+                assert lhs_spec[lhs_inner_dim] == tp_resource, (
+                    "GEMM+RS overlap requires the contracting dimension of the LHS operand "
+                    + f"to be sharded over the TP resource '{tp_resource=}'."
+                )
+                assert rhs_spec[rhs_inner_dim] == tp_resource, (
+                    "GEMM+RS overlap requires the contracting dimension of the RHS operand "
+                    + f"to be sharded over the TP resource '{tp_resource=}'."
+                )
+                assert rhs_spec[rhs_outer_dim] is None, (
+                    "GEMM+RS overlap requires the outer dimension of the RHS operand to be "
+                    + "unsharded."
+                )
+                extra_out_spec = list(out_spec).copy()
+                extra_out_spec[-2] = tp_resource
         extra_out_sharding = NamedSharding(mesh, PartitionSpec(*extra_out_spec))
 
         return (
@@ -742,7 +806,6 @@ def infer_sharding_from_operands(
 
     @staticmethod
     def partition(
-        out_dtype,
         batched_output,
         contracting_dims,
         fuse_gelu,
@@ -751,11 +814,12 @@ def partition(
         accumulate,
         use_split_accumulator,
         comm_overlap_config,
+        sharded_abstract,
         mesh,
         arg_infos,
         result_infos,
     ):
-        del result_infos
+        del sharded_abstract, result_infos
         lhs, _, rhs, *_ = arg_infos
         lhs_spec, rhs_spec = map(get_padded_spec, [lhs, rhs])
 
@@ -783,12 +847,6 @@ def partition(
             else:
                 lhs_spec_new[lhs_inner_dim] = None
                 rhs_spec_new[rhs_inner_dim] = None
-        else:
-            # When comm overlap is enabled, make sure both contracting dims are unsharded if one
-            # of them is unsharded.
-            if lhs_spec_new[lhs_inner_dim] is None or rhs_spec_new[rhs_inner_dim] is None:
-                lhs_spec_new[lhs_inner_dim] = None
-                rhs_spec_new[rhs_inner_dim] = None
         out_col_spec = rhs_spec_new[rhs_outer_dim]
 
         lhs_sharding = NamedSharding(mesh, PartitionSpec(*lhs_spec_new))
@@ -812,20 +870,17 @@ def partition(
         gelu_spec = [None, out_col_spec] if fuse_gelu else [None]
         gelu_sharding = NamedSharding(mesh, PartitionSpec(*gelu_spec))
 
-        # Adjust extra output sharding for comm+GEMM overlap
+        # Extra output sharding for comm+GEMM overlap
         extra_out_spec = [None]
         if comm_overlap_config is not None:
-            mesh = comm_overlap_config.get("mesh", None)
-            tp_resource = comm_overlap_config.get("tp_resource", global_mesh_resource().tp_resource)
-            match comm_overlap_config.get("comm_type", None):
-                case tex.CommOverlapType.AG:
-                    extra_out_spec = list(lhs_spec).copy()
-                    extra_out_spec[lhs_outer_dim] = None
-
-                case tex.CommOverlapType.RS:
-                    extra_out_spec = out_spec.copy()
-                    extra_out_spec[-2] = tp_resource
-
+            comm_type = comm_overlap_config.get("comm_type", None)
+            if comm_type == tex.CommOverlapType.AG:
+                extra_out_spec = list(lhs_spec).copy()
+                extra_out_spec[lhs_outer_dim] = None
+            elif comm_type == tex.CommOverlapType.RS:
+                extra_out_spec = list(out_spec).copy()
+                extra_out_spec[-2] = comm_overlap_config.get("tp_resource",
+                                                             global_mesh_resource().tp_resource)
         extra_out_sharding = NamedSharding(mesh, PartitionSpec(*extra_out_spec))
 
         arg_shardings = (
@@ -835,8 +890,10 @@ def partition(
             fp8_meta_sharding,
             bias_sharding,
             gelu_sharding,
+            out_sharding,
             fp8_meta_sharding,
             fp8_meta_sharding,
+            extra_out_sharding,
         )
         out_shardings = (
             out_sharding,
@@ -848,15 +905,16 @@ def partition(
         )
 
         def sharded_impl(
-            lhs, lhs_scale_inv, rhs, rhs_scale_inv, bias, gelu_input, out_amax, out_scale
+            lhs, lhs_scale_inv, rhs, rhs_scale_inv, bias, gelu_input, out, out_amax, out_scale,
+            extra_out,
         ):
             (
-                out,
+                out_updated,
                 out_amax_updated,
                 out_scale_updated,
                 pre_gelu_out,
                 bias_grad,
-                extra_out,
+                extra_out_updated,
             ) = CollectiveGemmPrimitive.impl(
                 lhs,
                 lhs_scale_inv,
@@ -864,9 +922,10 @@ def sharded_impl(
                 rhs_scale_inv,
                 bias,
                 gelu_input,
+                out,
                 out_amax,
                 out_scale,
-                out_dtype=out_dtype,
+                extra_out,
                 batched_output=batched_output,
                 contracting_dims=contracting_dims,
                 fuse_gelu=fuse_gelu,
@@ -875,6 +934,7 @@ def sharded_impl(
                 accumulate=accumulate,
                 use_split_accumulator=use_split_accumulator,
                 comm_overlap_config=comm_overlap_config,
+                sharded_abstract=True,
             )
 
             # FP8 amax reduction
@@ -882,15 +942,19 @@ def sharded_impl(
                 out_amax_updated = all_reduce_max_along_all_axes_except_PP(out_amax_updated, mesh)
 
             # All-reduce sum GEMM output when contracting dimensions are sharded
-            if comm_overlap_config is None:
-                if reduce_output:
-                    out = jax.lax.psum(out, global_mesh_resource().tp_resource)
-                    if fuse_gelu:
-                        pre_gelu_out = jax.lax.psum(
-                            pre_gelu_out, global_mesh_resource().tp_resource
-                        )
+            if comm_overlap_config is None and reduce_output:
+                out_updated = jax.lax.psum(out_updated, global_mesh_resource().tp_resource)
+                if fuse_gelu:
+                    pre_gelu_out = jax.lax.psum(pre_gelu_out, global_mesh_resource().tp_resource)
 
-            return out, out_amax_updated, out_scale_updated, pre_gelu_out, bias_grad, extra_out
+            return (
+                out_updated,
+                out_amax_updated,
+                out_scale_updated,
+                pre_gelu_out,
+                bias_grad,
+                extra_out_updated
+            )
 
         return mesh, sharded_impl, out_shardings, arg_shardings
 
@@ -903,6 +967,8 @@ def gemm_impl(
     rhs: ArrayLike,
     bias: Optional[ArrayLike] = None,
     gelu_input: Optional[ArrayLike] = None,
+    out: Optional[ArrayLike] = None,
+    extra_out: Optional[ArrayLike] = None,
     batched_output: bool = False,
     contracting_dims: Tuple[int, int] = (-1, -2),
     fuse_gelu: bool = False,
@@ -917,7 +983,24 @@ def gemm_impl(
     lhs_inner_dim, rhs_inner_dim = map(sanitize_dims, contracting_dims, (lhs.ndim, rhs.ndim))
     lhs_outer_dim = lhs.ndim - 1 if lhs_inner_dim != lhs.ndim - 1 else lhs.ndim - 2
     rhs_outer_dim = rhs.ndim - 2 if rhs_inner_dim == rhs.ndim - 1 else rhs.ndim - 1
-    out_shape = (*lhs.shape[:-2], lhs.shape[lhs_outer_dim], rhs.shape[rhs_outer_dim])
+
+    out_shape_batched = (*lhs.shape[:-2], lhs.shape[lhs_outer_dim], rhs.shape[rhs_outer_dim])
+    out_shape_2d = (reduce(operator.mul, out_shape_batched[:-1], 1), out_shape_batched[-1])
+    out_shape = out_shape_batched if batched_output else out_shape_2d
+
+    if out is None:
+        out = jnp.zeros(out_shape, dtype=lhs.dtype)
+
+    if extra_out is None:
+        extra_out_shape = 0
+        if (comm_overlap_config is not None
+            and comm_overlap_config["method"] != "bulk"):
+            comm_type = comm_overlap_config["comm_type"]
+            if comm_type == tex.CommOverlapType.AG:
+                extra_out_shape = list(lhs.shape).copy()
+            elif comm_type == tex.CommOverlapType.RS:
+                extra_out_shape = list(out_shape).copy()
+        extra_out = jnp.zeros(extra_out_shape, dtype=lhs.dtype)
 
     if not fuse_bias:
         bias = jnp.zeros(0, dtype=lhs.dtype)
@@ -929,11 +1012,11 @@ def gemm_impl(
     if not fuse_gelu:
         gelu_input = jnp.zeros(0, dtype=lhs.dtype)
     elif grad:
-        assert (
-            gelu_input is not None
-        ), "Backward GEMM with dGELU epilogue requires pre-GELU output from forward GEMM."
+        assert gelu_input is not None, (
+            "Backward GEMM with dGELU epilogue requires pre-GELU output from forward GEMM."
+        )
     elif gelu_input is None:
-        gelu_input = jnp.zeros(out_shape, dtype=lhs.dtypes)
+        gelu_input = jnp.zeros(out_shape_2d, dtype=lhs.dtype)
 
     (
         out,
@@ -949,9 +1032,10 @@ def gemm_impl(
         dummy_fp8_meta,
         bias,
         gelu_input,
+        out,
         dummy_fp8_meta,
         dummy_fp8_meta,
-        out_dtype=lhs.dtype,
+        extra_out,
         batched_output=batched_output,
         contracting_dims=contracting_dims,
         fuse_gelu=fuse_gelu,
@@ -960,6 +1044,7 @@ def gemm_impl(
         accumulate=accumulate,
         use_split_accumulator=use_split_accumulator,
         comm_overlap_config=comm_overlap_config,
+        sharded_abstract=False,
     )
 
     if grad:
@@ -975,6 +1060,7 @@ def fp8_gemm_impl(
     rhs_scale_inv: ArrayLike,
     bias: Optional[ArrayLike] = None,
     gelu_input: Optional[ArrayLike] = None,
+    out: Optional[ArrayLike] = None,
     out_amax: Optional[ArrayLike] = None,
     out_scale: Optional[ArrayLike] = None,
     out_dtype: jnp.dtype = jnp.bfloat16,
@@ -986,7 +1072,27 @@ def fp8_gemm_impl(
     comm_overlap_config: Optional[dict] = None,
 ) -> Tuple[ArrayLike, ...]:
     """FP8 mat-mul with `nvte_cublas_gemm()` custom op."""
-    if out_dtype is not None and jax_dtype_is_fp8(out_dtype):
+    out_shape_batched = (*lhs.shape[:-2], lhs.shape[-1], rhs_t.shape[-1])
+    out_shape_2d = (reduce(operator.mul, out_shape_batched[:-1], 1), out_shape_batched[-1])
+    out_shape = out_shape_batched if batched_output else out_shape_2d
+
+    if out is None:
+        out = jnp.zeros(out_shape, dtype=out_dtype)
+    else:
+        out_dtype = out.dtype
+
+    if extra_out is None:
+        extra_out_shape = 0
+        if (comm_overlap_config is not None
+            and comm_overlap_config["method"] != "bulk"):
+            comm_type = comm_overlap_config["comm_type"]
+            if comm_type == tex.CommOverlapType.AG:
+                extra_out_shape = list(lhs.shape).copy()
+            elif comm_type == tex.CommOverlapType.RS:
+                extra_out_shape = list(out_shape).copy()
+        extra_out = jnp.zeros(extra_out_shape, dtype=jnp.bfloat16)
+
+    if jax_dtype_is_fp8(out_dtype):
         assert out_amax is not None and out_scale is not None, "Missing output amax and scale."
     else:
         out_amax = jnp.zeros(0, dtype=jnp.float32)
@@ -1000,8 +1106,7 @@ def fp8_gemm_impl(
     if not fuse_gelu:
         gelu_input = jnp.zeros(0, dtype=bias.dtype)
     elif gelu_input is None:
-        gelu_shape = (reduce(operator.mul, lhs.shape[:-1]), rhs_t.shape[-1])
-        gelu_input = jnp.zeros(gelu_shape, dtype=bias.dtype)
+        gelu_input = jnp.zeros(out_shape_2d, dtype=bias.dtype)
 
     (out, out_amax, out_scale, pre_gelu_out, _, extra_out) = (  # bias_grad in non-FP8 GEMM
         CollectiveGemmPrimitive.outer_primitive.bind(
@@ -1011,9 +1116,10 @@ def fp8_gemm_impl(
             lhs_scale_inv,
             bias,
             gelu_input,
+            out,
             out_amax,
             out_scale,
-            out_dtype=out_dtype,
+            extra_out,
             batched_output=batched_output,
             contracting_dims=(-1, -1),
             fuse_gelu=fuse_gelu,
@@ -1022,86 +1128,190 @@ def fp8_gemm_impl(
             accumulate=accumulate,
             use_split_accumulator=use_split_accumulator,
             comm_overlap_config=comm_overlap_config,
+            sharded_abstract=False,
         )
     )
 
     return out, out_amax, out_scale, pre_gelu_out, extra_out
 
 
+class BootstrapCommGemmOverlapPrimitive(BasePrimitive):
+    """
+    Initialize Comm+GEMM overlap communicators and buffers
+    """
+
+    name = "te_bootstrap_comm_gemm_overlap_ffi"
+    impl_static_args = (1,)
+    multiple_results = False
+    inner_primitive = None
+    outer_primitive = None
+
+    @staticmethod
+    def abstract(buffer_aval, myrank, numranks, comm_overlap_config):
+        del myrank, numranks
+        assert is_ffi_enabled(), "Comm+GEMM overlap is supported only via XLA FFI."
+        overlap_name = comm_overlap_config.get("name", None)
+        assert overlap_name in _COMM_GEMM_OVERLAP_NAMES, (
+            f"Unrecognized comm+GEMM overlap name: {overlap_name=}"
+        )
+        assert buffer_aval.size > 0, "Cannot initialize a zero-size communication buffer."
+        return jax.core.ShapedArray(shape=(0,), dtype=dtypes.canonicalize_dtype(buffer_aval.dtype))
+
+    @staticmethod
+    def lowering(ctx, buffer, *, myrank, numranks, comm_overlap_config):
+        return ffi.ffi_lowering(BootstrapCommGemmOverlapPrimitive.name)(
+            ctx,
+            buffer,
+            name=comm_overlap_config["name"],
+            method=comm_overlap_config["method"],
+            myrank=myrank,
+            numranks=numranks,
+            tp_size=comm_overlap_config["tp_size"],
+            num_splits=comm_overlap_config["num_splits"],
+            num_max_streams=comm_overlap_config["num_max_streams"],
+            cga_size=comm_overlap_config["cga_size"],
+            num_comm_sm=comm_overlap_config["num_sm"],
+            set_sm_margin=comm_overlap_config["set_sm_margin"],
+            use_ce=comm_overlap_config["use_ce"],
+            atomic_gemm=comm_overlap_config["atomic_gemm"],
+            aggregate=comm_overlap_config["aggregate"],
+            pipeline_rs_overlap_first_gemm=comm_overlap_config["pipeline_rs_overlap_first_gemm"],
+        )
+
+    @staticmethod
+    def impl(buffer, myrank, numranks, comm_overlap_config):
+        assert BootstrapCommGemmOverlapPrimitive.inner_primitive is not None
+        buffer = jax.lax.reshape(
+            buffer, (reduce(operator.mul, buffer.shape[:-1], 1), buffer.shape[-1])
+        )
+        return BootstrapCommGemmOverlapPrimitive.inner_primitive.bind(
+            buffer, myrank=myrank, numranks=numranks, comm_overlap_config=comm_overlap_config,
+        )
+
+    @staticmethod
+    def batcher(batched_args, batch_dims, *, myrank, numranks, comm_overlap_config):
+        assert BootstrapCommGemmOverlapPrimitive.inner_primitive is not None
+        check_valid_batch_dims(batch_dims)
+        return (
+            BootstrapCommGemmOverlapPrimitive.inner_primitive.bind(
+                *batched_args, myrank=myrank, numranks=numranks, comm_overlap_config=comm_overlap_config
+            ),
+            None,
+        )
+
+    @staticmethod
+    def infer_sharding_from_operands(myrank, numranks, comm_overlap_config, mesh, arg_infos,
+                                     result_infos):
+        del myrank, numranks, comm_overlap_config, result_infos
+        buffer_spec = get_padded_spec(arg_infos[0])
+        assert all([spec is None for spec in buffer_spec]), "Sample buffer must be unsharded."
+        return NamedSharding(mesh, PartitionSpec(None))
+
+    @staticmethod
+    def partition(myrank, numranks, comm_overlap_config, mesh, arg_infos, result_infos):
+        del arg_infos, result_infos
+        arg_shardings = (NamedSharding(mesh, PartitionSpec(None)),)
+        out_sharding = NamedSharding(mesh, PartitionSpec(None))
+        return (
+            mesh,
+            partial(BootstrapCommGemmOverlapPrimitive.impl,
+                    myrank=myrank,
+                    numranks=numranks,
+                    comm_overlap_config=comm_overlap_config),
+            out_sharding,
+            arg_shardings,
+        )
+
+
+register_primitive(BootstrapCommGemmOverlapPrimitive)
+
+
+def bootstrap_comm_gemm_overlap(
+    buffer: ArrayLike,
+    myrank: int,
+    numranks: int,
+    comm_overlap_config: dict
+):
+    _ = BootstrapCommGemmOverlapPrimitive.outer_primitive.bind(
+        buffer,
+        myrank=myrank,
+        numranks=numranks,
+        comm_overlap_config=comm_overlap_config
+    )
+
+
 class CopyIntoOverlapBufferPrimitive(BasePrimitive):
     """
     Copy JAX array data into comm+GEMM overlap buffer
     """
 
-    name = "te_copy_into_overlap_buffer"
+    name = "te_copy_into_overlap_buffer_ffi"
     impl_static_args = (1, 2)
     multiple_results = False
     inner_primitive = None
     outer_primitive = None
 
     @staticmethod
-    def abstract(inp_aval, name, comm_type):
+    def abstract(inp_aval, name, sharded):
+        del sharded
+        assert is_ffi_enabled(), "Comm+GEMM overlap is supported only via XLA FFI."
         assert name in _COMM_GEMM_OVERLAP_NAMES, f"Unrecognized comm+GEMM overlap name: {name=}"
-        assert comm_type in [
-            tex.CommOverlapType.AG,
-            tex.CommOverlapType.RS,
-        ], "Invalid comm+GEMM overlap type."
         assert inp_aval.size > 0, "Cannot copy a zero-size array into overlap buffer."
-        assert inp_aval.ndim == 2, "Cannot copy more than 2 dimensions!"
         return jax.core.ShapedArray(shape=(0,), dtype=dtypes.canonicalize_dtype(inp_aval.dtype))
 
     @staticmethod
-    def lowering(ctx, inp, *, name, comm_type):
-        if is_ffi_enabled():
-            name = "te_copy_into_overlap_buffer_ffi"
-            return ffi.ffi_lowering(name)(
-                ctx,
-                inp,
-                name=name,
-                comm_type=int(comm_type),
-            )
-        else:
-            operands = [inp]
-            operand_shapes = [ir.RankedTensorType(inp.type).shape]
-            out_types = []
-            args = CustomCallArgsWrapper(out_types, operands, operand_shapes)
-            opaque = tex.pack_buffer_descriptor(
-                name, inp.shape, jax_dtype_to_te_dtype(inp.dtype), comm_type
-            )
-            return custom_caller(CopyIntoOverlapBufferPrimitive.name, args, opaque, False)
+    def lowering(ctx, inp, *, name, sharded):
+        return ffi.ffi_lowering(name)(
+            ctx,
+            inp,
+            name=name,
+            sharded=sharded,
+        )
 
     @staticmethod
-    def impl(inp, name, comm_type):
+    def impl(inp, name, sharded):
         assert CopyIntoOverlapBufferPrimitive.inner_primitive is not None
+        inp_2d = jax.lax.reshape(inp, (reduce(operator.mul, inp.shape[:-1], 1), inp.shape[-1]))
         return CopyIntoOverlapBufferPrimitive.inner_primitive.bind(
-            inp, name=name, comm_type=comm_type
+            inp_2d, name=name, sharded=sharded
         )
 
     @staticmethod
-    def batcher(batched_args, batch_dims, *, name, comm_type):
+    def batcher(batched_args, batch_dims, *, name, sharded):
         assert CopyIntoOverlapBufferPrimitive.inner_primitive is not None
         check_valid_batch_dims(batch_dims)
         return (
             CopyIntoOverlapBufferPrimitive.inner_primitive.bind(
-                *batched_args, name=name, comm_type=comm_type
+                *batched_args, name=name, sharded=sharded
             ),
             None,
         )
 
     @staticmethod
-    def infer_sharding_from_operands(name, comm_type, mesh, arg_infos, result_infos):
-        del name, comm_type, arg_infos, result_infos
+    def infer_sharding_from_operands(name, sharded, mesh, arg_infos, result_infos):
+        del name, result_infos
+        inp_spec = get_padded_spec(arg_infos[0])
+        if sharded:
+            assert inp_spec[-2] is not None, (
+                "Leading dimension of input tensor must be sharded in order to copy into a "
+                + "sharded communication tensor (e.g. preparing for bulk all-gather overlap)."
+            )
+        else:
+            assert inp_spec[-2] is None, (
+                "Leading dimension of input tensor cannot be sharded when copying into an "
+                + "unsharded communication tensor (e.g. preparing for bulk reduce-scatter overlap)."
+            )
         return NamedSharding(mesh, PartitionSpec(None))
 
     @staticmethod
-    def partition(name, comm_type, mesh, arg_infos, result_infos):
-        del name, comm_type, result_infos
-        inp_spec = arg_infos[0]
+    def partition(name, sharded, mesh, arg_infos, result_infos):
+        del name, sharded, result_infos
+        inp_spec = get_padded_spec(arg_infos[0])
         arg_shardings = (NamedSharding(mesh, PartitionSpec(*inp_spec)),)
         out_sharding = NamedSharding(mesh, PartitionSpec(None))
         return (
             mesh,
-            partial(CopyIntoOverlapBufferPrimitive.impl, name=name, comm_type=comm_type),
+            partial(CopyIntoOverlapBufferPrimitive.impl, name=name, sharded=sharded),
             out_sharding,
             arg_shardings,
         )
@@ -1110,5 +1320,5 @@ def partition(name, comm_type, mesh, arg_infos, result_infos):
 register_primitive(CopyIntoOverlapBufferPrimitive)
 
 
-def copy_into_overlap_buffer(inp: ArrayLike, name: str, comm_type: tex.CommOverlapType) -> None:
-    _ = CollectiveGemmPrimitive.outer_primitive.bind(inp, name=name, comm_type=comm_type)
+def copy_into_overlap_buffer(inp: ArrayLike, name: str, sharded: bool) -> None:
+    _ = CopyIntoOverlapBufferPrimitive.outer_primitive.bind(inp, name=name, sharded=sharded)
diff --git a/transformer_engine/jax/csrc/extensions.h b/transformer_engine/jax/csrc/extensions.h
index d123d9b5b4..fd0786a040 100644
--- a/transformer_engine/jax/csrc/extensions.h
+++ b/transformer_engine/jax/csrc/extensions.h
@@ -171,44 +171,6 @@ pybind11::bytes PackCustomCallGemmDescriptor(size_t m, size_t n, size_t k, size_
                                              bool fuse_bias, bool grad, bool accumulate,
                                              bool use_split_accumulator);
 
-struct CustomCallBufferDescriptor {
-  const std::string name;
-  const size_t *shape;
-  const size_t ndim;
-  DType dtype;
-  CommOverlapType comm_type;
-};
-
-pybind11::bytes PackCustomCallBufferDescriptor(const std::string &name,
-                                               const std::vector<size_t> &shape, DType dtype,
-                                               CommOverlapType comm_type);
-
-struct CustomCallOverlapDescriptor {
-  size_t m;
-  size_t k;
-  size_t n;
-  size_t workspace_size;
-  DType operand_dtype;
-  DType bias_dtype;
-  DType out_dtype;
-  bool lhs_trans;
-  bool rhs_trans;
-  bool fuse_gelu;
-  bool fuse_bias;
-  bool grad;
-  bool accumulate;
-  bool use_split_accumulator;
-  CommOverlapType comm_type;
-  const std::string name;
-};
-
-pybind11::bytes PackCustomCallOverlapDescriptor(size_t m, size_t k, size_t n, size_t workspace_size,
-                                                DType operand_dtype, DType bias_dtype,
-                                                DType out_dtype, bool lhs_trans, bool rhs_trans,
-                                                bool fuse_gelu, bool fuse_bias, bool grad,
-                                                bool accumulate, bool use_split_accumulator,
-                                                CommOverlapType comm_type, const std::string &name);
-
 // Transpose
 
 void Transpose(cudaStream_t stream, void **buffers, const char *opaque, size_t opaque_len);
@@ -372,54 +334,63 @@ XLA_FFI_DECLARE_HANDLER_SYMBOL(FusedAttnBackwardHandler);
 
 // GEMM
 
+XLA_FFI_DECLARE_HANDLER_SYMBOL(CublasltHandleInitHandler);
+
 void Gemm(cudaStream_t stream, void **buffers, const char *opaque, size_t opaque_len);
 
-Error_Type GemmFFI(cudaStream_t stream, Buffer_Type lhs, Buffer_Type lhs_scale_inv, Buffer_Type rhs,
-                   Buffer_Type rhs_scale_inv, Buffer_Type bias, Buffer_Type gelu_input,
-                   Buffer_Type out_amax, Buffer_Type out_scale, Result_Type out,
-                   Result_Type out_amax_updated, Result_Type out_scale_updated,
-                   Result_Type pre_gelu_out, Result_Type bias_grad, Result_Type dummy_out,
-                   Result_Type workspace, bool lhs_trans, bool rhs_trans, bool fuse_gelu,
-                   bool fuse_bias, bool grad, bool accumulate, bool use_split_accumulator);
+Error_Type GemmFFI(
+    cudaStream_t stream, Buffer_Type lhs, Buffer_Type lhs_scale_inv, Buffer_Type rhs,
+    Buffer_Type rhs_scale_inv, Buffer_Type bias, Buffer_Type gelu_input, Buffer_Type out,
+    Buffer_Type out_amax, Buffer_Type out_scale, Buffer_Type dummy_in, Result_Type out_updated,
+    Result_Type out_amax_updated, Result_Type out_scale_updated, Result_Type pre_gelu_out,
+    Result_Type bias_grad, Result_Type dummy_out, Result_Type workspace, bool lhs_trans,
+    bool rhs_trans, bool fuse_gelu, bool fuse_bias, bool grad, bool accumulate,
+    bool use_split_accumulator);
 
 XLA_FFI_DECLARE_HANDLER_SYMBOL(GemmHandler);
 
 // Comm+GEMM Overlap
 
-void BootstrapCommGemmOverlap(const std::string &name, const std::string &method,
-                              const std::vector<size_t> &buffer_shape, DType buffer_dtype,
-                              CommOverlapType comm_type, int tp_size, int num_splits,
-                              int num_max_streams, int comm_cga_size, int num_comm_sm,
-                              int set_sm_margin, bool use_ce, bool atomic_gemm, bool aggregate,
-                              bool pipeline_rs_overlap_first_gemm);
+bool OverlapBufferIsFp8(const std::string &name);
 
-void DestroyCommGemmOverlap(const std::string &name);
+pybind11::object GetOverlapBuffer(const std::string &name, bool sharded);
 
-void SetOverlapBufferScaleInverse(const std::string &name, pybind11::object scale_inv,
-                                  bool grad = false);
+void SetOverlapBufferScaleInverse(const std::string &name, pybind11::object scale_inv, bool grad);
 
-bool OverlapBufferIsFp8(const std::string &name);
+void BootstrapCommGemmOverlap(
+    const std::vector<size_t> &buffer_shape, DType buffer_dtype, const std::string &name,
+    const std::string &method, CommOverlapType comm_type, int64_t myrank, int64_t numranks,
+    int64_t tp_size, int64_t num_splits, int64_t num_max_streams, int64_t cga_size,
+    int64_t num_comm_sm, bool set_sm_margin, bool use_ce, bool atomic_gemm, bool aggregate,
+    bool pipeline_rs_overlap_first_gemm);
+
+Error_Type BootstrapCommGemmOverlapFFI(
+    cudaStream_t, Buffer_Type sample_buffer, std::string_view name, std::string_view method,
+    int64_t comm_type_flag, int64_t myrank, int64_t numranks, int64_t tp_size, int64_t num_splits,
+    int64_t num_max_streams, int64_t cga_size, int64_t num_comm_sm, bool set_sm_margin,
+    bool use_ce, bool atomic_gemm, bool aggregate, bool pipeline_rs_overlap_first_gemm);
+
+XLA_FFI_DECLARE_HANDLER_SYMBOL(BootstrapCommGemmOverlapHandler);
+
+void DestroyCommGemmOverlap(const std::string &name);
 
-pybind11::object GetOverlapBuffer(const std::string &name, CommOverlapType comm_type);
+Error_Type DestroyCommGemmOverlapFFI(cudaStream_t stream, std::string_view name);
 
-void CopyIntoOverlapBuffer(cudaStream_t, void **buffers, const char *opaque, size_t opaque_len);
+XLA_FFI_DECLARE_HANDLER_SYMBOL(DestroyCommGemmOverlapHandler);
 
 Error_Type CopyIntoOverlapBufferFFI(cudaStream_t stream, Buffer_Type input, std::string_view name,
-                                    int32_t comm_type_flag);
+                                    bool sharded);
 
 XLA_FFI_DECLARE_HANDLER_SYMBOL(CopyIntoOverlapBufferHandler);
 
-void CommGemmOverlap(cudaStream_t stream, void **buffers, const char *opaque, size_t opaque_len);
-
-Error_Type CommGemmOverlapFFI(cudaStream_t stream, Buffer_Type lhs, Buffer_Type lhs_scale_inv,
-                              Buffer_Type rhs, Buffer_Type rhs_scale_inv, Buffer_Type bias,
-                              Buffer_Type gelu_input, Buffer_Type out_amax, Buffer_Type out_scale,
-                              Result_Type out, Result_Type out_amax_new, Result_Type out_scale_new,
-                              Result_Type pre_gelu_out, Result_Type bias_grad,
-                              Result_Type extra_out, Result_Type workspace, bool lhs_trans,
-                              bool rhs_trans, bool fuse_gelu, bool fuse_bias, bool grad,
-                              bool accumulate, bool use_split_accumulator, int32_t comm_type_flag,
-                              std::string_view name);
+Error_Type CommGemmOverlapFFI(
+    cudaStream_t stream, Buffer_Type lhs, Buffer_Type lhs_scale_inv, Buffer_Type rhs,
+    Buffer_Type rhs_scale_inv, Buffer_Type bias, Buffer_Type gelu_input, Buffer_Type out,
+    Buffer_Type out_amax, Buffer_Type out_scale, Buffer_Type extra_out, Result_Type out_updated,
+    Result_Type out_amax_updated, Result_Type out_scale_updated, Result_Type pre_gelu_out,
+    Result_Type bias_grad, Result_Type extra_out_updated, Result_Type workspace, bool lhs_trans,
+    bool rhs_trans, bool fuse_gelu, bool fuse_bias, bool grad, bool accumulate,
+    bool use_split_accumulator, int64_t comm_type_flag, std::string_view name);
 
 XLA_FFI_DECLARE_HANDLER_SYMBOL(CommGemmOverlapHandler);
 
diff --git a/transformer_engine/jax/csrc/extensions/comm_gemm_overlap.cpp b/transformer_engine/jax/csrc/extensions/comm_gemm_overlap.cpp
index df1f4bdc23..d6f5daaa80 100644
--- a/transformer_engine/jax/csrc/extensions/comm_gemm_overlap.cpp
+++ b/transformer_engine/jax/csrc/extensions/comm_gemm_overlap.cpp
@@ -16,14 +16,38 @@ namespace transformer_engine {
 
 namespace jax {
 
+Error_Type CublasltHandleInitFFI(Variadic_Buffer_Type args, Variadic_Result_Type rets,
+                              Dictionary attrs) {
+  cublasLtHandle_t handle;
+  NVTE_CHECK_CUBLAS(cublasLtCreate(&handle));
+  return ffi_with_cuda_error_check();
+}
+
+XLA_FFI_DEFINE_HANDLER_SYMBOL(CublasltHandleInitHandler, CublasltHandleInitFFI,
+                              FFI::Bind<FFI_Prepare>().RemainingArgs().RemainingRets().Attrs());
+
 static std::unordered_map<std::string, CommOverlapCore *> _overlaps;
 
-void BootstrapCommGemmOverlap(const std::string &name, const std::string &method,
-                              const std::vector<size_t> &buffer_shape, DType buffer_dtype,
-                              CommOverlapType comm_type, int tp_size, int num_splits,
-                              int num_max_streams, int comm_cga_size, int num_comm_sm,
-                              int set_sm_margin, bool use_ce, bool atomic_gemm, bool aggregate,
-                              bool pipeline_rs_overlap_first_gemm) {
+void SetOverlapBufferScaleInverse(const std::string &name, pybind11::object scale_inv, bool grad) {
+  auto scale_inv_tensor = DLPackWrapper(scale_inv, grad);
+  _overlaps[name]->set_ubuf_scale_inv(reinterpret_cast<float *>(scale_inv_tensor.dptr()));
+}
+
+bool OverlapBufferIsFp8(const std::string &name) { return _overlaps[name]->is_fp8_ubuf(); }
+
+pybind11::object GetOverlapBuffer(const std::string &name, bool sharded) {
+  auto comm_type = (sharded) ? CommOverlapType::RS : CommOverlapType::AG;
+  DLPackWrapper output = std::move(_overlaps[name]->get_ubuf_output(comm_type));
+  auto capsule = output.capsule();
+  return capsule;
+};
+
+void BootstrapCommGemmOverlap(
+    const std::vector<size_t> &buffer_shape, DType buffer_dtype, const std::string &name,
+    const std::string &method, CommOverlapType comm_type, int64_t myrank, int64_t numranks,
+    int64_t tp_size, int64_t num_splits, int64_t num_max_streams, int64_t comm_cga_size,
+    int64_t num_comm_sm, bool set_sm_margin, bool use_ce, bool atomic_gemm, bool aggregate,
+    bool pipeline_rs_overlap_first_gemm) {
 #ifndef NVTE_UB_WITH_MPI
   NVTE_ERROR(
       std::string("Comm+GEMM overlap in TE/JAX requires bootstrapping Userbuffers with MPI. ") +
@@ -32,19 +56,56 @@ void BootstrapCommGemmOverlap(const std::string &name, const std::string &method
 
   // Initialize overlap object -- this allocates the comm buffer
   NVTE_CHECK(_overlaps.find(name) == _overlaps.end(), name, " is already initialized!");
-  if (method == "ring-exchange") {
-    _overlaps[name] = reinterpret_cast<CommOverlapCore *>(new CommOverlapP2PBase(
-        buffer_shape, buffer_dtype, -1, -1, -1, -1, -1, -1, tp_size, &_dummy_allgather,
+  if (method == "ring_exchange") {
+    _overlaps[name] = new CommOverlapP2PBase(
+        buffer_shape, buffer_dtype, myrank, numranks, -1, -1, -1, -1, tp_size, &_dummy_allgather,
         &_dummy_barrier, comm_type, num_max_streams, comm_cga_size, num_comm_sm, set_sm_margin,
-        use_ce, atomic_gemm, aggregate));
+        use_ce, atomic_gemm, aggregate);
   } else {
-    _overlaps[name] = reinterpret_cast<CommOverlapCore *>(new CommOverlapBase(
-        buffer_shape, buffer_dtype, -1, -1, -1, -1, -1, -1, tp_size, &_dummy_allgather,
+    _overlaps[name] = new CommOverlapBase(
+        buffer_shape, buffer_dtype, myrank, numranks, -1, -1, -1, -1, tp_size, &_dummy_allgather,
         &_dummy_barrier, num_splits, num_max_streams, comm_cga_size, num_comm_sm, set_sm_margin,
-        atomic_gemm, pipeline_rs_overlap_first_gemm));
+        atomic_gemm, pipeline_rs_overlap_first_gemm);
   }
 };
 
+Error_Type BootstrapCommGemmOverlapFFI(
+    cudaStream_t, Buffer_Type sample_buffer, std::string_view name, std::string_view method,
+    int64_t comm_type_flag, int64_t myrank, int64_t numranks, int64_t tp_size, int64_t num_splits,
+    int64_t num_max_streams, int64_t cga_size, int64_t num_comm_sm, bool set_sm_margin,
+    bool use_ce, bool atomic_gemm, bool aggregate, bool pipeline_rs_overlap_first_gemm) {
+  auto buffer_shape = std::vector<size_t>(sample_buffer.dimensions().begin(),
+                                          sample_buffer.dimensions().end());
+  auto buffer_dtype = convert_ffi_datatype_to_te_dtype(sample_buffer.element_type());
+  BootstrapCommGemmOverlap(
+      buffer_shape, buffer_dtype, static_cast<std::string>(name), static_cast<std::string>(method),
+      static_cast<CommOverlapType>(comm_type_flag), myrank, numranks, tp_size, num_splits,
+      num_max_streams, cga_size, num_comm_sm, set_sm_margin, use_ce, atomic_gemm, aggregate,
+      pipeline_rs_overlap_first_gemm);
+  return ffi_with_cuda_error_check();
+}
+
+XLA_FFI_DEFINE_HANDLER_SYMBOL(BootstrapCommGemmOverlapHandler, BootstrapCommGemmOverlapFFI,
+                              FFI::Bind()
+                                  .Ctx<FFI_Stream_Type>()  // stream
+                                  .Arg<Buffer_Type>()      // sample_buffer
+                                  .Attr<std::string_view>("name")
+                                  .Attr<std::string_view>("method")
+                                  .Attr<int64_t>("comm_type_flag")
+                                  .Attr<int64_t>("myrank")
+                                  .Attr<int64_t>("numranks")
+                                  .Attr<int64_t>("tp_size")
+                                  .Attr<int64_t>("num_splits")
+                                  .Attr<int64_t>("num_max_streams")
+                                  .Attr<int64_t>("cga_size")
+                                  .Attr<int64_t>("num_comm_sm")
+                                  .Attr<bool>("set_sm_margin")
+                                  .Attr<bool>("use_ce")
+                                  .Attr<bool>("atomic_gemm")
+                                  .Attr<bool>("aggregate")
+                                  .Attr<bool>("pipeline_rs_overlap_first_gemm"),
+                              FFI_CudaGraph_Traits);
+
 void DestroyCommGemmOverlap(const std::string &name) {
   auto overlap = _overlaps.find(name);
   if (overlap != _overlaps.end()) {
@@ -53,45 +114,33 @@ void DestroyCommGemmOverlap(const std::string &name) {
   }
 };
 
-void SetOverlapBufferScaleInverse(const std::string &name, pybind11::object scale_inv, bool grad) {
-  auto scale_inv_tensor = DLPackWrapper(scale_inv, grad);
-  _overlaps[name]->set_ubuf_scale_inv(reinterpret_cast<float *>(scale_inv_tensor.dptr()));
+Error_Type DestroyCommGemmOverlapFFI(cudaStream_t stream, std::string_view name) {
+  DestroyCommGemmOverlap(static_cast<std::string>(name));
+  return ffi_with_cuda_error_check();
 }
 
-bool OverlapBufferIsFp8(const std::string &name) { return _overlaps[name]->is_fp8_ubuf(); }
-
-pybind11::object GetOverlapBuffer(const std::string &name, CommOverlapType comm_type) {
-  DLPackWrapper output = std::move(_overlaps[name]->get_ubuf_output(comm_type));
-  auto capsule = output.capsule();
-  return capsule;
-};
+XLA_FFI_DEFINE_HANDLER_SYMBOL(DestroyComMGemmOverlapHandler, DestroyCommGemmOverlapFFI,
+                              FFI::Bind()
+                                   .Ctx<FFI_Stream_Type>()
+                                   .Attr<std::string_view>("name"),
+                              FFI_CudaGraph_Traits);
 
 void CopyIntoOverlapBufferImpl(cudaStream_t stream, void *input_ptr,
                                const std::vector<size_t> &shape, DType dtype,
-                               const std::string &name, CommOverlapType comm_type) {
+                               const std::string &name, bool sharded) {
   auto input = TensorWrapper(input_ptr, shape, dtype);
+  auto comm_type = (sharded) ? CommOverlapType::RS : CommOverlapType::AG;
   _overlaps[name]->copy_into_ubuf(stream, input, comm_type);
 }
 
-void CopyIntoOverlapBuffer(cudaStream_t stream, void **buffers, const char *opaque,
-                           size_t opaque_len) {
-  auto input_ptr = buffers[0];
-
-  const auto &desc = *UnpackOpaque<CustomCallBufferDescriptor>(opaque, opaque_len);
-
-  CopyIntoOverlapBufferImpl(stream, input_ptr,
-                            std::vector<size_t>(desc.shape, desc.shape + desc.ndim), desc.dtype,
-                            desc.name, desc.comm_type);
-}
-
 Error_Type CopyIntoOverlapBufferFFI(cudaStream_t stream, Buffer_Type input, std::string_view name,
-                                    int32_t comm_type_flag) {
+                                    bool sharded) {
   auto input_ptr = input.untyped_data();
   auto shape = std::vector<size_t>(input.dimensions().begin(), input.dimensions().end());
   auto dtype = convert_ffi_datatype_to_te_dtype(input.element_type());
 
   CopyIntoOverlapBufferImpl(stream, input_ptr, shape, dtype, static_cast<std::string>(name),
-                            static_cast<CommOverlapType>(comm_type_flag));
+                            sharded);
 
   return ffi_with_cuda_error_check();
 }
@@ -101,7 +150,7 @@ XLA_FFI_DEFINE_HANDLER_SYMBOL(CopyIntoOverlapBufferHandler, CopyIntoOverlapBuffe
                                   .Ctx<FFI_Stream_Type>()  // stream
                                   .Arg<Buffer_Type>()      // input
                                   .Attr<std::string_view>("name")
-                                  .Attr<int32_t>("comm_type_flag"),
+                                  .Attr<bool>("sharded"),
                               FFI_CudaGraph_Traits);
 
 void CommGemmOverlapImpl(void *lhs, const std::vector<size_t> &lhs_shape, DType lhs_dtype,
@@ -156,59 +205,14 @@ void CommGemmOverlapImpl(void *lhs, const std::vector<size_t> &lhs_shape, DType
   }
 }
 
-void CommGemmOverlap(cudaStream_t stream, void **buffers, const char *opaque, size_t opaque_len) {
-  // Inputs
-  auto lhs = buffers[0];
-  auto lhs_scale_inv = reinterpret_cast<float *>(buffers[1]);
-  auto rhs = buffers[2];
-  auto rhs_scale_inv = reinterpret_cast<float *>(buffers[3]);
-  auto bias = buffers[4];
-  auto gelu_input = buffers[5];
-  auto out_amax = reinterpret_cast<float *>(buffers[6]);
-  auto out_scale = reinterpret_cast<float *>(buffers[7]);
-
-  // Outputs
-  auto out = buffers[8];
-  auto out_amax_new = reinterpret_cast<float *>(buffers[9]);
-  auto out_scale_new = reinterpret_cast<float *>(buffers[10]);
-  auto pre_gelu_out = buffers[11];
-  auto bias_grad = buffers[12];
-  auto extra_out = buffers[13];
-  auto workspace = buffers[14];
-
-  // Check operand-output aliases
-  NVTE_CHECK(bias == bias_grad, "bias not bound to bias_grad in AG+GEMM overlap.");
-  NVTE_CHECK(gelu_input == pre_gelu_out,
-             "gelu_input not bound to pre_gelu_out in AG+GEMM overlap.");
-  NVTE_CHECK(out_amax == out_amax_new, "out_amax not bound to out_amax_new in AG+GEMM overlap.");
-  NVTE_CHECK(out_scale == out_scale_new,
-             "out_scale not bound to out_scale_new in AG+GEMM overlap.");
-
-  const auto &desc = *UnpackOpaque<CustomCallOverlapDescriptor>(opaque, opaque_len);
-
-  auto lhs_shape =
-      (desc.lhs_trans) ? std::vector<size_t>{desc.k, desc.m} : std::vector<size_t>{desc.m, desc.k};
-  auto rhs_shape =
-      (desc.rhs_trans) ? std::vector<size_t>{desc.n, desc.k} : std::vector<size_t>{desc.k, desc.n};
-  auto out_shape = std::vector<size_t>{desc.m, desc.n};
-
-  CommGemmOverlapImpl(lhs, lhs_shape, desc.operand_dtype, lhs_scale_inv, desc.lhs_trans, rhs,
-                      rhs_shape, desc.operand_dtype, rhs_scale_inv, desc.rhs_trans, out, out_shape,
-                      desc.out_dtype, out_amax, out_scale, bias, desc.bias_dtype, pre_gelu_out,
-                      extra_out, lhs_shape, workspace, desc.workspace_size, desc.fuse_gelu,
-                      desc.fuse_bias, desc.grad, desc.accumulate, desc.use_split_accumulator,
-                      desc.comm_type, desc.name, stream);
-}
-
-Error_Type CommGemmOverlapFFI(cudaStream_t stream, Buffer_Type lhs, Buffer_Type lhs_scale_inv,
-                              Buffer_Type rhs, Buffer_Type rhs_scale_inv, Buffer_Type bias,
-                              Buffer_Type gelu_input, Buffer_Type out_amax, Buffer_Type out_scale,
-                              Result_Type out, Result_Type out_amax_new, Result_Type out_scale_new,
-                              Result_Type pre_gelu_out, Result_Type bias_grad,
-                              Result_Type extra_out, Result_Type workspace, bool lhs_trans,
-                              bool rhs_trans, bool fuse_gelu, bool fuse_bias, bool grad,
-                              bool accumulate, bool use_split_accumulator, int32_t comm_type_flag,
-                              std::string_view name) {
+Error_Type CommGemmOverlapFFI(
+    cudaStream_t stream, Buffer_Type lhs, Buffer_Type lhs_scale_inv, Buffer_Type rhs,
+    Buffer_Type rhs_scale_inv, Buffer_Type bias, Buffer_Type gelu_input, Buffer_Type out,
+    Buffer_Type out_amax, Buffer_Type out_scale, Buffer_Type extra_out, Result_Type out_updated,
+    Result_Type out_amax_updated, Result_Type out_scale_updated, Result_Type pre_gelu_out,
+    Result_Type bias_grad, Result_Type extra_out_updated, Result_Type workspace, bool lhs_trans,
+    bool rhs_trans, bool fuse_gelu, bool fuse_bias, bool grad, bool accumulate,
+    bool use_split_accumulator, int64_t comm_type_flag, std::string_view name) {
   // Inputs
   auto lhs_ptr = lhs.untyped_data();
   auto lhs_shape = std::vector<size_t>(lhs.dimensions().begin(), lhs.dimensions().end());
@@ -221,31 +225,38 @@ Error_Type CommGemmOverlapFFI(cudaStream_t stream, Buffer_Type lhs, Buffer_Type
   auto bias_ptr = bias.untyped_data();
   auto bias_dtype = convert_ffi_datatype_to_te_dtype(bias.element_type());
   auto gelu_input_ptr = gelu_input.untyped_data();
+  auto out_ptr = out.untyped_data();
   auto out_amax_ptr = reinterpret_cast<float *>(out_amax.untyped_data());
   auto out_scale_ptr = reinterpret_cast<float *>(out_scale.untyped_data());
+  auto extra_out_ptr = extra_out.untyped_data();
 
   // Outputs
-  auto out_ptr = out->untyped_data();
-  auto out_shape = std::vector<size_t>(out->dimensions().begin(), out->dimensions().end());
-  auto out_dtype = convert_ffi_datatype_to_te_dtype(out->element_type());
-  auto out_amax_new_ptr = reinterpret_cast<float *>(out_amax_new->untyped_data());
-  auto out_scale_new_ptr = reinterpret_cast<float *>(out_scale_new->untyped_data());
+  auto out_updated_ptr = out_updated->untyped_data();
+  auto out_shape = std::vector<size_t>(out_updated->dimensions().begin(),
+                                       out_updated->dimensions().end());
+  auto out_dtype = convert_ffi_datatype_to_te_dtype(out_updated->element_type());
+  auto out_amax_updated_ptr = reinterpret_cast<float *>(out_amax_updated->untyped_data());
+  auto out_scale_updated_ptr = reinterpret_cast<float *>(out_scale_updated->untyped_data());
   auto pre_gelu_ptr = pre_gelu_out->untyped_data();
   auto bias_grad_ptr = bias_grad->untyped_data();
-  auto extra_out_ptr = extra_out->untyped_data();
-  auto extra_out_shape =
-      std::vector<size_t>(extra_out->dimensions().begin(), extra_out->dimensions().end());
+  auto extra_out_updated_ptr = extra_out_updated->untyped_data();
+  auto extra_out_shape = std::vector<size_t>(extra_out_updated->dimensions().begin(),
+                                             extra_out_updated->dimensions().end());
   auto workspace_ptr = workspace->untyped_data();
   auto workspace_size = workspace->element_count();
 
   // Check operand-output aliases
-  NVTE_CHECK(bias_ptr == bias_grad_ptr, "bias not bound to bias_grad in AG+GEMM overlap.");
+  NVTE_CHECK(bias_ptr == bias_grad_ptr, "bias not bound to bias_grad in TE/JAX comm+GEMM overlap.");
   NVTE_CHECK(gelu_input_ptr == pre_gelu_ptr,
-             "gelu_input not bound to pre_gelu_out in AG+GEMM overlap.");
-  NVTE_CHECK(out_amax_ptr == out_amax_new_ptr,
-             "out_amax not bound to out_amax_new in AG+GEMM overlap.");
-  NVTE_CHECK(out_scale_ptr == out_scale_new_ptr,
-             "out_scale not bound to out_scale_new in AG+GEMM overlap.");
+             "gelu_input not bound to pre_gelu_out in TE/JAX comm+GEMM overlap.");
+  NVTE_CHECK(out_ptr == out_updated_ptr,
+             "out not bound to out_updated in TE/JAX comm+GEMM overlap.");
+  NVTE_CHECK(out_amax_ptr == out_amax_updated_ptr,
+             "out_amax not bound to out_amax_updated in TE/JAX comm+GEMM overlap.");
+  NVTE_CHECK(out_scale_ptr == out_scale_updated_ptr,
+             "out_scale not bound to out_scale_updated in TE/JAX comm+GEMM overlap.");
+  NVTE_CHECK(extra_out_ptr == extra_out_updated_ptr,
+             "extra_out not bound to extra_out_updated in TE/JAX comm+GEMM overlap.");
 
   CommGemmOverlapImpl(
       lhs_ptr, lhs_shape, lhs_dtype, lhs_scale_inv_ptr, lhs_trans, rhs_ptr, rhs_shape, rhs_dtype,
@@ -266,14 +277,16 @@ XLA_FFI_DEFINE_HANDLER_SYMBOL(CommGemmOverlapHandler, CommGemmOverlapFFI,
                                   .Arg<Buffer_Type>()      // rhs_scale_inv
                                   .Arg<Buffer_Type>()      // bias
                                   .Arg<Buffer_Type>()      // gelu_input
+                                  .Arg<Buffer_Type>()      // out
                                   .Arg<Buffer_Type>()      // out_amax
                                   .Arg<Buffer_Type>()      // out_scale
-                                  .Ret<Buffer_Type>()      // out
-                                  .Ret<Buffer_Type>()      // out_amax_new
-                                  .Ret<Buffer_Type>()      // out_scale_new
+                                  .Arg<Buffer_Type>()      // extra_out
+                                  .Ret<Buffer_Type>()      // out_updated
+                                  .Ret<Buffer_Type>()      // out_amax_updated
+                                  .Ret<Buffer_Type>()      // out_scale_updated
                                   .Ret<Buffer_Type>()      // pre_gelu_out
                                   .Ret<Buffer_Type>()      // bias_grad
-                                  .Ret<Buffer_Type>()      // extra_out
+                                  .Ret<Buffer_Type>()      // extra_out_updated
                                   .Ret<Buffer_Type>()      // workspace
                                   .Attr<bool>("lhs_trans")
                                   .Attr<bool>("rhs_trans")
@@ -282,7 +295,7 @@ XLA_FFI_DEFINE_HANDLER_SYMBOL(CommGemmOverlapHandler, CommGemmOverlapFFI,
                                   .Attr<bool>("grad")
                                   .Attr<bool>("accumulate")
                                   .Attr<bool>("use_split_accumulator")
-                                  .Attr<int32_t>("comm_type_flag")
+                                  .Attr<int64_t>("comm_type_flag")
                                   .Attr<std::string_view>("name"),
                               FFI_CudaGraph_Traits);
 
diff --git a/transformer_engine/jax/csrc/extensions/gemm.cpp b/transformer_engine/jax/csrc/extensions/gemm.cpp
index 14148ecbd0..8f6f907268 100644
--- a/transformer_engine/jax/csrc/extensions/gemm.cpp
+++ b/transformer_engine/jax/csrc/extensions/gemm.cpp
@@ -53,21 +53,24 @@ void Gemm(cudaStream_t stream, void **buffers, const char *opaque, size_t opaque
   auto *rhs_scale_inv = reinterpret_cast<float *>(buffers[3]);
   auto *bias = buffers[4];
   auto *gelu_input = buffers[5];
-  auto *out_amax = reinterpret_cast<float *>(buffers[6]);
-  auto *out_scale = reinterpret_cast<float *>(buffers[7]);
+  auto *out = buffers[6];
+  auto *out_amax = reinterpret_cast<float *>(buffers[7]);
+  auto *out_scale = reinterpret_cast<float *>(buffers[8]);
+  // buffers[9] is the extra output bufer for comm+GEMM overlap, not used here
 
   // Outputs
-  auto *out = buffers[8];
-  auto *out_amax_updated = reinterpret_cast<float *>(buffers[9]);
-  auto *out_scale_updated = reinterpret_cast<float *>(buffers[10]);
-  auto *pre_gelu_out = buffers[11];
-  auto *bias_grad = buffers[12];
-  // buffers[13] is the extra output for comm+GEMM overlap, not used here
-  auto *workspace = buffers[14];
+  auto *out_updated = buffers[10];
+  auto *out_amax_updated = reinterpret_cast<float *>(buffers[11]);
+  auto *out_scale_updated = reinterpret_cast<float *>(buffers[12]);
+  auto *pre_gelu_out = buffers[13];
+  auto *bias_grad = buffers[14];
+  // buffers[15] is the updated extra output for comm+GEMM overlap, not used here
+  auto *workspace = buffers[16];
 
   // Operand aliasing
   NVTE_CHECK(bias == bias_grad, "bias not bound to bias_grad in TE/JAX GEMM");
   NVTE_CHECK(gelu_input == pre_gelu_out, "gelu_input not bound to pre_gelu_out in TE/JAX GEMM");
+  NVTE_CHECK(out == out_updated, "out not bound to out_updated in TE/JAX GEMM");
   NVTE_CHECK(out_amax == out_amax_updated, "out_amax not bound to out_amax_updated in TE/JAX GEMM");
   NVTE_CHECK(out_scale == out_scale_updated,
              "out_scale not bound to out_scale_updated in TE/JAX GEMM");
@@ -85,13 +88,15 @@ void Gemm(cudaStream_t stream, void **buffers, const char *opaque, size_t opaque
            desc.fuse_bias, desc.grad, desc.accumulate, desc.use_split_accumulator);
 }
 
-Error_Type GemmFFI(cudaStream_t stream, Buffer_Type lhs, Buffer_Type lhs_scale_inv, Buffer_Type rhs,
-                   Buffer_Type rhs_scale_inv, Buffer_Type bias, Buffer_Type gelu_input,
-                   Buffer_Type out_amax, Buffer_Type out_scale, Result_Type out,
-                   Result_Type out_amax_updated, Result_Type out_scale_updated,
-                   Result_Type pre_gelu_out, Result_Type bias_grad, Result_Type dummy_out,
-                   Result_Type workspace, bool lhs_trans, bool rhs_trans, bool fuse_gelu,
-                   bool fuse_bias, bool grad, bool accumulate, bool use_split_accumulator) {
+Error_Type GemmFFI(
+    cudaStream_t stream, Buffer_Type lhs, Buffer_Type lhs_scale_inv, Buffer_Type rhs,
+    Buffer_Type rhs_scale_inv, Buffer_Type bias, Buffer_Type gelu_input, Buffer_Type out,
+    Buffer_Type out_amax, Buffer_Type out_scale, Buffer_Type dummy_in, Result_Type out_updated,
+    Result_Type out_amax_updated, Result_Type out_scale_updated, Result_Type pre_gelu_out,
+    Result_Type bias_grad, Result_Type dummy_out, Result_Type workspace, bool lhs_trans,
+    bool rhs_trans, bool fuse_gelu, bool fuse_bias, bool grad, bool accumulate,
+    bool use_split_accumulator
+) {
   // Inputs
   auto lhs_ptr = lhs.untyped_data();
   auto lhs_scale_inv_ptr = reinterpret_cast<float *>(lhs_scale_inv.untyped_data());
@@ -101,17 +106,19 @@ Error_Type GemmFFI(cudaStream_t stream, Buffer_Type lhs, Buffer_Type lhs_scale_i
   auto bias_ptr = bias.untyped_data();
   auto bias_dtype = convert_ffi_datatype_to_te_dtype(bias.element_type());
   auto gelu_input_ptr = gelu_input.untyped_data();
+  auto out_ptr = out.untyped_data();
   auto out_amax_ptr = reinterpret_cast<float *>(out_amax.untyped_data());
   auto out_scale_ptr = reinterpret_cast<float *>(out_scale.untyped_data());
+  // dummy_in is the extra output buffer for comm+GEMM overlap, not used here
 
   // Outputs
-  auto out_ptr = out->untyped_data();
+  auto out_updated_ptr = out_updated->untyped_data();
   auto out_amax_updated_ptr = reinterpret_cast<float *>(out_amax_updated->untyped_data());
   auto out_scale_updated_ptr = reinterpret_cast<float *>(out_scale_updated->untyped_data());
-  auto out_dtype = convert_ffi_datatype_to_te_dtype(out->element_type());
+  auto out_dtype = convert_ffi_datatype_to_te_dtype(out_updated->element_type());
   auto pre_gelu_out_ptr = pre_gelu_out->untyped_data();
   auto bias_grad_ptr = bias_grad->untyped_data();
-  // dummy_out is the extra output for comm+GEMM overlap, not used here
+  // dummy_out is the updated extra output for comm+GEMM overlap, not used here
   auto workspace_ptr = workspace->untyped_data();
   auto workspace_size = workspace->dimensions().back();
 
@@ -119,6 +126,7 @@ Error_Type GemmFFI(cudaStream_t stream, Buffer_Type lhs, Buffer_Type lhs_scale_i
   NVTE_CHECK(bias_ptr == bias_grad_ptr, "bias not bound to bias_grad in TE/JAX GEMM");
   NVTE_CHECK(gelu_input_ptr == pre_gelu_out_ptr,
              "gelu_input not bound to pre_gelu_out in TE/JAX GEMM");
+  NVTE_CHECK(out_ptr == out_updated_ptr, "out not bound to out_updated in TE/JAX GEMM");
   NVTE_CHECK(out_amax_ptr == out_amax_updated_ptr,
              "out_amax not bound to out_amax_updated in TE/JAX GEMM");
   NVTE_CHECK(out_scale_ptr == out_scale_updated_ptr,
@@ -146,9 +154,11 @@ XLA_FFI_DEFINE_HANDLER_SYMBOL(GemmHandler, GemmFFI,
                                   .Arg<Buffer_Type>()      // rhs_scale_inv
                                   .Arg<Buffer_Type>()      // bias
                                   .Arg<Buffer_Type>()      // gelu_input
+                                  .Arg<Buffer_Type>()      // out
                                   .Arg<Buffer_Type>()      // out_amax
                                   .Arg<Buffer_Type>()      // out_scale
-                                  .Ret<Buffer_Type>()      // out
+                                  .Arg<Buffer_Type>()      // dummy_in
+                                  .Ret<Buffer_Type>()      // out_updated
                                   .Ret<Buffer_Type>()      // out_amax_updated
                                   .Ret<Buffer_Type>()      // out_scale_updated
                                   .Ret<Buffer_Type>()      // pre_gelu_out
diff --git a/transformer_engine/jax/csrc/extensions/packing.cpp b/transformer_engine/jax/csrc/extensions/packing.cpp
index 31a53529e3..dd4070af41 100644
--- a/transformer_engine/jax/csrc/extensions/packing.cpp
+++ b/transformer_engine/jax/csrc/extensions/packing.cpp
@@ -90,24 +90,5 @@ pybind11::bytes PackCustomCallGemmDescriptor(size_t m, size_t n, size_t k, size_
                                              grad, accumulate, use_split_accumulator});
 }
 
-pybind11::bytes PackCustomCallBufferDescriptor(const std::string &name,
-                                               const std::vector<size_t> &shape, DType dtype,
-                                               CommOverlapType comm_type) {
-  return PackOpaque<CustomCallBufferDescriptor>(
-      {name, shape.data(), shape.size(), dtype, comm_type});
-}
-
-pybind11::bytes PackCustomCallOverlapDescriptor(size_t m, size_t k, size_t n, size_t workspace_size,
-                                                DType operand_dtype, DType bias_dtype,
-                                                DType out_dtype, bool lhs_trans, bool rhs_trans,
-                                                bool fuse_gelu, bool fuse_bias, bool grad,
-                                                bool accumulate, bool use_split_accumulator,
-                                                CommOverlapType comm_type,
-                                                const std::string &name) {
-  return PackOpaque<CustomCallOverlapDescriptor>(
-      {m, n, k, workspace_size, operand_dtype, bias_dtype, out_dtype, lhs_trans, rhs_trans,
-       fuse_gelu, fuse_bias, grad, accumulate, use_split_accumulator, comm_type, name});
-}
-
 }  // namespace jax
 }  // namespace transformer_engine
diff --git a/transformer_engine/jax/csrc/extensions/pybind.cpp b/transformer_engine/jax/csrc/extensions/pybind.cpp
index 2bf13a600d..c61e9c8127 100644
--- a/transformer_engine/jax/csrc/extensions/pybind.cpp
+++ b/transformer_engine/jax/csrc/extensions/pybind.cpp
@@ -54,8 +54,6 @@ pybind11::dict Registrations() {
   dict["te_fused_attn_forward"] = EncapsulateFunction(FusedAttnForward);
   dict["te_fused_attn_backward"] = EncapsulateFunction(FusedAttnBackward);
   dict["te_gemm"] = EncapsulateFunction(Gemm);
-  dict["te_copy_into_overlap_buffer"] = EncapsulateFunction(CopyIntoOverlapBuffer);
-  dict["te_comm_gemm_overlap"] = EncapsulateFunction(CommGemmOverlap);
 
   // Transpose
   dict["te_transpose_ffi"] = EncapsulateFFI(TransposeHandler);
@@ -106,9 +104,18 @@ pybind11::dict Registrations() {
   fused_attn_backward_ffi["execute"] = EncapsulateFFI(FusedAttnBackwardHandler);
   dict["te_fused_attn_backward_ffi"] = fused_attn_backward_ffi;
 
-  dict["te_gemm_ffi"] = EncapsulateFFI(GemmHandler);
+  pybind11::dict gemm_ffi;
+  gemm_ffi["prepare"] = EncapsulateFFI(CublasltHandleInitHandler);
+  gemm_ffi["execute"] = EncapsulateFFI(GemmHandler);
+  dict["te_gemm_ffi"] = gemm_ffi;
+
+  dict["te_bootstrap_comm_gemm_overlap_ffi"] = EncapsulateFFI(BootstrapCommGemmOverlapHandler);
   dict["te_copy_into_overlap_buffer_ffi"] = EncapsulateFFI(CopyIntoOverlapBufferHandler);
-  dict["te_comm_gemm_overlap_ffi"] = EncapsulateFFI(CommGemmOverlapHandler);
+
+  pybind11::dict comm_gemm_overlap_ffi;
+  comm_gemm_overlap_ffi["prepare"] = EncapsulateFFI(CublasltHandleInitHandler);
+  comm_gemm_overlap_ffi["execute"] = EncapsulateFFI(CommGemmOverlapHandler);
+  dict["te_comm_gemm_overlap_ffi"] = comm_gemm_overlap_ffi;
   return dict;
 }
 
@@ -125,8 +132,6 @@ PYBIND11_MODULE(transformer_engine_jax, m) {
   m.def("pack_softmax_descriptor", &PackCustomCallSoftmaxDescriptor);
   m.def("pack_fused_attn_descriptor", &PackCustomCallFusedAttnDescriptor);
   m.def("pack_gemm_descriptor", &PackCustomCallGemmDescriptor);
-  m.def("pack_buffer_descriptor", &PackCustomCallBufferDescriptor);
-  m.def("pack_overlap_descriptor", &PackCustomCallOverlapDescriptor);
   m.def("get_fused_attn_backend", &GetFusedAttnBackend);
   m.def("get_cuda_version", &GetCudaRuntimeVersion);
   m.def("get_cudnn_version", &GetCudnnRuntimeVersion);
@@ -140,7 +145,7 @@ PYBIND11_MODULE(transformer_engine_jax, m) {
   m.def("get_fused_attn_bwd_workspace_sizes", &GetFusedAttnBackwardWorkspaceSizes);
   m.def("nvte_get_qkv_format", &nvte_get_qkv_format);
   m.def("bootstrap_comm_gemm_overlap", &BootstrapCommGemmOverlap);
-  m.def("destroy_comm_gemm_overlaps", &DestroyCommGemmOverlap);
+  m.def("destroy_comm_gemm_overlap", &DestroyCommGemmOverlap);
   m.def("set_buffer_scale_inv", &SetOverlapBufferScaleInverse, pybind11::arg(), pybind11::arg(),
         pybind11::arg("grad") = false);
   m.def("get_overlap_buffer", &GetOverlapBuffer);
diff --git a/transformer_engine/jax/gemm.py b/transformer_engine/jax/gemm.py
index e463f0ace2..9b9afd56ca 100644
--- a/transformer_engine/jax/gemm.py
+++ b/transformer_engine/jax/gemm.py
@@ -10,7 +10,6 @@
 import jax
 import jax.numpy as jnp
 from jax.typing import ArrayLike
-from jax.sharding import NamedSharding, PartitionSpec
 
 from transformer_engine import transformer_engine_jax as tex
 from .fp8 import FP8Helper, FP8MetaPackage
@@ -34,6 +33,7 @@
     "type_safe_gemm",
     "initialize_comm_gemm_overlaps",
     "destroy_comm_gemm_overlap",
+    "get_comm_gemm_overlap_config",
 ]
 
 _NUM_MAX_UB_STREAMS = 3
@@ -83,7 +83,13 @@ def gemm(
     """
     comm_overlap_config = None
     if comm_overlap_name is not None:
-        comm_overlap_config = _ACTIVE_COMM_GEMM_OVERLAPS.get(comm_overlap_name, None)
+        global _ACTIVE_COMM_GEMM_OVERLAPS
+        comm_overlap_layer = (
+            comm_overlap_name + "_fprop"
+            if comm_overlap_name not in ["ag_gemm", "gemm_rs"]
+            else comm_overlap_name
+        )
+        comm_overlap_config = _ACTIVE_COMM_GEMM_OVERLAPS.get(comm_overlap_layer, None)
         if comm_overlap_config is None:
             warnings.warn(
                 f"Comm+GEMM overlap for {comm_overlap_name} has not been initialized! "
@@ -97,7 +103,7 @@ def gemm(
         ):
             if sanitize_dims(contracting_dims[0], x.ndim) != x.ndim - 1:
                 x = jnp.matrix_transpose(x)
-            copy_into_overlap_buffer(x, comm_overlap_name, tex.CommOverlapType.RS)
+            copy_into_overlap_buffer(x, comm_overlap_name, True)
 
     return _gemm(
         x,
@@ -151,11 +157,11 @@ def _gemm_fwd_rule(
 
     fuse_bias = bias is not None
 
-    # AG+GEMM:    ([B], M/P, K) --(AG)--> ([B], M, K) x (K, N/P) ------> ([B], M, N/P)
-    # (DP, TP, None) --(AG)--> (DP, None, None) x (None, TP) --> (DP, None, TP)
+    # AG+GEMM: ([B], M/P, K) --(AG)--> ([B], M, K) x (K, N/P) --> ([B], M, N/P)
     #
     # GEMM+AR: ([B], M, K/P) x (K/P, N) --(AR)--> ([B], M, N)
-    #     (DP, None, TP) x (TP, None) --(AR)--> (DP, None, None)
+    #
+    # GEMM+RS: ([B], M, K/P) x (K/P, N) --(RS)--> ([B], M/P, N)
     out, pre_gelu_out, extra_out = gemm_impl(
         x,
         kernel,
@@ -169,20 +175,15 @@ def _gemm_fwd_rule(
         comm_overlap_config=comm_overlap_config,
     )
 
-    # Update returned and saved tensors based on comm+GEMM overlap
-    saved_x = x
     final_out = out
-    if comm_overlap_config is not None:
-        match comm_overlap_config.get("comm_type", None):
-            case tex.CommOverlapType.AG:
-                # AG overlap puts the all-gathered global LHS (X) into extra_out
-                saved_x = extra_out
-            case tex.CommOverlapType.RS:
-                # RS overlap puts the reduce-scattered sharded output into extra_out
-                final_out = extra_out
+    if (comm_overlap_config is not None
+        and comm_overlap_config["method"] != "bulk"
+        and comm_overlap_config["comm_type"] == tex.CommOverlapType.RS):
+        # Non-bulk RS overlap output is in extra output, not usual output
+        final_out = extra_out
 
     ctx = (
-        saved_x,
+        x,
         kernel,
         pre_gelu_out if fuse_gelu else None,
         fuse_bias,
@@ -207,26 +208,47 @@ def _gemm_bwd_rule(
     )
 
     dgrad_overlap_config = None
+    wgrad_overlap_config = None
+    dgrad_pre_rs = None
     if comm_overlap_config is not None:
         dgrad_overlap_name = comm_overlap_config["name"].rstrip("_fprop") + "_dgrad"
         dgrad_overlap_config = _ACTIVE_COMM_GEMM_OVERLAPS.get(dgrad_overlap_name, None)
+        if (dgrad_overlap_config["method"] == "bulk"
+            and dgrad_overlap_config["comm_type"] == tex.CommOverlapType.AG):
+            # If DGRAD is bulk overlap, copy input X into comm buffer to be all-gathered in
+            # preparation for WGRAD.
+            wgrad_overlap_name = comm_overlap_config["name"].rstrip("_fprop") + "_wgrad"
+            wgrad_overlap_config = _ACTIVE_COMM_GEMM_OVERLAPS.get(wgrad_overlap_name, None)
+            assert wgrad_overlap_config is not None, "Internal TE error!"
+            copy_into_overlap_buffer(x, dgrad_overlap_name, True)
+
+            # Set DGRAD output buffer to the comm buffer of WGRAD GEMM in order to do the
+            # bulk RS overlap without an extra memcpy
+            dgrad_pre_rs = tex.get_overlap_buffer(wgrad_overlap_name, False)
 
     # FWD MODE:
     #     AG+GEMM: ([B], M/P, K) --(AG)--> ([B], M, K) x (K, N/P) ------> ([B], M, N/P)
-    #  (DP, TP, None) --(AG)--> (DP, None, None) x (None, TP) --> (DP, None, TP)
     #
     #     GEMM+AR: ([B], M, K/P) x (K/P, N) --(AR)--> ([B], M, N)
-    #         (DP, None, TP) x (TP, None) --(AR)--> (DP, None, None)
+    #
+    #     GEMM+RS: ([B], M, K/P) x (K/P, N) --(RS)--> ([B], M/P, N)
 
-    # DGRAD:
-    #    AG+GEMM: ([B], M, N/P) x (K, N/P)^T ----(AR)----> ([B], M, K)
-    #        (DP, None, TP) x (None, TP)^T --(AR)--> (DP, None, None)
+    # DGRAD w/o Overlap:
+    #    AG+GEMM: ([B], M, N/P) x (K, N/P)^T ---(AR)---> ([B], M, K)
+    #
+    #    GEMM+AR: ([B], M, N) x (K/P, N)^T ----> ([B], M, K/P)
+    #
+    # DGRAD w/ Overlap:
+    #    AG+GEMM w/ DGRAD+RS Overlap: ([B], M, N/P) x (K, N/P)^T ---(RS)---> ([B], M/P, K)
     #
-    #    GEMM+AR:   ([B], M, N) x (K/P, N)^T ------> ([B], M, K/P)
-    #        (DP, None, None) x (TP, None)^T --> (DP, None, TP)
+    #    AG+GEMM w/ Bulk AG Overlap: ([B], M, N/P) x (K, N/P)^T -----> ([B], M, K) (deferred RS)
+    #                                ([B], M, K/P) --(Bulk AG)--> ([B], M, K) (needed in WGRAD)
+    #
+    #    GEMM+RS: ([B], M/P, N) --(AG)--> ([B], M, N) x (K/P, N)^T ----> ([B], M, K/P)
     dgrad, dgelu, _, dgrad_extra_out = gemm_impl(
         grad,
         kernel,
+        out=dgrad_pre_rs,
         gelu_input=pre_gelu_out,
         batched_output=(x.ndim > 2),
         contracting_dims=(-1, kernel_outer_dim),
@@ -238,38 +260,25 @@ def _gemm_bwd_rule(
         comm_overlap_config=dgrad_overlap_config,
     )
 
-    # If dgrad overlapped reduce-scatter, set it to the RS output
-    if dgrad_overlap_config is not None:
-        if (
-            dgrad_overlap_config["method"] != "bulk"
-            and dgrad_overlap_config["comm_type"] == tex.CommOverlapType.RS
-        ):
-            dgrad = dgrad_extra_out
-
-    # Collapse batch dimension for wgrad
-    wgrad_rhs = dgelu if fuse_gelu else grad
-    if x.ndim > 2:
-        # If x was originally transposed, we need to transpose it back in order to collapse
-        # the batch dims correctly.
-        if x_inner_dim == x.ndim - 2:
-            x = jnp.matrix_transpose(x)
-        batch_size = reduce(operator.mul, x.shape[:-2], 1)
-        x = jnp.reshape(x, (batch_size * x.shape[-2], x.shape[-1]))
-        wgrad_rhs = jnp.reshape(wgrad_rhs, (batch_size * wgrad_rhs.shape[-2], wgrad_rhs.shape[-1]))
-
-    # Recover comm+GEMM overlap config for wgrad
-    wgrad_overlap_config = None
-    if comm_overlap_config is not None:
-        wgrad_overlap_name = comm_overlap_config["name"].rstrip("_fprop") + "_wgrad"
-        wgrad_overlap_config = _ACTIVE_COMM_GEMM_OVERLAPS.get(wgrad_overlap_name, None)
+    if (dgrad_overlap_config is not None
+        and dgrad_overlap_config["method"] != "bulk"
+        and dgrad_overlap_config["comm_type"] == tex.CommOverlapType.RS):
+        # Otherwise, if DGRAD overlap is RS overlap, DGRAD output is the extra output tensor
+        dgrad = dgrad_extra_out
 
-    # WGRAD:
+    # WGRAD w/o Overlap:
     #    AG+GEMM: ([B], M/P, K)^T --(AG)--> ([B], M, K)^T x ([B], M, N/P) --> (K, N/P)
-    #  (DP, 'tp', None)^T --(AG)-->(DP, None, None)^T x (DP, None, 'tp') --> (None, 'tp')
     #
-    #    GEMM+AR: ([B], M, K/P)^T --(AG)--> ([B], M, K)^T x ([B], M, N) ---------> (K/P, N)
-    #     (DP, None, 'tp')^T --(AG)--> (DP, None, None)^T x (DP, None, None) ----> (None, None)
-    wgrad_rhs = dgelu if fuse_gelu else grad
+    #    GEMM+AR: ([B], M, K/P)^T --(AG)--> ([B], M, K)^T x ([B], M, N) ---------> (K, N)
+    #
+    # WGRAD w/ Overlap:
+    #    AG+GEMM w/ DGRAD+RS Overlap: ([B], M/P, K)^T --(AG)--> ([B], M, K)^T x ([B], M, N/P) --> (K, N/P)
+    #
+    #    AG+GEMM w/ Bulk Overlaps: ([B], M, K)^T x ([B], M, N/P) --> (K, N/P)
+    #                              ([B], M, K) --(Bulk RS)--> ([B], M/P, K) (finalize DGRAD)
+    #
+    #    GEMM+RS: ([B], M, K/P)^T x ([B], M, N) --> (K/P, N) (re-use all-gathered GRAD from DGRAD)
+    wgrad_rhs = dgelu if fuse_gelu else (grad if comm_overlap_config is None else dgrad_extra_out)
     wgrad, _, bgrad, wgrad_extra_out = gemm_impl(
         x,
         wgrad_rhs,
@@ -284,13 +293,9 @@ def _gemm_bwd_rule(
         comm_overlap_config=wgrad_overlap_config,
     )
 
-    # If wgrad overlapped reduce-scatter, set it to the RS output
     if wgrad_overlap_config is not None:
-        if (
-            wgrad_overlap_config["method"] != "bulk"
-            and wgrad_overlap_config["comm_type"] == tex.CommOverlapType.RS
-        ):
-            wgrad = wgrad_extra_out
+        # DGRAD was reduce-scattered during WGRAD GEMM, so set DGRAD to WGRAD extra output here
+        dgrad = wgrad_extra_out
 
     if not fuse_bias:
         bgrad = None
@@ -362,7 +367,7 @@ def fp8_gemm(
             and comm_overlap_config["method"] != "bulk"
             and comm_overlap_config["comm_type"] == tex.CommOverlapType.AG
         ):
-            copy_into_overlap_buffer(x, comm_overlap_name, tex.CommOverlapType.RS)
+            copy_into_overlap_buffer(x, comm_overlap_name, True)
 
     return _fp8_gemm(
         x,
@@ -526,18 +531,12 @@ def _fp8_gemm_fwd_rule(
 
     # Update returned and saved arrays based on comm+GEMM overlap config
     final_out = out
-    saved_casted_x = casted_x
     if comm_overlap_config is not None:
-        match comm_overlap_config.get("comm_type", None):
-            case tex.CommOverlapType.AG:
-                # AG overlap puts all-gathered global LHS (X) array into extra_out
-                saved_casted_x = extra_out
-            case tex.CommOverlapType.RS:
-                # RS overlap puts the reduce-scattered sharded output into extra_out
-                final_out = extra_out
+        if comm_overlap_config["comm_type"] == tex.CommOverlapType.RS:
+            # RS overlap puts the reduce-scattered sharded output into extra_out
+            final_out = extra_out
 
     ctx = (
-        saved_casted_x,
         casted_x_t,
         casted_kernel,
         amax_list,
@@ -820,29 +819,59 @@ def type_safe_gemm(
 
 def initialize_comm_gemm_overlaps(
     buffer_shape: Sequence[int],
-    buffer_dtype: jnp.dtype,
-    mesh: Optional[jax.sharding.Mesh] = None,
-    tp_resource: Optional[str] = None,
-    use_fp8: bool = False,
-    overlap_configs: Optional[dict] = None,
+    mesh: jax.sharding.Mesh,
+    myrank: int,
+    numranks: int,
+    **kwargs: Optional[dict],
 ) -> None:
+    """
+    Initialize Comm+GEMM overlap communicators and buffers.
+
+    .. warning::
+       Communication buffer allocations for this functionality are outside the XLA memory pool
+       and can cause OOM errors if XLA's memory margin is not reduced.
+
+    Parameters
+    ----------
+    buffer_shape : Sequence[int]
+        Shape of the communication buffer. This should be sized to match the global shape of the
+        input/activation tensor.
+    mesh : jax.sharding.Mesh
+        JAX Mesh with a `tp_resource` axis.
+    myrank: int
+        Global rank of the calling process.
+    numranks: int
+        Global number of processes.
+    tp_resource : Optional[str] = None
+        Tensor-parallel mesh axis name. If not given, defaults to the TP resource in the global
+        te.sharding.MeshResource context.
+    tp_size : Optional[int] = None
+        Size of the tensor-parallel axis in the mesh. If not given, defaults to the size of the
+        tensor-parallel axis in `jax.interpreters.pxla.thread_resources`.
+    use_fp8 : bool = False
+        Flag for allocating an FP8 communication buffer. This is not supported for reduce-scatter
+        overlaps with the `pipeline` method.
+    overlap_configs: Optional[dict] = None,
+        Dictionary of configs for comm+GEMM overlaps by layer name.
+    """
     assert tex.ubuf_built_with_mpi(), (
         "Comm+GEMM overlap in TE/JAX requires Transformer Engine to be compiled with "
-        + "`NVTE_UB_WITH_MPI=1` and `MPI_HOME=/path/to/mpi` options."
+        + "`NVTE_UB_WITH_MPI=1` and `MPI_HOME=/path/to/mpi` variables."
     )
     if not tex.device_supports_multicast():
         assert bool(int(os.getenv("UB_SKIPMC", "0"))), (
             "CUDA device, driver and/or toolkit version does not support comm+GEMM overlap with "
-            + "CUDA Multicast. Launch app with UB_SKIPMC=1 to try CUDA IPC instead."
+            + "CUDA Multicast. Launch with UB_SKIPMC=1 to try CUDA IPC instead."
         )
-
-    # Get # of devices in the mesh axis for comm+GEMM overlap
-    tp_resource = global_mesh_resource().tp_resource if tp_resource is None else tp_resource
-    tp_size = get_mesh_axis_size(tp_resource, mesh=mesh)
+    # Extract kwargs
+    tp_resource = kwargs.get("tp_resource", global_mesh_resource().tp_resource)
+    tp_size = kwargs.get("tp_size", get_mesh_axis_size(tp_resource, mesh=mesh))
+    use_fp8 = kwargs.get("use_fp8", False)
+    overlap_configs = kwargs.get("overlap_configs", None)
 
     # Layers that support comm+GEMM overlap
     layers_all_gather_overlap = [
-        "generic_ag",
+        "ag_gemm",
         "qkv_fprop",
         "qkv_dgrad",
         "proj_dgrad",
@@ -851,7 +880,7 @@ def initialize_comm_gemm_overlaps(
         "fc2_dgrad",
     ]
     layers_reduce_scatter_overlap = [
-        "generic_rs",
+        "gemm_rs",
         "proj_fprop",
         "fc2_fprop",
         "qkv_wgrad",
@@ -862,8 +891,8 @@ def initialize_comm_gemm_overlaps(
     # Default overlap methods for layers
     methods = {
         "ring_exchange": [
-            "generic_ag",
-            "generic_rs",
+            "ag_gemm",
+            "gemm_rs",
             "qkv_fprop",
             "fc1_fprop",
             "proj_dgrad",
@@ -874,7 +903,10 @@ def initialize_comm_gemm_overlaps(
     }
 
     # AG-RS overlap pairs of layers forming a tensor-parallel block
-    ag_rs_pairs = {"qkv_fprop": "proj_fprop", "fc1_fprop": "fc2_fprop"}
+    ag_rs_pairs = {
+        "qkv_fprop": "proj_fprop",
+        "fc1_fprop": "fc2_fprop",
+    }
     rs_ag_pairs = {v: k for k, v in ag_rs_pairs.items()}
     global layers_atomic_ring_exchange
     layers_atomic_ring_exchange = []
@@ -888,11 +920,16 @@ def get_method(name):
     def get_default_config(name):
         method = get_method(name)
         default_cfg = {
+            "mesh": mesh,
+            "tp_resource": tp_resource,
+            "tp_size": tp_size,
+            "name": name,
             "method": method,
             "comm_type": (
                 tex.CommOverlapType.AG if name in layers_all_gather_overlap else tex.CommOverlap.RS
             ),
             "num_sm": 1 if method == "ring_exchange" else 16,
+            "num_max_streams": _NUM_MAX_UB_STREAMS,
             "cga_size": 1 if method == "ring_exchange" else 2,
             "set_sm_margin": False,
             "num_splits": 4 if method == "pipeline" else tp_size,
@@ -905,76 +942,75 @@ def get_default_config(name):
         return default_cfg
 
     def add_new_comm_gemm_overlap(
-        name: str,
-        method: str,
         shape: Sequence[int],
-        dtype: jnp.dtype,
-        comm_type: tex.CommOverlapType,
-        num_sm: int = 16,
-        cga_size: int = 2,
-        set_sm_margin: bool = False,
-        num_splits: int = 4,
-        aggregate: bool = False,
-        atomic_gemm: bool = False,
-        pipeline_rs_overlap_first_gemm: bool = False,
-        use_ce: bool = True,
-        fp8_buf: bool = False,
+        kwargs: dict,
     ) -> None:
+        overlap_name = kwargs["name"]
         assert (
-            name not in _ACTIVE_COMM_GEMM_OVERLAPS
-        ), "Duplicate initialization for `{name}` overlap!"
+            overlap_name not in _ACTIVE_COMM_GEMM_OVERLAPS
+        ), f"Duplicate initialization for `{overlap_name}` overlap!"
 
-        if atomic_gemm:
+        overlap_method = kwargs["method"]
+        overlap_atomic_gemm = kwargs["atomic_gemm"]
+        if overlap_atomic_gemm:
             warnings.warn(
                 "Atomic GEMM uses a beta API from cublas and is not tested for all use cases."
             )
             assert use_fp8, "Atomic GEMM overlap supported only for FP8 GEMM."
-            if method == "bulk":
+            if overlap_method == "bulk":
                 warnings.warn(
-                    f"At {name}, atoimic GEMM not is supported for a bulk overlap."
+                    f"At {overlap_name}, atoimic GEMM not is supported for a bulk overlap."
                     "Defaulting to `atomic_gemm=False`."
                 )
-                atomic_gemm = False
-        if method == "pipeline" and comm_type == tex.CommOverlapType.AG:
+                overlap_atomic_gemm = False
+        kwargs["atomic_gemm"] = overlap_atomic_gemm
+        if overlap_method == "pipeline" and kwargs["comm_type"] == tex.CommOverlapType.AG:
             raise ValueError(
-                f"At {name}, `pipeline` overlap method is not supported for AllGather."
+                f"At {overlap_name}, `pipeline` overlap method is not supported for AllGather."
             )
         # Check if both AG and RS overlaps use `atomic GEMM`` + `p2p ring-exchange`.
         # Using atomic GEMM + p2p ring-exchange in only one of the pair breaks functionality.
         global layers_atomic_ring_exchange
-        if atomic_gemm and method == "ring_exchange" and name in ag_rs_pairs:
-            layers_atomic_ring_exchange += [name, ag_rs_pairs[name]]
-        if name in rs_ag_pairs:
+        if (overlap_atomic_gemm
+            and overlap_method == "ring_exchange"
+            and overlap_name in ag_rs_pairs):
+            layers_atomic_ring_exchange += [overlap_name, ag_rs_pairs[overlap_name]]
+        if overlap_name in rs_ag_pairs:
             assert_message = (
-                f"At {name}, atomic AG-GEMM overlap with `ring_exchange` shuffles GEMM chunk "
-                "outputs, and  RS-GEMM overlap un-suffle them. When one of the GEMM-AG and "
+                f"At {overlap_name}, atomic AG-GEMM overlap with `ring_exchange` shuffles GEMM "
+                "chunk outputs, and  RS-GEMM overlap un-suffle them. When one of the GEMM-AG and "
                 "GEMM-RS overlaps forming a TP block (e.g., qkv_fprop and proj_fprop) uses "
                 "`atomic gemm` and `ring_exhcnage`, its pair must use the same overlap config "
                 "for functionality."
             )
-            if name in layers_atomic_ring_exchange:
-                assert atomic_gemm and method == "ring_exchange", assert_message
+            if overlap_name in layers_atomic_ring_exchange:
+                assert overlap_atomic_gemm and overlap_method == "ring_exchange", assert_message
             else:
-                if atomic_gemm and method == "ring_exchange":
-                    assert rs_ag_pairs[name] in layers_atomic_ring_exchange, assert_message
+                if overlap_atomic_gemm and overlap_method == "ring_exchange":
+                    assert (
+                        rs_ag_pairs[overlap_name] in layers_atomic_ring_exchange
+                    ), assert_message
 
-        dtype = jnp.uint8 if (use_fp8 and fp8_buf) else dtype
+        # Reduce buffer shape to 2D here in case the user initialized with batch dims
+        buffer_shape = (reduce(operator.mul, shape[:-1], 1), shape[-1])
         tex.bootstrap_comm_gemm_overlap(
-            name,
-            method,
-            shape,
-            jax_dtype_to_te_dtype(dtype),
-            comm_type,
+            buffer_shape,
+            jax_dtype_to_te_dtype(jnp.uint8 if (use_fp8 and fp8_buf) else jnp.bfloat16),
+            overlap_name,
+            overlap_method,
+            kwargs["comm_type"],
+            myrank,
+            numranks,
             tp_size,
-            num_splits,
+            kwargs["num_splits"],
             _NUM_MAX_UB_STREAMS,
-            cga_size,
-            num_sm,
-            set_sm_margin,
-            use_ce,
-            atomic_gemm,
-            aggregate,
-            pipeline_rs_overlap_first_gemm,
+            kwargs["cga_size"],
+            kwargs["num_sm"],
+            kwargs["set_sm_margin"],
+            kwargs["use_ce"],
+            overlap_atomic_gemm,
+            kwargs["aggregate"],
+            kwargs["pipeline_rs_overlap_first_gemm"],
         )
 
     if overlap_configs is not None:
@@ -998,17 +1034,25 @@ def add_new_comm_gemm_overlap(
     for name in methods["ring_exchange"] + methods["pipeline"] + methods["bulk"]:
         if overlap_configs is not None and name in overlap_configs:
             fp8_buf = (name in layers_all_gather_overlap) or (
-                overlap_configs[name].get("fp8_buf", False) and name in methods["pipeline"]
+                overlap_configs[name].get("fp8_buf", False) and name not in methods["pipeline"]
             )
-            default_config = get_default_config(name)
-            final_config = default_config.update(overlap_configs[name])
+            final_config = get_default_config(name)
+            final_config.update(overlap_configs[name])
             final_config["fp8_buf"] = fp8_buf
-            add_new_comm_gemm_overlap(name, buffer_shape, buffer_dtype, **final_config)
-            _ACTIVE_COMM_GEMM_OVERLAPS.update({name: final_config})
+            add_new_comm_gemm_overlap(buffer_shape, final_config)
+            _ACTIVE_COMM_GEMM_OVERLAPS[name] = final_config
 
 
 def destroy_comm_gemm_overlaps():
+    global _ACTIVE_COMM_GEMM_OVERLAPS
     for name in _ACTIVE_COMM_GEMM_OVERLAPS:
         tex.destroy_comm_gemm_overlap(name)
-        _ACTIVE_COMM_GEMM_OVERLAPS.pop(name)
     _ACTIVE_COMM_GEMM_OVERLAPS = dict()
+
+
+def get_comm_overlap_config(name):
+    global _ACTIVE_COMM_GEMM_OVERLAPS
+    assert name in _ACTIVE_COMM_GEMM_OVERLAPS, (
+        f"Comm+GEMM overlap for '{name}' has not been initialized!"
+    )
+    return _ACTIVE_COMM_GEMM_OVERLAPS[name]

From aa16726307aadbf64193796f77eb113778f5d0a3 Mon Sep 17 00:00:00 2001
From: "pre-commit-ci[bot]"
 <66853113+pre-commit-ci[bot]@users.noreply.github.com>
Date: Tue, 3 Dec 2024 14:09:06 +0000
Subject: [PATCH 22/32] [pre-commit.ci] auto fixes from pre-commit.com hooks

for more information, see https://pre-commit.ci
---
 transformer_engine/jax/cpp_extensions/gemm.py | 127 ++++++++++--------
 transformer_engine/jax/csrc/extensions.h      |  61 +++++----
 .../jax/csrc/extensions/comm_gemm_overlap.cpp |  75 ++++++-----
 .../jax/csrc/extensions/gemm.cpp              |  17 ++-
 transformer_engine/jax/gemm.py                |  34 +++--
 5 files changed, 171 insertions(+), 143 deletions(-)

diff --git a/transformer_engine/jax/cpp_extensions/gemm.py b/transformer_engine/jax/cpp_extensions/gemm.py
index 59bf28434d..b43c644a51 100644
--- a/transformer_engine/jax/cpp_extensions/gemm.py
+++ b/transformer_engine/jax/cpp_extensions/gemm.py
@@ -149,9 +149,9 @@ def abstract(
             (lhs_aval.ndim, rhs_aval.ndim),
         )
         if lhs_aval.ndim > 2 and rhs_aval.ndim > 2:
-            assert not batched_output, (
-                "Batched output requires batched LHS and non-batched RHS operands."
-            )
+            assert (
+                not batched_output
+            ), "Batched output requires batched LHS and non-batched RHS operands."
             lhs_bdims = [
                 dim for dim in range(lhs_aval.ndim) if dim not in [lhs_outer_dim, lhs_inner_dim]
             ]
@@ -170,17 +170,17 @@ def abstract(
         # Validate output dtypes
         out_dtype = dtypes.canonicalize_dtype(out_aval.dtype)
         if jax_dtype_is_fp8(out_dtype):
-            assert jax_dtype_is_fp8(lhs_dtype) and jax_dtype_is_fp8(rhs_dtype), (
-                "FP8 GEMM output requires FP8 inputs."
-            )
-            assert out_amax_aval.size == out_scale_aval.size == 1, (
-                "Invalid/missing output amax and scale."
-            )
+            assert jax_dtype_is_fp8(lhs_dtype) and jax_dtype_is_fp8(
+                rhs_dtype
+            ), "FP8 GEMM output requires FP8 inputs."
+            assert (
+                out_amax_aval.size == out_scale_aval.size == 1
+            ), "Invalid/missing output amax and scale."
             out_amax_updated_dtype = dtypes.canonicalize_dtype(out_amax_aval.dtype)
             out_scale_updated_dtype = dtypes.canonicalize_dtype(out_scale_aval.dtype)
-            assert out_amax_updated_dtype == out_scale_updated_dtype == jnp.float32, (
-                "Invalid output amax or scale dtype."
-            )
+            assert (
+                out_amax_updated_dtype == out_scale_updated_dtype == jnp.float32
+            ), "Invalid output amax or scale dtype."
         else:
             assert out_dtype == lhs_dtype, (
                 "Output buffer has incorrect dtype: "
@@ -194,7 +194,7 @@ def abstract(
         expected_out_shape = [
             *lhs_aval.shape[:-2],
             lhs_aval.shape[lhs_outer_dim],
-            rhs_aval.shape[rhs_outer_dim]
+            rhs_aval.shape[rhs_outer_dim],
         ]
         extra_out_shape = extra_out_aval.shape
         expected_extra_out_shape = [0]
@@ -203,18 +203,19 @@ def abstract(
         if batched_output:
             assert out_aval.ndim > 2, "Batched output buffer is missing batch dimensions."
         else:
-            expected_out_shape = [reduce(operator.mul, expected_out_shape[:-1], 1),
-                                  expected_out_shape[-1]]
+            expected_out_shape = [
+                reduce(operator.mul, expected_out_shape[:-1], 1),
+                expected_out_shape[-1],
+            ]
 
-        if (comm_overlap_config is not None
-            and comm_overlap_config["method"] != "bulk"):
+        if comm_overlap_config is not None and comm_overlap_config["method"] != "bulk":
             comm_type = comm_overlap_config.get("comm_type", None)
             assert comm_type is not None, "Missing comm type for comm+GEMM overlap."
 
             tp_size = comm_overlap_config.get("tp_size", 1)
-            assert tp_size > 1, (
-                "Comm+GEMM overlap requires tensor-parallel mesh axis size greater than 1."
-            )
+            assert (
+                tp_size > 1
+            ), "Comm+GEMM overlap requires tensor-parallel mesh axis size greater than 1."
 
             if comm_type == tex.CommOverlapType.AG:
                 expected_extra_out_shape = list(lhs_aval.shape).copy()
@@ -246,8 +247,12 @@ def abstract(
             "Extra output buffer has incorrect number of dimensions: "
             + f"expected {len(expected_extra_out_shape)} but found {extra_out_aval.ndim}"
         )
-        assert all([extra_out_aval.shape[i] == expected_extra_out_shape[i]
-                    for i in range(extra_out_aval.ndim)]), (
+        assert all(
+            [
+                extra_out_aval.shape[i] == expected_extra_out_shape[i]
+                for i in range(extra_out_aval.ndim)
+            ]
+        ), (
             "Extra output buffer has incorrect shape: "
             + f"expected {expected_extra_out_shape=} but found {extra_out_aval.shape=}"
         )
@@ -350,7 +355,7 @@ def lowering(
         accumulate,
         use_split_accumulator,
         comm_overlap_config,
-        sharded_abstract
+        sharded_abstract,
     ):
         """
         Fused attention fwd lowering rules
@@ -586,7 +591,7 @@ def impl(
             out_shape = (
                 *lhs_batch_shape,
                 out_updated.shape[-2] // lhs_batch_size,
-                out_updated.shape[-1]
+                out_updated.shape[-1],
             )
             out_updated = jax.lax.reshape(out_updated, out_shape)
 
@@ -594,7 +599,7 @@ def impl(
             extra_out_shape = (
                 *lhs_batch_shape,
                 extra_out_updated.shape[-2] // lhs_batch_size,
-                extra_out_updated.shape[-1]
+                extra_out_updated.shape[-1],
             )
             extra_out_updated = jax.lax.reshape(extra_out_updated, extra_out_shape)
 
@@ -653,7 +658,7 @@ def batcher(
                 out_scale_bdims,
                 gelu_input_bdims,
                 bias_bdims,
-                extra_out_bdims
+                extra_out_bdims,
             ),
         )
 
@@ -879,8 +884,9 @@ def partition(
                 extra_out_spec[lhs_outer_dim] = None
             elif comm_type == tex.CommOverlapType.RS:
                 extra_out_spec = list(out_spec).copy()
-                extra_out_spec[-2] = comm_overlap_config.get("tp_resource",
-                                                             global_mesh_resource().tp_resource)
+                extra_out_spec[-2] = comm_overlap_config.get(
+                    "tp_resource", global_mesh_resource().tp_resource
+                )
         extra_out_sharding = NamedSharding(mesh, PartitionSpec(*extra_out_spec))
 
         arg_shardings = (
@@ -905,7 +911,15 @@ def partition(
         )
 
         def sharded_impl(
-            lhs, lhs_scale_inv, rhs, rhs_scale_inv, bias, gelu_input, out, out_amax, out_scale,
+            lhs,
+            lhs_scale_inv,
+            rhs,
+            rhs_scale_inv,
+            bias,
+            gelu_input,
+            out,
+            out_amax,
+            out_scale,
             extra_out,
         ):
             (
@@ -953,7 +967,7 @@ def sharded_impl(
                 out_scale_updated,
                 pre_gelu_out,
                 bias_grad,
-                extra_out_updated
+                extra_out_updated,
             )
 
         return mesh, sharded_impl, out_shardings, arg_shardings
@@ -993,8 +1007,7 @@ def gemm_impl(
 
     if extra_out is None:
         extra_out_shape = 0
-        if (comm_overlap_config is not None
-            and comm_overlap_config["method"] != "bulk"):
+        if comm_overlap_config is not None and comm_overlap_config["method"] != "bulk":
             comm_type = comm_overlap_config["comm_type"]
             if comm_type == tex.CommOverlapType.AG:
                 extra_out_shape = list(lhs.shape).copy()
@@ -1012,9 +1025,9 @@ def gemm_impl(
     if not fuse_gelu:
         gelu_input = jnp.zeros(0, dtype=lhs.dtype)
     elif grad:
-        assert gelu_input is not None, (
-            "Backward GEMM with dGELU epilogue requires pre-GELU output from forward GEMM."
-        )
+        assert (
+            gelu_input is not None
+        ), "Backward GEMM with dGELU epilogue requires pre-GELU output from forward GEMM."
     elif gelu_input is None:
         gelu_input = jnp.zeros(out_shape_2d, dtype=lhs.dtype)
 
@@ -1083,8 +1096,7 @@ def fp8_gemm_impl(
 
     if extra_out is None:
         extra_out_shape = 0
-        if (comm_overlap_config is not None
-            and comm_overlap_config["method"] != "bulk"):
+        if comm_overlap_config is not None and comm_overlap_config["method"] != "bulk":
             comm_type = comm_overlap_config["comm_type"]
             if comm_type == tex.CommOverlapType.AG:
                 extra_out_shape = list(lhs.shape).copy()
@@ -1151,9 +1163,9 @@ def abstract(buffer_aval, myrank, numranks, comm_overlap_config):
         del myrank, numranks
         assert is_ffi_enabled(), "Comm+GEMM overlap is supported only via XLA FFI."
         overlap_name = comm_overlap_config.get("name", None)
-        assert overlap_name in _COMM_GEMM_OVERLAP_NAMES, (
-            f"Unrecognized comm+GEMM overlap name: {overlap_name=}"
-        )
+        assert (
+            overlap_name in _COMM_GEMM_OVERLAP_NAMES
+        ), f"Unrecognized comm+GEMM overlap name: {overlap_name=}"
         assert buffer_aval.size > 0, "Cannot initialize a zero-size communication buffer."
         return jax.core.ShapedArray(shape=(0,), dtype=dtypes.canonicalize_dtype(buffer_aval.dtype))
 
@@ -1185,7 +1197,10 @@ def impl(buffer, myrank, numranks, comm_overlap_config):
             buffer, (reduce(operator.mul, buffer.shape[:-1], 1), buffer.shape[-1])
         )
         return BootstrapCommGemmOverlapPrimitive.inner_primitive.bind(
-            buffer, myrank=myrank, numranks=numranks, comm_overlap_config=comm_overlap_config,
+            buffer,
+            myrank=myrank,
+            numranks=numranks,
+            comm_overlap_config=comm_overlap_config,
         )
 
     @staticmethod
@@ -1194,14 +1209,18 @@ def batcher(batched_args, batch_dims, *, myrank, numranks, comm_overlap_config):
         check_valid_batch_dims(batch_dims)
         return (
             BootstrapCommGemmOverlapPrimitive.inner_primitive.bind(
-                *batched_args, myrank=myrank, numranks=numranks, comm_overlap_config=comm_overlap_config
+                *batched_args,
+                myrank=myrank,
+                numranks=numranks,
+                comm_overlap_config=comm_overlap_config,
             ),
             None,
         )
 
     @staticmethod
-    def infer_sharding_from_operands(myrank, numranks, comm_overlap_config, mesh, arg_infos,
-                                     result_infos):
+    def infer_sharding_from_operands(
+        myrank, numranks, comm_overlap_config, mesh, arg_infos, result_infos
+    ):
         del myrank, numranks, comm_overlap_config, result_infos
         buffer_spec = get_padded_spec(arg_infos[0])
         assert all([spec is None for spec in buffer_spec]), "Sample buffer must be unsharded."
@@ -1214,10 +1233,12 @@ def partition(myrank, numranks, comm_overlap_config, mesh, arg_infos, result_inf
         out_sharding = NamedSharding(mesh, PartitionSpec(None))
         return (
             mesh,
-            partial(BootstrapCommGemmOverlapPrimitive.impl,
-                    myrank=myrank,
-                    numranks=numranks,
-                    comm_overlap_config=comm_overlap_config),
+            partial(
+                BootstrapCommGemmOverlapPrimitive.impl,
+                myrank=myrank,
+                numranks=numranks,
+                comm_overlap_config=comm_overlap_config,
+            ),
             out_sharding,
             arg_shardings,
         )
@@ -1227,16 +1248,10 @@ def partition(myrank, numranks, comm_overlap_config, mesh, arg_infos, result_inf
 
 
 def bootstrap_comm_gemm_overlap(
-    buffer: ArrayLike,
-    myrank: int,
-    numranks: int,
-    comm_overlap_config: dict
+    buffer: ArrayLike, myrank: int, numranks: int, comm_overlap_config: dict
 ):
     _ = BootstrapCommGemmOverlapPrimitive.outer_primitive.bind(
-        buffer,
-        myrank=myrank,
-        numranks=numranks,
-        comm_overlap_config=comm_overlap_config
+        buffer, myrank=myrank, numranks=numranks, comm_overlap_config=comm_overlap_config
     )
 
 
diff --git a/transformer_engine/jax/csrc/extensions.h b/transformer_engine/jax/csrc/extensions.h
index fd0786a040..6bc6d02173 100644
--- a/transformer_engine/jax/csrc/extensions.h
+++ b/transformer_engine/jax/csrc/extensions.h
@@ -338,14 +338,14 @@ XLA_FFI_DECLARE_HANDLER_SYMBOL(CublasltHandleInitHandler);
 
 void Gemm(cudaStream_t stream, void **buffers, const char *opaque, size_t opaque_len);
 
-Error_Type GemmFFI(
-    cudaStream_t stream, Buffer_Type lhs, Buffer_Type lhs_scale_inv, Buffer_Type rhs,
-    Buffer_Type rhs_scale_inv, Buffer_Type bias, Buffer_Type gelu_input, Buffer_Type out,
-    Buffer_Type out_amax, Buffer_Type out_scale, Buffer_Type dummy_in, Result_Type out_updated,
-    Result_Type out_amax_updated, Result_Type out_scale_updated, Result_Type pre_gelu_out,
-    Result_Type bias_grad, Result_Type dummy_out, Result_Type workspace, bool lhs_trans,
-    bool rhs_trans, bool fuse_gelu, bool fuse_bias, bool grad, bool accumulate,
-    bool use_split_accumulator);
+Error_Type GemmFFI(cudaStream_t stream, Buffer_Type lhs, Buffer_Type lhs_scale_inv, Buffer_Type rhs,
+                   Buffer_Type rhs_scale_inv, Buffer_Type bias, Buffer_Type gelu_input,
+                   Buffer_Type out, Buffer_Type out_amax, Buffer_Type out_scale,
+                   Buffer_Type dummy_in, Result_Type out_updated, Result_Type out_amax_updated,
+                   Result_Type out_scale_updated, Result_Type pre_gelu_out, Result_Type bias_grad,
+                   Result_Type dummy_out, Result_Type workspace, bool lhs_trans, bool rhs_trans,
+                   bool fuse_gelu, bool fuse_bias, bool grad, bool accumulate,
+                   bool use_split_accumulator);
 
 XLA_FFI_DECLARE_HANDLER_SYMBOL(GemmHandler);
 
@@ -357,18 +357,21 @@ pybind11::object GetOverlapBuffer(const std::string &name, bool sharded);
 
 void SetOverlapBufferScaleInverse(const std::string &name, pybind11::object scale_inv, bool grad);
 
-void BootstrapCommGemmOverlap(
-    const std::vector<size_t> &buffer_shape, DType buffer_dtype, const std::string &name,
-    const std::string &method, CommOverlapType comm_type, int64_t myrank, int64_t numranks,
-    int64_t tp_size, int64_t num_splits, int64_t num_max_streams, int64_t cga_size,
-    int64_t num_comm_sm, bool set_sm_margin, bool use_ce, bool atomic_gemm, bool aggregate,
-    bool pipeline_rs_overlap_first_gemm);
-
-Error_Type BootstrapCommGemmOverlapFFI(
-    cudaStream_t, Buffer_Type sample_buffer, std::string_view name, std::string_view method,
-    int64_t comm_type_flag, int64_t myrank, int64_t numranks, int64_t tp_size, int64_t num_splits,
-    int64_t num_max_streams, int64_t cga_size, int64_t num_comm_sm, bool set_sm_margin,
-    bool use_ce, bool atomic_gemm, bool aggregate, bool pipeline_rs_overlap_first_gemm);
+void BootstrapCommGemmOverlap(const std::vector<size_t> &buffer_shape, DType buffer_dtype,
+                              const std::string &name, const std::string &method,
+                              CommOverlapType comm_type, int64_t myrank, int64_t numranks,
+                              int64_t tp_size, int64_t num_splits, int64_t num_max_streams,
+                              int64_t cga_size, int64_t num_comm_sm, bool set_sm_margin,
+                              bool use_ce, bool atomic_gemm, bool aggregate,
+                              bool pipeline_rs_overlap_first_gemm);
+
+Error_Type BootstrapCommGemmOverlapFFI(cudaStream_t, Buffer_Type sample_buffer,
+                                       std::string_view name, std::string_view method,
+                                       int64_t comm_type_flag, int64_t myrank, int64_t numranks,
+                                       int64_t tp_size, int64_t num_splits, int64_t num_max_streams,
+                                       int64_t cga_size, int64_t num_comm_sm, bool set_sm_margin,
+                                       bool use_ce, bool atomic_gemm, bool aggregate,
+                                       bool pipeline_rs_overlap_first_gemm);
 
 XLA_FFI_DECLARE_HANDLER_SYMBOL(BootstrapCommGemmOverlapHandler);
 
@@ -383,14 +386,16 @@ Error_Type CopyIntoOverlapBufferFFI(cudaStream_t stream, Buffer_Type input, std:
 
 XLA_FFI_DECLARE_HANDLER_SYMBOL(CopyIntoOverlapBufferHandler);
 
-Error_Type CommGemmOverlapFFI(
-    cudaStream_t stream, Buffer_Type lhs, Buffer_Type lhs_scale_inv, Buffer_Type rhs,
-    Buffer_Type rhs_scale_inv, Buffer_Type bias, Buffer_Type gelu_input, Buffer_Type out,
-    Buffer_Type out_amax, Buffer_Type out_scale, Buffer_Type extra_out, Result_Type out_updated,
-    Result_Type out_amax_updated, Result_Type out_scale_updated, Result_Type pre_gelu_out,
-    Result_Type bias_grad, Result_Type extra_out_updated, Result_Type workspace, bool lhs_trans,
-    bool rhs_trans, bool fuse_gelu, bool fuse_bias, bool grad, bool accumulate,
-    bool use_split_accumulator, int64_t comm_type_flag, std::string_view name);
+Error_Type CommGemmOverlapFFI(cudaStream_t stream, Buffer_Type lhs, Buffer_Type lhs_scale_inv,
+                              Buffer_Type rhs, Buffer_Type rhs_scale_inv, Buffer_Type bias,
+                              Buffer_Type gelu_input, Buffer_Type out, Buffer_Type out_amax,
+                              Buffer_Type out_scale, Buffer_Type extra_out, Result_Type out_updated,
+                              Result_Type out_amax_updated, Result_Type out_scale_updated,
+                              Result_Type pre_gelu_out, Result_Type bias_grad,
+                              Result_Type extra_out_updated, Result_Type workspace, bool lhs_trans,
+                              bool rhs_trans, bool fuse_gelu, bool fuse_bias, bool grad,
+                              bool accumulate, bool use_split_accumulator, int64_t comm_type_flag,
+                              std::string_view name);
 
 XLA_FFI_DECLARE_HANDLER_SYMBOL(CommGemmOverlapHandler);
 
diff --git a/transformer_engine/jax/csrc/extensions/comm_gemm_overlap.cpp b/transformer_engine/jax/csrc/extensions/comm_gemm_overlap.cpp
index d6f5daaa80..533fdc3e83 100644
--- a/transformer_engine/jax/csrc/extensions/comm_gemm_overlap.cpp
+++ b/transformer_engine/jax/csrc/extensions/comm_gemm_overlap.cpp
@@ -17,7 +17,7 @@ namespace transformer_engine {
 namespace jax {
 
 Error_Type CublasltHandleInitFFI(Variadic_Buffer_Type args, Variadic_Result_Type rets,
-                              Dictionary attrs) {
+                                 Dictionary attrs) {
   cublasLtHandle_t handle;
   NVTE_CHECK_CUBLAS(cublasLtCreate(&handle));
   return ffi_with_cuda_error_check();
@@ -42,12 +42,13 @@ pybind11::object GetOverlapBuffer(const std::string &name, bool sharded) {
   return capsule;
 };
 
-void BootstrapCommGemmOverlap(
-    const std::vector<size_t> &buffer_shape, DType buffer_dtype, const std::string &name,
-    const std::string &method, CommOverlapType comm_type, int64_t myrank, int64_t numranks,
-    int64_t tp_size, int64_t num_splits, int64_t num_max_streams, int64_t comm_cga_size,
-    int64_t num_comm_sm, bool set_sm_margin, bool use_ce, bool atomic_gemm, bool aggregate,
-    bool pipeline_rs_overlap_first_gemm) {
+void BootstrapCommGemmOverlap(const std::vector<size_t> &buffer_shape, DType buffer_dtype,
+                              const std::string &name, const std::string &method,
+                              CommOverlapType comm_type, int64_t myrank, int64_t numranks,
+                              int64_t tp_size, int64_t num_splits, int64_t num_max_streams,
+                              int64_t comm_cga_size, int64_t num_comm_sm, bool set_sm_margin,
+                              bool use_ce, bool atomic_gemm, bool aggregate,
+                              bool pipeline_rs_overlap_first_gemm) {
 #ifndef NVTE_UB_WITH_MPI
   NVTE_ERROR(
       std::string("Comm+GEMM overlap in TE/JAX requires bootstrapping Userbuffers with MPI. ") +
@@ -57,10 +58,10 @@ void BootstrapCommGemmOverlap(
   // Initialize overlap object -- this allocates the comm buffer
   NVTE_CHECK(_overlaps.find(name) == _overlaps.end(), name, " is already initialized!");
   if (method == "ring_exchange") {
-    _overlaps[name] = new CommOverlapP2PBase(
-        buffer_shape, buffer_dtype, myrank, numranks, -1, -1, -1, -1, tp_size, &_dummy_allgather,
-        &_dummy_barrier, comm_type, num_max_streams, comm_cga_size, num_comm_sm, set_sm_margin,
-        use_ce, atomic_gemm, aggregate);
+    _overlaps[name] = new CommOverlapP2PBase(buffer_shape, buffer_dtype, myrank, numranks, -1, -1,
+                                             -1, -1, tp_size, &_dummy_allgather, &_dummy_barrier,
+                                             comm_type, num_max_streams, comm_cga_size, num_comm_sm,
+                                             set_sm_margin, use_ce, atomic_gemm, aggregate);
   } else {
     _overlaps[name] = new CommOverlapBase(
         buffer_shape, buffer_dtype, myrank, numranks, -1, -1, -1, -1, tp_size, &_dummy_allgather,
@@ -69,19 +70,21 @@ void BootstrapCommGemmOverlap(
   }
 };
 
-Error_Type BootstrapCommGemmOverlapFFI(
-    cudaStream_t, Buffer_Type sample_buffer, std::string_view name, std::string_view method,
-    int64_t comm_type_flag, int64_t myrank, int64_t numranks, int64_t tp_size, int64_t num_splits,
-    int64_t num_max_streams, int64_t cga_size, int64_t num_comm_sm, bool set_sm_margin,
-    bool use_ce, bool atomic_gemm, bool aggregate, bool pipeline_rs_overlap_first_gemm) {
-  auto buffer_shape = std::vector<size_t>(sample_buffer.dimensions().begin(),
-                                          sample_buffer.dimensions().end());
+Error_Type BootstrapCommGemmOverlapFFI(cudaStream_t, Buffer_Type sample_buffer,
+                                       std::string_view name, std::string_view method,
+                                       int64_t comm_type_flag, int64_t myrank, int64_t numranks,
+                                       int64_t tp_size, int64_t num_splits, int64_t num_max_streams,
+                                       int64_t cga_size, int64_t num_comm_sm, bool set_sm_margin,
+                                       bool use_ce, bool atomic_gemm, bool aggregate,
+                                       bool pipeline_rs_overlap_first_gemm) {
+  auto buffer_shape =
+      std::vector<size_t>(sample_buffer.dimensions().begin(), sample_buffer.dimensions().end());
   auto buffer_dtype = convert_ffi_datatype_to_te_dtype(sample_buffer.element_type());
-  BootstrapCommGemmOverlap(
-      buffer_shape, buffer_dtype, static_cast<std::string>(name), static_cast<std::string>(method),
-      static_cast<CommOverlapType>(comm_type_flag), myrank, numranks, tp_size, num_splits,
-      num_max_streams, cga_size, num_comm_sm, set_sm_margin, use_ce, atomic_gemm, aggregate,
-      pipeline_rs_overlap_first_gemm);
+  BootstrapCommGemmOverlap(buffer_shape, buffer_dtype, static_cast<std::string>(name),
+                           static_cast<std::string>(method),
+                           static_cast<CommOverlapType>(comm_type_flag), myrank, numranks, tp_size,
+                           num_splits, num_max_streams, cga_size, num_comm_sm, set_sm_margin,
+                           use_ce, atomic_gemm, aggregate, pipeline_rs_overlap_first_gemm);
   return ffi_with_cuda_error_check();
 }
 
@@ -120,9 +123,7 @@ Error_Type DestroyCommGemmOverlapFFI(cudaStream_t stream, std::string_view name)
 }
 
 XLA_FFI_DEFINE_HANDLER_SYMBOL(DestroyComMGemmOverlapHandler, DestroyCommGemmOverlapFFI,
-                              FFI::Bind()
-                                   .Ctx<FFI_Stream_Type>()
-                                   .Attr<std::string_view>("name"),
+                              FFI::Bind().Ctx<FFI_Stream_Type>().Attr<std::string_view>("name"),
                               FFI_CudaGraph_Traits);
 
 void CopyIntoOverlapBufferImpl(cudaStream_t stream, void *input_ptr,
@@ -205,14 +206,16 @@ void CommGemmOverlapImpl(void *lhs, const std::vector<size_t> &lhs_shape, DType
   }
 }
 
-Error_Type CommGemmOverlapFFI(
-    cudaStream_t stream, Buffer_Type lhs, Buffer_Type lhs_scale_inv, Buffer_Type rhs,
-    Buffer_Type rhs_scale_inv, Buffer_Type bias, Buffer_Type gelu_input, Buffer_Type out,
-    Buffer_Type out_amax, Buffer_Type out_scale, Buffer_Type extra_out, Result_Type out_updated,
-    Result_Type out_amax_updated, Result_Type out_scale_updated, Result_Type pre_gelu_out,
-    Result_Type bias_grad, Result_Type extra_out_updated, Result_Type workspace, bool lhs_trans,
-    bool rhs_trans, bool fuse_gelu, bool fuse_bias, bool grad, bool accumulate,
-    bool use_split_accumulator, int64_t comm_type_flag, std::string_view name) {
+Error_Type CommGemmOverlapFFI(cudaStream_t stream, Buffer_Type lhs, Buffer_Type lhs_scale_inv,
+                              Buffer_Type rhs, Buffer_Type rhs_scale_inv, Buffer_Type bias,
+                              Buffer_Type gelu_input, Buffer_Type out, Buffer_Type out_amax,
+                              Buffer_Type out_scale, Buffer_Type extra_out, Result_Type out_updated,
+                              Result_Type out_amax_updated, Result_Type out_scale_updated,
+                              Result_Type pre_gelu_out, Result_Type bias_grad,
+                              Result_Type extra_out_updated, Result_Type workspace, bool lhs_trans,
+                              bool rhs_trans, bool fuse_gelu, bool fuse_bias, bool grad,
+                              bool accumulate, bool use_split_accumulator, int64_t comm_type_flag,
+                              std::string_view name) {
   // Inputs
   auto lhs_ptr = lhs.untyped_data();
   auto lhs_shape = std::vector<size_t>(lhs.dimensions().begin(), lhs.dimensions().end());
@@ -232,8 +235,8 @@ Error_Type CommGemmOverlapFFI(
 
   // Outputs
   auto out_updated_ptr = out_updated->untyped_data();
-  auto out_shape = std::vector<size_t>(out_updated->dimensions().begin(),
-                                       out_updated->dimensions().end());
+  auto out_shape =
+      std::vector<size_t>(out_updated->dimensions().begin(), out_updated->dimensions().end());
   auto out_dtype = convert_ffi_datatype_to_te_dtype(out_updated->element_type());
   auto out_amax_updated_ptr = reinterpret_cast<float *>(out_amax_updated->untyped_data());
   auto out_scale_updated_ptr = reinterpret_cast<float *>(out_scale_updated->untyped_data());
diff --git a/transformer_engine/jax/csrc/extensions/gemm.cpp b/transformer_engine/jax/csrc/extensions/gemm.cpp
index 8f6f907268..44a2d55f8e 100644
--- a/transformer_engine/jax/csrc/extensions/gemm.cpp
+++ b/transformer_engine/jax/csrc/extensions/gemm.cpp
@@ -88,15 +88,14 @@ void Gemm(cudaStream_t stream, void **buffers, const char *opaque, size_t opaque
            desc.fuse_bias, desc.grad, desc.accumulate, desc.use_split_accumulator);
 }
 
-Error_Type GemmFFI(
-    cudaStream_t stream, Buffer_Type lhs, Buffer_Type lhs_scale_inv, Buffer_Type rhs,
-    Buffer_Type rhs_scale_inv, Buffer_Type bias, Buffer_Type gelu_input, Buffer_Type out,
-    Buffer_Type out_amax, Buffer_Type out_scale, Buffer_Type dummy_in, Result_Type out_updated,
-    Result_Type out_amax_updated, Result_Type out_scale_updated, Result_Type pre_gelu_out,
-    Result_Type bias_grad, Result_Type dummy_out, Result_Type workspace, bool lhs_trans,
-    bool rhs_trans, bool fuse_gelu, bool fuse_bias, bool grad, bool accumulate,
-    bool use_split_accumulator
-) {
+Error_Type GemmFFI(cudaStream_t stream, Buffer_Type lhs, Buffer_Type lhs_scale_inv, Buffer_Type rhs,
+                   Buffer_Type rhs_scale_inv, Buffer_Type bias, Buffer_Type gelu_input,
+                   Buffer_Type out, Buffer_Type out_amax, Buffer_Type out_scale,
+                   Buffer_Type dummy_in, Result_Type out_updated, Result_Type out_amax_updated,
+                   Result_Type out_scale_updated, Result_Type pre_gelu_out, Result_Type bias_grad,
+                   Result_Type dummy_out, Result_Type workspace, bool lhs_trans, bool rhs_trans,
+                   bool fuse_gelu, bool fuse_bias, bool grad, bool accumulate,
+                   bool use_split_accumulator) {
   // Inputs
   auto lhs_ptr = lhs.untyped_data();
   auto lhs_scale_inv_ptr = reinterpret_cast<float *>(lhs_scale_inv.untyped_data());
diff --git a/transformer_engine/jax/gemm.py b/transformer_engine/jax/gemm.py
index 9b9afd56ca..37d6e5328b 100644
--- a/transformer_engine/jax/gemm.py
+++ b/transformer_engine/jax/gemm.py
@@ -176,9 +176,11 @@ def _gemm_fwd_rule(
     )
 
     final_out = out
-    if (comm_overlap_config is not None
+    if (
+        comm_overlap_config is not None
         and comm_overlap_config["method"] != "bulk"
-        and comm_overlap_config["comm_type"] == tex.CommOverlapType.RS):
+        and comm_overlap_config["comm_type"] == tex.CommOverlapType.RS
+    ):
         # Non-bulk RS overlap output is in extra output, not usual output
         final_out = extra_out
 
@@ -213,8 +215,10 @@ def _gemm_bwd_rule(
     if comm_overlap_config is not None:
         dgrad_overlap_name = comm_overlap_config["name"].rstrip("_fprop") + "_dgrad"
         dgrad_overlap_config = _ACTIVE_COMM_GEMM_OVERLAPS.get(dgrad_overlap_name, None)
-        if (dgrad_overlap_config["method"] == "bulk"
-            and dgrad_overlap_config["comm_type"] == tex.CommOverlapType.AG):
+        if (
+            dgrad_overlap_config["method"] == "bulk"
+            and dgrad_overlap_config["comm_type"] == tex.CommOverlapType.AG
+        ):
             # If DGRAD is bulk overlap, copy input X into comm buffer to be all-gathered in
             # preparation for WGRAD.
             wgrad_overlap_name = comm_overlap_config["name"].rstrip("_fprop") + "_wgrad"
@@ -260,9 +264,11 @@ def _gemm_bwd_rule(
         comm_overlap_config=dgrad_overlap_config,
     )
 
-    if (dgrad_overlap_config is not None
+    if (
+        dgrad_overlap_config is not None
         and dgrad_overlap_config["method"] != "bulk"
-        and dgrad_overlap_config["comm_type"] == tex.CommOverlapType.RS):
+        and dgrad_overlap_config["comm_type"] == tex.CommOverlapType.RS
+    ):
         # Otherwise, if DGRAD overlap is RS overlap, DGRAD output is the extra output tensor
         dgrad = dgrad_extra_out
 
@@ -971,9 +977,11 @@ def add_new_comm_gemm_overlap(
         # Check if both AG and RS overlaps use `atomic GEMM`` + `p2p ring-exchange`.
         # Using atomic GEMM + p2p ring-exchange in only one of the pair breaks functionality.
         global layers_atomic_ring_exchange
-        if (overlap_atomic_gemm
+        if (
+            overlap_atomic_gemm
             and overlap_method == "ring_exchange"
-            and overlap_name in ag_rs_pairs):
+            and overlap_name in ag_rs_pairs
+        ):
             layers_atomic_ring_exchange += [overlap_name, ag_rs_pairs[overlap_name]]
         if overlap_name in rs_ag_pairs:
             assert_message = (
@@ -987,9 +995,7 @@ def add_new_comm_gemm_overlap(
                 assert overlap_atomic_gemm and overlap_method == "ring_exchange", assert_message
             else:
                 if overlap_atomic_gemm and overlap_method == "ring_exchange":
-                    assert (
-                        rs_ag_pairs[overlap_name] in layers_atomic_ring_exchange
-                    ), assert_message
+                    assert rs_ag_pairs[overlap_name] in layers_atomic_ring_exchange, assert_message
 
         # Reduce buffer shape to 2D here in case the user initialized with batch dims
         buffer_shape = (reduce(operator.mul, shape[:-1], 1), shape[-1])
@@ -1052,7 +1058,7 @@ def destroy_comm_gemm_overlaps():
 
 def get_comm_overlap_config(name):
     global _ACTIVE_COMM_GEMM_OVERLAPS
-    assert name in _ACTIVE_COMM_GEMM_OVERLAPS, (
-        f"Comm+GEMM overlap for '{name}' has not been initialized!"
-    )
+    assert (
+        name in _ACTIVE_COMM_GEMM_OVERLAPS
+    ), f"Comm+GEMM overlap for '{name}' has not been initialized!"
     return _ACTIVE_COMM_GEMM_OVERLAPS[name]

From a569e3b44b6e9f26823ceeb5608e0bd820c20841 Mon Sep 17 00:00:00 2001
From: Alp Dener <adener@nvidia.com>
Date: Tue, 3 Dec 2024 14:35:13 +0000
Subject: [PATCH 23/32] added comm+GEMM overlap example script

Signed-off-by: Alp Dener <adener@nvidia.com>
---
 .../comm_gemm_overlap/comm_gemm_overlap.py    | 152 ++++++++++++++++++
 1 file changed, 152 insertions(+)
 create mode 100644 examples/jax/comm_gemm_overlap/comm_gemm_overlap.py

diff --git a/examples/jax/comm_gemm_overlap/comm_gemm_overlap.py b/examples/jax/comm_gemm_overlap/comm_gemm_overlap.py
new file mode 100644
index 0000000000..eb8d09d7f6
--- /dev/null
+++ b/examples/jax/comm_gemm_overlap/comm_gemm_overlap.py
@@ -0,0 +1,152 @@
+# Copyright (c) 2022-2024, NVIDIA CORPORATION & AFFILIATES. All rights reserved.
+#
+# See LICENSE for license information.
+"""Comm+GEMM Overlap with TE/JAX"""
+
+import argparse
+
+from mpi4py import MPI
+
+import jax
+import jax.numpy as jnp
+from jax.sharding import Mesh, NamedSharding, PartitionSpec
+from jax.experimental import mesh_utils
+import numpy as np
+import transformer_engine.jax as te
+from transformer_engine.jax.cpp_extensions import gemm_impl
+from transformer_engine.jax.gemm import (
+    initialize_comm_gemm_overlaps,
+    destroy_comm_gemm_overlaps,
+    get_comm_overlap_config,
+)
+
+jax.clear_caches()
+
+# This script needs to be launched via `mpirun` with 1 process per GPU
+myrank = MPI.COMM_WORLD.Get_rank()
+numranks = MPI.COMM_WORLD.Get_size()
+jax.distributed.initialize(cluster_detection_method='mpi4py')
+
+parser = argparse.ArgumentParser()
+parser.add_argument('-dp', '--dp-size', type=int, default=1)
+parser.add_argument('-zp', '--fsdp-size', type=int, default=2)
+parser.add_argument('-tp', '--tp-size', type=int, default=4)
+parser.add_argument('-np', '--num-gpus', type=int, default=8)
+parser.add_argument('--base-size', type=int, default=16)
+parser.add_argument('--batch-size', type=int, default=4)
+parser.add_argument('--no-batch', action="store_true")
+parser.add_argument('--no-fsdp',  action="store_true")
+parser.add_argument('--comm-type', type=str.upper, default="AG", choices=["AG", "RS"])
+args = parser.parse_args()
+
+# GEMM problem sizing
+dtype = jnp.bfloat16
+seq_length = args.base_size * 8
+hidden_size = args.base_size * 6
+ffn_hidden_size = args.base_size * 16
+
+# Operand shapes
+lhs_shape = (
+    [seq_length, hidden_size]
+    if args.comm_type == "AG"
+    else [seq_length, ffn_hidden_size]
+)
+rhs_shape = (
+    [hidden_size, ffn_hidden_size]
+    if args.comm_type == "AG"
+    else [ffn_hidden_size, hidden_size]
+)
+
+# Operand partitioning
+batched = not args.no_batch
+fsdp = not args.no_fsdp
+if batched:
+    lhs_shape = [args.batch_size] + lhs_shape
+    if fsdp:
+        mesh_shape = {'dp': args.dp_size, 'zp': args.fsdp_size, 'tp': args.tp_size}
+        mesh_resource = te.MeshResource(dp_resource='dp', tp_resource='tp', cp_resource='tp',
+                                        fsdp_resource='zp')
+        if args.comm_type == "AG":
+            input_specs = [('dp', 'zp'), 'tp', None]
+            weight_specs = ['zp', 'tp']
+            weight_no_fsdp = [None, 'tp']
+        elif args.comm_type == "RS":
+            input_specs = [('dp', 'zp'), None, 'tp']
+            weight_specs = ['tp', 'zp']
+            weight_no_fsdp = ['tp', None]
+    else:
+        mesh_shape = {'dp': args.dp_size, 'tp': args.tp_size}
+        mesh_resource = te.MeshResource(dp_resource='dp', tp_resource='tp', cp_resource='tp',)
+        if args.comm_type == "AG":
+            input_specs = ['dp', 'tp', None]
+            weight_specs = [None, 'tp']
+        elif args.comm_type == "RS":
+            input_specs = ['dp', None, 'tp']
+            weight_specs = ['tp', None]
+        weight_no_fsdp = weight_specs
+else:
+    mesh_shape = {'tp': args.tp_size}
+    mesh_resource = te.MeshResource(tp_resource='tp', cp_resource='cp')
+    if args.comm_type == "AG":
+        input_specs = ['tp', None]
+        weight_specs = [None, 'tp']
+    elif args.comm_type == "RS":
+        input_specs = [None, 'tp']
+        weight_specs = ['tp', None]
+    weight_no_fsdp = weight_specs
+
+# Mesh setup and sharding definitions
+devices = mesh_utils.create_device_mesh((args.num_gpus, ), devices=jax.devices()[:args.num_gpus])
+mesh = Mesh(np.array(devices).reshape(tuple(mesh_shape.values())), tuple(mesh_shape.keys()))
+input_sharding = NamedSharding(mesh, PartitionSpec(*input_specs))
+weight_sharding = NamedSharding(mesh, PartitionSpec(*weight_specs))
+weight_no_fsdp_sharding = NamedSharding(mesh, PartitionSpec(*weight_no_fsdp))
+
+# Operand initialization
+key = jax.random.PRNGKey(0)
+key1, key2 = jax.random.split(key, 2)
+lhs = jax.device_put(jax.random.normal(key1, lhs_shape, dtype=dtype), input_sharding)
+rhs = jax.device_put(jax.random.normal(key2, rhs_shape, dtype=dtype), weight_sharding)
+
+# Name of comm+GEMM overlap layer
+overlap_name = "ag_gemm" if args.comm_type == "AG" else "gemm_rs"
+
+# Bootstrap Userbuffers communicators and communication buffers
+initialize_comm_gemm_overlaps(
+    lhs_shape,
+    mesh,
+    myrank,
+    numranks,
+    tp_resource='tp',
+    overlap_configs={overlap_name : dict()},
+)
+
+if myrank == 0:
+    print(
+        f"{myrank}: INPUTS {lhs.shape} x {rhs.shape}\n"
+        + f"{myrank}:    LHS sharding: {lhs.sharding}\n"
+        + f"{myrank}:    RHS sharding: {rhs.sharding}\n",
+        flush=True
+    )
+
+@jax.jit
+def te_gemm(A, B):
+    return gemm_impl(A, jax.lax.with_sharding_constraint(B, weight_no_fsdp_sharding),
+                     batched_output=True,
+                     comm_overlap_config=get_comm_overlap_config(overlap_name))
+
+with te.sharding.global_shard_guard(mesh_resource):
+    output, _, extra_out = te_gemm(lhs, rhs)
+
+if myrank == 0:
+    print(
+        f"{myrank}: {'AG -> GEMM' if args.comm_type == 'AG' else 'GEMM -> RS'} OUTPUTS:\n"
+        + f"{myrank}:    GEMM output: {output.shape} | {output.sharding}\n"
+        + f"{myrank}:    {'Gathered LHS' if args.comm_type == 'AG' else 'Scattered output:'}: "
+        + f"{extra_out.shape} | {extra_out.sharding}\n",
+        flush=True
+    )
+
+destroy_comm_gemm_overlaps()
+
+

From 69db12ea42e0e3729b42ba874cae91aaf52f4b1d Mon Sep 17 00:00:00 2001
From: "pre-commit-ci[bot]"
 <66853113+pre-commit-ci[bot]@users.noreply.github.com>
Date: Tue, 3 Dec 2024 14:35:44 +0000
Subject: [PATCH 24/32] [pre-commit.ci] auto fixes from pre-commit.com hooks

for more information, see https://pre-commit.ci
---
 .../comm_gemm_overlap/comm_gemm_overlap.py    | 100 +++++++++---------
 1 file changed, 51 insertions(+), 49 deletions(-)

diff --git a/examples/jax/comm_gemm_overlap/comm_gemm_overlap.py b/examples/jax/comm_gemm_overlap/comm_gemm_overlap.py
index eb8d09d7f6..551fdaa0b4 100644
--- a/examples/jax/comm_gemm_overlap/comm_gemm_overlap.py
+++ b/examples/jax/comm_gemm_overlap/comm_gemm_overlap.py
@@ -25,18 +25,18 @@
 # This script needs to be launched via `mpirun` with 1 process per GPU
 myrank = MPI.COMM_WORLD.Get_rank()
 numranks = MPI.COMM_WORLD.Get_size()
-jax.distributed.initialize(cluster_detection_method='mpi4py')
+jax.distributed.initialize(cluster_detection_method="mpi4py")
 
 parser = argparse.ArgumentParser()
-parser.add_argument('-dp', '--dp-size', type=int, default=1)
-parser.add_argument('-zp', '--fsdp-size', type=int, default=2)
-parser.add_argument('-tp', '--tp-size', type=int, default=4)
-parser.add_argument('-np', '--num-gpus', type=int, default=8)
-parser.add_argument('--base-size', type=int, default=16)
-parser.add_argument('--batch-size', type=int, default=4)
-parser.add_argument('--no-batch', action="store_true")
-parser.add_argument('--no-fsdp',  action="store_true")
-parser.add_argument('--comm-type', type=str.upper, default="AG", choices=["AG", "RS"])
+parser.add_argument("-dp", "--dp-size", type=int, default=1)
+parser.add_argument("-zp", "--fsdp-size", type=int, default=2)
+parser.add_argument("-tp", "--tp-size", type=int, default=4)
+parser.add_argument("-np", "--num-gpus", type=int, default=8)
+parser.add_argument("--base-size", type=int, default=16)
+parser.add_argument("--batch-size", type=int, default=4)
+parser.add_argument("--no-batch", action="store_true")
+parser.add_argument("--no-fsdp", action="store_true")
+parser.add_argument("--comm-type", type=str.upper, default="AG", choices=["AG", "RS"])
 args = parser.parse_args()
 
 # GEMM problem sizing
@@ -46,15 +46,9 @@
 ffn_hidden_size = args.base_size * 16
 
 # Operand shapes
-lhs_shape = (
-    [seq_length, hidden_size]
-    if args.comm_type == "AG"
-    else [seq_length, ffn_hidden_size]
-)
+lhs_shape = [seq_length, hidden_size] if args.comm_type == "AG" else [seq_length, ffn_hidden_size]
 rhs_shape = (
-    [hidden_size, ffn_hidden_size]
-    if args.comm_type == "AG"
-    else [ffn_hidden_size, hidden_size]
+    [hidden_size, ffn_hidden_size] if args.comm_type == "AG" else [ffn_hidden_size, hidden_size]
 )
 
 # Operand partitioning
@@ -63,40 +57,45 @@
 if batched:
     lhs_shape = [args.batch_size] + lhs_shape
     if fsdp:
-        mesh_shape = {'dp': args.dp_size, 'zp': args.fsdp_size, 'tp': args.tp_size}
-        mesh_resource = te.MeshResource(dp_resource='dp', tp_resource='tp', cp_resource='tp',
-                                        fsdp_resource='zp')
+        mesh_shape = {"dp": args.dp_size, "zp": args.fsdp_size, "tp": args.tp_size}
+        mesh_resource = te.MeshResource(
+            dp_resource="dp", tp_resource="tp", cp_resource="tp", fsdp_resource="zp"
+        )
         if args.comm_type == "AG":
-            input_specs = [('dp', 'zp'), 'tp', None]
-            weight_specs = ['zp', 'tp']
-            weight_no_fsdp = [None, 'tp']
+            input_specs = [("dp", "zp"), "tp", None]
+            weight_specs = ["zp", "tp"]
+            weight_no_fsdp = [None, "tp"]
         elif args.comm_type == "RS":
-            input_specs = [('dp', 'zp'), None, 'tp']
-            weight_specs = ['tp', 'zp']
-            weight_no_fsdp = ['tp', None]
+            input_specs = [("dp", "zp"), None, "tp"]
+            weight_specs = ["tp", "zp"]
+            weight_no_fsdp = ["tp", None]
     else:
-        mesh_shape = {'dp': args.dp_size, 'tp': args.tp_size}
-        mesh_resource = te.MeshResource(dp_resource='dp', tp_resource='tp', cp_resource='tp',)
+        mesh_shape = {"dp": args.dp_size, "tp": args.tp_size}
+        mesh_resource = te.MeshResource(
+            dp_resource="dp",
+            tp_resource="tp",
+            cp_resource="tp",
+        )
         if args.comm_type == "AG":
-            input_specs = ['dp', 'tp', None]
-            weight_specs = [None, 'tp']
+            input_specs = ["dp", "tp", None]
+            weight_specs = [None, "tp"]
         elif args.comm_type == "RS":
-            input_specs = ['dp', None, 'tp']
-            weight_specs = ['tp', None]
+            input_specs = ["dp", None, "tp"]
+            weight_specs = ["tp", None]
         weight_no_fsdp = weight_specs
 else:
-    mesh_shape = {'tp': args.tp_size}
-    mesh_resource = te.MeshResource(tp_resource='tp', cp_resource='cp')
+    mesh_shape = {"tp": args.tp_size}
+    mesh_resource = te.MeshResource(tp_resource="tp", cp_resource="cp")
     if args.comm_type == "AG":
-        input_specs = ['tp', None]
-        weight_specs = [None, 'tp']
+        input_specs = ["tp", None]
+        weight_specs = [None, "tp"]
     elif args.comm_type == "RS":
-        input_specs = [None, 'tp']
-        weight_specs = ['tp', None]
+        input_specs = [None, "tp"]
+        weight_specs = ["tp", None]
     weight_no_fsdp = weight_specs
 
 # Mesh setup and sharding definitions
-devices = mesh_utils.create_device_mesh((args.num_gpus, ), devices=jax.devices()[:args.num_gpus])
+devices = mesh_utils.create_device_mesh((args.num_gpus,), devices=jax.devices()[: args.num_gpus])
 mesh = Mesh(np.array(devices).reshape(tuple(mesh_shape.values())), tuple(mesh_shape.keys()))
 input_sharding = NamedSharding(mesh, PartitionSpec(*input_specs))
 weight_sharding = NamedSharding(mesh, PartitionSpec(*weight_specs))
@@ -117,8 +116,8 @@
     mesh,
     myrank,
     numranks,
-    tp_resource='tp',
-    overlap_configs={overlap_name : dict()},
+    tp_resource="tp",
+    overlap_configs={overlap_name: dict()},
 )
 
 if myrank == 0:
@@ -126,14 +125,19 @@
         f"{myrank}: INPUTS {lhs.shape} x {rhs.shape}\n"
         + f"{myrank}:    LHS sharding: {lhs.sharding}\n"
         + f"{myrank}:    RHS sharding: {rhs.sharding}\n",
-        flush=True
+        flush=True,
     )
 
+
 @jax.jit
 def te_gemm(A, B):
-    return gemm_impl(A, jax.lax.with_sharding_constraint(B, weight_no_fsdp_sharding),
-                     batched_output=True,
-                     comm_overlap_config=get_comm_overlap_config(overlap_name))
+    return gemm_impl(
+        A,
+        jax.lax.with_sharding_constraint(B, weight_no_fsdp_sharding),
+        batched_output=True,
+        comm_overlap_config=get_comm_overlap_config(overlap_name),
+    )
+
 
 with te.sharding.global_shard_guard(mesh_resource):
     output, _, extra_out = te_gemm(lhs, rhs)
@@ -144,9 +148,7 @@ def te_gemm(A, B):
         + f"{myrank}:    GEMM output: {output.shape} | {output.sharding}\n"
         + f"{myrank}:    {'Gathered LHS' if args.comm_type == 'AG' else 'Scattered output:'}: "
         + f"{extra_out.shape} | {extra_out.sharding}\n",
-        flush=True
+        flush=True,
     )
 
 destroy_comm_gemm_overlaps()
-
-

From ec2d5aecd45926959e90a5cd6a20bb2202beecdd Mon Sep 17 00:00:00 2001
From: Alp Dener <adener@nvidia.com>
Date: Tue, 3 Dec 2024 14:38:57 +0000
Subject: [PATCH 25/32] RS overlap also works

Signed-off-by: Alp Dener <adener@nvidia.com>
---
 examples/jax/comm_gemm_overlap/comm_gemm_overlap.py | 3 ++-
 transformer_engine/jax/gemm.py                      | 4 +++-
 2 files changed, 5 insertions(+), 2 deletions(-)

diff --git a/examples/jax/comm_gemm_overlap/comm_gemm_overlap.py b/examples/jax/comm_gemm_overlap/comm_gemm_overlap.py
index 551fdaa0b4..3637abbd50 100644
--- a/examples/jax/comm_gemm_overlap/comm_gemm_overlap.py
+++ b/examples/jax/comm_gemm_overlap/comm_gemm_overlap.py
@@ -4,6 +4,7 @@
 """Comm+GEMM Overlap with TE/JAX"""
 
 import argparse
+import numpy as np
 
 from mpi4py import MPI
 
@@ -11,7 +12,7 @@
 import jax.numpy as jnp
 from jax.sharding import Mesh, NamedSharding, PartitionSpec
 from jax.experimental import mesh_utils
-import numpy as np
+
 import transformer_engine.jax as te
 from transformer_engine.jax.cpp_extensions import gemm_impl
 from transformer_engine.jax.gemm import (
diff --git a/transformer_engine/jax/gemm.py b/transformer_engine/jax/gemm.py
index 37d6e5328b..59d1045080 100644
--- a/transformer_engine/jax/gemm.py
+++ b/transformer_engine/jax/gemm.py
@@ -932,7 +932,9 @@ def get_default_config(name):
             "name": name,
             "method": method,
             "comm_type": (
-                tex.CommOverlapType.AG if name in layers_all_gather_overlap else tex.CommOverlap.RS
+                tex.CommOverlapType.AG
+                if name in layers_all_gather_overlap
+                else tex.CommOverlapType.RS
             ),
             "num_sm": 1 if method == "ring_exchange" else 16,
             "num_max_streams": _NUM_MAX_UB_STREAMS,

From 8fe3942635c311b80de2910cebd906b250236b90 Mon Sep 17 00:00:00 2001
From: Alp Dener <adener@nvidia.com>
Date: Tue, 3 Dec 2024 14:55:18 +0000
Subject: [PATCH 26/32] added missing copy of AG+GEMM input into comm buffer

Signed-off-by: Alp Dener <adener@nvidia.com>
---
 .../jax/comm_gemm_overlap/comm_gemm_overlap.py     | 14 +++++++-------
 1 file changed, 7 insertions(+), 7 deletions(-)

diff --git a/examples/jax/comm_gemm_overlap/comm_gemm_overlap.py b/examples/jax/comm_gemm_overlap/comm_gemm_overlap.py
index 3637abbd50..8920b1a37d 100644
--- a/examples/jax/comm_gemm_overlap/comm_gemm_overlap.py
+++ b/examples/jax/comm_gemm_overlap/comm_gemm_overlap.py
@@ -14,7 +14,7 @@
 from jax.experimental import mesh_utils
 
 import transformer_engine.jax as te
-from transformer_engine.jax.cpp_extensions import gemm_impl
+from transformer_engine.jax.cpp_extensions import gemm_impl, copy_into_overlap_buffer
 from transformer_engine.jax.gemm import (
     initialize_comm_gemm_overlaps,
     destroy_comm_gemm_overlaps,
@@ -124,14 +124,15 @@
 if myrank == 0:
     print(
         f"{myrank}: INPUTS {lhs.shape} x {rhs.shape}\n"
-        + f"{myrank}:    LHS sharding: {lhs.sharding}\n"
-        + f"{myrank}:    RHS sharding: {rhs.sharding}\n",
+        + f"{myrank}:    LHS sharding: {lhs.sharding.spec}\n"
+        + f"{myrank}:    RHS sharding: {rhs.sharding.spec}\n",
         flush=True,
     )
 
 
 @jax.jit
 def te_gemm(A, B):
+    copy_into_overlap_buffer(A, overlap_name, True)
     return gemm_impl(
         A,
         jax.lax.with_sharding_constraint(B, weight_no_fsdp_sharding),
@@ -145,10 +146,9 @@ def te_gemm(A, B):
 
 if myrank == 0:
     print(
-        f"{myrank}: {'AG -> GEMM' if args.comm_type == 'AG' else 'GEMM -> RS'} OUTPUTS:\n"
-        + f"{myrank}:    GEMM output: {output.shape} | {output.sharding}\n"
-        + f"{myrank}:    {'Gathered LHS' if args.comm_type == 'AG' else 'Scattered output:'}: "
-        + f"{extra_out.shape} | {extra_out.sharding}\n",
+        f"{myrank}: {'AG -> GEMM' if args.comm_type == 'AG' else 'GEMM -> RS'} OUTPUT "
+        + f"{output.shape}\n"
+        + f"{myrank}:    Sharding: {output.sharding.spec}\n",
         flush=True,
     )
 

From adf4046f7fd92e29ec2834778821acec177fe949 Mon Sep 17 00:00:00 2001
From: Alp Dener <adener@nvidia.com>
Date: Thu, 5 Dec 2024 19:54:23 +0000
Subject: [PATCH 27/32] updated FWD/BWD wrappers for non-FP8 and FP8 gemm

Signed-off-by: Alp Dener <adener@nvidia.com>
---
 transformer_engine/jax/cpp_extensions/gemm.py |   1 +
 transformer_engine/jax/gemm.py                | 224 +++++++++---------
 2 files changed, 108 insertions(+), 117 deletions(-)

diff --git a/transformer_engine/jax/cpp_extensions/gemm.py b/transformer_engine/jax/cpp_extensions/gemm.py
index b43c644a51..66eea09cb2 100644
--- a/transformer_engine/jax/cpp_extensions/gemm.py
+++ b/transformer_engine/jax/cpp_extensions/gemm.py
@@ -1074,6 +1074,7 @@ def fp8_gemm_impl(
     bias: Optional[ArrayLike] = None,
     gelu_input: Optional[ArrayLike] = None,
     out: Optional[ArrayLike] = None,
+    extra_out: Optional[ArrayLike] = None,
     out_amax: Optional[ArrayLike] = None,
     out_scale: Optional[ArrayLike] = None,
     out_dtype: jnp.dtype = jnp.bfloat16,
diff --git a/transformer_engine/jax/gemm.py b/transformer_engine/jax/gemm.py
index 59d1045080..1a275ceed7 100644
--- a/transformer_engine/jax/gemm.py
+++ b/transformer_engine/jax/gemm.py
@@ -209,27 +209,30 @@ def _gemm_bwd_rule(
         mirror_dim, (x_inner_dim, kernel_inner_dim), (x.ndim, kernel.ndim)
     )
 
+    # Recover DGRAD and WGRAD comm+GEMM overlap configs
+    dgrad_overlap_name = None
     dgrad_overlap_config = None
+    wgrad_overlap_name = None
     wgrad_overlap_config = None
-    dgrad_pre_rs = None
     if comm_overlap_config is not None:
         dgrad_overlap_name = comm_overlap_config["name"].rstrip("_fprop") + "_dgrad"
         dgrad_overlap_config = _ACTIVE_COMM_GEMM_OVERLAPS.get(dgrad_overlap_name, None)
-        if (
-            dgrad_overlap_config["method"] == "bulk"
-            and dgrad_overlap_config["comm_type"] == tex.CommOverlapType.AG
-        ):
-            # If DGRAD is bulk overlap, copy input X into comm buffer to be all-gathered in
-            # preparation for WGRAD.
-            wgrad_overlap_name = comm_overlap_config["name"].rstrip("_fprop") + "_wgrad"
-            wgrad_overlap_config = _ACTIVE_COMM_GEMM_OVERLAPS.get(wgrad_overlap_name, None)
-            assert wgrad_overlap_config is not None, "Internal TE error!"
-            copy_into_overlap_buffer(x, dgrad_overlap_name, True)
+        wgrad_overlap_name = comm_overlap_config["name"].rstrip("_fprop") + "_wgrad"
+        wgrad_overlap_config = _ACTIVE_COMM_GEMM_OVERLAPS.get(wgrad_overlap_name, None)
 
+    dgrad_pre_rs = None
+    if dgrad_overlap_config is not None:
+        if dgrad_overlap_config["method"] == "bulk":
             # Set DGRAD output buffer to the comm buffer of WGRAD GEMM in order to do the
-            # bulk RS overlap without an extra memcpy
+            # bulk RS overlap without an extra memcpy.
+            assert wgrad_overlap_config is not None, (
+                f"Missing comm+GEMM overlap config for {wgrad_overlap_name}!"
+            )
             dgrad_pre_rs = tex.get_overlap_buffer(wgrad_overlap_name, False)
 
+            # Copy transposed input into the DGRAD overlap buffer for bulk AG.
+            copy_into_overlap_buffer(jnp.matrix_transpose(x), dgrad_overlap_name, True)
+
     # FWD MODE:
     #     AG+GEMM: ([B], M/P, K) --(AG)--> ([B], M, K) x (K, N/P) ------> ([B], M, N/P)
     #
@@ -246,7 +249,7 @@ def _gemm_bwd_rule(
     #    AG+GEMM w/ DGRAD+RS Overlap: ([B], M, N/P) x (K, N/P)^T ---(RS)---> ([B], M/P, K)
     #
     #    AG+GEMM w/ Bulk AG Overlap: ([B], M, N/P) x (K, N/P)^T -----> ([B], M, K) (deferred RS)
-    #                                ([B], M, K/P) --(Bulk AG)--> ([B], M, K) (needed in WGRAD)
+    #                                ([B], M, K/P)^T --(Bulk AG)--> ([B], M, K)^T (needed in WGRAD)
     #
     #    GEMM+RS: ([B], M/P, N) --(AG)--> ([B], M, N) x (K/P, N)^T ----> ([B], M, K/P)
     dgrad, dgelu, _, dgrad_extra_out = gemm_impl(
@@ -272,13 +275,14 @@ def _gemm_bwd_rule(
         # Otherwise, if DGRAD overlap is RS overlap, DGRAD output is the extra output tensor
         dgrad = dgrad_extra_out
 
+
     # WGRAD w/o Overlap:
     #    AG+GEMM: ([B], M/P, K)^T --(AG)--> ([B], M, K)^T x ([B], M, N/P) --> (K, N/P)
     #
     #    GEMM+AR: ([B], M, K/P)^T --(AG)--> ([B], M, K)^T x ([B], M, N) ---------> (K, N)
     #
     # WGRAD w/ Overlap:
-    #    AG+GEMM w/ DGRAD+RS Overlap: ([B], M/P, K)^T --(AG)--> ([B], M, K)^T x ([B], M, N/P) --> (K, N/P)
+    #    AG+GEMM w/ DGRAD+RS Overlap: ([B], M, K/P)^T --(AG)--> ([B], M, K)^T x ([B], M, N/P) --> (K, N/P)
     #
     #    AG+GEMM w/ Bulk Overlaps: ([B], M, K)^T x ([B], M, N/P) --> (K, N/P)
     #                              ([B], M, K) --(Bulk RS)--> ([B], M/P, K) (finalize DGRAD)
@@ -299,7 +303,11 @@ def _gemm_bwd_rule(
         comm_overlap_config=wgrad_overlap_config,
     )
 
-    if wgrad_overlap_config is not None:
+    if (
+        wgrad_overlap_config is not None
+        and wgrad_overlap_config["method"] == "bulk"
+        and wgrad_overlap_config["comm_type"] == tex.CommOverlapType.RS
+    ):
         # DGRAD was reduce-scattered during WGRAD GEMM, so set DGRAD to WGRAD extra output here
         dgrad = wgrad_extra_out
 
@@ -317,6 +325,7 @@ def fp8_gemm(
     kernel_t: ArrayLike,
     fp8_meta: FP8MetaPackage,
     bias: Optional[ArrayLike] = None,
+    out: Optional[ArrayLike] = None,
     out_dtype: jnp.dtype = jnp.bfloat16,
     fuse_gelu: bool = False,
     accumulate: bool = False,
@@ -340,10 +349,12 @@ def fp8_gemm(
         FP8MetaPackage object carrying amax, scale and scale_inv information for the GEMM operands.
     bias : Optional[ArrayLike], default = `None`
         Optional bias term to add onto the (LHS x RHS) result.
+    out: Optional[ArrayLike], default = `None`
+        Optional empty buffer for FP8 GEMM output.
     out_dtype : jnp.dtype, default = `jnp.bfloat16`
         Data type of the FP8 GEMM output. If chosen as an FP8 dtype (i.e. `jnp.float8_e4m3fn` or
         `jnp.float8_e5m2`), the `fp8_meta` must also contain amax and scale information for the
-        GEMM output.
+        GEMM output. This option is overridden by the data type of the `out` buffer, if given.
     fuse_gelu : bool, default = `False`
         Enable the GELU epilogue for GEMM. This applies GELU after the bias-addition if the bias
         term is not `None`.
@@ -389,13 +400,14 @@ def fp8_gemm(
     )
 
 
-@partial(jax.custom_vjp, nondiff_argnums=(5, 6, 7, 8, 9))
+@partial(jax.custom_vjp, nondiff_argnums=(5, 6, 7, 8, 9, 10))
 def _fp8_gemm(
     x: ArrayLike,
     kernel_t: ArrayLike,
     bias: ArrayLike,
     amax_list: ArrayLike,
     scale_list: ArrayLike,
+    out: ArrayLike,
     out_dtype: jnp.dtype,
     fuse_gelu: bool,
     accumulate: bool,
@@ -501,14 +513,14 @@ def _fp8_gemm_fwd_rule(
     buffer_scale_inv = None
     if comm_overlap_config is not None:
         overlap_name = comm_overlap_config["name"]
-
         if comm_overlap_config["method"] != "bulk" and tex.overlap_buffer_is_fp8(overlap_name):
-            match comm_overlap_config["comm_type"]:
-                case tex.CommOverlapType.AG:
-                    buffer_scale_inv = x_scale_inv
+            if comm_overlap_config["comm_type"] == tex.CommOverlapType.AG:
+                buffer_scale_inv = x_scale_inv
 
-                case tex.CommOverlapType.RS:
-                    buffer_scale_inv = jnp.reciprocal(out_scale)
+            elif comm_overlap_config["comm_type"] == tex.CommOverlapType.RS:
+                out_dtype = fwd_dtype
+                out_scale = scale_list[FP8MetaPackage.OUTPUT_IDX][0:1]
+                buffer_scale_inv = jnp.reciprocal(out_scale)
 
             tex.set_overlap_buffer_scale_inverse(
                 overlap_name,
@@ -531,9 +543,6 @@ def _fp8_gemm_fwd_rule(
         use_split_accumulator=use_split_accumulator,
         comm_overlap_config=comm_overlap_config,
     )
-    if not jax_dtype_is_fp8(out_dtype):
-        updated_out_amax = None
-        updated_out_scale = None
 
     # Update returned and saved arrays based on comm+GEMM overlap config
     final_out = out
@@ -542,6 +551,10 @@ def _fp8_gemm_fwd_rule(
             # RS overlap puts the reduce-scattered sharded output into extra_out
             final_out = extra_out
 
+    if not jax_dtype_is_fp8(final_out):
+        updated_out_amax = None
+        updated_out_scale = None
+
     ctx = (
         casted_x_t,
         casted_kernel,
@@ -583,9 +596,21 @@ def _fp8_gemm_bwd_rule(
         maybe_fp32_to_fm32,
         batched_input,
     ) = ctx
-
+    del out_dtype
     bwd_dtype = FP8Helper.BWD_DTYPE
 
+    # Recover DGRAD and WGRAD comm+GEMM overlap configs
+    dgrad_overlap_name = None
+    dgrad_overlap_config = None
+    wgrad_overlap_name = None
+    wgrad_overlap_config = None
+    if comm_overlap_config is not None:
+        dgrad_overlap_name = comm_overlap_config["name"].rstrip("_fprop") + "_dgrad"
+        dgrad_overlap_config = _ACTIVE_COMM_GEMM_OVERLAPS.get(dgrad_overlap_name, None)
+        wgrad_overlap_name = comm_overlap_config["name"].rstrip("_fprop") + "_wgrad"
+        wgrad_overlap_config = _ACTIVE_COMM_GEMM_OVERLAPS.get(wgrad_overlap_name, None)
+
+    # Cast-transpose grad with potential fusions
     grad_amax = amax_list[FP8MetaPackage.GRAD_IDX][0:1]
     grad_scale = scale_list[FP8MetaPackage.GRAD_IDX]
     grad_scale_inv = scale_inv_list[FP8MetaPackage.GRAD_ID]
@@ -633,28 +658,29 @@ def _fp8_gemm_bwd_rule(
             )
             bgrad = None
 
-    # Recover dgrad comm+GEMM overlap config
-    dgrad_overlap_config = None
-    if comm_overlap_config is not None:
-        dgrad_overlap_name = comm_overlap_config["name"].rstrip("_fprop") + "_dgrad"
-        dgrad_overlap_config = _ACTIVE_COMM_GEMM_OVERLAPS.get(dgrad_overlap_name, None)
-
     # Set scale_inv for comm overlap buffer
-    dgrad_out_dtype = jnp.bfloat16
     dgrad_amax = None
     dgrad_scale = None
-    if (
-        dgrad_overlap_config is not None
-        and dgrad_overlap_config["method"] != "bulk"
-        and tex.overlap_buffer_is_fp8(dgrad_overlap_name)
-    ):
-        dgrad_out_dtype = bwd_dtype
-        dgrad_amax = grad_amax
-        dgrad_scale = grad_scale
-        tex.set_overlap_buffer_scale_inverse(
-            dgrad_overlap_name,
-            jax.dlpack.to_dlpack(grad_scale_inv),
-        )
+    if dgrad_overlap_config is not None:
+        if dgrad_overlap_config["method"] == "bulk":
+            assert wgrad_overlap_config is not None, (
+                f"Missing comm+GEMM overlap config for {wgrad_overlap_name}!"
+            )
+            # Set WGRAD buffer as output of DGRAD in order to avoid a memcpy for bulk RS overlap
+            dgrad_pre_rs = jax.dlpack.from_dlpack(
+                tex.get_overlap_buffer(wgrad_overlap_name, False)
+            )
+            # Copy input into overlap buffer for all-gather
+            copy_into_overlap_buffer(casted_x_t, dgrad_overlap_name, True)
+
+        elif tex.overlap_buffer_is_fp8(dgrad_overlap_name):
+            # Non-bulk RS DGRAD overlap needs output amax and scale if buffer type is FP8
+            dgrad_amax = grad_amax
+            dgrad_scale = grad_scale
+            tex.set_overlap_buffer_scale_inverse(
+                dgrad_overlap_name,
+                jax.dlpack.to_dlpack(grad_scale_inv),
+            )
 
     # DGRAD: ([B], M, N) x (K, N)^T = ([B], M, K)
     kernel_scale_inv = scale_inv_list[FP8MetaPackage.WEIGHT_IDX]
@@ -663,11 +689,9 @@ def _fp8_gemm_bwd_rule(
         grad_scale_inv,
         casted_kernel,
         kernel_scale_inv,
-        None,
-        None,
-        dgrad_amax,
-        dgrad_scale,
-        out_dtype=dgrad_out_dtype,
+        out=dgrad_pre_rs,
+        out_amax=dgrad_amax,
+        out_scale=dgrad_scale,
         batched_output=batched_input,
         accumulate=accumulate,
         use_split_accumulator=use_split_accumulator,
@@ -682,65 +706,29 @@ def _fp8_gemm_bwd_rule(
     ):
         dgrad = dgrad_extra_out
 
-    if fuse_gelu and fuse_bias:
-        # Fuse bgrad with dGELU.
-        _, casted_dgelu_t, bgrad, updated_grad_amax = dact_lu_dbias_cast_transpose(
-            grad,
-            pre_gelu_out,
-            grad_amax,
-            grad_scale,
-            grad_scale_inv,
-            bwd_dtype,
-            static_axis_boundary=-1,
-            transpose_axis_boundary=-1,
-            activation_type=("gelu",),
-        )
-    elif fuse_gelu:
-        # No bias grad to fuse so we just do dGELU.
-        _, casted_dgelu_t, updated_grad_amax = dact_lu(grad, pre_gelu_out, ("gelu",))
-        bgrad = None
-
-    # Recover wgrad config
-    wgrad_overlap_config = None
-    if comm_overlap_config is not None:
-        wgrad_overlap_name = comm_overlap_config["name"].rstrip("_fprop") + "_wgrad"
-        wgrad_overlap_config = _ACTIVE_COMM_GEMM_OVERLAPS.get(wgrad_overlap_name, None)
+    # Prepare comm+GEMM overlap for WGRAD
+    if wgrad_overlap_config is not None:
+        if wgrad_overlap_config["method"] == "bulk":
+            # Get all-gathered input from DGRAD bulk overlap
+            casted_x_t = jax.dlpack.from_dlpack(
+                tex.get_overlap_buffer(dgrad_overlap_name, False)
+            )
 
-    # Set scale_inv for comm overlap buffer
-    wgrad_out_dtype = jnp.bfloat16
-    wgrad_amax = None
-    wgrad_scale = None
-    if (
-        wgrad_overlap_config is not None
-        and wgrad_overlap_config["method"] != "bulk"
-        and tex.overlap_buffer_is_fp8(wgrad_overlap_name)
-    ):
-        match wgrad_overlap_config["comm_type"]:
-            case tex.CommOverlapType.AG:
-                buffer_scale_inv = x_scale_inv
-            case tex.CommOverlapType.RS:
-                buffer_scale_inv = grad_scale_inv
-                wgrad_out_dtype = bwd_dtype
-                wgrad_amax = grad_amax
-                wgrad_scale = grad_scale
-        tex.set_overlap_buffer_scale_inverse(
-            dgrad_overlap_name,
-            jax.dlpack.to_dlpack(buffer_scale_inv),
-        )
+        elif tex.overlap_buffer_is_fp8(wgrad_overlap_name):
+            # Set FP8 scale inverse for non-bulk AG overlap
+            tex.set_overlap_buffer_scale_inverse(
+                wgrad_overlap_name,
+                jax.dlpack.to_dlpack(x_scale_inv)
+            )
 
     # WGRAD: ([B], N, M) x ([B], K, M)^T = (N, K)
-    wgrad_rhs_t = casted_dgelu_t if fuse_gelu else casted_grad_t
     x_scale_inv = scale_inv_list[FP8MetaPackage.INPUT_IDX]
     wgrad, *_, wgrad_extra_out = fp8_gemm_impl(
         casted_x_t,
         x_scale_inv,
-        wgrad_rhs_t,
+        casted_grad_t,
         grad_scale_inv,
-        None,
-        None,
-        wgrad_amax,
-        wgrad_scale,
-        out_dtype=wgrad_out_dtype,
+        out_dtype=jnp.bfloat16,
         batched_output=False,
         accumulate=accumulate,
         use_split_accumulator=use_split_accumulator,
@@ -753,7 +741,7 @@ def _fp8_gemm_bwd_rule(
         and wgrad_overlap_config["method"] != "bulk"
         and wgrad_overlap_config["comm_type"] == tex.CommOverlapType.RS
     ):
-        wgrad = wgrad_extra_out
+        dgrad = wgrad_extra_out
 
     amax_list[FP8MetaPackage.INPUT_IDX] = (
         amax_list[FP8MetaPackage.INPUT_IDX].at[0].set(updated_x_amax[0])
@@ -764,7 +752,7 @@ def _fp8_gemm_bwd_rule(
     amax_list[FP8MetaPackage.GRAD_IDX] = (
         amax_list[FP8MetaPackage.GRAD_IDX].at[0].set(updated_grad_amax[0])
     )
-    if out_dtype in [jnp.float8_e4m3fn, jnp.float8_e5m2]:
+    if updated_out_amax is not None:
         amax_list[FP8MetaPackage.OUTPUT_IDX] = (
             amax_list[FP8MetaPackage.OUTPUT_IDX].at[0].set(updated_out_amax[0])
         )
@@ -782,8 +770,9 @@ def type_safe_gemm(
     x: ArrayLike,
     kernel: ArrayLike,
     bias: Optional[ArrayLike] = None,
-    fp8_meta: Optional[FP8MetaPackage] = None,
+    out: Optional[ArrayLike] = None,
     out_dtype: Optional[jnp.dtype] = None,
+    fp8_meta: Optional[FP8MetaPackage] = None,
     contracting_dims: Tuple[int, int] = (-1, -2),
     fuse_gelu: bool = False,
     accumulate: bool = False,
@@ -802,24 +791,25 @@ def type_safe_gemm(
         return fp8_gemm(
             x,
             kernel,
-            bias,
             fp8_meta,
-            out_dtype,
-            fuse_gelu,
-            accumulate,
-            use_split_accumulator,
-            comm_overlap_name,
+            bias=bias,
+            out=out,
+            out_dtype=out_dtype,
+            fuse_gelu=fuse_gelu,
+            accumulate=accumulate,
+            use_split_accumulator=use_split_accumulator,
+            comm_overlap_name=comm_overlap_name,
         )
     else:
         return gemm(
             x,
             kernel,
-            bias,
-            contracting_dims,
-            fuse_gelu,
-            accumulate,
-            use_split_accumulator,
-            comm_overlap_name,
+            bias=bias,
+            contracting_dims=contracting_dims,
+            fuse_gelu=fuse_gelu,
+            accumulate=accumulate,
+            use_split_accumulator=use_split_accumulator,
+            comm_overlap_name=comm_overlap_name,
         )
 
 

From c4c608b54540c7cfc48f867e3e01bab15c4593cf Mon Sep 17 00:00:00 2001
From: "pre-commit-ci[bot]"
 <66853113+pre-commit-ci[bot]@users.noreply.github.com>
Date: Thu, 5 Dec 2024 19:54:48 +0000
Subject: [PATCH 28/32] [pre-commit.ci] auto fixes from pre-commit.com hooks

for more information, see https://pre-commit.ci
---
 transformer_engine/jax/gemm.py | 24 +++++++++---------------
 1 file changed, 9 insertions(+), 15 deletions(-)

diff --git a/transformer_engine/jax/gemm.py b/transformer_engine/jax/gemm.py
index 1a275ceed7..7024dcb9fe 100644
--- a/transformer_engine/jax/gemm.py
+++ b/transformer_engine/jax/gemm.py
@@ -225,9 +225,9 @@ def _gemm_bwd_rule(
         if dgrad_overlap_config["method"] == "bulk":
             # Set DGRAD output buffer to the comm buffer of WGRAD GEMM in order to do the
             # bulk RS overlap without an extra memcpy.
-            assert wgrad_overlap_config is not None, (
-                f"Missing comm+GEMM overlap config for {wgrad_overlap_name}!"
-            )
+            assert (
+                wgrad_overlap_config is not None
+            ), f"Missing comm+GEMM overlap config for {wgrad_overlap_name}!"
             dgrad_pre_rs = tex.get_overlap_buffer(wgrad_overlap_name, False)
 
             # Copy transposed input into the DGRAD overlap buffer for bulk AG.
@@ -275,7 +275,6 @@ def _gemm_bwd_rule(
         # Otherwise, if DGRAD overlap is RS overlap, DGRAD output is the extra output tensor
         dgrad = dgrad_extra_out
 
-
     # WGRAD w/o Overlap:
     #    AG+GEMM: ([B], M/P, K)^T --(AG)--> ([B], M, K)^T x ([B], M, N/P) --> (K, N/P)
     #
@@ -663,13 +662,11 @@ def _fp8_gemm_bwd_rule(
     dgrad_scale = None
     if dgrad_overlap_config is not None:
         if dgrad_overlap_config["method"] == "bulk":
-            assert wgrad_overlap_config is not None, (
-                f"Missing comm+GEMM overlap config for {wgrad_overlap_name}!"
-            )
+            assert (
+                wgrad_overlap_config is not None
+            ), f"Missing comm+GEMM overlap config for {wgrad_overlap_name}!"
             # Set WGRAD buffer as output of DGRAD in order to avoid a memcpy for bulk RS overlap
-            dgrad_pre_rs = jax.dlpack.from_dlpack(
-                tex.get_overlap_buffer(wgrad_overlap_name, False)
-            )
+            dgrad_pre_rs = jax.dlpack.from_dlpack(tex.get_overlap_buffer(wgrad_overlap_name, False))
             # Copy input into overlap buffer for all-gather
             copy_into_overlap_buffer(casted_x_t, dgrad_overlap_name, True)
 
@@ -710,15 +707,12 @@ def _fp8_gemm_bwd_rule(
     if wgrad_overlap_config is not None:
         if wgrad_overlap_config["method"] == "bulk":
             # Get all-gathered input from DGRAD bulk overlap
-            casted_x_t = jax.dlpack.from_dlpack(
-                tex.get_overlap_buffer(dgrad_overlap_name, False)
-            )
+            casted_x_t = jax.dlpack.from_dlpack(tex.get_overlap_buffer(dgrad_overlap_name, False))
 
         elif tex.overlap_buffer_is_fp8(wgrad_overlap_name):
             # Set FP8 scale inverse for non-bulk AG overlap
             tex.set_overlap_buffer_scale_inverse(
-                wgrad_overlap_name,
-                jax.dlpack.to_dlpack(x_scale_inv)
+                wgrad_overlap_name, jax.dlpack.to_dlpack(x_scale_inv)
             )
 
     # WGRAD: ([B], N, M) x ([B], K, M)^T = (N, K)

From 4707df3bfaa8bc1c6a1de1caec1fa4ee19dc0902 Mon Sep 17 00:00:00 2001
From: Alp Dener <adener@nvidia.com>
Date: Thu, 5 Dec 2024 22:41:27 +0000
Subject: [PATCH 29/32] added more documentation to the TE/JAX comm+GEMM
 overlap example

Signed-off-by: Alp Dener <adener@nvidia.com>
---
 .../comm_gemm_overlap/comm_gemm_overlap.py    | 26 ++++++++++++++++---
 1 file changed, 23 insertions(+), 3 deletions(-)

diff --git a/examples/jax/comm_gemm_overlap/comm_gemm_overlap.py b/examples/jax/comm_gemm_overlap/comm_gemm_overlap.py
index 8920b1a37d..e3d72a9849 100644
--- a/examples/jax/comm_gemm_overlap/comm_gemm_overlap.py
+++ b/examples/jax/comm_gemm_overlap/comm_gemm_overlap.py
@@ -14,6 +14,7 @@
 from jax.experimental import mesh_utils
 
 import transformer_engine.jax as te
+from transformer_engine import transformer_engine_jax as tex
 from transformer_engine.jax.cpp_extensions import gemm_impl, copy_into_overlap_buffer
 from transformer_engine.jax.gemm import (
     initialize_comm_gemm_overlaps,
@@ -118,7 +119,18 @@
     myrank,
     numranks,
     tp_resource="tp",
-    overlap_configs={overlap_name: dict()},
+    overlap_configs={
+        overlap_name: {
+            "method": "ring_exchange",   # "pipeline" for collective kernels instead of send/recv
+            "comm_type": tex.CommOverlapType if args.comm_type == "AG" else tex.CommOverlapType.RS,
+            "num_splits": args.tp_size,   # independent of TP size for "pipeline"
+            "cga_size": 1,   # default is 2 for "pipeline"
+            "num_sm": 1,   # ignored for "ring_exchange", must be tuned for "pipeline"
+            "set_sm_margin": False,   # set to True for "pipeline"
+            "atomic_gemm": False,   # more performant when not using CUDA Graphs
+            "use_ce": True,   # ignored (always False) for "pipeline" method
+        }
+    },
 )
 
 if myrank == 0:
@@ -132,11 +144,16 @@
 
 @jax.jit
 def te_gemm(A, B):
+    # LHS needs to be copied into the comm. buffer before GEMM. This can usually be circumvented by
+    # extracting the comm. buffer as a JAX array via
+    # `buffer = jax.dlpack.from_dlpack(tex.get_overlap_buffer(overlap_name: str, sharded: bool))`
+    # and directly writing the result of a preceding operation into it (e.g.. LayerNorm output
+    # written directly into the communication buffer before AG+GEMM in a QKV projection)
     copy_into_overlap_buffer(A, overlap_name, True)
     return gemm_impl(
         A,
-        jax.lax.with_sharding_constraint(B, weight_no_fsdp_sharding),
-        batched_output=True,
+        jax.lax.with_sharding_constraint(B, weight_no_fsdp_sharding),   # all-gather FSDP weights
+        batched_output=True,   # internal option, will be hidden by the FWD/BWD wrapper
         comm_overlap_config=get_comm_overlap_config(overlap_name),
     )
 
@@ -144,6 +161,9 @@ def te_gemm(A, B):
 with te.sharding.global_shard_guard(mesh_resource):
     output, _, extra_out = te_gemm(lhs, rhs)
 
+if args.comm_type == "RS":
+    output = extra_out
+
 if myrank == 0:
     print(
         f"{myrank}: {'AG -> GEMM' if args.comm_type == 'AG' else 'GEMM -> RS'} OUTPUT "

From 18a62496be25d9d0a0039c5bf3e2c5acaff671f1 Mon Sep 17 00:00:00 2001
From: "pre-commit-ci[bot]"
 <66853113+pre-commit-ci[bot]@users.noreply.github.com>
Date: Thu, 5 Dec 2024 22:43:33 +0000
Subject: [PATCH 30/32] [pre-commit.ci] auto fixes from pre-commit.com hooks

for more information, see https://pre-commit.ci
---
 .../jax/comm_gemm_overlap/comm_gemm_overlap.py | 18 +++++++++---------
 1 file changed, 9 insertions(+), 9 deletions(-)

diff --git a/examples/jax/comm_gemm_overlap/comm_gemm_overlap.py b/examples/jax/comm_gemm_overlap/comm_gemm_overlap.py
index e3d72a9849..b968c4ef62 100644
--- a/examples/jax/comm_gemm_overlap/comm_gemm_overlap.py
+++ b/examples/jax/comm_gemm_overlap/comm_gemm_overlap.py
@@ -121,14 +121,14 @@
     tp_resource="tp",
     overlap_configs={
         overlap_name: {
-            "method": "ring_exchange",   # "pipeline" for collective kernels instead of send/recv
+            "method": "ring_exchange",  # "pipeline" for collective kernels instead of send/recv
             "comm_type": tex.CommOverlapType if args.comm_type == "AG" else tex.CommOverlapType.RS,
-            "num_splits": args.tp_size,   # independent of TP size for "pipeline"
-            "cga_size": 1,   # default is 2 for "pipeline"
-            "num_sm": 1,   # ignored for "ring_exchange", must be tuned for "pipeline"
-            "set_sm_margin": False,   # set to True for "pipeline"
-            "atomic_gemm": False,   # more performant when not using CUDA Graphs
-            "use_ce": True,   # ignored (always False) for "pipeline" method
+            "num_splits": args.tp_size,  # independent of TP size for "pipeline"
+            "cga_size": 1,  # default is 2 for "pipeline"
+            "num_sm": 1,  # ignored for "ring_exchange", must be tuned for "pipeline"
+            "set_sm_margin": False,  # set to True for "pipeline"
+            "atomic_gemm": False,  # more performant when not using CUDA Graphs
+            "use_ce": True,  # ignored (always False) for "pipeline" method
         }
     },
 )
@@ -152,8 +152,8 @@ def te_gemm(A, B):
     copy_into_overlap_buffer(A, overlap_name, True)
     return gemm_impl(
         A,
-        jax.lax.with_sharding_constraint(B, weight_no_fsdp_sharding),   # all-gather FSDP weights
-        batched_output=True,   # internal option, will be hidden by the FWD/BWD wrapper
+        jax.lax.with_sharding_constraint(B, weight_no_fsdp_sharding),  # all-gather FSDP weights
+        batched_output=True,  # internal option, will be hidden by the FWD/BWD wrapper
         comm_overlap_config=get_comm_overlap_config(overlap_name),
     )
 

From b1449417044e132c7a80b13a6756581964f836df Mon Sep 17 00:00:00 2001
From: Alp Dener <adener@nvidia.com>
Date: Mon, 9 Dec 2024 16:48:41 +0000
Subject: [PATCH 31/32] fixed RS overlap in the example

Signed-off-by: Alp Dener <adener@nvidia.com>
---
 .../comm_gemm_overlap/comm_gemm_overlap.py    | 28 ++++++++++++-------
 1 file changed, 18 insertions(+), 10 deletions(-)

diff --git a/examples/jax/comm_gemm_overlap/comm_gemm_overlap.py b/examples/jax/comm_gemm_overlap/comm_gemm_overlap.py
index e3d72a9849..a283ca62a2 100644
--- a/examples/jax/comm_gemm_overlap/comm_gemm_overlap.py
+++ b/examples/jax/comm_gemm_overlap/comm_gemm_overlap.py
@@ -21,6 +21,7 @@
     destroy_comm_gemm_overlaps,
     get_comm_overlap_config,
 )
+from transformer_engine.jax.sharding import get_padded_spec
 
 jax.clear_caches()
 
@@ -122,7 +123,11 @@
     overlap_configs={
         overlap_name: {
             "method": "ring_exchange",   # "pipeline" for collective kernels instead of send/recv
-            "comm_type": tex.CommOverlapType if args.comm_type == "AG" else tex.CommOverlapType.RS,
+            "comm_type": (
+                tex.CommOverlapType.AG
+                if args.comm_type == "AG"
+                else tex.CommOverlapType.RS
+            ),
             "num_splits": args.tp_size,   # independent of TP size for "pipeline"
             "cga_size": 1,   # default is 2 for "pipeline"
             "num_sm": 1,   # ignored for "ring_exchange", must be tuned for "pipeline"
@@ -144,31 +149,34 @@
 
 @jax.jit
 def te_gemm(A, B):
-    # LHS needs to be copied into the comm. buffer before GEMM. This can usually be circumvented by
-    # extracting the comm. buffer as a JAX array via
+    # For AG overlap, LHS needs to be copied into the comm. buffer before GEMM. This can usually
+    # be circumvented by extracting the comm. buffer as a JAX array via
     # `buffer = jax.dlpack.from_dlpack(tex.get_overlap_buffer(overlap_name: str, sharded: bool))`
     # and directly writing the result of a preceding operation into it (e.g.. LayerNorm output
     # written directly into the communication buffer before AG+GEMM in a QKV projection)
-    copy_into_overlap_buffer(A, overlap_name, True)
+    if args.comm_type == "AG":
+        copy_into_overlap_buffer(A, overlap_name, True)
+        return_idx = 0
+    else:
+        # For RS overlap, the scattered output is in the `extra_out` array.
+        return_idx = -1
+
     return gemm_impl(
         A,
         jax.lax.with_sharding_constraint(B, weight_no_fsdp_sharding),   # all-gather FSDP weights
         batched_output=True,   # internal option, will be hidden by the FWD/BWD wrapper
         comm_overlap_config=get_comm_overlap_config(overlap_name),
-    )
+    )[return_idx]
 
 
 with te.sharding.global_shard_guard(mesh_resource):
-    output, _, extra_out = te_gemm(lhs, rhs)
-
-if args.comm_type == "RS":
-    output = extra_out
+    output = te_gemm(lhs, rhs)
 
 if myrank == 0:
     print(
         f"{myrank}: {'AG -> GEMM' if args.comm_type == 'AG' else 'GEMM -> RS'} OUTPUT "
         + f"{output.shape}\n"
-        + f"{myrank}:    Sharding: {output.sharding.spec}\n",
+        + f"{myrank}:    Sharding: {get_padded_spec(output.sharding.spec, output.ndim)}\n",
         flush=True,
     )
 

From 6ad56517df0d19017c38efe9ad5374dfe000b948 Mon Sep 17 00:00:00 2001
From: "pre-commit-ci[bot]"
 <66853113+pre-commit-ci[bot]@users.noreply.github.com>
Date: Mon, 9 Dec 2024 16:50:20 +0000
Subject: [PATCH 32/32] [pre-commit.ci] auto fixes from pre-commit.com hooks

for more information, see https://pre-commit.ci
---
 .../comm_gemm_overlap/comm_gemm_overlap.py    | 20 +++++++++----------
 1 file changed, 9 insertions(+), 11 deletions(-)

diff --git a/examples/jax/comm_gemm_overlap/comm_gemm_overlap.py b/examples/jax/comm_gemm_overlap/comm_gemm_overlap.py
index 0bfcbb5830..77266539e1 100644
--- a/examples/jax/comm_gemm_overlap/comm_gemm_overlap.py
+++ b/examples/jax/comm_gemm_overlap/comm_gemm_overlap.py
@@ -122,20 +122,18 @@
     tp_resource="tp",
     overlap_configs={
         overlap_name: {
-            "method": "ring_exchange",   # "pipeline" for collective kernels instead of send/recv
+            "method": "ring_exchange",  # "pipeline" for collective kernels instead of send/recv
             "comm_type": (
-                tex.CommOverlapType.AG
-                if args.comm_type == "AG"
-                else tex.CommOverlapType.RS
+                tex.CommOverlapType.AG if args.comm_type == "AG" else tex.CommOverlapType.RS
             ),
-            "num_splits": args.tp_size,   # independent of TP size for "pipeline"
-            "cga_size": 1,   # default is 2 for "pipeline"
-            "num_sm": 1,   # ignored for "ring_exchange", must be tuned for "pipeline"
-            "set_sm_margin": False,   # set to True for "pipeline"
-            "atomic_gemm": False,   # more performant when not using CUDA Graphs
-            "use_ce": True,   # ignored (always False) for "pipeline" method
+            "num_splits": args.tp_size,  # independent of TP size for "pipeline"
+            "cga_size": 1,  # default is 2 for "pipeline"
+            "num_sm": 1,  # ignored for "ring_exchange", must be tuned for "pipeline"
+            "set_sm_margin": False,  # set to True for "pipeline"
+            "atomic_gemm": False,  # more performant when not using CUDA Graphs
+            "use_ce": True,  # ignored (always False) for "pipeline" method
         },
-    }
+    },
 )
 
 if myrank == 0: