FlagOpen
diff --git a/‎src/flag_gems/runtime/backend/_ascend/__init__.py‎
Lines changed: 0 additions & 1 deletion b/‎src/flag_gems/runtime/backend/_ascend/__init__.py‎
Lines changed: 0 additions & 1 deletion
diff --git a/‎src/flag_gems/runtime/backend/_ascend/fused/__init__.py‎
Lines changed: 6 additions & 0 deletions b/‎src/flag_gems/runtime/backend/_ascend/fused/__init__.py‎
Lines changed: 6 additions & 0 deletions
diff --git a/‎src/flag_gems/runtime/backend/_ascend/fused/cross_entropy_loss.py‎
Lines changed: 2 additions & 2 deletions b/‎src/flag_gems/runtime/backend/_ascend/fused/cross_entropy_loss.py‎
Lines changed: 2 additions & 2 deletions
diff --git a/‎src/flag_gems/runtime/backend/_ascend/fused/fused_add_rms_norm.py‎
Lines changed: 77 additions & 0 deletions b/‎src/flag_gems/runtime/backend/_ascend/fused/fused_add_rms_norm.py‎
Lines changed: 77 additions & 0 deletions
diff --git a/‎src/flag_gems/runtime/backend/_ascend/fused/rotary_embedding.py‎
Lines changed: 278 additions & 0 deletions b/‎src/flag_gems/runtime/backend/_ascend/fused/rotary_embedding.py‎
Lines changed: 278 additions & 0 deletions
@@ -3,7 +3,6 @@
 vendor_info = VendorInfoBase(
     vendor_name="ascend",
     device_name="npu",
-    triton_extra_name="ascend",
     device_query_cmd="npu-smi info",
     dispatch_key="PrivateUse1",
 )
 
@@ -1,5 +1,11 @@
 from .cross_entropy_loss import cross_entropy_loss
+from .rotary_embedding import apply_rotary_pos_emb
+from .fused_add_rms_norm import fused_add_rms_norm
+from .skip_layernorm import skip_layer_norm
 
 __all__ = [
     "cross_entropy_loss",
+    "apply_rotary_pos_emb",
+    "fused_add_rms_norm",
+    "skip_layer_norm",
 ]
@@ -519,7 +519,7 @@ def sum_and_scale(
 class CrossEntropyLoss(torch.autograd.Function):
     @staticmethod
     def forward(ctx, inp, target, weight, reduction, ignore_index, label_smoothing):
-        logger.debug("GEMS CrossEntropyLoss")
+        logger.debug("GEMS_ASCEND CrossEntropyLoss")
 
         shape = list(inp.shape)
         dim = inp.ndim
@@ -607,7 +607,7 @@ def forward(ctx, inp, target, weight, reduction, ignore_index, label_smoothing):
 
     @staticmethod
     def backward(ctx, out_grad):
-        logger.debug("GEMS CrossEntropyLoss VJP")
+        logger.debug("GEMS_ASCEND CrossEntropyLoss VJP")
 
         inp, tgt, weight = ctx.saved_tensors
         N = ctx.N
 
@@ -0,0 +1,77 @@
+import logging
+import math
+ 
+import triton
+import triton.language as tl
+ 
+from flag_gems.runtime import torch_device_fn
+from flag_gems.utils import libentry
+from flag_gems.utils import triton_lang_extension as tle
+ 
+logger = logging.getLogger(__name__)
+ 
+@libentry()
+@triton.jit(do_not_specialize=["eps"])
+def fused_add_rms_norm_kernel(
+    X,  # pointer to the input
+    R,  # pointer to the residual
+    W,  # pointer to the weight
+    x_stride_r,  # how much to increase the pointer when moving by 1 row
+    x_stride_c,  # how much to increase the pointer when moving by 1 col
+    r_stride_r,  # how much to increase the pointer when moving by 1 row
+    r_stride_c,  # how much to increase the pointer when moving by 1 col
+    N,  # number of columns in X
+    eps,  # epsilon to avoid division by zero
+    BLOCK_SIZE: tl.constexpr,
+):
+    pid = tl.program_id(0)
+    X += pid * x_stride_r
+    R += pid * r_stride_r
+ 
+    _var_base = tl.zeros([BLOCK_SIZE], dtype=tl.float32)
+
+    for off in range(0, N, BLOCK_SIZE):
+        cols = off + tl.arange(0, BLOCK_SIZE)
+        mask = cols < N
+        x = tl.load(X + cols, mask, other=0.0).to(tl.float32)
+        r = tl.load(R + cols, mask, other=0.0).to(tl.float32)
+        x += r
+        _var_base += x * x / N
+    var = tl.sum(_var_base)
+ 
+    rrms = 1 / tl.sqrt(var + eps)
+ 
+    for off in range(0, N, BLOCK_SIZE):
+        cols = off + tl.arange(0, BLOCK_SIZE)
+        mask = cols < N
+        x = tl.load(X + cols, mask, other=0.0).to(tl.float32)
+        r = tl.load(R + cols, mask, other=0.0).to(tl.float32)
+        x += r
+        w = tl.load(W + cols, mask, other=0.0)
+        y = (x * rrms).to(X.dtype.element_ty) * w
+        # write back to residual and input
+        tl.store(R + cols * r_stride_c, x, mask=mask)
+        tl.store(X + cols * x_stride_c, y, mask=mask)
+ 
+ 
+def fused_add_rms_norm(x, residual, normalized_shape, weight, eps=1e-5):
+    """
+    This function performs fused residual addition and RMS normalization **in-place**.
+    Both `x` and `residual` tensors will be modified. Use with caution if these tensors
+    are reused elsewhere or require gradients.
+    """
+    logger.debug("GEMS_ASCEND FUSED_ADD_RMS_NORM FORWARD")
+    dim = x.ndim - len(normalized_shape)
+    M = min(math.prod(x.shape[:dim]), 65535)
+    N = math.prod(normalized_shape)
+ 
+    BLOCK_SIZE = min(triton.next_power_of_2(N), 8192)
+    x = x.contiguous()
+    residual = residual.contiguous()
+    weight = weight.contiguous()
+ 
+    with torch_device_fn.device(x.device):
+        fused_add_rms_norm_kernel[M,](
+            x, residual, weight, N, 1, N, 1, N, eps, BLOCK_SIZE
+        )
+    return x, residual
@@ -0,0 +1,278 @@
+import logging
+from typing import Optional
+
+import torch
+import triton
+import triton.language as tl
+
+import flag_gems
+from flag_gems.runtime import torch_device_fn
+from flag_gems.utils import libentry
+from flag_gems.utils import triton_lang_extension as tle
+
+
+@triton.jit
+def rotary_embedding_rw_kernel(
+    state_out,
+    state,
+    cos,
+    sin,
+    stride_state_n,
+    stride_state_h,
+    stride_state_d,
+    stride_cos_n,
+    stride_cos_d,
+    num_tokens,
+    num_heads,
+    token_range,
+    head_range,
+    dim_range_x,
+    dim_range_y,
+    rotary_interleaved: tl.constexpr,
+):
+    state_x_offset = (
+        token_range[:, None, None] * stride_state_n
+        + head_range[None, :, None] * stride_state_h
+        + dim_range_x[None, None, :] * stride_state_d
+    )
+    state_y_offset = (
+        token_range[:, None, None] * stride_state_n
+        + head_range[None, :, None] * stride_state_h
+        + dim_range_y[None, None, :] * stride_state_d
+    )
+
+    cos_sim_offset = (
+        token_range[:, None, None] * stride_cos_n
+        + dim_range_x[None, None, :] * stride_cos_d
+    )
+    if rotary_interleaved:
+        sin_sim_offset = (
+            token_range[:, None, None] * stride_cos_n
+            + dim_range_y[None, None, :] * stride_cos_d
+        )
+    else:
+        sin_sim_offset = cos_sim_offset
+
+    state_x = tl.load(
+        state + state_x_offset,
+        mask=(token_range[:, None, None] < num_tokens)
+        & (head_range[None, :, None] < num_heads),
+        other=0.0,
+    )
+    state_y = tl.load(
+        state + state_y_offset,
+        mask=(token_range[:, None, None] < num_tokens)
+        & (head_range[None, :, None] < num_heads),
+        other=0.0,
+    )
+
+    cos_loaded = tl.load(
+        cos + cos_sim_offset,
+        mask=token_range[:, None, None] < num_tokens,
+        other=0.0,
+    ).to(tl.float32)
+    sin_loaded = tl.load(
+        sin + sin_sim_offset,
+        mask=token_range[:, None, None] < num_tokens,
+        other=0.0,
+    ).to(tl.float32)
+
+    out_x = state_x * cos_loaded - state_y * sin_loaded
+    out_y = state_x * sin_loaded + state_y * cos_loaded
+
+    tl.store(
+        state_out + state_x_offset,
+        out_x,
+        mask=(token_range[:, None, None] < num_tokens)
+        & (head_range[None, :, None] < num_heads),
+    )
+    tl.store(
+        state_out + state_y_offset,
+        out_y,
+        mask=(token_range[:, None, None] < num_tokens)
+        & (head_range[None, :, None] < num_heads),
+    )
+
+
+@libentry()
+@triton.jit
+def rotary_embedding_siso_kernel(
+    state_out, # [num_tokens, head_num, head_dim]
+    state,  # [num_tokens, head_num, head_dim]
+    cos,  # [num_tokens, 1, head_dim // 2]
+    sin,  # [num_tokens, 1, head_dim // 2]
+    stride_state_n,
+    stride_state_h,
+    stride_state_d,
+    stride_cos_n,
+    stride_cos_d,
+    num_tokens,
+    num_heads,
+    BLOCK_N: tl.constexpr,
+    BLOCK_H: tl.constexpr,
+    BLOCK_D: tl.constexpr,
+    rotary_interleaved: tl.constexpr,
+):
+    token_index = tl.program_id(0)
+    token_range = token_index * BLOCK_N + tl.arange(0, BLOCK_N)
+    head_index = tl.program_id(1)
+    head_range = head_index * BLOCK_H + tl.arange(0, BLOCK_H)
+    
+    if rotary_interleaved:
+        for d in range(0, BLOCK_D // 2):
+            dim_range_x = d * 2
+            dim_range_y = d * 2 + 1
+
+            rotary_embedding_rw_kernel(
+                state_out,
+                state,
+                cos,
+                sin,
+                stride_state_n,
+                stride_state_h,
+                stride_state_d,
+                stride_cos_n,
+                stride_cos_d,
+                num_tokens,
+                num_heads,
+                token_range,
+                head_range,
+                dim_range_x,
+                dim_range_y,
+                rotary_interleaved,
+            )
+    else:
+        dim_range_x = tl.arange(0, BLOCK_D // 2)
+        dim_range_y = tl.arange(BLOCK_D // 2, BLOCK_D)
+        rotary_embedding_rw_kernel(
+            state_out,
+            state,
+            cos,
+            sin,
+            stride_state_n,
+            stride_state_h,
+            stride_state_d,
+            stride_cos_n,
+            stride_cos_d,
+            num_tokens,
+            num_heads,
+            token_range,
+            head_range,
+            dim_range_x,
+            dim_range_y,
+            rotary_interleaved,
+        )
+
+def apply_rotary_pos_emb(
+    q,
+    k,
+    cos,
+    sin,
+    position_ids: Optional[torch.IntTensor] = None,
+    rotary_interleaved: bool = False,
+):
+    """
+    Apply rotary position embedding to q and k
+
+    Args:
+        q: (*, q_heads, head_dim)
+        k: (*, k_heads, head_dim)
+        cos: (max_seq_len, head_dim // 2)
+        sin: (max_seq_len, head_dim // 2)
+        position_ids: (*, ), optional, position ids for each token
+        rotary_interleaved: whether the head_dim is rotated in an interleaved way
+
+    Returns:
+        q_embed: (*, q_heads, head_dim)
+        k_embed: (*, k_heads, head_dim)
+    """
+    logging.debug("GEMS_ASCEND ROTARY POS EMBEDDING")
+    assert (
+        k.shape[-1] == q.shape[-1]
+    ), f"q and k must have the same last dimension, got {q.shape} and {k.shape}"
+    assert (
+        cos.shape[-1] == sin.shape[-1]
+    ), f"cos and sin must have the same last dimension, got {cos.shape} and {sin.shape}"
+    assert (
+        cos.shape[-1] * 2 == q.shape[-1]
+    ), f"cos/sin dim must be half of q/k dim, got {cos.shape} and {q.shape}"
+    assert cos.stride(-1) == 1, "cos must be contiguous at the last dimension"
+    assert sin.stride(-1) == 1, "sin must be contiguous at the last dimension"
+
+    q_shape = q.shape
+    k_shape = k.shape
+
+    assert (
+        q.shape[:-2] == k.shape[:-2]
+    ), f"q and k must have the same length, got {q.shape[:-2]} and {k.shape[:-2]}"
+    if position_ids is None:
+        assert (
+            len(q.shape) == 4
+        ), f"q must have 4 dimensions if position_ids is not provided, got {q.shape}"
+        seq_len = q.shape[-3]
+    else:
+        assert (
+            position_ids.shape == q.shape[:-2]
+        ), f"position_ids must have the same length as q, got {position_ids.shape} and {q.shape[:-2]}"
+
+        position_ids = position_ids.view(-1)
+        seq_len = None
+
+    q = q.view(-1, q.shape[-2], q.shape[-1])
+    k = k.view(-1, k.shape[-2], k.shape[-1])
+
+    q_embed = torch.empty_like(q)
+    k_embed = torch.empty_like(k)
+
+    def torch_rotary_embedding(state_out, state, cos, sin):
+        num_tokens = state.shape[0]
+        num_heads = state.shape[1]
+        head_dim = state.shape[-1]
+
+        BLOCK_N = 8
+        BLOCK_H = 4
+        grid = (
+            triton.cdiv(num_tokens, BLOCK_N),
+            triton.cdiv(num_heads, BLOCK_H),
+        )
+        with torch_device_fn.device(state_out.device):
+            with flag_gems.use_gems():
+                if position_ids is None:
+                    cos = cos[: q_shape[-3], None, :]
+                    sin = sin[: q_shape[-3], None, :]
+                else:
+                    cos = cos[position_ids, None, :]
+                    sin = sin[position_ids, None, :]
+
+                if rotary_interleaved:
+                    cos = torch.repeat_interleave(cos, 2, dim=-1)
+                    sin = torch.repeat_interleave(sin, 2, dim=-1)
+                orig_cos = cos
+                orig_sin = sin
+                for _ in range(q_shape[0] - 1):
+                    cos = torch.cat((cos, orig_cos), dim=0)
+                    sin = torch.cat((sin, orig_sin), dim=0)
+            rotary_embedding_siso_kernel[grid](
+                state_out,
+                state,
+                cos,
+                sin,
+                state.stride(0),
+                state.stride(1),
+                state.stride(2),
+                cos.stride(0),
+                cos.stride(2),
+                num_tokens,
+                num_heads,
+                BLOCK_N=BLOCK_N,
+                BLOCK_H=BLOCK_H,
+                BLOCK_D=head_dim,
+                rotary_interleaved=rotary_interleaved,
+            )
+    
+    torch_rotary_embedding(q_embed, q, cos, sin)
+    torch_rotary_embedding(k_embed, k, cos, sin)
+
+    q_embed = q_embed.view(q_shape)
+    k_embed = k_embed.view(k_shape)
+    return q_embed, k_embed
Original file line number	Diff line number	Diff line change
`@@ -3,7 +3,6 @@`
`3`	`3`	`vendor_info = VendorInfoBase(`
`4`	`4`	`vendor_name="ascend",`
`5`	`5`	`device_name="npu",`
`6`		`- triton_extra_name="ascend",`
`7`	`6`	`device_query_cmd="npu-smi info",`
`8`	`7`	`dispatch_key="PrivateUse1",`
`9`	`8`	`)`