sgl-project · HaiShaw · Jan 3, 2025 · Dec 26, 2024 · Dec 27, 2024 · Dec 27, 2024
diff --git a/python/pyproject.toml b/python/pyproject.toml
@@ -27,7 +27,7 @@ srt = ["sglang[runtime_common]", "torch", "vllm>=0.6.3.post1,<=0.6.4.post1", "cu
 
 # HIP (Heterogeneous-computing Interface for Portability) for AMD
 # => base docker rocm/vllm-dev:20241022, not from public vllm whl
-srt_hip = ["sglang[runtime_common]", "torch", "vllm==0.6.3.dev13"]
+srt_hip = ["sglang[runtime_common]", "torch", "vllm==0.6.3.post2.dev1+g1ef171e0.rocm624"]
 # xpu is not enabled in public vllm and torch whl,
 # need to follow https://docs.vllm.ai/en/latest/getting_started/xpu-installation.htmlinstall vllm
 srt_xpu = ["sglang[runtime_common]"]

@@ -406,6 +406,10 @@ def _decode_grouped_att_m_fwd(
     Lk = k_buffer.shape[-1]
     Lv = v_buffer.shape[-1]
 
+    # [TODO] work around shmem limit on MI3xx
+    if is_hip_ and Lk >= 576:
+        BLOCK = 16
+
     if Lk == 576:
         BLOCK_DMODEL = 512
         BLOCK_DPE = 64

@@ -11,18 +11,14 @@
 import torch
 import triton
 import triton.language as tl
+from sgl_kernel import moe_align_block_size as sgl_moe_align_block_size
 from vllm import _custom_ops as ops
 
 from sglang.srt.layers.moe.topk import select_experts
 from sglang.srt.layers.quantization.fp8_kernel import per_token_group_quant_fp8
 from sglang.srt.utils import direct_register_custom_op, get_device_name, is_hip
 
-not_hip = False
-if not is_hip():
-    from sgl_kernel import moe_align_block_size as sgl_moe_align_block_size
-
-    not_hip = True
-
+is_hip_ = is_hip()
 logger = logging.getLogger(__name__)
 padding_size = 128 if bool(int(os.getenv("MOE_PADDING", "0"))) else 0
 
@@ -272,7 +268,7 @@ def moe_align_block_size(
         (max_num_m_blocks,), dtype=torch.int32, device=topk_ids.device
     )
     num_tokens_post_pad = torch.empty((1), dtype=torch.int32, device=topk_ids.device)
-    if not_hip and num_experts >= 224:
+    if num_experts >= 224:
         token_cnts_buffer = torch.empty(
             (num_experts + 1) * num_experts, dtype=torch.int32, device=topk_ids.device
         )
@@ -326,11 +322,12 @@ def invoke_fused_moe_kernel(
 
     padded_size = 0
     if use_fp8_w8a8:
-        padded_size = padding_size
         assert B_scale is not None
         if block_shape is None:
+            padded_size = padding_size
             A, A_scale = ops.scaled_fp8_quant(A, A_scale)
         else:
+            padding_size = 0
             assert len(block_shape) == 2
             block_n, block_k = block_shape[0], block_shape[1]
             A, A_scale = per_token_group_quant_fp8(A, block_k)
@@ -463,7 +460,7 @@ def get_default_config(
                 "BLOCK_SIZE_K": 128,
                 "GROUP_SIZE_M": 32,
                 "num_warps": 8,
-                "num_stages": 4,
+                "num_stages": 2 if is_hip_ else 4,
             }
             if M <= E:
                 config = {
@@ -472,7 +469,7 @@ def get_default_config(
                     "BLOCK_SIZE_K": 128,
                     "GROUP_SIZE_M": 1,
                     "num_warps": 4,
-                    "num_stages": 4,
+                    "num_stages": 2 if is_hip_ else 4,
                 }
         else:
             # Block-wise quant: BLOCK_SIZE_K must be divisable by block_shape[1]
@@ -482,7 +479,7 @@ def get_default_config(
                 "BLOCK_SIZE_K": block_shape[1],
                 "GROUP_SIZE_M": 32,
                 "num_warps": 4,
-                "num_stages": 3,
+                "num_stages": 2 if is_hip_ else 3,
             }
     else:
         config = {
@@ -727,7 +724,7 @@ def fused_experts_impl(
     block_shape: Optional[List[int]] = None,
 ):
     padded_size = padding_size
-    if not use_fp8_w8a8:
+    if not use_fp8_w8a8 or block_shape is not None:
         padded_size = 0
 
     # Check constraints.

diff --git a/sgl-kernel/setup.py b/sgl-kernel/setup.py
@@ -3,6 +3,14 @@
 import zipfile
 from pathlib import Path
 
+import torch
+
+
+def is_hip() -> bool:
+    """Return whether it is HIP on the AMD ROCm platform."""
+    return torch.version.hip is not None
-    return torch.version.hip is not None
+    return torch.cuda.is_available() and torch.version.hip
-    return torch.version.hip is not None
+    return torch.cuda.is_available() and torch.version.hip
+
+
 from setuptools import setup
 from torch.utils.cpp_extension import BuildExtension, CUDAExtension
 
@@ -58,38 +66,64 @@ def update_wheel_platform_tag():
     old_wheel.rename(new_wheel)
 
 
-nvcc_flags = [
-    "-O3",
-    "-Xcompiler",
-    "-fPIC",
-    "-gencode=arch=compute_75,code=sm_75",
-    "-gencode=arch=compute_80,code=sm_80",
-    "-gencode=arch=compute_89,code=sm_89",
-    "-gencode=arch=compute_90,code=sm_90",
-    "-U__CUDA_NO_HALF_OPERATORS__",
-    "-U__CUDA_NO_HALF2_OPERATORS__",
-]
-cxx_flags = ["-O3"]
-libraries = ["c10", "torch", "torch_python"]
-extra_link_args = ["-Wl,-rpath,$ORIGIN/../../torch/lib"]
-ext_modules = [
-    CUDAExtension(
-        name="sgl_kernel.ops._kernels",
-        sources=[
-            "src/sgl-kernel/csrc/warp_reduce_kernel.cu",
-            "src/sgl-kernel/csrc/trt_reduce_internal.cu",
-            "src/sgl-kernel/csrc/trt_reduce_kernel.cu",
-            "src/sgl-kernel/csrc/moe_align_kernel.cu",
-            "src/sgl-kernel/csrc/sgl_kernel_ops.cu",
-        ],
-        extra_compile_args={
-            "nvcc": nvcc_flags,
-            "cxx": cxx_flags,
-        },
-        libraries=libraries,
-        extra_link_args=extra_link_args,
-    ),
-]
+if not is_hip():
+    nvcc_flags = [
+        "-O3",
+        "-Xcompiler",
+        "-fPIC",
+        "-gencode=arch=compute_75,code=sm_75",
+        "-gencode=arch=compute_80,code=sm_80",
+        "-gencode=arch=compute_89,code=sm_89",
+        "-gencode=arch=compute_90,code=sm_90",
+        "-U__CUDA_NO_HALF_OPERATORS__",
+        "-U__CUDA_NO_HALF2_OPERATORS__",
+    ]
+    cxx_flags = ["-O3"]
+    libraries = ["c10", "torch", "torch_python"]
+    extra_link_args = ["-Wl,-rpath,$ORIGIN/../../torch/lib"]
+    ext_modules = [
+        CUDAExtension(
+            name="sgl_kernel.ops._kernels",
+            sources=[
+                "src/sgl-kernel/csrc/warp_reduce_kernel.cu",
+                "src/sgl-kernel/csrc/trt_reduce_internal.cu",
+                "src/sgl-kernel/csrc/trt_reduce_kernel.cu",
+                "src/sgl-kernel/csrc/moe_align_kernel.cu",
+                "src/sgl-kernel/csrc/sgl_kernel_ops.cu",
+            ],
+            extra_compile_args={
+                "nvcc": nvcc_flags,
+                "cxx": cxx_flags,
+            },
+            libraries=libraries,
+            extra_link_args=extra_link_args,
+        ),
+    ]
+else:
+    hipcc_flags = [
+        "-D__HIP_PLATFORM_AMD__=1",
+        "--amdgpu-target=gfx90a,gfx940,gfx941,gfx942",
+    ]
+    ext_modules=[
+        CUDAExtension(
+            "sgl_kernel.ops.moe_align_block_size",
+            [
+                "src/sgl-kernel/csrc/moe_align_kernel.cu",
+                "src/sgl-kernel/csrc/sgl_kernel_ops.cu",
+            ],
+            extra_compile_args={
+                "nvcc": hipcc_flags
+                + [
+                    "-O3",
+                    "-Xcompiler",
+                    "-fPIC",
+                ],
+                "cxx": ["-O3"],
+            },
+            libraries=["hiprtc", "amdhip64", "c10", "torch", "torch_python"],
+            extra_link_args=["-Wl,-rpath,$ORIGIN/../../torch/lib"],
+        ),
+    ]
 
 setup(
     name="sgl-kernel",