sgl-project · HaiShaw · Jan 3, 2025 · Dec 26, 2024 · Dec 27, 2024 · Dec 27, 2024
diff --git a/docker/Dockerfile.rocm b/docker/Dockerfile.rocm
@@ -2,7 +2,7 @@
 #   docker build --build-arg SGL_BRANCH=v0.4.1 -t v0.4.1-rocm620 -f Dockerfile.rocm .
 
 # default base image
-ARG BASE_IMAGE="rocm/vllm-dev:20241022"
+ARG BASE_IMAGE="rocm/vllm-dev:20241226"
 
 FROM $BASE_IMAGE AS base
 USER root

diff --git a/python/pyproject.toml b/python/pyproject.toml
@@ -18,7 +18,7 @@ dependencies = ["requests", "tqdm", "numpy", "IPython", "setproctitle"]
 [project.optional-dependencies]
 runtime_common = ["aiohttp", "decord", "fastapi",
     "hf_transfer", "huggingface_hub", "interegular", "modelscope",
-    "orjson", "outlines>=0.0.44,<0.1.0",
+    "orjson", "outlines>=0.1.7", "outlines-core>=0.1.17",
     "packaging", "pillow", "prometheus-client>=0.20.0",
     "psutil", "pydantic", "python-multipart",
     "pyzmq>=25.1.2", "torchao>=0.7.0", "uvicorn", "uvloop",
@@ -27,7 +27,7 @@ srt = ["sglang[runtime_common]", "torch", "vllm>=0.6.3.post1,<=0.6.4.post1", "cu
 
 # HIP (Heterogeneous-computing Interface for Portability) for AMD
 # => base docker rocm/vllm-dev:20241022, not from public vllm whl
-srt_hip = ["sglang[runtime_common]", "torch", "vllm==0.6.3.dev13"]
+srt_hip = ["sglang[runtime_common]", "torch", "vllm==0.6.5.dev414+ga2646938.rocm634"]
 # xpu is not enabled in public vllm and torch whl,
 # need to follow https://docs.vllm.ai/en/latest/getting_started/xpu-installation.htmlinstall vllm
 srt_xpu = ["sglang[runtime_common]"]

diff --git a/python/sglang/srt/constrained/outlines_backend.py b/python/sglang/srt/constrained/outlines_backend.py
@@ -20,7 +20,7 @@
 import interegular
 import torch
 from outlines.fsm.guide import RegexGuide
-from outlines.fsm.json_schema import build_regex_from_schema
+from outlines_core.fsm.json_schema import build_regex_from_schema
 from outlines.models.transformers import TransformerTokenizer
 from pydantic import BaseModel
 

@@ -402,7 +402,7 @@ def _decode_grouped_att_m_fwd(
     sm_scale,
     logit_cap,
 ):
-    BLOCK = 32
+    BLOCK = 16 if is_hip() else 32
     Lk = k_buffer.shape[-1]
     Lv = v_buffer.shape[-1]
 

@@ -16,7 +16,9 @@
 
 from sglang.srt.layers.moe.topk import select_experts
 from sglang.srt.layers.quantization.fp8_kernel import per_token_group_quant_fp8
-from sglang.srt.utils import direct_register_custom_op, get_device_name
+from sglang.srt.utils import direct_register_custom_op, get_device_name, is_hip
+
+is_hip_ = is_hip()
 
 logger = logging.getLogger(__name__)
 padding_size = 128 if bool(int(os.getenv("MOE_PADDING", "0"))) else 0
@@ -437,7 +439,7 @@ def get_default_config(
             "BLOCK_SIZE_K": 128,
             "GROUP_SIZE_M": 32,
             "num_warps": 8,
-            "num_stages": 4,
+            "num_stages": 2 if is_hip_ else 4,
         }
         if M <= E:
             config = {
@@ -446,7 +448,7 @@ def get_default_config(
                 "BLOCK_SIZE_K": 128,
                 "GROUP_SIZE_M": 1,
                 "num_warps": 4,
-                "num_stages": 4,
+                "num_stages": 2 if is_hip_ 4,
             }
     else:
         config = {

diff --git a/python/sglang/srt/layers/quantization/fp8.py b/python/sglang/srt/layers/quantization/fp8.py
@@ -217,7 +217,7 @@ def create_weights(
 
         # WEIGHT
         weight_dtype = (
-            torch.float8_e4m3fn
+            torch.float8_e4m3fnuz if is_hip() else torch.float8_e4m3fn
             if self.quant_config.is_checkpoint_fp8_serialized
             else params_dtype
         )
@@ -432,7 +432,7 @@ def create_weights(
         from sglang.srt.layers.moe.fused_moe_triton import FusedMoeWeightScaleSupported
 
         if self.quant_config.is_checkpoint_fp8_serialized:
-            params_dtype = torch.float8_e4m3fn
+            params_dtype = torch.float8_e4m3fnuz if is_hip() else torch.float8_e4m3fn
         tp_size = get_tensor_model_parallel_world_size()
         if self.block_quant:
             block_n, block_k = (

diff --git a/python/sglang/srt/layers/quantization/fp8_kernel.py b/python/sglang/srt/layers/quantization/fp8_kernel.py
@@ -17,7 +17,7 @@
 import torch
 import triton
 import triton.language as tl
-
+from sglang.srt.utils import is_hip
 
 @triton.jit
 def _per_token_group_quant_fp8(
@@ -65,7 +65,7 @@ def per_token_group_quant_fp8(
     x: torch.Tensor,
     group_size: int,
     eps: float = 1e-10,
-    dtype: torch.dtype = torch.float8_e4m3fn,
+    dtype: torch.dtype = torch.float8_e4m3fnuz if is_hip() else torch.float8_e4m3fn
 ) -> Tuple[torch.Tensor, torch.Tensor]:
     """Function to perform per-token-group quantization on an input tensor `x`.
 

diff --git a/python/sglang/srt/models/deepseek_v2.py b/python/sglang/srt/models/deepseek_v2.py
@@ -55,7 +55,7 @@
 from sglang.srt.managers.schedule_batch import global_server_args_dict
 from sglang.srt.model_executor.forward_batch_info import ForwardBatch
 from sglang.srt.model_loader.weight_utils import default_weight_loader
-from sglang.srt.utils import is_flashinfer_available
+from sglang.srt.utils import is_flashinfer_available, is_hip
 
 if is_flashinfer_available():
     from flashinfer import bmm_fp8
@@ -573,13 +573,16 @@ def forward_absorb(
             )
         q_nope, q_pe = q.split([self.qk_nope_head_dim, self.qk_rope_head_dim], dim=-1)
 
-        if self.w_kc.dtype == torch.float8_e4m3fn:
-            q_nope_val, q_nope_scale = input_to_float8(
-                q_nope.transpose(0, 1), torch.float8_e4m3fn
-            )
-            q_nope_out = bmm_fp8(
-                q_nope_val, self.w_kc, q_nope_scale, self.w_scale, torch.bfloat16
-            )
+        if self.w_kc.dtype == torch.float8_e4m3fn or torch.float8_e4m3fnuz:
+            if is_hip():
+                q_nope_out = torch.bmm(q_nope.to(torch.bfloat16).transpose(0, 1), self.w_kc.to(torch.bfloat16))
+            else:
+                q_nope_val, q_nope_scale = input_to_float8(
+                    q_nope.transpose(0, 1), torch.float8_e4m3fn
+                )
+                q_nope_out = bmm_fp8(
+                    q_nope_val, self.w_kc, q_nope_scale, self.w_scale, torch.bfloat16
+                )
         else:
             q_nope_out = torch.bmm(q_nope.transpose(0, 1), self.w_kc)
         q_input[..., : self.kv_lora_rank] = q_nope_out.transpose(0, 1)
@@ -598,17 +601,20 @@ def forward_absorb(
         attn_output = self.attn_mqa(q_input, k_input, v_input, forward_batch)
         attn_output = attn_output.view(-1, self.num_local_heads, self.kv_lora_rank)
 
-        if self.w_vc.dtype == torch.float8_e4m3fn:
-            attn_output_val, attn_output_scale = input_to_float8(
-                attn_output.transpose(0, 1), torch.float8_e4m3fn
-            )
-            attn_bmm_output = bmm_fp8(
-                attn_output_val,
-                self.w_vc,
-                attn_output_scale,
-                self.w_scale,
-                torch.bfloat16,
-            )
+        if self.w_vc.dtype == torch.float8_e4m3fn or torch.float8_e4m3fnuz:
+            if is_hip():
+                attn_bmm_output = torch.bmm(attn_output.to(torch.bfloat16).transpose(0, 1), self.w_vc.to(torch.bfloat16))
+            else:
+                attn_output_val, attn_output_scale = input_to_float8(
+                    attn_output.transpose(0, 1), torch.float8_e4m3fn
+                )
+                attn_bmm_output = bmm_fp8(
+                    attn_output_val,
+                    self.w_vc,
+                    attn_output_scale,
+                    self.w_scale,
+                    torch.bfloat16,
+                )
         else:
             attn_bmm_output = torch.bmm(attn_output.transpose(0, 1), self.w_vc)
         attn_output = attn_bmm_output.transpose(0, 1).flatten(1, 2)
@@ -942,7 +948,7 @@ def load_weights(self, weights: Iterable[Tuple[str, torch.Tensor]]):
                 # This may affect the accuracy of fp8 model.
                 if (
                     hasattr(self.quant_config, "weight_block_size")
-                    and w.dtype == torch.float8_e4m3fn
+                    and w.dtype == torch.float8_e4m3fnuz if is_hip() else torch.float8_e4m3fn
                 ):
                     weight_block_size = self.quant_config.weight_block_size
                     if weight_block_size is not None:

diff --git a/sgl-kernel/setup.py b/sgl-kernel/setup.py
@@ -2,6 +2,10 @@
 import shutil
 import zipfile
 from pathlib import Path
+import torch 
+def is_hip() -> bool:
+    """Return whether it is HIP on the AMD ROCm platform."""
+    return torch.version.hip is not None
-    return torch.version.hip is not None
+    return torch.cuda.is_available() and torch.version.hip
-    return torch.version.hip is not None
+    return torch.cuda.is_available() and torch.version.hip
 
 from setuptools import setup
 from torch.utils.cpp_extension import BuildExtension, CUDAExtension
@@ -58,80 +62,116 @@ def update_wheel_platform_tag():
     old_wheel.rename(new_wheel)
 
 
-setup(
-    name="sgl-kernel",
-    version=get_version(),
-    packages=["sgl_kernel"],
-    package_dir={"": "src"},
-    ext_modules=[
-        CUDAExtension(
-            "sgl_kernel.ops.warp_reduce_cuda",
-            [
-                "src/sgl-kernel/csrc/warp_reduce.cc",
-                "src/sgl-kernel/csrc/warp_reduce_kernel.cu",
-            ],
-            extra_compile_args={
-                "nvcc": [
-                    "-O3",
-                    "-Xcompiler",
-                    "-fPIC",
-                    "-gencode=arch=compute_75,code=sm_75",
-                    "-gencode=arch=compute_80,code=sm_80",
-                    "-gencode=arch=compute_89,code=sm_89",
-                    "-gencode=arch=compute_90,code=sm_90",
+
+if not is_hip(): 
+    setup(
+        name="sgl-kernel",
+        version=get_version(),
+        packages=["sgl_kernel"],
+        package_dir={"": "src"},
+        ext_modules=[
+            CUDAExtension(
+                "sgl_kernel.ops.warp_reduce_cuda",
+                [
+                    "src/sgl-kernel/csrc/warp_reduce.cc",
+                    "src/sgl-kernel/csrc/warp_reduce_kernel.cu",
+                ],
+                extra_compile_args={
+                    "nvcc": [
+                        "-O3",
+                        "-Xcompiler",
+                        "-fPIC",
+                        "-gencode=arch=compute_75,code=sm_75",
+                        "-gencode=arch=compute_80,code=sm_80",
+                        "-gencode=arch=compute_89,code=sm_89",
+                        "-gencode=arch=compute_90,code=sm_90",
+                    ],
+                    "cxx": ["-O3"],
+                },
+                libraries=["c10", "torch", "torch_python"],
+                extra_link_args=["-Wl,-rpath,$ORIGIN/../../torch/lib"],
+            ),
+            CUDAExtension(
+                "sgl_kernel.ops.custom_reduce_cuda",
+                [
+                    "src/sgl-kernel/csrc/trt_reduce_internal.cu",
+                    "src/sgl-kernel/csrc/trt_reduce_kernel.cu",
+                    "src/sgl-kernel/csrc/trt_reduce.cc",
                 ],
-                "cxx": ["-O3"],
-            },
-            libraries=["c10", "torch", "torch_python"],
-            extra_link_args=["-Wl,-rpath,$ORIGIN/../../torch/lib"],
-        ),
-        CUDAExtension(
-            "sgl_kernel.ops.custom_reduce_cuda",
-            [
-                "src/sgl-kernel/csrc/trt_reduce_internal.cu",
-                "src/sgl-kernel/csrc/trt_reduce_kernel.cu",
-                "src/sgl-kernel/csrc/trt_reduce.cc",
-            ],
-            extra_compile_args={
-                "nvcc": [
-                    "-O3",
-                    "-Xcompiler",
-                    "-fPIC",
-                    "-gencode=arch=compute_75,code=sm_75",
-                    "-gencode=arch=compute_80,code=sm_80",
-                    "-gencode=arch=compute_89,code=sm_89",
-                    "-gencode=arch=compute_90,code=sm_90",
-                    "-U__CUDA_NO_HALF_OPERATORS__",
-                    "-U__CUDA_NO_HALF2_OPERATORS__",
+                extra_compile_args={
+                    "nvcc": [
+                        "-O3",
+                        "-Xcompiler",
+                        "-fPIC",
+                        "-gencode=arch=compute_75,code=sm_75",
+                        "-gencode=arch=compute_80,code=sm_80",
+                        "-gencode=arch=compute_89,code=sm_89",
+                        "-gencode=arch=compute_90,code=sm_90",
+                        "-U__CUDA_NO_HALF_OPERATORS__",
+                        "-U__CUDA_NO_HALF2_OPERATORS__",
+                    ],
+                    "cxx": ["-O3"],
+                },
+                libraries=["c10", "torch", "torch_python"],
+                extra_link_args=["-Wl,-rpath,$ORIGIN/../../torch/lib"],
+            ),
+            CUDAExtension(
+                "sgl_kernel.ops.moe_align_block_size",
+                [
+                    "src/sgl-kernel/csrc/moe_align_kernel.cu",
                 ],
-                "cxx": ["-O3"],
-            },
-            libraries=["c10", "torch", "torch_python"],
-            extra_link_args=["-Wl,-rpath,$ORIGIN/../../torch/lib"],
-        ),
-        CUDAExtension(
-            "sgl_kernel.ops.moe_align_block_size",
-            [
-                "src/sgl-kernel/csrc/moe_align_kernel.cu",
-            ],
-            extra_compile_args={
-                "nvcc": [
-                    "-O3",
-                    "-Xcompiler",
-                    "-fPIC",
-                    "-gencode=arch=compute_75,code=sm_75",
-                    "-gencode=arch=compute_80,code=sm_80",
-                    "-gencode=arch=compute_89,code=sm_89",
-                    "-gencode=arch=compute_90,code=sm_90",
+                extra_compile_args={
+                    "nvcc": [
+                        "-O3",
+                        "-Xcompiler",
+                        "-fPIC",
+                        "-gencode=arch=compute_75,code=sm_75",
+                        "-gencode=arch=compute_80,code=sm_80",
+                        "-gencode=arch=compute_89,code=sm_89",
+                        "-gencode=arch=compute_90,code=sm_90",
+                    ],
+                    "cxx": ["-O3"],
+                },
+                libraries=["c10", "torch", "torch_python"],
+                extra_link_args=["-Wl,-rpath,$ORIGIN/../../torch/lib"],
+            ),
+        ],
+        cmdclass={"build_ext": BuildExtension},
+        install_requires=["torch"],
+    )
+else:
+    hipcc_flags = [
+        "-D__HIP_PLATFORM_AMD__=1",
+        "--amdgpu-target=gfx942",  
+        "-DENABLE_BF16", # Enable BF16 for cuda_version >= 11
+        "-DENABLE_FP8",  # Enable FP8 for cuda_version >= 11.8
+    ]
+    setup(
+        name="sgl-kernel",
+        version=get_version(),
+        packages=["sgl_kernel"],
+        package_dir={"": "src"},
+        ext_modules=[
+            CUDAExtension(
+                "sgl_kernel.ops.moe_align_block_size",
+                [
+                    "src/sgl-kernel/csrc/moe_align_kernel.cu",
                 ],
-                "cxx": ["-O3"],
-            },
-            libraries=["c10", "torch", "torch_python"],
-            extra_link_args=["-Wl,-rpath,$ORIGIN/../../torch/lib"],
-        ),
-    ],
-    cmdclass={"build_ext": BuildExtension},
-    install_requires=["torch"],
-)
+                extra_compile_args={
+                    "nvcc": hipcc_flags + [
+                        "-O3",
+                        "-Xcompiler",
+                        "-fPIC",
+                    ],
+                    "cxx": ["-O3"],
+                },
+                libraries=["hiprtc", "amdhip64", "c10", "torch", "torch_python"],
+                extra_link_args=["-Wl,-rpath,$ORIGIN/../../torch/lib"],
+            ),
+        ],
+        cmdclass={"build_ext": BuildExtension},
+        install_requires=["torch"],
+    )   
 
 update_wheel_platform_tag()
+