sgl-project · HaiShaw · Jan 3, 2025 · Dec 26, 2024 · Dec 27, 2024 · Dec 27, 2024
diff --git a/docker/Dockerfile.rocm b/docker/Dockerfile.rocm
@@ -2,7 +2,7 @@
 #   docker build --build-arg SGL_BRANCH=v0.4.1 -t v0.4.1-rocm620 -f Dockerfile.rocm .
 
 # default base image
-ARG BASE_IMAGE="rocm/vllm-dev:20241022"
+ARG BASE_IMAGE="rocm/vllm-dev:20241226"
 
 FROM $BASE_IMAGE AS base
 USER root
@@ -37,7 +37,7 @@ ENV SGLANG_SET_CPU_AFFINITY=1
 ENV SGLANG_ALLOW_OVERWRITE_LONGER_CONTEXT_LEN=1
 ENV NCCL_MIN_NCHANNELS=112
 
-ENV MOE_PADDING=1
+ENV MOE_PADDING=0
 ENV VLLM_FP8_PADDING=1
 ENV VLLM_FP8_ACT_PADDING=1
 ENV VLLM_FP8_WEIGHT_PADDING=1

diff --git a/python/pyproject.toml b/python/pyproject.toml
@@ -18,7 +18,7 @@ dependencies = ["requests", "tqdm", "numpy", "IPython", "setproctitle"]
 [project.optional-dependencies]
 runtime_common = ["aiohttp", "decord", "fastapi",
     "hf_transfer", "huggingface_hub", "interegular", "modelscope",
-    "orjson", "outlines>=0.0.44,<0.1.0",
+    "orjson", "outlines>=0.1.7", "outlines-core>=0.1.17",
     "packaging", "pillow", "prometheus-client>=0.20.0",
     "psutil", "pydantic", "python-multipart",
     "pyzmq>=25.1.2", "torchao>=0.7.0", "uvicorn", "uvloop",
@@ -27,7 +27,7 @@ srt = ["sglang[runtime_common]", "torch", "vllm>=0.6.3.post1,<=0.6.4.post1", "cu
 
 # HIP (Heterogeneous-computing Interface for Portability) for AMD
 # => base docker rocm/vllm-dev:20241022, not from public vllm whl
-srt_hip = ["sglang[runtime_common]", "torch", "vllm==0.6.3.dev13"]
+srt_hip = ["sglang[runtime_common]", "torch", "vllm==0.6.5.dev414+ga2646938.rocm634"]
 # xpu is not enabled in public vllm and torch whl,
 # need to follow https://docs.vllm.ai/en/latest/getting_started/xpu-installation.htmlinstall vllm
 srt_xpu = ["sglang[runtime_common]"]

diff --git a/python/sglang/srt/constrained/outlines_backend.py b/python/sglang/srt/constrained/outlines_backend.py
@@ -20,7 +20,7 @@
 import interegular
 import torch
 from outlines.fsm.guide import RegexGuide
-from outlines.fsm.json_schema import build_regex_from_schema
+from outlines_core.fsm.json_schema import build_regex_from_schema
 from outlines.models.transformers import TransformerTokenizer
 from pydantic import BaseModel
 

@@ -402,7 +402,7 @@ def _decode_grouped_att_m_fwd(
     sm_scale,
     logit_cap,
 ):
-    BLOCK = 32
+    BLOCK = 16 if is_hip() else 32
     Lk = k_buffer.shape[-1]
     Lv = v_buffer.shape[-1]
 

@@ -11,7 +11,9 @@
 import torch
 import triton
 import triton.language as tl
-from sgl_kernel import moe_align_block_size as sgl_moe_align_block_size
+from sglang.srt.utils import is_hip
+if not is_hip():
+    from sgl_kernel import moe_align_block_size as sgl_moe_align_block_size
 from vllm import _custom_ops as ops
 
 from sglang.srt.layers.moe.topk import select_experts
@@ -268,7 +270,7 @@ def moe_align_block_size(
     )
     num_tokens_post_pad = torch.empty((1), dtype=torch.int32, device=topk_ids.device)
     # FIXME(zhyncs)
-    if num_experts >= 256:
+    if num_experts >= 256 and not is_hip():
         sgl_moe_align_block_size(
             topk_ids,
             num_experts,
@@ -430,7 +432,7 @@ def get_default_config(
     dtype: Optional[str],
     is_marlin: bool,
 ) -> Dict[str, int]:
-    if dtype == "fp8_w8a8":
+    if dtype == "fp8_w8a8" and not is_hip():
         config = {
             "BLOCK_SIZE_M": 128,
             "BLOCK_SIZE_N": 256,

diff --git a/python/sglang/srt/layers/quantization/fp8.py b/python/sglang/srt/layers/quantization/fp8.py
@@ -217,7 +217,7 @@ def create_weights(
 
         # WEIGHT
         weight_dtype = (
-            torch.float8_e4m3fn
+            torch.float8_e4m3fnuz if is_hip() else torch.float8_e4m3fn
             if self.quant_config.is_checkpoint_fp8_serialized
             else params_dtype
         )
@@ -432,7 +432,7 @@ def create_weights(
         from sglang.srt.layers.moe.fused_moe_triton import FusedMoeWeightScaleSupported
 
         if self.quant_config.is_checkpoint_fp8_serialized:
-            params_dtype = torch.float8_e4m3fn
+            params_dtype = torch.float8_e4m3fnuz if is_hip() else torch.float8_e4m3fn
         tp_size = get_tensor_model_parallel_world_size()
         if self.block_quant:
             block_n, block_k = (

diff --git a/python/sglang/srt/layers/quantization/fp8_kernel.py b/python/sglang/srt/layers/quantization/fp8_kernel.py
@@ -3,7 +3,7 @@
 import torch
 import triton
 import triton.language as tl
-
+from sglang.srt.utils import is_hip
 
 @triton.jit
 def _per_token_group_quant_fp8(
@@ -51,7 +51,7 @@ def per_token_group_quant_fp8(
     x: torch.Tensor,
     group_size: int,
     eps: float = 1e-10,
-    dtype: torch.dtype = torch.float8_e4m3fn,
+    dtype: torch.dtype = torch.float8_e4m3fnuz if is_hip() else torch.float8_e4m3fn
 ) -> Tuple[torch.Tensor, torch.Tensor]:
     """Function to perform per-token-group quantization on an input tensor `x`.
 

diff --git a/python/sglang/srt/models/deepseek_v2.py b/python/sglang/srt/models/deepseek_v2.py
@@ -55,7 +55,7 @@
 from sglang.srt.managers.schedule_batch import global_server_args_dict
 from sglang.srt.model_executor.forward_batch_info import ForwardBatch
 from sglang.srt.model_loader.weight_utils import default_weight_loader
-from sglang.srt.utils import is_flashinfer_available
+from sglang.srt.utils import is_flashinfer_available, is_hip
 
 if is_flashinfer_available():
     from flashinfer import bmm_fp8
@@ -573,13 +573,16 @@ def forward_absorb(
             )
         q_nope, q_pe = q.split([self.qk_nope_head_dim, self.qk_rope_head_dim], dim=-1)
 
-        if self.w_kc.dtype == torch.float8_e4m3fn:
-            q_nope_val, q_nope_scale = input_to_float8(
-                q_nope.transpose(0, 1), torch.float8_e4m3fn
-            )
-            q_nope_out = bmm_fp8(
-                q_nope_val, self.w_kc, q_nope_scale, self.w_scale, torch.bfloat16
-            )
+        if self.w_kc.dtype == torch.float8_e4m3fn or torch.float8_e4m3fnuz:
+            if is_hip():
+                q_nope_out = torch.bmm(q_nope.to(torch.bfloat16).transpose(0, 1), self.w_kc.to(torch.bfloat16))
+            else:
+                q_nope_val, q_nope_scale = input_to_float8(
+                    q_nope.transpose(0, 1), torch.float8_e4m3fn
+                )
+                q_nope_out = bmm_fp8(
+                    q_nope_val, self.w_kc, q_nope_scale, self.w_scale, torch.bfloat16
+                )
         else:
             q_nope_out = torch.bmm(q_nope.transpose(0, 1), self.w_kc)
         q_input[..., : self.kv_lora_rank] = q_nope_out.transpose(0, 1)
@@ -598,17 +601,20 @@ def forward_absorb(
         attn_output = self.attn_mqa(q_input, k_input, v_input, forward_batch)
         attn_output = attn_output.view(-1, self.num_local_heads, self.kv_lora_rank)
 
-        if self.w_vc.dtype == torch.float8_e4m3fn:
-            attn_output_val, attn_output_scale = input_to_float8(
-                attn_output.transpose(0, 1), torch.float8_e4m3fn
-            )
-            attn_bmm_output = bmm_fp8(
-                attn_output_val,
-                self.w_vc,
-                attn_output_scale,
-                self.w_scale,
-                torch.bfloat16,
-            )
+        if self.w_vc.dtype == torch.float8_e4m3fn or torch.float8_e4m3fnuz:
+            if is_hip():
+                attn_bmm_output = torch.bmm(attn_output.to(torch.bfloat16).transpose(0, 1), self.w_vc.to(torch.bfloat16))
+            else:
+                attn_output_val, attn_output_scale = input_to_float8(
+                    attn_output.transpose(0, 1), torch.float8_e4m3fn
+                )
+                attn_bmm_output = bmm_fp8(
+                    attn_output_val,
+                    self.w_vc,
+                    attn_output_scale,
+                    self.w_scale,
+                    torch.bfloat16,
+                )
         else:
             attn_bmm_output = torch.bmm(attn_output.transpose(0, 1), self.w_vc)
         attn_output = attn_bmm_output.transpose(0, 1).flatten(1, 2)
@@ -942,7 +948,7 @@ def load_weights(self, weights: Iterable[Tuple[str, torch.Tensor]]):
                 # This may affect the accuracy of fp8 model.
                 if (
                     hasattr(self.quant_config, "weight_block_size")
-                    and w.dtype == torch.float8_e4m3fn
+                    and w.dtype == torch.float8_e4m3fnuz if is_hip() else torch.float8_e4m3fn
                 ):
                     weight_block_size = self.quant_config.weight_block_size
                     if weight_block_size is not None: