ROCm
diff --git a/‎.buildkite/scripts/hardware_ci/run-cpu-test.sh‎
Lines changed: 4 additions & 1 deletion b/‎.buildkite/scripts/hardware_ci/run-cpu-test.sh‎
Lines changed: 4 additions & 1 deletion
diff --git a/‎.github/ISSUE_TEMPLATE/400-bug-report.yml‎
Lines changed: 10 additions & 0 deletions b/‎.github/ISSUE_TEMPLATE/400-bug-report.yml‎
Lines changed: 10 additions & 0 deletions
diff --git a/‎.github/mergify.yml‎
Lines changed: 20 additions & 0 deletions b/‎.github/mergify.yml‎
Lines changed: 20 additions & 0 deletions
diff --git a/‎.gitignore‎
Lines changed: 1 addition & 1 deletion b/‎.gitignore‎
Lines changed: 1 addition & 1 deletion
diff --git a/‎.pre-commit-config.yaml‎
Lines changed: 10 additions & 5 deletions b/‎.pre-commit-config.yaml‎
Lines changed: 10 additions & 5 deletions
diff --git a/‎benchmarks/P3L_mling.py‎
Lines changed: 4 additions & 4 deletions b/‎benchmarks/P3L_mling.py‎
Lines changed: 4 additions & 4 deletions
diff --git a/‎benchmarks/kernels/bench_int8_gemm.py‎
Lines changed: 200 additions & 0 deletions b/‎benchmarks/kernels/bench_int8_gemm.py‎
Lines changed: 200 additions & 0 deletions
diff --git a/‎benchmarks/kernels/benchmark_moe.py‎
Lines changed: 4 additions & 10 deletions b/‎benchmarks/kernels/benchmark_moe.py‎
Lines changed: 4 additions & 10 deletions
diff --git a/‎csrc/cpu/attention.cpp‎
Lines changed: 3 additions & 3 deletions b/‎csrc/cpu/attention.cpp‎
Lines changed: 3 additions & 3 deletions
@@ -43,7 +43,10 @@ function cpu_tests() {
     pytest -v -s tests/kernels/attention/test_mla_decode_cpu.py -m cpu_model
     pytest -v -s tests/models/language/generation -m cpu_model
     pytest -v -s tests/models/language/pooling -m cpu_model
-    pytest -v -s tests/models/multimodal/generation --ignore=tests/models/multimodal/generation/test_mllama.py -m cpu_model"
+    pytest -v -s tests/models/multimodal/generation \
+                --ignore=tests/models/multimodal/generation/test_mllama.py \
+                --ignore=tests/models/multimodal/generation/test_pixtral.py \
+                -m cpu_model"
 
   # Run compressed-tensor test
   docker exec cpu-test-"$NUMA_NODE" bash -c "
 
@@ -8,6 +8,16 @@ body:
   attributes:
     value: >
       #### Before submitting an issue, please make sure the issue hasn't been already addressed by searching through [the existing and past issues](https://github.com/vllm-project/vllm/issues?q=is%3Aissue+sort%3Acreated-desc+).
+- type: markdown
+  attributes:
+    value: |
+      ⚠️ **SECURITY WARNING:** Please review any text you paste to ensure it does not contain sensitive information such as:
+      - API tokens or keys (e.g., Hugging Face tokens, OpenAI API keys)
+      - Passwords or authentication credentials
+      - Private URLs or endpoints
+      - Personal or confidential data
+      
+      Consider redacting or replacing sensitive values with placeholders like `<YOUR_TOKEN_HERE>` when sharing configuration or code examples.
 - type: textarea
   attributes:
     label: Your current environment
 
@@ -65,6 +65,26 @@ pull_request_rules:
       add:
         - multi-modality
 
+- name: label-rocm
+  description: Automatically apply rocm label
+  conditions:
+    - or:
+      - files~=^csrc/rocm/
+      - files~=^docker/Dockerfile.rocm
+      - files~=^requirements/rocm.*\.txt
+      - files~=^vllm/attention/backends/rocm.*\.py
+      - files~=^vllm/attention/ops/rocm.*\.py
+      - files~=^vllm/model_executor/layers/fused_moe/rocm.*\.py
+      - files~=^vllm/v1/attention/backends/mla/rocm.*\.py
+      - files~=^tests/kernels/.*_rocm.*\.py
+      - files=vllm/platforms/rocm.py
+      - title~=(?i)AMD
+      - title~=(?i)ROCm
+  actions:
+    label:
+      add:
+        - rocm
+
 - name: label-structured-output
   description: Automatically apply structured-output label
   conditions:
 
@@ -200,5 +200,5 @@ benchmarks/**/*.json
 actionlint
 shellcheck*/
 
-# Ingore moe/marlin_moe gen code
+# Ignore moe/marlin_moe gen code
 csrc/moe/marlin_moe_wna16/kernel_*
@@ -20,12 +20,10 @@ repos:
     args: [--output-format, github, --fix]
   - id: ruff-format
     files: ^(.buildkite|benchmarks|examples)/.*
-- repo: https://github.com/codespell-project/codespell
-  rev: v2.4.1
+- repo: https://github.com/crate-ci/typos
+  rev: v1.32.0
   hooks:
-  - id: codespell
-    additional_dependencies: ['tomli']
-    args: ['--toml', 'pyproject.toml']
+  - id: typos
 - repo: https://github.com/PyCQA/isort
   rev: 6.0.1
   hooks:
@@ -145,6 +143,13 @@ repos:
     types: [python]
     pass_filenames: false
     additional_dependencies: [regex]
+  - id: check-pickle-imports
+    name: Prevent new pickle/cloudpickle imports
+    entry: python tools/check_pickle_imports.py
+    language: python
+    types: [python]
+    pass_filenames: false
+    additional_dependencies: [pathspec, regex]
   # Keep `suggestion` last
   - id: suggestion
     name: Suggestion
 
@@ -91,19 +91,19 @@ def get_wikitext2_text(tokenizer):
     return test_enc, test_text
 
 
-def get_flores_plus_text(tokenizer, lng_scrpt):
+def get_flores_plus_text(tokenizer, lng_script):
     hf_hub_download(
         repo_id="alexei-v-ivanov-amd/flores_plus",
         repo_type="dataset",
-        filename=lng_scrpt + ".parquet",
+        filename=lng_script + ".parquet",
         local_dir="./",
     )
 
-    df = pandas.read_parquet("./" + lng_scrpt + ".parquet")
+    df = pandas.read_parquet("./" + lng_script + ".parquet")
     test_text = "\n\n".join(line.strip() for line in df["text"])
     test_enc = tokenizer(test_text)
 
-    os.remove("./" + lng_scrpt + ".parquet")
+    os.remove("./" + lng_script + ".parquet")
 
     return test_enc, test_text
 
 
@@ -0,0 +1,200 @@
+# SPDX-License-Identifier: Apache-2.0
+# SPDX-FileCopyrightText: Copyright contributors to the vLLM project
+import argparse
+import copy
+import itertools
+
+import torch
+from weight_shapes import WEIGHT_SHAPES
+
+from vllm._custom_ops import cutlass_scaled_mm as vllm_scaled_mm
+from vllm._custom_ops import scaled_int8_quant as vllm_scaled_int8_quant
+from vllm.triton_utils import triton
+
+
+@triton.testing.perf_report(
+    triton.testing.Benchmark(
+        x_names=["batch_size"],
+        x_vals=[1, 16, 64, 128, 256, 512, 1024, 2048, 4096, 8192, 16384],
+        x_log=False,
+        line_arg="provider",
+        line_vals=[
+            "torch-bf16",
+            # "int8-tensor-w-token-a",
+            "int8-tensor-w-tensor-a",
+            "int8-channel-w-token-a",
+            # "int8-channel-w-tensor-a",
+            # "int8-tensor-w-token-a-noquant",
+            "int8-tensor-w-tensor-a-noquant",
+            "int8-channel-w-token-a-noquant",
+            # "int8-channel-w-tensor-a-noquant",
+        ],
+        line_names=[
+            "torch-bf16",
+            # "int8-tensor-w-token-a",
+            "int8-tensor-w-tensor-a",
+            "int8-channel-w-token-a",
+            # "int8-channel-w-tensor-a",
+            # "int8-tensor-w-token-a-noquant",
+            "int8-tensor-w-tensor-a-noquant",
+            "int8-channel-w-token-a-noquant",
+            # "int8-channel-w-tensor-a-noquant",
+        ],
+        ylabel="TFLOP/s (larger is better)",
+        plot_name="BF16 vs INT8 GEMMs",
+        args={},
+    )
+)
+def benchmark(batch_size, provider, N, K):
+    M = batch_size
+    device = "cuda"
+    dtype = torch.bfloat16
+    a = torch.randn((M, K), device=device, dtype=dtype)
+    b = torch.randn((N, K), device=device, dtype=dtype)
+
+    quantiles = [0.5, 0.2, 0.8]
+
+    if "torch-bf16" in provider:
+        ms, min_ms, max_ms = triton.testing.do_bench_cudagraph(
+            lambda: torch.nn.functional.linear(a, b), quantiles=quantiles
+        )
+
+    elif "int8" in provider:
+        # Weights are always quantized ahead of time
+        if "noquant" in provider:
+            # For "no quant", we don't measure the time for activations
+            if "tensor-w-token-a" in provider:
+                # Dynamic per-token quant for A, static per-tensor quant for B
+                scale_b = torch.tensor([1.0], device=device, dtype=torch.float32)
+                b_int8, scale_b_int8, _ = vllm_scaled_int8_quant(b, scale_b)
+                assert scale_b_int8.numel() == 1
+                a_int8, scale_a_int8, _ = vllm_scaled_int8_quant(a)
+
+            elif "tensor-w-tensor-a" in provider:
+                # Static per-tensor quantization with fixed scales for both A and B
+                scale_a = torch.tensor([1.0], device=device, dtype=torch.float32)
+                scale_b = torch.tensor([1.0], device=device, dtype=torch.float32)
+                b_int8, scale_b_int8, _ = vllm_scaled_int8_quant(b, scale_b)
+                assert scale_b_int8.numel() == 1
+                a_int8, scale_a_int8, _ = vllm_scaled_int8_quant(a, scale_a)
+
+            elif "channel-w-token-a" in provider:
+                # Dynamic per-channel quantization for weights, per-token quant for A
+                b_int8, scale_b_int8, _ = vllm_scaled_int8_quant(b)
+                assert scale_b_int8.numel() == N
+                a_int8, scale_a_int8, _ = vllm_scaled_int8_quant(a)
+
+            elif "channel-w-tensor-a" in provider:
+                # Dynamic per-channel quantization for weights, per-tensor quant for A
+                scale_a = torch.tensor([1.0], device=device, dtype=torch.float32)
+                b_int8, scale_b_int8, _ = vllm_scaled_int8_quant(b)
+                assert scale_b_int8.numel() == N
+                a_int8, scale_a_int8, _ = vllm_scaled_int8_quant(a, scale_a)
+
+            def run_quant():
+                return vllm_scaled_mm(a_int8, b_int8, scale_a_int8, scale_b_int8, dtype)
+
+        else:
+            # Quantize the activations during the GEMM call
+            if "tensor-w-token-a" in provider:
+                # Dynamic per-token quant for A, static per-tensor quant for B
+                scale_b = torch.tensor([1.0], device=device, dtype=torch.float32)
+                b_int8, scale_b_int8, _ = vllm_scaled_int8_quant(b, scale_b)
+                assert scale_b_int8.numel() == 1
+
+                def run_quant():
+                    a_int8, scale_a_int8, _ = vllm_scaled_int8_quant(a)
+                    return vllm_scaled_mm(
+                        a_int8, b_int8, scale_a_int8, scale_b_int8, dtype
+                    )
+
+            elif "tensor-w-tensor-a" in provider:
+                # Static per-tensor quantization with fixed scales for both A and B
+                scale_a = torch.tensor([1.0], device=device, dtype=torch.float32)
+                scale_b = torch.tensor([1.0], device=device, dtype=torch.float32)
+                b_int8, scale_b_int8, _ = vllm_scaled_int8_quant(b, scale_b)
+                assert scale_b_int8.numel() == 1
+
+                def run_quant():
+                    a_int8, scale_a_int8, _ = vllm_scaled_int8_quant(a, scale_a)
+                    return vllm_scaled_mm(
+                        a_int8, b_int8, scale_a_int8, scale_b_int8, dtype
+                    )
+
+            elif "channel-w-token-a" in provider:
+                # Dynamic per-channel quant for weights, per-token quant for A
+                b_int8, scale_b_int8, _ = vllm_scaled_int8_quant(b)
+                assert scale_b_int8.numel() == N
+
+                def run_quant():
+                    a_int8, scale_a_int8, _ = vllm_scaled_int8_quant(a)
+                    return vllm_scaled_mm(
+                        a_int8, b_int8, scale_a_int8, scale_b_int8, dtype
+                    )
+
+            elif "channel-w-tensor-a" in provider:
+                # Dynamic per-channel quant for weights, static per-tensor quant for A
+                scale_a = torch.tensor([1.0], device=device, dtype=torch.float32)
+                b_int8, scale_b_int8, _ = vllm_scaled_int8_quant(b)
+                assert scale_b_int8.numel() == N
+
+                def run_quant():
+                    a_int8, scale_a_int8, _ = vllm_scaled_int8_quant(a, scale_a)
+                    return vllm_scaled_mm(
+                        a_int8, b_int8, scale_a_int8, scale_b_int8, dtype
+                    )
+
+        b_int8 = b_int8.t()
+
+        ms, min_ms, max_ms = triton.testing.do_bench_cudagraph(
+            lambda: run_quant(), quantiles=quantiles
+        )
+
+    # Calculate TFLOP/s, two flops per multiply-add
+    tflops = lambda ms: (2 * M * N * K) * 1e-12 / (ms * 1e-3)
+    return tflops(ms), tflops(max_ms), tflops(min_ms)
+
+
+def prepare_shapes(args):
+    KN_model_names = []
+    models_tps = list(itertools.product(args.models, args.tp_sizes))
+    for model, tp_size in models_tps:
+        assert model in WEIGHT_SHAPES
+        for KN, tp_split_dim in copy.deepcopy(WEIGHT_SHAPES[model]):
+            KN[tp_split_dim] = KN[tp_split_dim] // tp_size
+            KN.append(model)
+            KN_model_names.append(KN)
+    return KN_model_names
+
+
+if __name__ == "__main__":
+    parser = argparse.ArgumentParser()
+    parser.add_argument(
+        "--models",
+        nargs="+",
+        type=str,
+        default=["meta-llama/Llama-3.1-8B-Instruct"],
+        choices=[*WEIGHT_SHAPES.keys()],
+        help="List of models to benchmark",
+    )
+    parser.add_argument(
+        "--tp-sizes",
+        nargs="+",
+        type=int,
+        default=[1],
+        help="List of tensor parallel sizes",
+    )
+    args = parser.parse_args()
+
+    KN_model_names = prepare_shapes(args)
+    for K, N, model_name in KN_model_names:
+        print(f"{model_name}, N={N} K={K}, BF16 vs INT8 GEMMs TFLOP/s:")
+        benchmark.run(
+            print_data=True,
+            show_plots=True,
+            save_path=f"bench_int8_res_n{N}_k{K}",
+            N=N,
+            K=K,
+        )
+
+    print("Benchmark finished!")
@@ -7,7 +7,6 @@
 from contextlib import nullcontext
 from datetime import datetime
 from itertools import product
-from types import SimpleNamespace
 from typing import Any, TypedDict
 
 import ray
@@ -43,7 +42,7 @@ def benchmark_config(
     use_fp8_w8a8: bool,
     use_int8_w8a16: bool,
     num_iters: int = 100,
-    block_quant_shape: List[int] = None,
+    block_quant_shape: list[int] = None,
     use_deep_gemm: bool = False,
 ) -> float:
     init_dtype = torch.float16 if use_fp8_w8a8 else dtype
@@ -400,7 +399,7 @@ def benchmark(
         dtype: torch.dtype,
         use_fp8_w8a8: bool,
         use_int8_w8a16: bool,
-        block_quant_shape: List[int] = None,
+        block_quant_shape: list[int] = None,
         use_deep_gemm: bool = False,
     ) -> tuple[dict[str, int], float]:
         current_platform.seed_everything(self.seed)
@@ -532,7 +531,7 @@ def save_configs(
     dtype: torch.dtype,
     use_fp8_w8a8: bool,
     use_int8_w8a16: bool,
-    block_quant_shape: List[int],
+    block_quant_shape: list[int],
 ) -> None:
     dtype_str = get_config_dtype_str(
         dtype, use_int8_w8a16=use_int8_w8a16, use_fp8_w8a8=use_fp8_w8a8
@@ -563,7 +562,6 @@ def main(args: argparse.Namespace):
     config = get_config(model=args.model, trust_remote_code=args.trust_remote_code)
     if args.model_prefix:
         config = getattr(config, args.model_prefix)
-    config = SimpleNamespace(**config)
 
     if config.architectures[0] == "DbrxForCausalLM":
         E = config.ffn_config.moe_num_experts
@@ -595,11 +593,7 @@ def main(args: argparse.Namespace):
         shard_intermediate_size = 2 * intermediate_size // args.tp_size
 
     hidden_size = config.hidden_size
-    dtype = (
-        torch.float16
-        if current_platform.is_rocm()
-        else getattr(torch, config.torch_dtype)
-    )
+    dtype = torch.float16 if current_platform.is_rocm() else config.torch_dtype
     use_fp8_w8a8 = args.dtype == "fp8_w8a8"
     use_int8_w8a16 = args.dtype == "int8_w8a16"
     block_quant_shape = get_weight_block_size_safety(config)
 
@@ -137,8 +137,8 @@ FORCE_INLINE std::pair<T, T> reduceSoftmaxAlibi(T* data, const int size,
 }
 
 template <typename T>
-FORCE_INLINE void reducePartitonSoftmax(const T* max_data, T* sum_data,
-                                        const int size) {
+FORCE_INLINE void reducePartitionSoftmax(const T* max_data, T* sum_data,
+                                         const int size) {
   T max = max_data[0];
   for (int i = 1; i < size; ++i) {
     max = max >= max_data[i] ? max : max_data[i];
@@ -634,7 +634,7 @@ struct paged_attention_v2_impl {
 
         if (partition_num == 1) continue;
 
-        reducePartitonSoftmax(
+        reducePartitionSoftmax(
             max_logits + seq_idx * num_heads * max_num_partitions +
                 head_idx * max_num_partitions,
             exp_sums + seq_idx * num_heads * max_num_partitions +