Skip to content

Commit 9b43d47

Browse files
authored
Merge pull request #577 from ROCm/upstream_merge_2025_06_12
Upstream merge 2025 06 12
2 parents 68af055 + 0eb854c commit 9b43d47

File tree

190 files changed

+3849
-1655
lines changed

Some content is hidden

Large Commits have some content hidden by default. Use the searchbox below for content that may be hidden.

190 files changed

+3849
-1655
lines changed

.buildkite/scripts/hardware_ci/run-cpu-test.sh

Lines changed: 4 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -43,7 +43,10 @@ function cpu_tests() {
4343
pytest -v -s tests/kernels/attention/test_mla_decode_cpu.py -m cpu_model
4444
pytest -v -s tests/models/language/generation -m cpu_model
4545
pytest -v -s tests/models/language/pooling -m cpu_model
46-
pytest -v -s tests/models/multimodal/generation --ignore=tests/models/multimodal/generation/test_mllama.py -m cpu_model"
46+
pytest -v -s tests/models/multimodal/generation \
47+
--ignore=tests/models/multimodal/generation/test_mllama.py \
48+
--ignore=tests/models/multimodal/generation/test_pixtral.py \
49+
-m cpu_model"
4750

4851
# Run compressed-tensor test
4952
docker exec cpu-test-"$NUMA_NODE" bash -c "

.github/ISSUE_TEMPLATE/400-bug-report.yml

Lines changed: 10 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -8,6 +8,16 @@ body:
88
attributes:
99
value: >
1010
#### Before submitting an issue, please make sure the issue hasn't been already addressed by searching through [the existing and past issues](https://github.com/vllm-project/vllm/issues?q=is%3Aissue+sort%3Acreated-desc+).
11+
- type: markdown
12+
attributes:
13+
value: |
14+
⚠️ **SECURITY WARNING:** Please review any text you paste to ensure it does not contain sensitive information such as:
15+
- API tokens or keys (e.g., Hugging Face tokens, OpenAI API keys)
16+
- Passwords or authentication credentials
17+
- Private URLs or endpoints
18+
- Personal or confidential data
19+
20+
Consider redacting or replacing sensitive values with placeholders like `<YOUR_TOKEN_HERE>` when sharing configuration or code examples.
1121
- type: textarea
1222
attributes:
1323
label: Your current environment

.github/mergify.yml

Lines changed: 20 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -65,6 +65,26 @@ pull_request_rules:
6565
add:
6666
- multi-modality
6767

68+
- name: label-rocm
69+
description: Automatically apply rocm label
70+
conditions:
71+
- or:
72+
- files~=^csrc/rocm/
73+
- files~=^docker/Dockerfile.rocm
74+
- files~=^requirements/rocm.*\.txt
75+
- files~=^vllm/attention/backends/rocm.*\.py
76+
- files~=^vllm/attention/ops/rocm.*\.py
77+
- files~=^vllm/model_executor/layers/fused_moe/rocm.*\.py
78+
- files~=^vllm/v1/attention/backends/mla/rocm.*\.py
79+
- files~=^tests/kernels/.*_rocm.*\.py
80+
- files=vllm/platforms/rocm.py
81+
- title~=(?i)AMD
82+
- title~=(?i)ROCm
83+
actions:
84+
label:
85+
add:
86+
- rocm
87+
6888
- name: label-structured-output
6989
description: Automatically apply structured-output label
7090
conditions:

.gitignore

Lines changed: 1 addition & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -200,5 +200,5 @@ benchmarks/**/*.json
200200
actionlint
201201
shellcheck*/
202202

203-
# Ingore moe/marlin_moe gen code
203+
# Ignore moe/marlin_moe gen code
204204
csrc/moe/marlin_moe_wna16/kernel_*

.pre-commit-config.yaml

Lines changed: 10 additions & 5 deletions
Original file line numberDiff line numberDiff line change
@@ -20,12 +20,10 @@ repos:
2020
args: [--output-format, github, --fix]
2121
- id: ruff-format
2222
files: ^(.buildkite|benchmarks|examples)/.*
23-
- repo: https://github.com/codespell-project/codespell
24-
rev: v2.4.1
23+
- repo: https://github.com/crate-ci/typos
24+
rev: v1.32.0
2525
hooks:
26-
- id: codespell
27-
additional_dependencies: ['tomli']
28-
args: ['--toml', 'pyproject.toml']
26+
- id: typos
2927
- repo: https://github.com/PyCQA/isort
3028
rev: 6.0.1
3129
hooks:
@@ -145,6 +143,13 @@ repos:
145143
types: [python]
146144
pass_filenames: false
147145
additional_dependencies: [regex]
146+
- id: check-pickle-imports
147+
name: Prevent new pickle/cloudpickle imports
148+
entry: python tools/check_pickle_imports.py
149+
language: python
150+
types: [python]
151+
pass_filenames: false
152+
additional_dependencies: [pathspec, regex]
148153
# Keep `suggestion` last
149154
- id: suggestion
150155
name: Suggestion

benchmarks/P3L_mling.py

Lines changed: 4 additions & 4 deletions
Original file line numberDiff line numberDiff line change
@@ -91,19 +91,19 @@ def get_wikitext2_text(tokenizer):
9191
return test_enc, test_text
9292

9393

94-
def get_flores_plus_text(tokenizer, lng_scrpt):
94+
def get_flores_plus_text(tokenizer, lng_script):
9595
hf_hub_download(
9696
repo_id="alexei-v-ivanov-amd/flores_plus",
9797
repo_type="dataset",
98-
filename=lng_scrpt + ".parquet",
98+
filename=lng_script + ".parquet",
9999
local_dir="./",
100100
)
101101

102-
df = pandas.read_parquet("./" + lng_scrpt + ".parquet")
102+
df = pandas.read_parquet("./" + lng_script + ".parquet")
103103
test_text = "\n\n".join(line.strip() for line in df["text"])
104104
test_enc = tokenizer(test_text)
105105

106-
os.remove("./" + lng_scrpt + ".parquet")
106+
os.remove("./" + lng_script + ".parquet")
107107

108108
return test_enc, test_text
109109

Lines changed: 200 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,200 @@
1+
# SPDX-License-Identifier: Apache-2.0
2+
# SPDX-FileCopyrightText: Copyright contributors to the vLLM project
3+
import argparse
4+
import copy
5+
import itertools
6+
7+
import torch
8+
from weight_shapes import WEIGHT_SHAPES
9+
10+
from vllm._custom_ops import cutlass_scaled_mm as vllm_scaled_mm
11+
from vllm._custom_ops import scaled_int8_quant as vllm_scaled_int8_quant
12+
from vllm.triton_utils import triton
13+
14+
15+
@triton.testing.perf_report(
16+
triton.testing.Benchmark(
17+
x_names=["batch_size"],
18+
x_vals=[1, 16, 64, 128, 256, 512, 1024, 2048, 4096, 8192, 16384],
19+
x_log=False,
20+
line_arg="provider",
21+
line_vals=[
22+
"torch-bf16",
23+
# "int8-tensor-w-token-a",
24+
"int8-tensor-w-tensor-a",
25+
"int8-channel-w-token-a",
26+
# "int8-channel-w-tensor-a",
27+
# "int8-tensor-w-token-a-noquant",
28+
"int8-tensor-w-tensor-a-noquant",
29+
"int8-channel-w-token-a-noquant",
30+
# "int8-channel-w-tensor-a-noquant",
31+
],
32+
line_names=[
33+
"torch-bf16",
34+
# "int8-tensor-w-token-a",
35+
"int8-tensor-w-tensor-a",
36+
"int8-channel-w-token-a",
37+
# "int8-channel-w-tensor-a",
38+
# "int8-tensor-w-token-a-noquant",
39+
"int8-tensor-w-tensor-a-noquant",
40+
"int8-channel-w-token-a-noquant",
41+
# "int8-channel-w-tensor-a-noquant",
42+
],
43+
ylabel="TFLOP/s (larger is better)",
44+
plot_name="BF16 vs INT8 GEMMs",
45+
args={},
46+
)
47+
)
48+
def benchmark(batch_size, provider, N, K):
49+
M = batch_size
50+
device = "cuda"
51+
dtype = torch.bfloat16
52+
a = torch.randn((M, K), device=device, dtype=dtype)
53+
b = torch.randn((N, K), device=device, dtype=dtype)
54+
55+
quantiles = [0.5, 0.2, 0.8]
56+
57+
if "torch-bf16" in provider:
58+
ms, min_ms, max_ms = triton.testing.do_bench_cudagraph(
59+
lambda: torch.nn.functional.linear(a, b), quantiles=quantiles
60+
)
61+
62+
elif "int8" in provider:
63+
# Weights are always quantized ahead of time
64+
if "noquant" in provider:
65+
# For "no quant", we don't measure the time for activations
66+
if "tensor-w-token-a" in provider:
67+
# Dynamic per-token quant for A, static per-tensor quant for B
68+
scale_b = torch.tensor([1.0], device=device, dtype=torch.float32)
69+
b_int8, scale_b_int8, _ = vllm_scaled_int8_quant(b, scale_b)
70+
assert scale_b_int8.numel() == 1
71+
a_int8, scale_a_int8, _ = vllm_scaled_int8_quant(a)
72+
73+
elif "tensor-w-tensor-a" in provider:
74+
# Static per-tensor quantization with fixed scales for both A and B
75+
scale_a = torch.tensor([1.0], device=device, dtype=torch.float32)
76+
scale_b = torch.tensor([1.0], device=device, dtype=torch.float32)
77+
b_int8, scale_b_int8, _ = vllm_scaled_int8_quant(b, scale_b)
78+
assert scale_b_int8.numel() == 1
79+
a_int8, scale_a_int8, _ = vllm_scaled_int8_quant(a, scale_a)
80+
81+
elif "channel-w-token-a" in provider:
82+
# Dynamic per-channel quantization for weights, per-token quant for A
83+
b_int8, scale_b_int8, _ = vllm_scaled_int8_quant(b)
84+
assert scale_b_int8.numel() == N
85+
a_int8, scale_a_int8, _ = vllm_scaled_int8_quant(a)
86+
87+
elif "channel-w-tensor-a" in provider:
88+
# Dynamic per-channel quantization for weights, per-tensor quant for A
89+
scale_a = torch.tensor([1.0], device=device, dtype=torch.float32)
90+
b_int8, scale_b_int8, _ = vllm_scaled_int8_quant(b)
91+
assert scale_b_int8.numel() == N
92+
a_int8, scale_a_int8, _ = vllm_scaled_int8_quant(a, scale_a)
93+
94+
def run_quant():
95+
return vllm_scaled_mm(a_int8, b_int8, scale_a_int8, scale_b_int8, dtype)
96+
97+
else:
98+
# Quantize the activations during the GEMM call
99+
if "tensor-w-token-a" in provider:
100+
# Dynamic per-token quant for A, static per-tensor quant for B
101+
scale_b = torch.tensor([1.0], device=device, dtype=torch.float32)
102+
b_int8, scale_b_int8, _ = vllm_scaled_int8_quant(b, scale_b)
103+
assert scale_b_int8.numel() == 1
104+
105+
def run_quant():
106+
a_int8, scale_a_int8, _ = vllm_scaled_int8_quant(a)
107+
return vllm_scaled_mm(
108+
a_int8, b_int8, scale_a_int8, scale_b_int8, dtype
109+
)
110+
111+
elif "tensor-w-tensor-a" in provider:
112+
# Static per-tensor quantization with fixed scales for both A and B
113+
scale_a = torch.tensor([1.0], device=device, dtype=torch.float32)
114+
scale_b = torch.tensor([1.0], device=device, dtype=torch.float32)
115+
b_int8, scale_b_int8, _ = vllm_scaled_int8_quant(b, scale_b)
116+
assert scale_b_int8.numel() == 1
117+
118+
def run_quant():
119+
a_int8, scale_a_int8, _ = vllm_scaled_int8_quant(a, scale_a)
120+
return vllm_scaled_mm(
121+
a_int8, b_int8, scale_a_int8, scale_b_int8, dtype
122+
)
123+
124+
elif "channel-w-token-a" in provider:
125+
# Dynamic per-channel quant for weights, per-token quant for A
126+
b_int8, scale_b_int8, _ = vllm_scaled_int8_quant(b)
127+
assert scale_b_int8.numel() == N
128+
129+
def run_quant():
130+
a_int8, scale_a_int8, _ = vllm_scaled_int8_quant(a)
131+
return vllm_scaled_mm(
132+
a_int8, b_int8, scale_a_int8, scale_b_int8, dtype
133+
)
134+
135+
elif "channel-w-tensor-a" in provider:
136+
# Dynamic per-channel quant for weights, static per-tensor quant for A
137+
scale_a = torch.tensor([1.0], device=device, dtype=torch.float32)
138+
b_int8, scale_b_int8, _ = vllm_scaled_int8_quant(b)
139+
assert scale_b_int8.numel() == N
140+
141+
def run_quant():
142+
a_int8, scale_a_int8, _ = vllm_scaled_int8_quant(a, scale_a)
143+
return vllm_scaled_mm(
144+
a_int8, b_int8, scale_a_int8, scale_b_int8, dtype
145+
)
146+
147+
b_int8 = b_int8.t()
148+
149+
ms, min_ms, max_ms = triton.testing.do_bench_cudagraph(
150+
lambda: run_quant(), quantiles=quantiles
151+
)
152+
153+
# Calculate TFLOP/s, two flops per multiply-add
154+
tflops = lambda ms: (2 * M * N * K) * 1e-12 / (ms * 1e-3)
155+
return tflops(ms), tflops(max_ms), tflops(min_ms)
156+
157+
158+
def prepare_shapes(args):
159+
KN_model_names = []
160+
models_tps = list(itertools.product(args.models, args.tp_sizes))
161+
for model, tp_size in models_tps:
162+
assert model in WEIGHT_SHAPES
163+
for KN, tp_split_dim in copy.deepcopy(WEIGHT_SHAPES[model]):
164+
KN[tp_split_dim] = KN[tp_split_dim] // tp_size
165+
KN.append(model)
166+
KN_model_names.append(KN)
167+
return KN_model_names
168+
169+
170+
if __name__ == "__main__":
171+
parser = argparse.ArgumentParser()
172+
parser.add_argument(
173+
"--models",
174+
nargs="+",
175+
type=str,
176+
default=["meta-llama/Llama-3.1-8B-Instruct"],
177+
choices=[*WEIGHT_SHAPES.keys()],
178+
help="List of models to benchmark",
179+
)
180+
parser.add_argument(
181+
"--tp-sizes",
182+
nargs="+",
183+
type=int,
184+
default=[1],
185+
help="List of tensor parallel sizes",
186+
)
187+
args = parser.parse_args()
188+
189+
KN_model_names = prepare_shapes(args)
190+
for K, N, model_name in KN_model_names:
191+
print(f"{model_name}, N={N} K={K}, BF16 vs INT8 GEMMs TFLOP/s:")
192+
benchmark.run(
193+
print_data=True,
194+
show_plots=True,
195+
save_path=f"bench_int8_res_n{N}_k{K}",
196+
N=N,
197+
K=K,
198+
)
199+
200+
print("Benchmark finished!")

benchmarks/kernels/benchmark_moe.py

Lines changed: 4 additions & 10 deletions
Original file line numberDiff line numberDiff line change
@@ -7,7 +7,6 @@
77
from contextlib import nullcontext
88
from datetime import datetime
99
from itertools import product
10-
from types import SimpleNamespace
1110
from typing import Any, TypedDict
1211

1312
import ray
@@ -43,7 +42,7 @@ def benchmark_config(
4342
use_fp8_w8a8: bool,
4443
use_int8_w8a16: bool,
4544
num_iters: int = 100,
46-
block_quant_shape: List[int] = None,
45+
block_quant_shape: list[int] = None,
4746
use_deep_gemm: bool = False,
4847
) -> float:
4948
init_dtype = torch.float16 if use_fp8_w8a8 else dtype
@@ -400,7 +399,7 @@ def benchmark(
400399
dtype: torch.dtype,
401400
use_fp8_w8a8: bool,
402401
use_int8_w8a16: bool,
403-
block_quant_shape: List[int] = None,
402+
block_quant_shape: list[int] = None,
404403
use_deep_gemm: bool = False,
405404
) -> tuple[dict[str, int], float]:
406405
current_platform.seed_everything(self.seed)
@@ -532,7 +531,7 @@ def save_configs(
532531
dtype: torch.dtype,
533532
use_fp8_w8a8: bool,
534533
use_int8_w8a16: bool,
535-
block_quant_shape: List[int],
534+
block_quant_shape: list[int],
536535
) -> None:
537536
dtype_str = get_config_dtype_str(
538537
dtype, use_int8_w8a16=use_int8_w8a16, use_fp8_w8a8=use_fp8_w8a8
@@ -563,7 +562,6 @@ def main(args: argparse.Namespace):
563562
config = get_config(model=args.model, trust_remote_code=args.trust_remote_code)
564563
if args.model_prefix:
565564
config = getattr(config, args.model_prefix)
566-
config = SimpleNamespace(**config)
567565

568566
if config.architectures[0] == "DbrxForCausalLM":
569567
E = config.ffn_config.moe_num_experts
@@ -595,11 +593,7 @@ def main(args: argparse.Namespace):
595593
shard_intermediate_size = 2 * intermediate_size // args.tp_size
596594

597595
hidden_size = config.hidden_size
598-
dtype = (
599-
torch.float16
600-
if current_platform.is_rocm()
601-
else getattr(torch, config.torch_dtype)
602-
)
596+
dtype = torch.float16 if current_platform.is_rocm() else config.torch_dtype
603597
use_fp8_w8a8 = args.dtype == "fp8_w8a8"
604598
use_int8_w8a16 = args.dtype == "int8_w8a16"
605599
block_quant_shape = get_weight_block_size_safety(config)

csrc/cpu/attention.cpp

Lines changed: 3 additions & 3 deletions
Original file line numberDiff line numberDiff line change
@@ -137,8 +137,8 @@ FORCE_INLINE std::pair<T, T> reduceSoftmaxAlibi(T* data, const int size,
137137
}
138138

139139
template <typename T>
140-
FORCE_INLINE void reducePartitonSoftmax(const T* max_data, T* sum_data,
141-
const int size) {
140+
FORCE_INLINE void reducePartitionSoftmax(const T* max_data, T* sum_data,
141+
const int size) {
142142
T max = max_data[0];
143143
for (int i = 1; i < size; ++i) {
144144
max = max >= max_data[i] ? max : max_data[i];
@@ -634,7 +634,7 @@ struct paged_attention_v2_impl {
634634

635635
if (partition_num == 1) continue;
636636

637-
reducePartitonSoftmax(
637+
reducePartitionSoftmax(
638638
max_logits + seq_idx * num_heads * max_num_partitions +
639639
head_idx * max_num_partitions,
640640
exp_sums + seq_idx * num_heads * max_num_partitions +

0 commit comments

Comments
 (0)