diff --git a/.buildkite/lm-eval-harness/configs/Meta-Llama-3.2-1B-Instruct-INT8-compressed-tensors.yaml b/.buildkite/lm-eval-harness/configs/Meta-Llama-3.2-1B-Instruct-INT8-compressed-tensors.yaml
new file mode 100644
index 0000000000000..78347f63fa793
--- /dev/null
+++ b/.buildkite/lm-eval-harness/configs/Meta-Llama-3.2-1B-Instruct-INT8-compressed-tensors.yaml
@@ -0,0 +1,11 @@
+# bash .buildkite/lm-eval-harness/run-lm-eval-gsm-vllm-baseline.sh -m neuralmagic/Llama-3.2-1B-Instruct-quantized.w8a8 -b "auto" -l 1000 -f 5 -t 1
+model_name: "neuralmagic/Llama-3.2-1B-Instruct-quantized.w8a8"
+tasks:
+- name: "gsm8k"
+ metrics:
+ - name: "exact_match,strict-match"
+ value: 0.356
+ - name: "exact_match,flexible-extract"
+ value: 0.358
+limit: 1000
+num_fewshot: 5
diff --git a/.buildkite/lm-eval-harness/configs/models-small.txt b/.buildkite/lm-eval-harness/configs/models-small.txt
index 64a0f428587af..6057229ac50f3 100644
--- a/.buildkite/lm-eval-harness/configs/models-small.txt
+++ b/.buildkite/lm-eval-harness/configs/models-small.txt
@@ -1,6 +1,6 @@
Meta-Llama-3-8B-Instruct.yaml
Meta-Llama-3-8B-Instruct-FP8-compressed-tensors.yaml
-Meta-Llama-3-8B-Instruct-INT8-compressed-tensors.yaml
+Meta-Llama-3.2-1B-Instruct-INT8-compressed-tensors.yaml
Meta-Llama-3-8B-Instruct-INT8-compressed-tensors-asym.yaml
Meta-Llama-3-8B-Instruct-nonuniform-compressed-tensors.yaml
Meta-Llama-3-8B-Instruct-Channelwise-compressed-tensors.yaml
diff --git a/.buildkite/lm-eval-harness/run-lm-eval-gsm-hf-baseline.sh b/.buildkite/lm-eval-harness/run-lm-eval-gsm-hf-baseline.sh
index b2e910e1ba8a7..a67fc89d54e60 100644
--- a/.buildkite/lm-eval-harness/run-lm-eval-gsm-hf-baseline.sh
+++ b/.buildkite/lm-eval-harness/run-lm-eval-gsm-hf-baseline.sh
@@ -41,6 +41,6 @@ while getopts "m:b:l:f:" OPT; do
done
lm_eval --model hf \
- --model_args pretrained=$MODEL,parallelize=True \
- --tasks gsm8k --num_fewshot $FEWSHOT --limit $LIMIT \
- --batch_size $BATCH_SIZE
+ --model_args "pretrained=$MODEL,parallelize=True" \
+ --tasks gsm8k --num_fewshot "$FEWSHOT" --limit "$LIMIT" \
+ --batch_size "$BATCH_SIZE"
diff --git a/.buildkite/lm-eval-harness/run-lm-eval-gsm-vllm-baseline.sh b/.buildkite/lm-eval-harness/run-lm-eval-gsm-vllm-baseline.sh
index 4d32b49a4fac3..65be3c5d93b20 100644
--- a/.buildkite/lm-eval-harness/run-lm-eval-gsm-vllm-baseline.sh
+++ b/.buildkite/lm-eval-harness/run-lm-eval-gsm-vllm-baseline.sh
@@ -46,6 +46,6 @@ while getopts "m:b:l:f:t:" OPT; do
done
lm_eval --model vllm \
- --model_args pretrained=$MODEL,tensor_parallel_size=$TP_SIZE,distributed_executor_backend="ray",trust_remote_code=true,max_model_len=4096 \
- --tasks gsm8k --num_fewshot $FEWSHOT --limit $LIMIT \
- --batch_size $BATCH_SIZE
+ --model_args "pretrained=$MODEL,tensor_parallel_size=$TP_SIZE,distributed_executor_backend=ray,trust_remote_code=true,max_model_len=4096" \
+ --tasks gsm8k --num_fewshot "$FEWSHOT" --limit "$LIMIT" \
+ --batch_size "$BATCH_SIZE"
diff --git a/.buildkite/lm-eval-harness/run-tests.sh b/.buildkite/lm-eval-harness/run-tests.sh
index b4fdde6dab425..26f33b744289a 100644
--- a/.buildkite/lm-eval-harness/run-tests.sh
+++ b/.buildkite/lm-eval-harness/run-tests.sh
@@ -30,7 +30,7 @@ while getopts "c:t:" OPT; do
done
# Parse list of configs.
-IFS=$'\n' read -d '' -r -a MODEL_CONFIGS < $CONFIG
+IFS=$'\n' read -d '' -r -a MODEL_CONFIGS < "$CONFIG"
for MODEL_CONFIG in "${MODEL_CONFIGS[@]}"
do
diff --git a/.buildkite/nightly-benchmarks/benchmark-pipeline.yaml b/.buildkite/nightly-benchmarks/benchmark-pipeline.yaml
index eec2a51e2f8fd..3db77d5f16022 100644
--- a/.buildkite/nightly-benchmarks/benchmark-pipeline.yaml
+++ b/.buildkite/nightly-benchmarks/benchmark-pipeline.yaml
@@ -9,8 +9,11 @@ steps:
- image: badouralix/curl-jq
command:
- sh .buildkite/nightly-benchmarks/scripts/wait-for-image.sh
+
- wait
+
- label: "A100"
+ # skip: "use this flag to conditionally skip the benchmark step, useful for PR testing"
agents:
queue: A100
plugins:
@@ -41,20 +44,43 @@ steps:
- name: devshm
emptyDir:
medium: Memory
- # - label: "H100"
- # agents:
- # queue: H100
- # plugins:
- # - docker#v5.11.0:
- # image: public.ecr.aws/q9t5s3a7/vllm-ci-test-repo:$BUILDKITE_COMMIT
- # command:
- # - bash
- # - .buildkite/nightly-benchmarks/run-benchmarks-suite.sh
- # mount-buildkite-agent: true
- # propagate-environment: true
- # ipc: host
- # gpus: all
- # environment:
- # - VLLM_USAGE_SOURCE
- # - HF_TOKEN
+ - label: "H200"
+ # skip: "use this flag to conditionally skip the benchmark step, useful for PR testing"
+ agents:
+ queue: H200
+ plugins:
+ - docker#v5.12.0:
+ image: public.ecr.aws/q9t5s3a7/vllm-ci-test-repo:$BUILDKITE_COMMIT
+ command:
+ - bash
+ - .buildkite/nightly-benchmarks/scripts/run-performance-benchmarks.sh
+ mount-buildkite-agent: true
+ propagate-environment: true
+ ipc: host
+ gpus: 4,5,6,7
+ volumes:
+ - /data/benchmark-hf-cache:/root/.cache/huggingface
+ environment:
+ - VLLM_USAGE_SOURCE
+ - HF_TOKEN
+
+ - label: "H100"
+ # skip: "use this flag to conditionally skip the benchmark step, useful for PR testing"
+ agents:
+ queue: H100
+ plugins:
+ - docker#v5.12.0:
+ image: public.ecr.aws/q9t5s3a7/vllm-ci-test-repo:$BUILDKITE_COMMIT
+ command:
+ - bash
+ - .buildkite/nightly-benchmarks/scripts/run-performance-benchmarks.sh
+ mount-buildkite-agent: true
+ propagate-environment: true
+ ipc: host
+ gpus: all # see CUDA_VISIBLE_DEVICES for actual GPUs used
+ volumes:
+ - /data/benchmark-hf-cache:/root/.cache/huggingface
+ environment:
+ - VLLM_USAGE_SOURCE
+ - HF_TOKEN
diff --git a/.buildkite/nightly-benchmarks/scripts/convert-results-json-to-markdown.py b/.buildkite/nightly-benchmarks/scripts/convert-results-json-to-markdown.py
index f90e464288cf1..9d3646e2f6a15 100644
--- a/.buildkite/nightly-benchmarks/scripts/convert-results-json-to-markdown.py
+++ b/.buildkite/nightly-benchmarks/scripts/convert-results-json-to-markdown.py
@@ -56,7 +56,7 @@
def read_markdown(file):
if os.path.exists(file):
- with open(file, "r") as f:
+ with open(file) as f:
return f.read() + "\n"
else:
return f"{file} not found.\n"
@@ -75,14 +75,14 @@ def results_to_json(latency, throughput, serving):
# collect results
for test_file in results_folder.glob("*.json"):
- with open(test_file, "r") as f:
+ with open(test_file) as f:
raw_result = json.loads(f.read())
if "serving" in str(test_file):
# this result is generated via `benchmark_serving.py`
# attach the benchmarking command to raw_result
- with open(test_file.with_suffix(".commands"), "r") as f:
+ with open(test_file.with_suffix(".commands")) as f:
command = json.loads(f.read())
raw_result.update(command)
@@ -97,7 +97,7 @@ def results_to_json(latency, throughput, serving):
# this result is generated via `benchmark_latency.py`
# attach the benchmarking command to raw_result
- with open(test_file.with_suffix(".commands"), "r") as f:
+ with open(test_file.with_suffix(".commands")) as f:
command = json.loads(f.read())
raw_result.update(command)
@@ -119,7 +119,7 @@ def results_to_json(latency, throughput, serving):
# this result is generated via `benchmark_throughput.py`
# attach the benchmarking command to raw_result
- with open(test_file.with_suffix(".commands"), "r") as f:
+ with open(test_file.with_suffix(".commands")) as f:
command = json.loads(f.read())
raw_result.update(command)
@@ -157,6 +157,18 @@ def results_to_json(latency, throughput, serving):
throughput_results,
serving_results)
+ for df in [latency_results, serving_results, throughput_results]:
+ if df.empty:
+ continue
+
+ # Sort all dataframes by their respective "Test name" columns
+ df.sort_values(by="Test name", inplace=True)
+
+ # The GPUs sometimes come in format of "GPUTYPE\nGPUTYPE\n...",
+ # we want to turn it into "8xGPUTYPE"
+ df["GPU"] = df["GPU"].apply(
+ lambda x: f"{len(x.split('\n'))}x{x.split('\n')[0]}")
+
# get markdown tables
latency_md_table = tabulate(latency_results,
headers='keys',
diff --git a/.buildkite/nightly-benchmarks/scripts/generate-nightly-markdown.py b/.buildkite/nightly-benchmarks/scripts/generate-nightly-markdown.py
index 6059588fe7277..052060c576300 100644
--- a/.buildkite/nightly-benchmarks/scripts/generate-nightly-markdown.py
+++ b/.buildkite/nightly-benchmarks/scripts/generate-nightly-markdown.py
@@ -72,7 +72,7 @@ def main(args):
# collect results
for test_file in results_folder.glob("*_nightly_results.json"):
- with open(test_file, "r") as f:
+ with open(test_file) as f:
results = results + json.loads(f.read())
# generate markdown table
@@ -80,7 +80,7 @@ def main(args):
md_table = tabulate(df, headers='keys', tablefmt='pipe', showindex=False)
- with open(args.description, "r") as f:
+ with open(args.description) as f:
description = f.read()
description = description.format(
diff --git a/.buildkite/nightly-benchmarks/scripts/launch-server.sh b/.buildkite/nightly-benchmarks/scripts/launch-server.sh
index e9d7d6a8d760a..fb5063db86942 100644
--- a/.buildkite/nightly-benchmarks/scripts/launch-server.sh
+++ b/.buildkite/nightly-benchmarks/scripts/launch-server.sh
@@ -50,31 +50,30 @@ launch_trt_server() {
git clone https://github.com/triton-inference-server/tensorrtllm_backend.git
git lfs install
cd tensorrtllm_backend
- git checkout $trt_llm_version
- tensorrtllm_backend_dir=$(pwd)
+ git checkout "$trt_llm_version"
git submodule update --init --recursive
# build trtllm engine
cd /tensorrtllm_backend
- cd ./tensorrt_llm/examples/${model_type}
+ cd "./tensorrt_llm/examples/${model_type}"
python3 convert_checkpoint.py \
- --model_dir ${model_path} \
- --dtype ${model_dtype} \
- --tp_size ${model_tp_size} \
- --output_dir ${trt_model_path}
+ --model_dir "${model_path}" \
+ --dtype "${model_dtype}" \
+ --tp_size "${model_tp_size}" \
+ --output_dir "${trt_model_path}"
trtllm-build \
- --checkpoint_dir ${trt_model_path} \
+ --checkpoint_dir "${trt_model_path}" \
--use_fused_mlp \
--reduce_fusion disable \
--workers 8 \
- --gpt_attention_plugin ${model_dtype} \
- --gemm_plugin ${model_dtype} \
- --tp_size ${model_tp_size} \
- --max_batch_size ${max_batch_size} \
- --max_input_len ${max_input_len} \
- --max_seq_len ${max_seq_len} \
- --max_num_tokens ${max_num_tokens} \
- --output_dir ${trt_engine_path}
+ --gpt_attention_plugin "${model_dtype}" \
+ --gemm_plugin "${model_dtype}" \
+ --tp_size "${model_tp_size}" \
+ --max_batch_size "${max_batch_size}" \
+ --max_input_len "${max_input_len}" \
+ --max_seq_len "${max_seq_len}" \
+ --max_num_tokens "${max_num_tokens}" \
+ --output_dir "${trt_engine_path}"
# handle triton protobuf files and launch triton server
cd /tensorrtllm_backend
@@ -82,15 +81,15 @@ launch_trt_server() {
cp -r all_models/inflight_batcher_llm/* triton_model_repo/
cd triton_model_repo
rm -rf ./tensorrt_llm/1/*
- cp -r ${trt_engine_path}/* ./tensorrt_llm/1
+ cp -r "${trt_engine_path}"/* ./tensorrt_llm/1
python3 ../tools/fill_template.py -i tensorrt_llm/config.pbtxt triton_backend:tensorrtllm,engine_dir:/tensorrtllm_backend/triton_model_repo/tensorrt_llm/1,decoupled_mode:true,batching_strategy:inflight_fused_batching,batch_scheduler_policy:guaranteed_no_evict,exclude_input_in_output:true,triton_max_batch_size:2048,max_queue_delay_microseconds:0,max_beam_width:1,max_queue_size:2048,enable_kv_cache_reuse:false
- python3 ../tools/fill_template.py -i preprocessing/config.pbtxt triton_max_batch_size:2048,tokenizer_dir:$model_path,preprocessing_instance_count:5
- python3 ../tools/fill_template.py -i postprocessing/config.pbtxt triton_max_batch_size:2048,tokenizer_dir:$model_path,postprocessing_instance_count:5,skip_special_tokens:false
- python3 ../tools/fill_template.py -i ensemble/config.pbtxt triton_max_batch_size:$max_batch_size
- python3 ../tools/fill_template.py -i tensorrt_llm_bls/config.pbtxt triton_max_batch_size:$max_batch_size,decoupled_mode:true,accumulate_tokens:"False",bls_instance_count:1
+ python3 ../tools/fill_template.py -i preprocessing/config.pbtxt "triton_max_batch_size:2048,tokenizer_dir:$model_path,preprocessing_instance_count:5"
+ python3 ../tools/fill_template.py -i postprocessing/config.pbtxt "triton_max_batch_size:2048,tokenizer_dir:$model_path,postprocessing_instance_count:5,skip_special_tokens:false"
+ python3 ../tools/fill_template.py -i ensemble/config.pbtxt triton_max_batch_size:"$max_batch_size"
+ python3 ../tools/fill_template.py -i tensorrt_llm_bls/config.pbtxt "triton_max_batch_size:$max_batch_size,decoupled_mode:true,accumulate_tokens:False,bls_instance_count:1"
cd /tensorrtllm_backend
python3 scripts/launch_triton_server.py \
- --world_size=${model_tp_size} \
+ --world_size="${model_tp_size}" \
--model_repo=/tensorrtllm_backend/triton_model_repo &
}
@@ -98,10 +97,7 @@ launch_trt_server() {
launch_tgi_server() {
model=$(echo "$common_params" | jq -r '.model')
tp=$(echo "$common_params" | jq -r '.tp')
- dataset_name=$(echo "$common_params" | jq -r '.dataset_name')
- dataset_path=$(echo "$common_params" | jq -r '.dataset_path')
port=$(echo "$common_params" | jq -r '.port')
- num_prompts=$(echo "$common_params" | jq -r '.num_prompts')
server_args=$(json2args "$server_params")
if echo "$common_params" | jq -e 'has("fp8")' >/dev/null; then
@@ -129,10 +125,7 @@ launch_tgi_server() {
launch_lmdeploy_server() {
model=$(echo "$common_params" | jq -r '.model')
tp=$(echo "$common_params" | jq -r '.tp')
- dataset_name=$(echo "$common_params" | jq -r '.dataset_name')
- dataset_path=$(echo "$common_params" | jq -r '.dataset_path')
port=$(echo "$common_params" | jq -r '.port')
- num_prompts=$(echo "$common_params" | jq -r '.num_prompts')
server_args=$(json2args "$server_params")
server_command="lmdeploy serve api_server $model \
@@ -149,10 +142,7 @@ launch_sglang_server() {
model=$(echo "$common_params" | jq -r '.model')
tp=$(echo "$common_params" | jq -r '.tp')
- dataset_name=$(echo "$common_params" | jq -r '.dataset_name')
- dataset_path=$(echo "$common_params" | jq -r '.dataset_path')
port=$(echo "$common_params" | jq -r '.port')
- num_prompts=$(echo "$common_params" | jq -r '.num_prompts')
server_args=$(json2args "$server_params")
if echo "$common_params" | jq -e 'has("fp8")' >/dev/null; then
@@ -185,10 +175,7 @@ launch_vllm_server() {
model=$(echo "$common_params" | jq -r '.model')
tp=$(echo "$common_params" | jq -r '.tp')
- dataset_name=$(echo "$common_params" | jq -r '.dataset_name')
- dataset_path=$(echo "$common_params" | jq -r '.dataset_path')
port=$(echo "$common_params" | jq -r '.port')
- num_prompts=$(echo "$common_params" | jq -r '.num_prompts')
server_args=$(json2args "$server_params")
if echo "$common_params" | jq -e 'has("fp8")' >/dev/null; then
@@ -217,19 +204,19 @@ launch_vllm_server() {
main() {
- if [[ $CURRENT_LLM_SERVING_ENGINE == "trt" ]]; then
+ if [[ "$CURRENT_LLM_SERVING_ENGINE" == "trt" ]]; then
launch_trt_server
fi
- if [[ $CURRENT_LLM_SERVING_ENGINE == "tgi" ]]; then
+ if [[ "$CURRENT_LLM_SERVING_ENGINE" == "tgi" ]]; then
launch_tgi_server
fi
- if [[ $CURRENT_LLM_SERVING_ENGINE == "lmdeploy" ]]; then
+ if [[ "$CURRENT_LLM_SERVING_ENGINE" == "lmdeploy" ]]; then
launch_lmdeploy_server
fi
- if [[ $CURRENT_LLM_SERVING_ENGINE == "sglang" ]]; then
+ if [[ "$CURRENT_LLM_SERVING_ENGINE" == "sglang" ]]; then
launch_sglang_server
fi
diff --git a/.buildkite/nightly-benchmarks/scripts/nightly-annotate.sh b/.buildkite/nightly-benchmarks/scripts/nightly-annotate.sh
index c6a1bbdeb7d48..686f70dbece6c 100644
--- a/.buildkite/nightly-benchmarks/scripts/nightly-annotate.sh
+++ b/.buildkite/nightly-benchmarks/scripts/nightly-annotate.sh
@@ -16,10 +16,10 @@ main() {
fi
# initial annotation
- description="$VLLM_SOURCE_CODE_LOC/.buildkite/nightly-benchmarks/nightly-descriptions.md"
+ #description="$VLLM_SOURCE_CODE_LOC/.buildkite/nightly-benchmarks/nightly-descriptions.md"
# download results
- cd $VLLM_SOURCE_CODE_LOC/benchmarks
+ cd "$VLLM_SOURCE_CODE_LOC/benchmarks"
mkdir -p results/
/workspace/buildkite-agent artifact download 'results/*nightly_results.json' results/
ls
@@ -30,15 +30,15 @@ main() {
/workspace/buildkite-agent artifact upload "results.zip"
# upload benchmarking scripts
- cd $VLLM_SOURCE_CODE_LOC/
+ cd "$VLLM_SOURCE_CODE_LOC/"
zip -r nightly-benchmarks.zip .buildkite/ benchmarks/
/workspace/buildkite-agent artifact upload "nightly-benchmarks.zip"
- cd $VLLM_SOURCE_CODE_LOC/.buildkite/nightly-benchmarks/
+ cd "$VLLM_SOURCE_CODE_LOC/.buildkite/nightly-benchmarks/"
# upload benchmarking pipeline
/workspace/buildkite-agent artifact upload "nightly-pipeline.yaml"
- cd $VLLM_SOURCE_CODE_LOC/.buildkite/nightly-benchmarks/
+ cd "$VLLM_SOURCE_CODE_LOC/.buildkite/nightly-benchmarks/"
/workspace/buildkite-agent annotate --style "success" --context "nightly-benchmarks-results" --append < nightly-annotation.md
@@ -75,4 +75,4 @@ main() {
# /workspace/buildkite-agent annotate --style "success" --context "nightly-benchmarks-results" --append < nightly_results.md
}
-main "$@"
\ No newline at end of file
+main "$@"
diff --git a/.buildkite/nightly-benchmarks/scripts/run-nightly-benchmarks.sh b/.buildkite/nightly-benchmarks/scripts/run-nightly-benchmarks.sh
index dd8c15e0700eb..3f38cf5137535 100644
--- a/.buildkite/nightly-benchmarks/scripts/run-nightly-benchmarks.sh
+++ b/.buildkite/nightly-benchmarks/scripts/run-nightly-benchmarks.sh
@@ -12,7 +12,7 @@ check_gpus() {
echo "Need at least 1 GPU to run benchmarking."
exit 1
fi
- declare -g gpu_type=$(echo $(nvidia-smi --query-gpu=name --format=csv,noheader) | awk '{print $2}')
+ declare -g gpu_type="$(nvidia-smi --query-gpu=name --format=csv,noheader | awk '{print $2}')"
echo "GPU type is $gpu_type"
}
@@ -102,7 +102,7 @@ kill_gpu_processes() {
pkill -f text-generation
pkill -f lmdeploy
- while [ $(nvidia-smi --query-gpu=memory.used --format=csv,noheader,nounits | head -n 1) -ge 1000 ]; do
+ while [ "$(nvidia-smi --query-gpu=memory.used --format=csv,noheader,nounits | head -n 1)" -ge 1000 ]; do
sleep 1
done
}
@@ -119,8 +119,8 @@ wait_for_server() {
ensure_installed() {
# Ensure that the given command is installed by apt-get
local cmd=$1
- if ! which $cmd >/dev/null; then
- apt-get update && apt-get install -y $cmd
+ if ! which "$cmd" >/dev/null; then
+ apt-get update && apt-get install -y "$cmd"
fi
}
@@ -173,13 +173,11 @@ run_serving_tests() {
echo "Reuse previous server for test case $test_name"
else
kill_gpu_processes
- bash $VLLM_SOURCE_CODE_LOC/.buildkite/nightly-benchmarks/scripts/launch-server.sh \
+ bash "$VLLM_SOURCE_CODE_LOC/.buildkite/nightly-benchmarks/scripts/launch-server.sh" \
"$server_params" "$common_params"
fi
- wait_for_server
-
- if [ $? -eq 0 ]; then
+ if wait_for_server; then
echo ""
echo "$CURRENT_LLM_SERVING_ENGINE server is up and running."
else
@@ -190,13 +188,13 @@ run_serving_tests() {
# prepare tokenizer
# this is required for lmdeploy.
- cd $VLLM_SOURCE_CODE_LOC/benchmarks
+ cd "$VLLM_SOURCE_CODE_LOC/benchmarks"
rm -rf /tokenizer_cache
mkdir /tokenizer_cache
python3 ../.buildkite/nightly-benchmarks/scripts/download-tokenizer.py \
--model "$model" \
--cachedir /tokenizer_cache
- cd $VLLM_SOURCE_CODE_LOC/benchmarks
+ cd "$VLLM_SOURCE_CODE_LOC/benchmarks"
# change model name for lmdeploy (it will not follow standard hf name)
@@ -307,11 +305,11 @@ run_serving_tests() {
prepare_dataset() {
# download sharegpt dataset
- cd $VLLM_SOURCE_CODE_LOC/benchmarks
+ cd "$VLLM_SOURCE_CODE_LOC/benchmarks"
wget https://huggingface.co/datasets/anon8231489123/ShareGPT_Vicuna_unfiltered/resolve/main/ShareGPT_V3_unfiltered_cleaned_split.json
# duplicate sonnet by 4x, to allow benchmarking with input length 2048
- cd $VLLM_SOURCE_CODE_LOC/benchmarks
+ cd "$VLLM_SOURCE_CODE_LOC/benchmarks"
echo "" > sonnet_4x.txt
for _ in {1..4}
do
@@ -339,17 +337,17 @@ main() {
prepare_dataset
- cd $VLLM_SOURCE_CODE_LOC/benchmarks
+ cd "$VLLM_SOURCE_CODE_LOC/benchmarks"
declare -g RESULTS_FOLDER=results/
mkdir -p $RESULTS_FOLDER
- BENCHMARK_ROOT=$VLLM_SOURCE_CODE_LOC/.buildkite/nightly-benchmarks/
+ BENCHMARK_ROOT="$VLLM_SOURCE_CODE_LOC/.buildkite/nightly-benchmarks/"
# run the test
- run_serving_tests $BENCHMARK_ROOT/tests/nightly-tests.json
+ run_serving_tests "$BENCHMARK_ROOT/tests/nightly-tests.json"
# upload benchmark results to buildkite
python3 -m pip install tabulate pandas
- python3 $BENCHMARK_ROOT/scripts/summary-nightly-results.py
+ python3 "$BENCHMARK_ROOT/scripts/summary-nightly-results.py"
upload_to_buildkite
}
diff --git a/.buildkite/nightly-benchmarks/scripts/run-performance-benchmarks.sh b/.buildkite/nightly-benchmarks/scripts/run-performance-benchmarks.sh
index a0b9a409b758d..0d16a83781ab2 100644
--- a/.buildkite/nightly-benchmarks/scripts/run-performance-benchmarks.sh
+++ b/.buildkite/nightly-benchmarks/scripts/run-performance-benchmarks.sh
@@ -6,6 +6,7 @@
# Do not set -e, as the mixtral 8x22B model tends to crash occasionally
# and we still want to see other benchmarking results even when mixtral crashes.
+set -x
set -o pipefail
check_gpus() {
@@ -17,7 +18,7 @@ check_gpus() {
echo "Need at least 1 GPU to run benchmarking."
exit 1
fi
- declare -g gpu_type=$(echo $(nvidia-smi --query-gpu=name --format=csv,noheader) | awk '{print $2}')
+ declare -g gpu_type=$(nvidia-smi --query-gpu=name --format=csv,noheader | awk '{print $2}')
echo "GPU type is $gpu_type"
}
@@ -85,15 +86,11 @@ kill_gpu_processes() {
ps -aux
lsof -t -i:8000 | xargs -r kill -9
- pkill -f pt_main_thread
- # this line doesn't work now
- # ps aux | grep python | grep openai | awk '{print $2}' | xargs -r kill -9
- pkill -f python3
- pkill -f /usr/bin/python3
+ pgrep python3 | xargs -r kill -9
# wait until GPU memory usage smaller than 1GB
- while [ $(nvidia-smi --query-gpu=memory.used --format=csv,noheader,nounits | head -n 1) -ge 1000 ]; do
+ while [ "$(nvidia-smi --query-gpu=memory.used --format=csv,noheader,nounits | head -n 1)" -ge 1000 ]; do
sleep 1
done
@@ -117,7 +114,7 @@ upload_to_buildkite() {
fi
# Use the determined command to annotate and upload artifacts
- $BUILDKITE_AGENT_COMMAND annotate --style "info" --context "$BUILDKITE_LABEL-benchmark-results" <$RESULTS_FOLDER/benchmark_results.md
+ $BUILDKITE_AGENT_COMMAND annotate --style "info" --context "$BUILDKITE_LABEL-benchmark-results" < "$RESULTS_FOLDER/benchmark_results.md"
$BUILDKITE_AGENT_COMMAND artifact upload "$RESULTS_FOLDER/*"
}
@@ -150,7 +147,7 @@ run_latency_tests() {
# check if there is enough GPU to run the test
tp=$(echo "$latency_params" | jq -r '.tensor_parallel_size')
if [[ $gpu_count -lt $tp ]]; then
- echo "Required tensor-parallel-size $tp but only $gpu_count GPU found. Skip testcase $testname."
+ echo "Required tensor-parallel-size $tp but only $gpu_count GPU found. Skip testcase $test_name."
continue
fi
@@ -206,9 +203,9 @@ run_throughput_tests() {
throughput_args=$(json2args "$throughput_params")
# check if there is enough GPU to run the test
- tp=$(echo $throughput_params | jq -r '.tensor_parallel_size')
+ tp=$(echo "$throughput_params" | jq -r '.tensor_parallel_size')
if [[ $gpu_count -lt $tp ]]; then
- echo "Required tensor-parallel-size $tp but only $gpu_count GPU found. Skip testcase $testname."
+ echo "Required tensor-parallel-size $tp but only $gpu_count GPU found. Skip testcase $test_name."
continue
fi
@@ -270,7 +267,7 @@ run_serving_tests() {
# check if there is enough GPU to run the test
tp=$(echo "$server_params" | jq -r '.tensor_parallel_size')
if [[ $gpu_count -lt $tp ]]; then
- echo "Required tensor-parallel-size $tp but only $gpu_count GPU found. Skip testcase $testname."
+ echo "Required tensor-parallel-size $tp but only $gpu_count GPU found. Skip testcase $test_name."
continue
fi
@@ -278,7 +275,7 @@ run_serving_tests() {
server_model=$(echo "$server_params" | jq -r '.model')
client_model=$(echo "$client_params" | jq -r '.model')
if [[ $server_model != "$client_model" ]]; then
- echo "Server model and client model must be the same. Skip testcase $testname."
+ echo "Server model and client model must be the same. Skip testcase $test_name."
continue
fi
@@ -289,12 +286,11 @@ run_serving_tests() {
# run the server
echo "Running test case $test_name"
echo "Server command: $server_command"
- eval "$server_command" &
+ bash -c "$server_command" &
server_pid=$!
# wait until the server is alive
- wait_for_server
- if [ $? -eq 0 ]; then
+ if wait_for_server; then
echo ""
echo "vllm server is up and running."
else
@@ -323,7 +319,7 @@ run_serving_tests() {
echo "Running test case $test_name with qps $qps"
echo "Client command: $client_command"
- eval "$client_command"
+ bash -c "$client_command"
# record the benchmarking commands
jq_output=$(jq -n \
diff --git a/.buildkite/nightly-benchmarks/scripts/summary-nightly-results.py b/.buildkite/nightly-benchmarks/scripts/summary-nightly-results.py
index 4e4d4cd4ca3c6..92d6fad73a94c 100644
--- a/.buildkite/nightly-benchmarks/scripts/summary-nightly-results.py
+++ b/.buildkite/nightly-benchmarks/scripts/summary-nightly-results.py
@@ -36,11 +36,11 @@
# collect results
for test_file in results_folder.glob("*.json"):
- with open(test_file, "r") as f:
+ with open(test_file) as f:
raw_result = json.loads(f.read())
# attach the benchmarking command to raw_result
- with open(test_file.with_suffix(".commands"), "r") as f:
+ with open(test_file.with_suffix(".commands")) as f:
command = json.loads(f.read())
raw_result.update(command)
diff --git a/.buildkite/nightly-benchmarks/scripts/wait-for-image.sh b/.buildkite/nightly-benchmarks/scripts/wait-for-image.sh
index f16862907def1..19f7160e68a4d 100644
--- a/.buildkite/nightly-benchmarks/scripts/wait-for-image.sh
+++ b/.buildkite/nightly-benchmarks/scripts/wait-for-image.sh
@@ -6,7 +6,7 @@ TIMEOUT_SECONDS=10
retries=0
while [ $retries -lt 1000 ]; do
- if [ $(curl -s --max-time $TIMEOUT_SECONDS -L -H "Authorization: Bearer $TOKEN" -o /dev/null -w "%{http_code}" $URL) -eq 200 ]; then
+ if [ "$(curl -s --max-time "$TIMEOUT_SECONDS" -L -H "Authorization: Bearer $TOKEN" -o /dev/null -w "%{http_code}" "$URL")" -eq 200 ]; then
exit 0
fi
@@ -16,4 +16,4 @@ while [ $retries -lt 1000 ]; do
sleep 5
done
-exit 1
\ No newline at end of file
+exit 1
diff --git a/.buildkite/release-pipeline.yaml b/.buildkite/release-pipeline.yaml
index 98592ea7948f2..f78e360b7afd3 100644
--- a/.buildkite/release-pipeline.yaml
+++ b/.buildkite/release-pipeline.yaml
@@ -3,31 +3,26 @@ steps:
agents:
queue: cpu_queue
commands:
- - "DOCKER_BUILDKIT=1 docker build --build-arg max_jobs=16 --build-arg USE_SCCACHE=1 --build-arg CUDA_VERSION=12.1.0 --tag vllm-ci:build-image --target build --progress plain ."
+ - "DOCKER_BUILDKIT=1 docker build --build-arg max_jobs=16 --build-arg USE_SCCACHE=1 --build-arg GIT_REPO_CHECK=1 --build-arg CUDA_VERSION=12.1.0 --tag vllm-ci:build-image --target build --progress plain ."
- "mkdir artifacts"
- "docker run --rm -v $(pwd)/artifacts:/artifacts_host vllm-ci:build-image bash -c 'cp -r dist /artifacts_host && chmod -R a+rw /artifacts_host'"
- # rename the files to change linux -> manylinux1
- - "for f in artifacts/dist/*.whl; do mv -- \"$$f\" \"$${f/linux/manylinux1}\"; done"
- - "mv artifacts/dist/$(ls artifacts/dist) artifacts/dist/vllm-1.0.0.dev-cp38-abi3-manylinux1_x86_64.whl"
- - "aws s3 cp artifacts/dist/vllm-1.0.0.dev-cp38-abi3-manylinux1_x86_64.whl s3://vllm-wheels/$BUILDKITE_COMMIT/vllm-1.0.0.dev-cp38-abi3-manylinux1_x86_64.whl"
- - "aws s3 cp artifacts/dist/vllm-1.0.0.dev-cp38-abi3-manylinux1_x86_64.whl s3://vllm-wheels/nightly/vllm-1.0.0.dev-cp38-abi3-manylinux1_x86_64.whl"
+ - "bash .buildkite/upload-wheels.sh"
env:
DOCKER_BUILDKIT: "1"
- - block: "Build CUDA 11.8 wheel"
- key: block-build-cu118-wheel
-
+ # Note(simon): We can always build CUDA 11.8 wheel to ensure the build is working.
+ # However, this block can be uncommented to save some compute hours.
+ # - block: "Build CUDA 11.8 wheel"
+ # key: block-build-cu118-wheel
+
- label: "Build wheel - CUDA 11.8"
- depends_on: block-build-cu118-wheel
+ # depends_on: block-build-cu118-wheel
agents:
queue: cpu_queue
commands:
- - "DOCKER_BUILDKIT=1 docker build --build-arg max_jobs=16 --build-arg USE_SCCACHE=1 --build-arg CUDA_VERSION=11.8.0 --tag vllm-ci:build-image --target build --progress plain ."
+ - "DOCKER_BUILDKIT=1 docker build --build-arg max_jobs=16 --build-arg USE_SCCACHE=1 --build-arg GIT_REPO_CHECK=1 --build-arg CUDA_VERSION=11.8.0 --tag vllm-ci:build-image --target build --progress plain ."
- "mkdir artifacts"
- "docker run --rm -v $(pwd)/artifacts:/artifacts_host vllm-ci:build-image bash -c 'cp -r dist /artifacts_host && chmod -R a+rw /artifacts_host'"
- # rename the files to change linux -> manylinux1
- - "for f in artifacts/dist/*.whl; do mv -- \"$$f\" \"$${f/linux/manylinux1}\"; done"
- - "aws s3 cp --recursive artifacts/dist s3://vllm-wheels/$BUILDKITE_COMMIT/"
- - "aws s3 cp --recursive artifacts/dist s3://vllm-wheels/nightly/"
+ - "bash .buildkite/upload-wheels.sh"
env:
DOCKER_BUILDKIT: "1"
diff --git a/.buildkite/run-amd-test.sh b/.buildkite/run-amd-test.sh
index df201cdc7c554..902e162720b89 100755
--- a/.buildkite/run-amd-test.sh
+++ b/.buildkite/run-amd-test.sh
@@ -1,3 +1,5 @@
+#!/bin/bash
+
# This script runs test inside the corresponding ROCm docker container.
set -o pipefail
@@ -31,8 +33,8 @@ cleanup_docker() {
echo "Disk usage is above $threshold%. Cleaning up Docker images and volumes..."
# Remove dangling images (those that are not tagged and not used by any container)
docker image prune -f
- # Remove unused volumes
- docker volume prune -f
+ # Remove unused volumes / force the system prune for old images as well.
+ docker volume prune -f && docker system prune --force --filter "until=72h" --all
echo "Docker images and volumes cleanup completed."
else
echo "Disk usage is below $threshold%. No cleanup needed."
@@ -57,17 +59,17 @@ done
echo "--- Pulling container"
image_name="rocm/vllm-ci:${BUILDKITE_COMMIT}"
container_name="rocm_${BUILDKITE_COMMIT}_$(tr -dc A-Za-z0-9 < /dev/urandom | head -c 10; echo)"
-docker pull ${image_name}
+docker pull "${image_name}"
remove_docker_container() {
- docker rm -f ${container_name} || docker image rm -f ${image_name} || true
+ docker rm -f "${container_name}" || docker image rm -f "${image_name}" || true
}
trap remove_docker_container EXIT
echo "--- Running container"
HF_CACHE="$(realpath ~)/huggingface"
-mkdir -p ${HF_CACHE}
+mkdir -p "${HF_CACHE}"
HF_MOUNT="/root/.cache/huggingface"
commands=$@
@@ -107,35 +109,36 @@ fi
PARALLEL_JOB_COUNT=8
# check if the command contains shard flag, we will run all shards in parallel because the host have 8 GPUs.
if [[ $commands == *"--shard-id="* ]]; then
+ # assign job count as the number of shards used
+ commands=${commands//"--num-shards= "/"--num-shards=${PARALLEL_JOB_COUNT} "}
for GPU in $(seq 0 $(($PARALLEL_JOB_COUNT-1))); do
- #replace shard arguments
- commands=${commands//"--shard-id= "/"--shard-id=${GPU} "}
- commands=${commands//"--num-shards= "/"--num-shards=${PARALLEL_JOB_COUNT} "}
- echo "Shard ${GPU} commands:$commands"
+ # assign shard-id for each shard
+ commands_gpu=${commands//"--shard-id= "/"--shard-id=${GPU} "}
+ echo "Shard ${GPU} commands:$commands_gpu"
docker run \
--device /dev/kfd --device /dev/dri \
--network host \
--shm-size=16gb \
--rm \
- -e HIP_VISIBLE_DEVICES=${GPU} \
+ -e HIP_VISIBLE_DEVICES="${GPU}" \
-e HF_TOKEN \
- -v ${HF_CACHE}:${HF_MOUNT} \
- -e HF_HOME=${HF_MOUNT} \
- --name ${container_name}_${GPU} \
- ${image_name} \
- /bin/bash -c "${commands}" \
+ -v "${HF_CACHE}:${HF_MOUNT}" \
+ -e "HF_HOME=${HF_MOUNT}" \
+ --name "${container_name}_${GPU}" \
+ "${image_name}" \
+ /bin/bash -c "${commands_gpu}" \
|& while read -r line; do echo ">>Shard $GPU: $line"; done &
PIDS+=($!)
done
#wait for all processes to finish and collect exit codes
- for pid in ${PIDS[@]}; do
- wait ${pid}
+ for pid in "${PIDS[@]}"; do
+ wait "${pid}"
STATUS+=($?)
done
- for st in ${STATUS[@]}; do
+ for st in "${STATUS[@]}"; do
if [[ ${st} -ne 0 ]]; then
echo "One of the processes failed with $st"
- exit ${st}
+ exit "${st}"
fi
done
else
@@ -146,9 +149,9 @@ else
--rm \
-e HIP_VISIBLE_DEVICES=0 \
-e HF_TOKEN \
- -v ${HF_CACHE}:${HF_MOUNT} \
- -e HF_HOME=${HF_MOUNT} \
- --name ${container_name} \
- ${image_name} \
+ -v "${HF_CACHE}:${HF_MOUNT}" \
+ -e "HF_HOME=${HF_MOUNT}" \
+ --name "${container_name}" \
+ "${image_name}" \
/bin/bash -c "${commands}"
fi
diff --git a/.buildkite/run-benchmarks.sh b/.buildkite/run-benchmarks.sh
index cbf6dda677c53..1641c1faa9d6a 100644
--- a/.buildkite/run-benchmarks.sh
+++ b/.buildkite/run-benchmarks.sh
@@ -1,3 +1,5 @@
+#!/bin/bash
+
# This script is run by buildkite to run the benchmarks and upload the results to buildkite
set -ex
diff --git a/.buildkite/run-cpu-test-ppc64le.sh b/.buildkite/run-cpu-test-ppc64le.sh
index fd60f5b6afeca..5d7a0bff90963 100755
--- a/.buildkite/run-cpu-test-ppc64le.sh
+++ b/.buildkite/run-cpu-test-ppc64le.sh
@@ -1,3 +1,5 @@
+#!/bin/bash
+
# This script build the CPU docker image and run the offline inference inside the container.
# It serves a sanity check for compilation and basic model usage.
set -ex
@@ -13,27 +15,38 @@ remove_docker_container
# Run the image, setting --shm-size=4g for tensor parallel.
source /etc/environment
#docker run -itd --entrypoint /bin/bash -v ~/.cache/huggingface:/root/.cache/huggingface --privileged=true --network host -e HF_TOKEN --env VLLM_CPU_KVCACHE_SPACE=4 --shm-size=4g --name cpu-test cpu-test
-docker run -itd --entrypoint /bin/bash -v ~/.cache/huggingface:/root/.cache/huggingface --privileged=true --network host -e HF_TOKEN=$HF_TOKEN --name cpu-test cpu-test
-
-# Run basic model test
-docker exec cpu-test bash -c "
- pip install pytest matplotlib einops transformers_stream_generator
- pytest -v -s tests/models -m \"not vlm\" \
- --ignore=tests/models/test_embedding.py \
- --ignore=tests/models/test_oot_registration.py \
- --ignore=tests/models/test_registry.py \
- --ignore=tests/models/test_jamba.py \
- --ignore=tests/models/test_mamba.py \
- --ignore=tests/models/test_danube3_4b.py" # Mamba kernels and Danube3-4B on CPU is not supported
-
-# online inference
-docker exec cpu-test bash -c "
- python3 -m vllm.entrypoints.openai.api_server --model facebook/opt-125m &
- timeout 600 bash -c 'until curl localhost:8000/v1/models; do sleep 1; done' || exit 1
- python3 benchmarks/benchmark_serving.py \
- --backend vllm \
- --dataset-name random \
- --model facebook/opt-125m \
- --num-prompts 20 \
- --endpoint /v1/completions \
- --tokenizer facebook/opt-125m"
+docker run -itd --entrypoint /bin/bash -v ~/.cache/huggingface:/root/.cache/huggingface --privileged=true --network host -e HF_TOKEN="$HF_TOKEN" --name cpu-test cpu-test
+
+function cpu_tests() {
+ set -e
+
+ # Run basic model test
+ docker exec cpu-test bash -c "
+ set -e
+ pip install pytest pytest-asyncio \
+ decord einops librosa peft Pillow sentence-transformers soundfile \
+ transformers_stream_generator matplotlib datamodel_code_generator
+ pip install torchvision --index-url https://download.pytorch.org/whl/cpu
+ pytest -v -s tests/models/decoder_only/language -m cpu_model
+ pytest -v -s tests/models/embedding/language -m cpu_model
+ pytest -v -s tests/models/encoder_decoder/language -m cpu_model
+ pytest -v -s tests/models/decoder_only/audio_language -m cpu_model
+ pytest -v -s tests/models/decoder_only/vision_language -m cpu_model"
+
+ # online inference
+ docker exec cpu-test bash -c "
+ set -e
+ python3 -m vllm.entrypoints.openai.api_server --model facebook/opt-125m &
+ timeout 600 bash -c 'until curl localhost:8000/v1/models; do sleep 1; done' || exit 1
+ python3 benchmarks/benchmark_serving.py \
+ --backend vllm \
+ --dataset-name random \
+ --model facebook/opt-125m \
+ --num-prompts 20 \
+ --endpoint /v1/completions \
+ --tokenizer facebook/opt-125m"
+}
+
+# All of CPU tests are expected to be finished less than 25 mins.
+export -f cpu_tests
+timeout 25m bash -c "cpu_tests"
diff --git a/.buildkite/run-cpu-test.sh b/.buildkite/run-cpu-test.sh
index c2818c38965ea..4f1729d46dae2 100644
--- a/.buildkite/run-cpu-test.sh
+++ b/.buildkite/run-cpu-test.sh
@@ -1,57 +1,85 @@
+#!/bin/bash
+
# This script build the CPU docker image and run the offline inference inside the container.
# It serves a sanity check for compilation and basic model usage.
set -ex
+# allow to bind to different cores
+CORE_RANGE=${CORE_RANGE:-48-95}
+NUMA_NODE=${NUMA_NODE:-1}
+
# Try building the docker image
-numactl -C 48-95 -N 1 docker build -t cpu-test -f Dockerfile.cpu .
-numactl -C 48-95 -N 1 docker build --build-arg VLLM_CPU_DISABLE_AVX512="true" -t cpu-test-avx2 -f Dockerfile.cpu .
+numactl -C "$CORE_RANGE" -N "$NUMA_NODE" docker build -t cpu-test -f Dockerfile.cpu .
+numactl -C "$CORE_RANGE" -N "$NUMA_NODE" docker build --build-arg VLLM_CPU_DISABLE_AVX512="true" -t cpu-test-avx2 -f Dockerfile.cpu .
# Setup cleanup
-remove_docker_container() { docker rm -f cpu-test cpu-test-avx2 || true; }
+remove_docker_container() { docker rm -f cpu-test-"$NUMA_NODE" cpu-test-avx2-"$NUMA_NODE" || true; }
trap remove_docker_container EXIT
remove_docker_container
# Run the image, setting --shm-size=4g for tensor parallel.
-docker run -itd --entrypoint /bin/bash -v ~/.cache/huggingface:/root/.cache/huggingface --cpuset-cpus=48-95 \
- --cpuset-mems=1 --privileged=true --network host -e HF_TOKEN --env VLLM_CPU_KVCACHE_SPACE=4 --shm-size=4g --name cpu-test cpu-test
-docker run -itd --entrypoint /bin/bash -v ~/.cache/huggingface:/root/.cache/huggingface --cpuset-cpus=48-95 \
- --cpuset-mems=1 --privileged=true --network host -e HF_TOKEN --env VLLM_CPU_KVCACHE_SPACE=4 --shm-size=4g --name cpu-test-avx2 cpu-test-avx2
-
-# offline inference
-docker exec cpu-test-avx2 bash -c "python3 examples/offline_inference.py"
-
-# Run basic model test
-docker exec cpu-test bash -c "
- pip install pytest matplotlib einops transformers_stream_generator datamodel_code_generator
- pytest -v -s tests/models/encoder_decoder/language
- pytest -v -s tests/models/decoder_only/language \
- --ignore=tests/models/test_fp8.py \
- --ignore=tests/models/decoder_only/language/test_jamba.py \
- --ignore=tests/models/decoder_only/language/test_mamba.py \
- --ignore=tests/models/decoder_only/language/test_granitemoe.py \
- --ignore=tests/models/decoder_only/language/test_danube3_4b.py" # Mamba and Danube3-4B on CPU is not supported
-
-# Run compressed-tensor test
-# docker exec cpu-test bash -c "
-# pytest -s -v \
-# tests/quantization/test_compressed_tensors.py::test_compressed_tensors_w8a8_static_setup \
-# tests/quantization/test_compressed_tensors.py::test_compressed_tensors_w8a8_dynanmic_per_token"
-
-# Run AWQ test
-docker exec cpu-test bash -c "
- pytest -s -v \
- tests/quantization/test_ipex_quant.py"
-
-# online inference
-docker exec cpu-test bash -c "
- export VLLM_CPU_KVCACHE_SPACE=10
- export VLLM_CPU_OMP_THREADS_BIND=48-92
- python3 -m vllm.entrypoints.openai.api_server --model facebook/opt-125m &
- timeout 600 bash -c 'until curl localhost:8000/v1/models; do sleep 1; done' || exit 1
- python3 benchmarks/benchmark_serving.py \
- --backend vllm \
- --dataset-name random \
- --model facebook/opt-125m \
- --num-prompts 20 \
- --endpoint /v1/completions \
- --tokenizer facebook/opt-125m"
+docker run -itd --entrypoint /bin/bash -v ~/.cache/huggingface:/root/.cache/huggingface --cpuset-cpus="$CORE_RANGE" \
+ --cpuset-mems="$NUMA_NODE" --privileged=true --network host -e HF_TOKEN --env VLLM_CPU_KVCACHE_SPACE=4 --shm-size=4g --name cpu-test-"$NUMA_NODE" cpu-test
+docker run -itd --entrypoint /bin/bash -v ~/.cache/huggingface:/root/.cache/huggingface --cpuset-cpus="$CORE_RANGE" \
+ --cpuset-mems="$NUMA_NODE" --privileged=true --network host -e HF_TOKEN --env VLLM_CPU_KVCACHE_SPACE=4 --shm-size=4g --name cpu-test-avx2-"$NUMA_NODE" cpu-test-avx2
+
+function cpu_tests() {
+ set -e
+ export NUMA_NODE=$2
+
+ # offline inference
+ docker exec cpu-test-avx2-"$NUMA_NODE" bash -c "
+ set -e
+ python3 examples/offline_inference.py"
+
+ # Run basic model test
+ docker exec cpu-test-"$NUMA_NODE" bash -c "
+ set -e
+ pip install pytest pytest-asyncio \
+ decord einops librosa peft Pillow sentence-transformers soundfile \
+ transformers_stream_generator matplotlib datamodel_code_generator
+ pip install torchvision --index-url https://download.pytorch.org/whl/cpu
+ pytest -v -s tests/models/decoder_only/language -m cpu_model
+ pytest -v -s tests/models/embedding/language -m cpu_model
+ pytest -v -s tests/models/encoder_decoder/language -m cpu_model
+ pytest -v -s tests/models/decoder_only/audio_language -m cpu_model
+ pytest -v -s tests/models/decoder_only/vision_language -m cpu_model"
+
+ # Run compressed-tensor test
+ docker exec cpu-test-"$NUMA_NODE" bash -c "
+ set -e
+ pytest -s -v \
+ tests/quantization/test_compressed_tensors.py::test_compressed_tensors_w8a8_static_setup \
+ tests/quantization/test_compressed_tensors.py::test_compressed_tensors_w8a8_dynamic_per_token"
+
+ # Run AWQ test
+ docker exec cpu-test-"$NUMA_NODE" bash -c "
+ set -e
+ pytest -s -v \
+ tests/quantization/test_ipex_quant.py"
+
+ # Run chunked-prefill and prefix-cache test
+ docker exec cpu-test-"$NUMA_NODE" bash -c "
+ set -e
+ pytest -s -v -k cpu_model \
+ tests/basic_correctness/test_chunked_prefill.py"
+
+ # online inference
+ docker exec cpu-test-"$NUMA_NODE" bash -c "
+ set -e
+ export VLLM_CPU_KVCACHE_SPACE=10
+ export VLLM_CPU_OMP_THREADS_BIND=$1
+ python3 -m vllm.entrypoints.openai.api_server --model facebook/opt-125m --dtype half &
+ timeout 600 bash -c 'until curl localhost:8000/v1/models; do sleep 1; done' || exit 1
+ python3 benchmarks/benchmark_serving.py \
+ --backend vllm \
+ --dataset-name random \
+ --model facebook/opt-125m \
+ --num-prompts 20 \
+ --endpoint /v1/completions \
+ --tokenizer facebook/opt-125m"
+}
+
+# All of CPU tests are expected to be finished less than 25 mins.
+export -f cpu_tests
+timeout 30m bash -c "cpu_tests $CORE_RANGE $NUMA_NODE"
diff --git a/.buildkite/run-hpu-test.sh b/.buildkite/run-hpu-test.sh
new file mode 100644
index 0000000000000..fa4f74fca7a11
--- /dev/null
+++ b/.buildkite/run-hpu-test.sh
@@ -0,0 +1,16 @@
+#!/bin/bash
+
+# This script build the CPU docker image and run the offline inference inside the container.
+# It serves a sanity check for compilation and basic model usage.
+set -ex
+
+# Try building the docker image
+docker build -t hpu-test-env -f Dockerfile.hpu .
+
+# Setup cleanup
+remove_docker_container() { docker rm -f hpu-test || true; }
+trap remove_docker_container EXIT
+remove_docker_container
+
+# Run the image and launch offline inference
+docker run --runtime=habana --name=hpu-test --network=host -e HABANA_VISIBLE_DEVICES=all -e VLLM_SKIP_WARMUP=true --entrypoint="" hpu-test-env python3 examples/offline_inference.py
\ No newline at end of file
diff --git a/.buildkite/run-multi-node-test.sh b/.buildkite/run-multi-node-test.sh
index 7ac4dcc4c786d..530bf90a855fe 100755
--- a/.buildkite/run-multi-node-test.sh
+++ b/.buildkite/run-multi-node-test.sh
@@ -14,7 +14,7 @@ DOCKER_IMAGE=$4
shift 4
COMMANDS=("$@")
-if [ ${#COMMANDS[@]} -ne $NUM_NODES ]; then
+if [ ${#COMMANDS[@]} -ne "$NUM_NODES" ]; then
echo "The number of commands must be equal to the number of nodes."
echo "Number of nodes: $NUM_NODES"
echo "Number of commands: ${#COMMANDS[@]}"
@@ -23,7 +23,7 @@ fi
echo "List of commands"
for command in "${COMMANDS[@]}"; do
- echo $command
+ echo "$command"
done
start_network() {
@@ -36,7 +36,7 @@ start_nodes() {
for node_gpu in $(seq 0 $(($NUM_GPUS - 1))); do
DEVICE_NUM=$(($node * $NUM_GPUS + $node_gpu))
GPU_DEVICES+=$(($DEVICE_NUM))
- if [ $node_gpu -lt $(($NUM_GPUS - 1)) ]; then
+ if [ "$node_gpu" -lt $(($NUM_GPUS - 1)) ]; then
GPU_DEVICES+=','
fi
done
@@ -49,17 +49,20 @@ start_nodes() {
# 3. map the huggingface cache directory to the container
# 3. assign ip addresses to the containers (head node: 192.168.10.10, worker nodes:
# starting from 192.168.10.11)
- docker run -d --gpus "$GPU_DEVICES" --shm-size=10.24gb -e HF_TOKEN -v ~/.cache/huggingface:/root/.cache/huggingface --name node$node --network docker-net --ip 192.168.10.$((10 + $node)) --rm $DOCKER_IMAGE /bin/bash -c "tail -f /dev/null"
+ docker run -d --gpus "$GPU_DEVICES" --shm-size=10.24gb -e HF_TOKEN \
+ -v ~/.cache/huggingface:/root/.cache/huggingface --name "node$node" \
+ --network docker-net --ip 192.168.10.$((10 + $node)) --rm "$DOCKER_IMAGE" \
+ /bin/bash -c "tail -f /dev/null"
# organize containers into a ray cluster
- if [ $node -eq 0 ]; then
+ if [ "$node" -eq 0 ]; then
# start the ray head node
- docker exec -d node$node /bin/bash -c "ray start --head --port=6379 --block"
+ docker exec -d "node$node" /bin/bash -c "ray start --head --port=6379 --block"
# wait for the head node to be ready
sleep 10
else
# start the ray worker nodes, and connect them to the head node
- docker exec -d node$node /bin/bash -c "ray start --address=192.168.10.10:6379 --block"
+ docker exec -d "node$node" /bin/bash -c "ray start --address=192.168.10.10:6379 --block"
fi
done
@@ -79,22 +82,22 @@ run_nodes() {
for node_gpu in $(seq 0 $(($NUM_GPUS - 1))); do
DEVICE_NUM=$(($node * $NUM_GPUS + $node_gpu))
GPU_DEVICES+=$(($DEVICE_NUM))
- if [ $node_gpu -lt $(($NUM_GPUS - 1)) ]; then
+ if [ "$node_gpu" -lt $(($NUM_GPUS - 1)) ]; then
GPU_DEVICES+=','
fi
done
GPU_DEVICES+='"'
echo "Running node$node with GPU devices: $GPU_DEVICES"
- if [ $node -ne 0 ]; then
- docker exec -d node$node /bin/bash -c "cd $WORKING_DIR ; ${COMMANDS[$node]}"
+ if [ "$node" -ne 0 ]; then
+ docker exec -d "node$node" /bin/bash -c "cd $WORKING_DIR ; ${COMMANDS[$node]}"
else
- docker exec node$node /bin/bash -c "cd $WORKING_DIR ; ${COMMANDS[$node]}"
+ docker exec "node$node" /bin/bash -c "cd $WORKING_DIR ; ${COMMANDS[$node]}"
fi
done
}
cleanup() {
for node in $(seq 0 $(($NUM_NODES-1))); do
- docker stop node$node
+ docker stop "node$node"
done
docker network rm docker-net
}
diff --git a/.buildkite/run-neuron-test.sh b/.buildkite/run-neuron-test.sh
index 252c0f7fecd12..9259391aaed49 100644
--- a/.buildkite/run-neuron-test.sh
+++ b/.buildkite/run-neuron-test.sh
@@ -1,3 +1,5 @@
+#!/bin/bash
+
# This script build the Neuron docker image and run the API server inside the container.
# It serves a sanity check for compilation and basic model usage.
set -e
@@ -12,10 +14,10 @@ if [ -f /tmp/neuron-docker-build-timestamp ]; then
current_time=$(date +%s)
if [ $((current_time - last_build)) -gt 86400 ]; then
docker system prune -f
- echo $current_time > /tmp/neuron-docker-build-timestamp
+ echo "$current_time" > /tmp/neuron-docker-build-timestamp
fi
else
- echo $(date +%s) > /tmp/neuron-docker-build-timestamp
+ date "+%s" > /tmp/neuron-docker-build-timestamp
fi
docker build -t neuron -f Dockerfile.neuron .
@@ -34,7 +36,7 @@ wait_for_server_to_start() {
timeout=300
counter=0
- while [ "$(curl -s -o /dev/null -w ''%{http_code}'' localhost:8000/health)" != "200" ]; do
+ while [ "$(curl -s -o /dev/null -w '%{http_code}' localhost:8000/health)" != "200" ]; do
sleep 1
counter=$((counter + 1))
if [ $counter -ge $timeout ]; then
diff --git a/.buildkite/run-openvino-test.sh b/.buildkite/run-openvino-test.sh
index 70e56596c4a86..6b12f424fd828 100755
--- a/.buildkite/run-openvino-test.sh
+++ b/.buildkite/run-openvino-test.sh
@@ -1,3 +1,5 @@
+#!/bin/bash
+
# This script build the OpenVINO docker image and run the offline inference inside the container.
# It serves a sanity check for compilation and basic model usage.
set -ex
@@ -11,4 +13,4 @@ trap remove_docker_container EXIT
remove_docker_container
# Run the image and launch offline inference
-docker run --network host --env VLLM_OPENVINO_KVCACHE_SPACE=1 --name openvino-test openvino-test python3 /workspace/vllm/examples/offline_inference.py
+docker run --network host --env VLLM_OPENVINO_KVCACHE_SPACE=1 --name openvino-test openvino-test python3 /workspace/examples/offline_inference.py
diff --git a/.buildkite/run-tpu-test.sh b/.buildkite/run-tpu-test.sh
index 6989c94d46a89..770dad6ffa3a1 100644
--- a/.buildkite/run-tpu-test.sh
+++ b/.buildkite/run-tpu-test.sh
@@ -1,3 +1,5 @@
+#!/bin/bash
+
set -e
# Build the docker image.
@@ -12,4 +14,4 @@ remove_docker_container
# For HF_TOKEN.
source /etc/environment
# Run a simple end-to-end example.
-docker run --privileged --net host --shm-size=16G -it -e HF_TOKEN=$HF_TOKEN --name tpu-test vllm-tpu /bin/bash -c "python3 -m pip install git+https://github.com/thuml/depyf.git && python3 -m pip install pytest && pytest -v -s /workspace/vllm/tests/tpu/test_custom_dispatcher.py && python3 /workspace/vllm/tests/tpu/test_compilation.py && python3 /workspace/vllm/examples/offline_inference_tpu.py"
+docker run --privileged --net host --shm-size=16G -it -e "HF_TOKEN=$HF_TOKEN" --name tpu-test vllm-tpu /bin/bash -c "python3 -m pip install git+https://github.com/thuml/depyf.git && python3 -m pip install pytest && python3 -m pip install lm_eval[api]==0.4.4 && pytest -v -s /workspace/vllm/tests/entrypoints/openai/test_accuracy.py && pytest -v -s /workspace/vllm/tests/tpu/test_custom_dispatcher.py && python3 /workspace/vllm/tests/tpu/test_compilation.py && python3 /workspace/vllm/examples/offline_inference_tpu.py"
diff --git a/.buildkite/run-xpu-test.sh b/.buildkite/run-xpu-test.sh
index 6ffa66d5ef3d6..faeac8e2ded36 100644
--- a/.buildkite/run-xpu-test.sh
+++ b/.buildkite/run-xpu-test.sh
@@ -1,3 +1,5 @@
+#!/bin/bash
+
# This script build the CPU docker image and run the offline inference inside the container.
# It serves a sanity check for compilation and basic model usage.
set -ex
diff --git a/.buildkite/test-pipeline.yaml b/.buildkite/test-pipeline.yaml
index 4385f250856e7..c436d2b48d20f 100644
--- a/.buildkite/test-pipeline.yaml
+++ b/.buildkite/test-pipeline.yaml
@@ -9,7 +9,7 @@
# label(str): the name of the test. emoji allowed.
# fast_check(bool): whether to run this on each commit on fastcheck pipeline.
# fast_check_only(bool): run this test on fastcheck pipeline only
-# optional(bool): never run this test by default (i.e. need to unblock manually)
+# optional(bool): never run this test by default (i.e. need to unblock manually) unless it's scheduled nightly run.
# command(str): the single command to run for tests. incompatible with commands.
# commands(list): the list of commands to run for test. incompatbile with command.
# mirror_hardwares(list): the list of hardwares to run the test on as well. currently only supports [amd]
@@ -50,7 +50,9 @@ steps:
- tests/multimodal
- tests/test_utils
- tests/worker
+ - tests/test_lazy_torch_compile.py
commands:
+ - python3 test_lazy_torch_compile.py
- pytest -v -s mq_llm_engine # MQLLMEngine
- pytest -v -s async_engine # AsyncLLMEngine
- NUM_SCHEDULER_STEPS=4 pytest -v -s async_engine/test_async_llm_engine.py
@@ -77,8 +79,8 @@ steps:
- vllm/
- tests/basic_correctness/test_chunked_prefill
commands:
- - VLLM_ATTENTION_BACKEND=XFORMERS VLLM_ALLOW_DEPRECATED_BLOCK_MANAGER_V1=1 pytest -v -s basic_correctness/test_chunked_prefill.py
- - VLLM_ATTENTION_BACKEND=FLASH_ATTN VLLM_ALLOW_DEPRECATED_BLOCK_MANAGER_V1=1 pytest -v -s basic_correctness/test_chunked_prefill.py
+ - VLLM_ATTENTION_BACKEND=XFORMERS pytest -v -s basic_correctness/test_chunked_prefill.py
+ - VLLM_ATTENTION_BACKEND=FLASH_ATTN pytest -v -s basic_correctness/test_chunked_prefill.py
- label: Core Test # 10min
mirror_hardwares: [amd]
@@ -88,11 +90,7 @@ steps:
- vllm/distributed
- tests/core
commands:
- - VLLM_ALLOW_DEPRECATED_BLOCK_MANAGER_V1=1 pytest -v -s core/test_scheduler.py
- - VLLM_ALLOW_DEPRECATED_BLOCK_MANAGER_V1=1 pytest -v -s core core/test_chunked_prefill_scheduler.py
- - VLLM_ALLOW_DEPRECATED_BLOCK_MANAGER_V1=1 pytest -v -s core core/block/e2e/test_correctness.py
- - VLLM_ALLOW_DEPRECATED_BLOCK_MANAGER_V1=1 pytest -v -s core core/block/e2e/test_correctness_sliding_window.py
- - pytest -v -s core --ignore=core/block/e2e/test_correctness.py --ignore=core/test_scheduler.py --ignore=core/test_chunked_prefill_scheduler.py --ignore=core/block/e2e/test_correctness.py --ignore=core/block/e2e/test_correctness_sliding_window.py
+ - pytest -v -s core
- label: Entrypoints Test # 40min
working_dir: "/vllm-workspace/tests"
@@ -123,6 +121,7 @@ steps:
- tests/spec_decode/e2e/test_integration_dist_tp4
- tests/compile
commands:
+ - pytest -v -s distributed/test_utils.py
- pytest -v -s compile/test_basic_correctness.py
- pytest -v -s distributed/test_pynccl.py
- pytest -v -s spec_decode/e2e/test_integration_dist_tp4.py
@@ -167,6 +166,14 @@ steps:
# OOM in the CI unless we run this separately
- pytest -v -s tokenization
+- label: V1 Test
+ #mirror_hardwares: [amd]
+ source_file_dependencies:
+ - vllm/
+ - tests/v1
+ commands:
+ - pytest -v -s v1
+
- label: Examples Test # 15min
working_dir: "/vllm-workspace/examples"
#mirror_hardwares: [amd]
@@ -184,6 +191,7 @@ steps:
- python3 offline_inference_vision_language_multi_image.py
- python3 tensorize_vllm_model.py --model facebook/opt-125m serialize --serialized-directory /tmp/ --suffix v1 && python3 tensorize_vllm_model.py --model facebook/opt-125m deserialize --path-to-tensors /tmp/vllm/facebook/opt-125m/v1/model.tensors
- python3 offline_inference_encoder_decoder.py
+ - python3 offline_profile.py --model facebook/opt-125m
- label: Prefix Caching Test # 9min
#mirror_hardwares: [amd]
@@ -191,8 +199,7 @@ steps:
- vllm/
- tests/prefix_caching
commands:
- - VLLM_ALLOW_DEPRECATED_BLOCK_MANAGER_V1=1 pytest -v -s prefix_caching/test_prefix_caching.py
- - pytest -v -s prefix_caching --ignore=prefix_caching/test_prefix_caching.py
+ - pytest -v -s prefix_caching
- label: Samplers Test # 36min
source_file_dependencies:
@@ -216,8 +223,7 @@ steps:
- tests/spec_decode
commands:
- pytest -v -s spec_decode/e2e/test_multistep_correctness.py
- - VLLM_ALLOW_DEPRECATED_BLOCK_MANAGER_V1=1 pytest -v -s spec_decode/e2e/test_compatibility.py
- - VLLM_ATTENTION_BACKEND=FLASH_ATTN pytest -v -s spec_decode --ignore=spec_decode/e2e/test_multistep_correctness.py --ignore=spec_decode/e2e/test_compatibility.py
+ - VLLM_ATTENTION_BACKEND=FLASH_ATTN pytest -v -s spec_decode --ignore=spec_decode/e2e/test_multistep_correctness.py
- label: LoRA Test %N # 15min each
mirror_hardwares: [amd]
@@ -234,15 +240,16 @@ steps:
- tests/compile
commands:
- pytest -v -s compile/test_basic_correctness.py
+ # these tests need to be separated, cannot combine
+ - pytest -v -s compile/piecewise/test_simple.py
+ - pytest -v -s compile/piecewise/test_toy_llama.py
-# TODO: re-write in comparison tests, and fix symbolic shape
-# for quantization ops.
-# - label: "PyTorch Fullgraph Test" # 18min
-# source_file_dependencies:
-# - vllm/
-# - tests/compile
-# commands:
-# - pytest -v -s compile/test_full_graph.py
+- label: "PyTorch Fullgraph Test" # 18min
+ source_file_dependencies:
+ - vllm/
+ - tests/compile
+ commands:
+ - pytest -v -s compile/test_full_graph.py
- label: Kernels Test %N # 1h each
mirror_hardwares: [amd]
@@ -271,7 +278,6 @@ steps:
source_file_dependencies:
- benchmarks/
commands:
- - pip install aiohttp
- bash run-benchmarks.sh
- label: Quantization Test # 33min
@@ -308,46 +314,70 @@ steps:
##### models test #####
-- label: Basic Models Test # 3min
+- label: Basic Models Test # 30min
source_file_dependencies:
- vllm/
- tests/models
commands:
- pip install -e ./plugins/vllm_add_dummy_model
- pytest -v -s models/test_oot_registration.py # it needs a clean process
- - pytest -v -s models/*.py --ignore=models/test_oot_registration.py
+ - pytest -v -s models/test_registry.py
+ - pytest -v -s models/test_initialization.py
-- label: Decoder-only Language Models Test # 1h36min
+- label: Language Models Test (Standard) # 42min
#mirror_hardwares: [amd]
source_file_dependencies:
- vllm/
- tests/models/decoder_only/language
+ - tests/models/embedding/language
+ - tests/models/encoder_decoder/language
commands:
- - pytest -v -s models/decoder_only/language
+ - pytest -v -s models/decoder_only/language -m 'core_model or quant_model'
+ - pytest -v -s models/embedding/language -m core_model
+ - pytest -v -s models/embedding/vision_language -m core_model
-- label: Decoder-only Multi-Modal Models Test # 1h31min
+- label: Language Models Test (Extended) # 50min
+ optional: true
+ source_file_dependencies:
+ - vllm/
+ - tests/models/decoder_only/language
+ - tests/models/embedding/language
+ - tests/models/encoder_decoder/language
+ commands:
+ - pytest -v -s models/decoder_only/language -m 'not core_model and not quant_model'
+ - pytest -v -s models/embedding/language -m 'not core_model'
+ - pytest -v -s models/embedding/vision_language -m 'not core_model'
+
+- label: Multi-Modal Models Test (Standard) # 26min
#mirror_hardwares: [amd]
source_file_dependencies:
- vllm/
- tests/models/decoder_only/audio_language
- tests/models/decoder_only/vision_language
+ - tests/models/embedding/vision_language
+ - tests/models/encoder_decoder/vision_language
commands:
- - pytest -v -s models/decoder_only/audio_language
- - pytest -v -s models/decoder_only/vision_language
+ - pytest -v -s models/decoder_only/audio_language -m 'core_model or quant_model'
+ - pytest -v -s --ignore models/decoder_only/vision_language/test_phi3v.py models/decoder_only/vision_language -m 'core_model or quant_model'
+ - pytest -v -s models/encoder_decoder/language -m core_model
+ - pytest -v -s models/encoder_decoder/vision_language -m core_model
-- label: Other Models Test # 6min
- #mirror_hardwares: [amd]
+- label: Multi-Modal Models Test (Extended) # 1h15m
+ optional: true
source_file_dependencies:
- vllm/
- - tests/models/embedding/language
+ - tests/models/decoder_only/audio_language
+ - tests/models/decoder_only/vision_language
- tests/models/embedding/vision_language
- - tests/models/encoder_decoder/language
- tests/models/encoder_decoder/vision_language
commands:
- - pytest -v -s models/embedding/language
- - pytest -v -s models/embedding/vision_language
- - pytest -v -s models/encoder_decoder/language
- - pytest -v -s models/encoder_decoder/vision_language
+ - pytest -v -s models/decoder_only/audio_language -m 'not core_model and not quant_model'
+ # HACK - run phi3v tests separately to sidestep this transformers bug
+ # https://github.com/huggingface/transformers/issues/34307
+ - pytest -v -s models/decoder_only/vision_language/test_phi3v.py
+ - pytest -v -s --ignore models/decoder_only/vision_language/test_phi3v.py models/decoder_only/vision_language -m 'not core_model and not quant_model'
+ - pytest -v -s models/encoder_decoder/language -m 'not core_model'
+ - pytest -v -s models/encoder_decoder/vision_language -m 'not core_model'
# This test is used only in PR development phase to test individual models and should never run on main
- label: Custom Models Test
@@ -404,16 +434,15 @@ steps:
- pytest -v -s ./compile/test_basic_correctness.py
- pytest -v -s ./compile/test_wrapper.py
- VLLM_TEST_SAME_HOST=1 torchrun --nproc-per-node=4 distributed/test_same_node.py | grep -q 'Same node test passed'
- - TARGET_TEST_SUITE=L4 VLLM_ALLOW_DEPRECATED_BLOCK_MANAGER_V1=1 pytest basic_correctness/ -v -s -m distributed_2_gpus
+ - TARGET_TEST_SUITE=L4 pytest basic_correctness/ -v -s -m distributed_2_gpus
# Avoid importing model tests that cause CUDA reinitialization error
- pytest models/encoder_decoder/language/test_bart.py -v -s -m distributed_2_gpus
- pytest models/encoder_decoder/vision_language/test_broadcast.py -v -s -m distributed_2_gpus
- - pytest models/decoder_only/vision_language/test_broadcast.py -v -s -m distributed_2_gpus
+ - pytest models/decoder_only/vision_language/test_models.py -v -s -m distributed_2_gpus
- pytest -v -s spec_decode/e2e/test_integration_dist_tp2.py
- pip install -e ./plugins/vllm_add_dummy_model
- pytest -v -s distributed/test_distributed_oot.py
- CUDA_VISIBLE_DEVICES=0,1 pytest -v -s test_sharded_state_loader.py
- - CUDA_VISIBLE_DEVICES=0,1 pytest -v -s distributed/test_utils.py
- label: Multi-step Tests (4 GPUs) # 36min
working_dir: "/vllm-workspace/tests"
@@ -485,6 +514,7 @@ steps:
- label: Distributed Tests (A100) # optional
gpu: a100
+ optional: true
num_gpus: 4
source_file_dependencies:
- vllm/
@@ -492,11 +522,13 @@ steps:
# NOTE: don't test llama model here, it seems hf implementation is buggy
# see https://github.com/vllm-project/vllm/pull/5689 for details
- pytest -v -s distributed/test_custom_all_reduce.py
+ - torchrun --nproc_per_node=2 distributed/test_ca_buffer_sharing.py
- TARGET_TEST_SUITE=A100 pytest basic_correctness/ -v -s -m distributed_2_gpus
- pytest -v -s -x lora/test_mixtral.py
- label: LM Eval Large Models # optional
gpu: a100
+ optional: true
num_gpus: 4
working_dir: "/vllm-workspace/.buildkite/lm-eval-harness"
source_file_dependencies:
diff --git a/.buildkite/upload-wheels.sh b/.buildkite/upload-wheels.sh
new file mode 100644
index 0000000000000..7345dd4e66b29
--- /dev/null
+++ b/.buildkite/upload-wheels.sh
@@ -0,0 +1,43 @@
+#!/usr/bin/env bash
+
+set -ex
+
+# Assume wheels are in artifacts/dist/*.whl
+wheel_files=(artifacts/dist/*.whl)
+
+# Check that exactly one wheel is found
+if [[ ${#wheel_files[@]} -ne 1 ]]; then
+ echo "Error: Expected exactly one wheel file in artifacts/dist/, but found ${#wheel_files[@]}"
+ exit 1
+fi
+
+# Get the single wheel file
+wheel="${wheel_files[0]}"
+
+# Rename 'linux' to 'manylinux1' in the wheel filename
+new_wheel="${wheel/linux/manylinux1}"
+mv -- "$wheel" "$new_wheel"
+wheel="$new_wheel"
+
+# Extract the version from the wheel
+version=$(unzip -p "$wheel" '**/METADATA' | grep '^Version: ' | cut -d' ' -f2)
+echo "Version: $version"
+
+# If the version contains "dev", rename it to v1.0.0.dev for consistency
+if [[ $version == *dev* ]]; then
+ suffix="${version##*.}"
+ if [[ $suffix == cu* ]]; then
+ new_version="1.0.0.dev+${suffix}"
+ else
+ new_version="1.0.0.dev"
+ fi
+ new_wheel="${wheel/$version/$new_version}"
+ mv -- "$wheel" "$new_wheel"
+ wheel="$new_wheel"
+ version="$new_version"
+fi
+
+# Upload the wheel to S3
+aws s3 cp "$wheel" "s3://vllm-wheels/$BUILDKITE_COMMIT/"
+aws s3 cp "$wheel" "s3://vllm-wheels/nightly/"
+aws s3 cp "$wheel" "s3://vllm-wheels/$version/"
\ No newline at end of file
diff --git a/.dockerignore b/.dockerignore
index 575f087f3ef6f..3863656915d03 100644
--- a/.dockerignore
+++ b/.dockerignore
@@ -1,4 +1,3 @@
-/.github/
/.venv
/build
dist
diff --git a/.github/CODEOWNERS b/.github/CODEOWNERS
index cd721971d01d6..3cb91fc0f8232 100644
--- a/.github/CODEOWNERS
+++ b/.github/CODEOWNERS
@@ -3,13 +3,16 @@
# This lists cover the "core" components of vLLM that require careful review
/vllm/attention/backends/abstract.py @WoosukKwon @zhuohan123 @youkaichao @alexm-neuralmagic @comaniac @njhill
-/vllm/core @WoosukKwon @zhuohan123 @youkaichao @alexm-neuralmagic @comaniac @njhill
-/vllm/engine/llm_engine.py @WoosukKwon @zhuohan123 @youkaichao @alexm-neuralmagic @comaniac @njhill
-/vllm/executor/executor_base.py @WoosukKwon @zhuohan123 @youkaichao @alexm-neuralmagic @comaniac @njhill
-/vllm/worker/worker_base.py @WoosukKwon @zhuohan123 @youkaichao @alexm-neuralmagic @comaniac @njhill
-/vllm/worker/worker.py @WoosukKwon @zhuohan123 @youkaichao @alexm-neuralmagic @comaniac @njhill
-/vllm/model_executor/layers/sampler.py @WoosukKwon @zhuohan123 @youkaichao @alexm-neuralmagic @comaniac @njhill
-CMakeLists.txt @tlrmchlsmth @WoosukKwon
+/vllm/core @zhuohan123 @youkaichao @alexm-neuralmagic @comaniac @njhill
+/vllm/engine/llm_engine.py @zhuohan123 @youkaichao @alexm-neuralmagic @comaniac @njhill
+/vllm/executor/executor_base.py @zhuohan123 @youkaichao @alexm-neuralmagic @comaniac @njhill
+/vllm/worker/worker_base.py @zhuohan123 @youkaichao @alexm-neuralmagic @comaniac @njhill
+/vllm/worker/worker.py @zhuohan123 @youkaichao @alexm-neuralmagic @comaniac @njhill
+/vllm/model_executor/layers/sampler.py @zhuohan123 @youkaichao @alexm-neuralmagic @comaniac @njhill
+CMakeLists.txt @tlrmchlsmth
+
+# vLLM V1
+/vllm/v1 @WoosukKwon @robertgshaw2-neuralmagic @njhill @ywang96 @comaniac @alexm-neuralmagic
# Test ownership
/tests/async_engine @njhill @robertgshaw2-neuralmagic @simon-mo
diff --git a/.github/FUNDING.yml b/.github/FUNDING.yml
index 71f4e520135d4..d1f6105a47166 100644
--- a/.github/FUNDING.yml
+++ b/.github/FUNDING.yml
@@ -1,2 +1,2 @@
github: [vllm-project]
-open_collective: [vllm]
+open_collective: vllm
diff --git a/.github/PULL_REQUEST_TEMPLATE.md b/.github/PULL_REQUEST_TEMPLATE.md
index be0afc6305044..51a73c857ccb2 100644
--- a/.github/PULL_REQUEST_TEMPLATE.md
+++ b/.github/PULL_REQUEST_TEMPLATE.md
@@ -2,73 +2,4 @@ FILL IN THE PR DESCRIPTION HERE
FIX #xxxx (*link existing issues this PR will resolve*)
-**BEFORE SUBMITTING, PLEASE READ THE CHECKLIST BELOW AND FILL IN THE DESCRIPTION ABOVE**
-
----
-
-
-
- PR Checklist (Click to Expand)
-
-Thank you for your contribution to vLLM! Before submitting the pull request, please ensure the PR meets the following criteria. This helps vLLM maintain the code quality and improve the efficiency of the review process.
-
-PR Title and Classification
-Only specific types of PRs will be reviewed. The PR title is prefixed appropriately to indicate the type of change. Please use one of the following:
-
- [Bugfix]
for bug fixes.
- [CI/Build]
for build or continuous integration improvements.
- [Doc]
for documentation fixes and improvements.
- [Model]
for adding a new model or improving an existing model. Model name should appear in the title.
- [Frontend]
For changes on the vLLM frontend (e.g., OpenAI API server, LLM
class, etc.)
- [Kernel]
for changes affecting CUDA kernels or other compute kernels.
- [Core]
for changes in the core vLLM logic (e.g., LLMEngine
, AsyncLLMEngine
, Scheduler
, etc.)
- [Hardware][Vendor]
for hardware-specific changes. Vendor name should appear in the prefix (e.g., [Hardware][AMD]
).
- [Misc]
for PRs that do not fit the above categories. Please use this sparingly.
-
-Note: If the PR spans more than one category, please include all relevant prefixes.
-
-Code Quality
-
-The PR need to meet the following code quality standards:
-
-
- - We adhere to Google Python style guide and Google C++ style guide.
- - Pass all linter checks. Please use
format.sh
to format your code.
- - The code need to be well-documented to ensure future contributors can easily understand the code.
- - Include sufficient tests to ensure the project to stay correct and robust. This includes both unit tests and integration tests.
- - Please add documentation to
docs/source/
if the PR modifies the user-facing behaviors of vLLM. It helps vLLM user understand and utilize the new features or changes.
-
-
-Adding or changing kernels
-Each custom kernel needs a schema and one or more implementations to be registered with PyTorch.
-
- - Make sure custom ops are registered following PyTorch guidelines: Custom C++ and CUDA Operators and The Custom Operators Manual
- - Custom operations that return
Tensors
require meta-functions. Meta-functions should be implemented and registered in python so that dynamic dims can be handled automatically. See above documents for a description of meta-functions.
- - Use
torch.libary.opcheck()
to test the function registration and meta-function for any registered ops. See tests/kernels
for examples.
- - When changing the C++ signature of an existing op, the schema must be updated to reflect the changes.
- - If a new custom type is needed, see the following document: Custom Class Support in PT2.
-
-
-Notes for Large Changes
-Please keep the changes as concise as possible. For major architectural changes (>500 LOC excluding kernel/data/config/test), we would expect a GitHub issue (RFC) discussing the technical design and justification. Otherwise, we will tag it with rfc-required
and might not go through the PR.
-
-What to Expect for the Reviews
-
-The goal of the vLLM team is to be a transparent reviewing machine. We would like to make the review process transparent and efficient and make sure no contributor feel confused or frustrated. However, the vLLM team is small, so we need to prioritize some PRs over others. Here is what you can expect from the review process:
-
-
- - After the PR is submitted, the PR will be assigned to a reviewer. Every reviewer will pick up the PRs based on their expertise and availability.
- - After the PR is assigned, the reviewer will provide status update every 2-3 days. If the PR is not reviewed within 7 days, please feel free to ping the reviewer or the vLLM team.
- - After the review, the reviewer will put an
action-required
label on the PR if there are changes required. The contributor should address the comments and ping the reviewer to re-review the PR.
- - Please respond to all comments within a reasonable time frame. If a comment isn't clear or you disagree with a suggestion, feel free to ask for clarification or discuss the suggestion.
-
-
-
-Thank You
-
- Finally, thank you for taking the time to read these guidelines and for your interest in contributing to vLLM. Your contributions make vLLM a great tool for everyone!
-
-
-
-
-
+**BEFORE SUBMITTING, PLEASE READ https://docs.vllm.ai/en/latest/contributing/overview.html **
diff --git a/.github/dependabot.yml b/.github/dependabot.yml
index 6fddca0d6e4b9..683b70cd89989 100644
--- a/.github/dependabot.yml
+++ b/.github/dependabot.yml
@@ -5,3 +5,27 @@ updates:
directory: "/"
schedule:
interval: "weekly"
+ - package-ecosystem: "pip"
+ directory: "/"
+ schedule:
+ interval: "weekly"
+ labels: ["dependencies"]
+ open-pull-requests-limit: 5
+ reviewers: ["khluu", "simon-mo"]
+ allow:
+ - dependency-type: "all"
+ ignore:
+ - dependency-name: "*"
+ update-types: ["version-update:semver-patch"]
+ - dependency-name: "torch"
+ - dependency-name: "torchvision"
+ - dependency-name: "xformers"
+ - dependency-name: "lm-format-enforcer"
+ - dependency-name: "gguf"
+ - dependency-name: "compressed-tensors"
+ - dependency-name: "ray[adag]"
+ - dependency-name: "lm-eval"
+ groups:
+ minor-update:
+ applies-to: version-updates
+ update-types: ["minor"]
diff --git a/.github/mergify.yml b/.github/mergify.yml
new file mode 100644
index 0000000000000..ca4bd7ee2b87f
--- /dev/null
+++ b/.github/mergify.yml
@@ -0,0 +1,60 @@
+pull_request_rules:
+- name: label-documentation
+ description: Automatically apply documentation label
+ conditions:
+ - or:
+ - files~=^[^/]+\.md$
+ - files~=^docs/
+ actions:
+ label:
+ add:
+ - documentation
+
+- name: label-ci-build
+ description: Automatically apply ci/build label
+ conditions:
+ - or:
+ - files~=^\.github/
+ - files~=\.buildkite/
+ - files~=^cmake/
+ - files=CMakeLists.txt
+ - files~=^Dockerfile
+ - files~=^requirements.*\.txt
+ - files=setup.py
+ actions:
+ label:
+ add:
+ - ci/build
+
+- name: label-frontend
+ description: Automatically apply frontend label
+ conditions:
+ - files~=^vllm/entrypoints/
+ actions:
+ label:
+ add:
+ - frontend
+
+- name: ping author on conflicts and add 'needs-rebase' label
+ conditions:
+ - conflict
+ - -closed
+ actions:
+ label:
+ add:
+ - needs-rebase
+ comment:
+ message: |
+ This pull request has merge conflicts that must be resolved before it can be
+ merged. Please rebase the PR, @{{author}}.
+
+ https://docs.github.com/en/pull-requests/collaborating-with-pull-requests/working-with-forks/syncing-a-fork
+
+- name: remove 'needs-rebase' label when conflict is resolved
+ conditions:
+ - -conflict
+ - -closed
+ actions:
+ label:
+ remove:
+ - needs-rebase
diff --git a/.github/scripts/cleanup_pr_body.sh b/.github/scripts/cleanup_pr_body.sh
new file mode 100755
index 0000000000000..3246c6f9bc4b7
--- /dev/null
+++ b/.github/scripts/cleanup_pr_body.sh
@@ -0,0 +1,50 @@
+#!/bin/bash
+
+set -eu
+
+# ensure 1 argument is passed
+if [ "$#" -ne 1 ]; then
+ echo "Usage: $0 "
+ exit 1
+fi
+
+PR_NUMBER=$1
+OLD=/tmp/orig_pr_body.txt
+NEW=/tmp/new_pr_body.txt
+
+gh pr view --json body --template "{{.body}}" "${PR_NUMBER}" > "${OLD}"
+cp "${OLD}" "${NEW}"
+
+# Remove "FIX #xxxx (*link existing issues this PR will resolve*)"
+sed -i '/FIX #xxxx.*$/d' "${NEW}"
+
+# Remove "FILL IN THE PR DESCRIPTION HERE"
+sed -i '/FILL IN THE PR DESCRIPTION HERE/d' "${NEW}"
+
+# Remove all lines after and including "**BEFORE SUBMITTING, PLEASE READ THE CHECKLIST BELOW AND FILL IN THE DESCRIPTION ABOVE**"
+sed -i '/\*\*BEFORE SUBMITTING, PLEASE READ.*\*\*/,$d' "${NEW}"
+
+# Remove HTML section that includes text of "PR Checklist (Click to Expand)"
+python3 - <.*?.*?PR Checklist \(Click to Expand\).*?
.*?
', re.DOTALL)
+content = re.sub(pattern, '', content)
+
+with open("${NEW}", "w") as file:
+ file.write(content)
+EOF
+
+# Run this only if ${NEW} is different than ${OLD}
+if ! cmp -s "${OLD}" "${NEW}"; then
+ gh pr edit --body-file "${NEW}" "${PR_NUMBER}"
+ echo
+ echo "Updated PR body:"
+ echo
+ cat "${NEW}"
+else
+ echo "No changes needed"
+fi
diff --git a/.github/workflows/actionlint.yml b/.github/workflows/actionlint.yml
index 2a0e3239f58da..0226cf0ca00e9 100644
--- a/.github/workflows/actionlint.yml
+++ b/.github/workflows/actionlint.yml
@@ -6,12 +6,14 @@ on:
paths:
- '.github/workflows/*.ya?ml'
- '.github/workflows/actionlint.*'
+ - '.github/workflows/matchers/actionlint.json'
pull_request:
branches:
- "main"
paths:
- '.github/workflows/*.ya?ml'
- '.github/workflows/actionlint.*'
+ - '.github/workflows/matchers/actionlint.json'
env:
LC_ALL: en_US.UTF-8
@@ -28,10 +30,11 @@ jobs:
runs-on: ubuntu-latest
steps:
- name: "Checkout"
- uses: actions/checkout@eef61447b9ff4aafe5dcd4e0bbf5d482be7e7871 # v4.2.1
+ uses: actions/checkout@11bd71901bbe5b1630ceea73d27597364c9af683 # v4.2.2
with:
fetch-depth: 0
- name: "Run actionlint"
run: |
+ echo "::add-matcher::.github/workflows/matchers/actionlint.json"
tools/actionlint.sh -color
diff --git a/.github/workflows/add_label_automerge.yml b/.github/workflows/add_label_automerge.yml
index 2e7c7f7f087af..c9d6d4259df99 100644
--- a/.github/workflows/add_label_automerge.yml
+++ b/.github/workflows/add_label_automerge.yml
@@ -8,7 +8,7 @@ jobs:
runs-on: ubuntu-latest
steps:
- name: Add label
- uses: actions/github-script@v7
+ uses: actions/github-script@60a0d83039c74a4aee543508d2ffcb1c3799cdea # v7.0.1
with:
script: |
github.rest.issues.addLabels({
diff --git a/.github/workflows/clang-format.yml b/.github/workflows/clang-format.yml
index 064af291009fa..68149d2dc019f 100644
--- a/.github/workflows/clang-format.yml
+++ b/.github/workflows/clang-format.yml
@@ -6,9 +6,21 @@ on:
push:
branches:
- main
+ paths:
+ - '**/*.h'
+ - '**/*.cpp'
+ - '**/*.cu'
+ - '**/*.cuh'
+ - '.github/workflows/clang-format.yml'
pull_request:
branches:
- main
+ paths:
+ - '**/*.h'
+ - '**/*.cpp'
+ - '**/*.cu'
+ - '**/*.cuh'
+ - '.github/workflows/clang-format.yml'
jobs:
clang-format:
@@ -17,9 +29,9 @@ jobs:
matrix:
python-version: ["3.11"]
steps:
- - uses: actions/checkout@v4
+ - uses: actions/checkout@11bd71901bbe5b1630ceea73d27597364c9af683 # v4.2.2
- name: Set up Python ${{ matrix.python-version }}
- uses: actions/setup-python@v5
+ uses: actions/setup-python@0b93645e9fea7318ecaed2b359559ac225c90a2b # v5.3.0
with:
python-version: ${{ matrix.python-version }}
- name: Install dependencies
@@ -38,4 +50,4 @@ jobs:
)
find csrc/ \( -name '*.h' -o -name '*.cpp' -o -name '*.cu' -o -name '*.cuh' \) -print \
| grep -vFf <(printf "%s\n" "${EXCLUDES[@]}") \
- | xargs clang-format --dry-run --Werror
\ No newline at end of file
+ | xargs clang-format --dry-run --Werror
diff --git a/.github/workflows/cleanup_pr_body.yml b/.github/workflows/cleanup_pr_body.yml
new file mode 100644
index 0000000000000..0085a1cc22373
--- /dev/null
+++ b/.github/workflows/cleanup_pr_body.yml
@@ -0,0 +1,26 @@
+name: Cleanup PR Body
+
+on:
+ pull_request_target:
+ types: [opened, reopened, edited]
+
+permissions:
+ pull-requests: write
+
+jobs:
+ update-description:
+ runs-on: ubuntu-latest
+
+ steps:
+ - name: Checkout repository
+ uses: actions/checkout@11bd71901bbe5b1630ceea73d27597364c9af683 # v4.2.2
+
+ - name: Set up Python
+ uses: actions/setup-python@0b93645e9fea7318ecaed2b359559ac225c90a2b # v5.3.0
+ with:
+ python-version: '3.12'
+
+ - name: Update PR description
+ env:
+ GITHUB_TOKEN: ${{ secrets.GITHUB_TOKEN }}
+ run: .github/scripts/cleanup_pr_body.sh "${{ github.event.number }}"
diff --git a/.github/workflows/codespell.yml b/.github/workflows/codespell.yml
new file mode 100644
index 0000000000000..68887adaae54b
--- /dev/null
+++ b/.github/workflows/codespell.yml
@@ -0,0 +1,45 @@
+name: codespell
+
+on:
+ # Trigger the workflow on push or pull request,
+ # but only for the main branch
+ push:
+ branches:
+ - main
+ paths:
+ - "**/*.py"
+ - "**/*.md"
+ - "**/*.rst"
+ - pyproject.toml
+ - requirements-lint.txt
+ - .github/workflows/codespell.yml
+ pull_request:
+ branches:
+ - main
+ paths:
+ - "**/*.py"
+ - "**/*.md"
+ - "**/*.rst"
+ - pyproject.toml
+ - requirements-lint.txt
+ - .github/workflows/codespell.yml
+
+jobs:
+ codespell:
+ runs-on: ubuntu-latest
+ strategy:
+ matrix:
+ python-version: ["3.12"]
+ steps:
+ - uses: actions/checkout@11bd71901bbe5b1630ceea73d27597364c9af683 # v4.2.2
+ - name: Set up Python ${{ matrix.python-version }}
+ uses: actions/setup-python@0b93645e9fea7318ecaed2b359559ac225c90a2b # v5.3.0
+ with:
+ python-version: ${{ matrix.python-version }}
+ - name: Install dependencies
+ run: |
+ python -m pip install --upgrade pip
+ pip install -r requirements-lint.txt
+ - name: Spelling check with codespell
+ run: |
+ codespell --toml pyproject.toml
diff --git a/.github/workflows/matchers/mypy.json b/.github/workflows/matchers/mypy.json
new file mode 100644
index 0000000000000..f048fce528941
--- /dev/null
+++ b/.github/workflows/matchers/mypy.json
@@ -0,0 +1,16 @@
+{
+ "problemMatcher": [
+ {
+ "owner": "mypy",
+ "pattern": [
+ {
+ "regexp": "^(.+):(\\d+):\\s(error|warning):\\s(.+)$",
+ "file": 1,
+ "line": 2,
+ "severity": 3,
+ "message": 4
+ }
+ ]
+ }
+ ]
+}
diff --git a/.github/workflows/matchers/ruff.json b/.github/workflows/matchers/ruff.json
new file mode 100644
index 0000000000000..f6d4479ee1996
--- /dev/null
+++ b/.github/workflows/matchers/ruff.json
@@ -0,0 +1,17 @@
+{
+ "problemMatcher": [
+ {
+ "owner": "ruff",
+ "pattern": [
+ {
+ "regexp": "^(.+?):(\\d+):(\\d+): (\\w+): (.+)$",
+ "file": 1,
+ "line": 2,
+ "column": 3,
+ "code": 4,
+ "message": 5
+ }
+ ]
+ }
+ ]
+ }
diff --git a/.github/workflows/mypy.yaml b/.github/workflows/mypy.yaml
index 22e3564779ad9..73eeacf1fa562 100644
--- a/.github/workflows/mypy.yaml
+++ b/.github/workflows/mypy.yaml
@@ -6,20 +6,35 @@ on:
push:
branches:
- main
+ paths:
+ - '**/*.py'
+ - '.github/workflows/mypy.yaml'
+ - 'tools/mypy.sh'
+ - 'pyproject.toml'
pull_request:
branches:
- main
+ # This workflow is only relevant when one of the following files changes.
+ # However, we have github configured to expect and require this workflow
+ # to run and pass before github with auto-merge a pull request. Until github
+ # allows more flexible auto-merge policy, we can just run this on every PR.
+ # It doesn't take that long to run, anyway.
+ #paths:
+ # - '**/*.py'
+ # - '.github/workflows/mypy.yaml'
+ # - 'tools/mypy.sh'
+ # - 'pyproject.toml'
jobs:
mypy:
runs-on: ubuntu-latest
strategy:
matrix:
- python-version: ["3.8", "3.9", "3.10", "3.11", "3.12"]
+ python-version: ["3.9", "3.10", "3.11", "3.12"]
steps:
- - uses: actions/checkout@v4
+ - uses: actions/checkout@11bd71901bbe5b1630ceea73d27597364c9af683 # v4.2.2
- name: Set up Python ${{ matrix.python-version }}
- uses: actions/setup-python@v5
+ uses: actions/setup-python@0b93645e9fea7318ecaed2b359559ac225c90a2b # v5.3.0
with:
python-version: ${{ matrix.python-version }}
- name: Install dependencies
@@ -32,4 +47,5 @@ jobs:
pip install types-setuptools
- name: Mypy
run: |
- tools/mypy.sh
+ echo "::add-matcher::.github/workflows/matchers/mypy.json"
+ tools/mypy.sh 1 ${{ matrix.python-version }}
diff --git a/.github/workflows/png-lint.yml b/.github/workflows/png-lint.yml
new file mode 100644
index 0000000000000..4932af943a07b
--- /dev/null
+++ b/.github/workflows/png-lint.yml
@@ -0,0 +1,37 @@
+name: Lint PNG exports from excalidraw
+on:
+ push:
+ branches:
+ - "main"
+ paths:
+ - '*.excalidraw.png'
+ - '.github/workflows/png-lint.yml'
+ pull_request:
+ branches:
+ - "main"
+ paths:
+ - '*.excalidraw.png'
+ - '.github/workflows/png-lint.yml'
+
+env:
+ LC_ALL: en_US.UTF-8
+
+defaults:
+ run:
+ shell: bash
+
+permissions:
+ contents: read
+
+jobs:
+ actionlint:
+ runs-on: ubuntu-latest
+ steps:
+ - name: "Checkout"
+ uses: actions/checkout@11bd71901bbe5b1630ceea73d27597364c9af683 # v4.2.2
+ with:
+ fetch-depth: 0
+
+ - name: "Run png-lint.sh to check excalidraw exported images"
+ run: |
+ tools/png-lint.sh
diff --git a/.github/workflows/publish.yml b/.github/workflows/publish.yml
index 96549b3f99181..c1051d10a4860 100644
--- a/.github/workflows/publish.yml
+++ b/.github/workflows/publish.yml
@@ -21,7 +21,7 @@ jobs:
upload_url: ${{ steps.create_release.outputs.upload_url }}
steps:
- name: Checkout
- uses: actions/checkout@v4
+ uses: actions/checkout@11bd71901bbe5b1630ceea73d27597364c9af683 # v4.2.2
- name: Extract branch info
shell: bash
@@ -30,7 +30,7 @@ jobs:
- name: Create Release
id: create_release
- uses: "actions/github-script@v7"
+ uses: actions/github-script@60a0d83039c74a4aee543508d2ffcb1c3799cdea # v7.0.1
env:
RELEASE_TAG: ${{ env.release_tag }}
with:
@@ -48,16 +48,16 @@ jobs:
fail-fast: false
matrix:
os: ['ubuntu-20.04']
- python-version: ['3.8', '3.9', '3.10', '3.11', '3.12']
+ python-version: ['3.9', '3.10', '3.11', '3.12']
pytorch-version: ['2.4.0'] # Must be the most recent version that meets requirements-cuda.txt.
cuda-version: ['11.8', '12.1']
steps:
- name: Checkout
- uses: actions/checkout@v4
+ uses: actions/checkout@11bd71901bbe5b1630ceea73d27597364c9af683 # v4.2.2
- name: Setup ccache
- uses: hendrikmuhs/ccache-action@v1.2
+ uses: hendrikmuhs/ccache-action@ed74d11c0b343532753ecead8a951bb09bb34bc9 # v1.2.14
with:
create-symlink: true
key: ${{ github.job }}-${{ matrix.python-version }}-${{ matrix.cuda-version }}
@@ -68,7 +68,7 @@ jobs:
bash -x .github/workflows/scripts/env.sh
- name: Set up Python
- uses: actions/setup-python@v5
+ uses: actions/setup-python@0b93645e9fea7318ecaed2b359559ac225c90a2b # v5.3.0
with:
python-version: ${{ matrix.python-version }}
@@ -92,7 +92,7 @@ jobs:
echo "asset_name=${asset_name}" >> "$GITHUB_ENV"
- name: Upload Release Asset
- uses: actions/upload-release-asset@v1
+ uses: actions/upload-release-asset@e8f9f06c4b078e705bd2ea027f0926603fc9b4d5 # v1.0.2
env:
GITHUB_TOKEN: ${{ secrets.GITHUB_TOKEN }}
with:
diff --git a/.github/workflows/reminder_comment.yml b/.github/workflows/reminder_comment.yml
index d1791c3bc865a..df62539c0b3d9 100644
--- a/.github/workflows/reminder_comment.yml
+++ b/.github/workflows/reminder_comment.yml
@@ -8,7 +8,7 @@ jobs:
runs-on: ubuntu-latest
steps:
- name: Remind to run full CI on PR
- uses: actions/github-script@v7
+ uses: actions/github-script@60a0d83039c74a4aee543508d2ffcb1c3799cdea # v7.0.1
with:
script: |
github.rest.issues.createComment({
diff --git a/.github/workflows/ruff.yml b/.github/workflows/ruff.yml
index be73fb85ed1fa..7266cc378cfb0 100644
--- a/.github/workflows/ruff.yml
+++ b/.github/workflows/ruff.yml
@@ -6,32 +6,47 @@ on:
push:
branches:
- main
+ paths:
+ - "**/*.py"
+ - pyproject.toml
+ - requirements-lint.txt
+ - .github/workflows/matchers/ruff.json
+ - .github/workflows/ruff.yml
pull_request:
branches:
- main
+ # This workflow is only relevant when one of the following files changes.
+ # However, we have github configured to expect and require this workflow
+ # to run and pass before github with auto-merge a pull request. Until github
+ # allows more flexible auto-merge policy, we can just run this on every PR.
+ # It doesn't take that long to run, anyway.
+ #paths:
+ # - "**/*.py"
+ # - pyproject.toml
+ # - requirements-lint.txt
+ # - .github/workflows/matchers/ruff.json
+ # - .github/workflows/ruff.yml
jobs:
ruff:
runs-on: ubuntu-latest
strategy:
matrix:
- python-version: ["3.8", "3.9", "3.10", "3.11", "3.12"]
+ python-version: ["3.12"]
steps:
- - uses: actions/checkout@v4
- - name: Set up Python ${{ matrix.python-version }}
- uses: actions/setup-python@v5
- with:
- python-version: ${{ matrix.python-version }}
- - name: Install dependencies
- run: |
- python -m pip install --upgrade pip
- pip install -r requirements-lint.txt
- - name: Analysing the code with ruff
- run: |
- ruff check .
- - name: Spelling check with codespell
- run: |
- codespell --toml pyproject.toml
- - name: Run isort
- run: |
- isort . --check-only
+ - uses: actions/checkout@11bd71901bbe5b1630ceea73d27597364c9af683 # v4.2.2
+ - name: Set up Python ${{ matrix.python-version }}
+ uses: actions/setup-python@0b93645e9fea7318ecaed2b359559ac225c90a2b # v5.3.0
+ with:
+ python-version: ${{ matrix.python-version }}
+ - name: Install dependencies
+ run: |
+ python -m pip install --upgrade pip
+ pip install -r requirements-lint.txt
+ - name: Analysing the code with ruff
+ run: |
+ echo "::add-matcher::.github/workflows/matchers/ruff.json"
+ ruff check --output-format github .
+ - name: Run isort
+ run: |
+ isort . --check-only
diff --git a/.github/workflows/scripts/build.sh b/.github/workflows/scripts/build.sh
index 9e0a698990b3b..122e4e101e201 100644
--- a/.github/workflows/scripts/build.sh
+++ b/.github/workflows/scripts/build.sh
@@ -1,4 +1,5 @@
#!/bin/bash
+set -eux
python_executable=python$1
cuda_home=/usr/local/cuda-$2
@@ -15,5 +16,8 @@ export MAX_JOBS=1
# Make sure release wheels are built for the following architectures
export TORCH_CUDA_ARCH_LIST="7.0 7.5 8.0 8.6 8.9 9.0+PTX"
export VLLM_FA_CMAKE_GPU_ARCHES="80-real;90-real"
+
+bash tools/check_repo.sh
+
# Build
$python_executable setup.py bdist_wheel --dist-dir=dist
diff --git a/.github/workflows/scripts/cuda-install.sh b/.github/workflows/scripts/cuda-install.sh
index 312c6e82f33a3..3d0b7a1fe0402 100644
--- a/.github/workflows/scripts/cuda-install.sh
+++ b/.github/workflows/scripts/cuda-install.sh
@@ -1,16 +1,16 @@
#!/bin/bash
# Replace '.' with '-' ex: 11.8 -> 11-8
-cuda_version=$(echo $1 | tr "." "-")
+cuda_version=$(echo "$1" | tr "." "-")
# Removes '-' and '.' ex: ubuntu-20.04 -> ubuntu2004
-OS=$(echo $2 | tr -d ".\-")
+OS=$(echo "$2" | tr -d ".\-")
# Installs CUDA
-wget -nv https://developer.download.nvidia.com/compute/cuda/repos/${OS}/x86_64/cuda-keyring_1.1-1_all.deb
+wget -nv "https://developer.download.nvidia.com/compute/cuda/repos/${OS}/x86_64/cuda-keyring_1.1-1_all.deb"
sudo dpkg -i cuda-keyring_1.1-1_all.deb
rm cuda-keyring_1.1-1_all.deb
sudo apt -qq update
-sudo apt -y install cuda-${cuda_version} cuda-nvcc-${cuda_version} cuda-libraries-dev-${cuda_version}
+sudo apt -y install "cuda-${cuda_version}" "cuda-nvcc-${cuda_version}" "cuda-libraries-dev-${cuda_version}"
sudo apt clean
# Test nvcc
diff --git a/.github/workflows/scripts/pytorch-install.sh b/.github/workflows/scripts/pytorch-install.sh
index dfc1851d7692c..e3cda7dad2d17 100644
--- a/.github/workflows/scripts/pytorch-install.sh
+++ b/.github/workflows/scripts/pytorch-install.sh
@@ -6,7 +6,7 @@ cuda_version=$3
# Install torch
$python_executable -m pip install numpy pyyaml scipy ipython mkl mkl-include ninja cython typing pandas typing-extensions dataclasses setuptools && conda clean -ya
-$python_executable -m pip install torch==${pytorch_version}+cu${cuda_version//./} --extra-index-url https://download.pytorch.org/whl/cu${cuda_version//./}
+$python_executable -m pip install torch=="${pytorch_version}+cu${cuda_version//./}" --extra-index-url "https://download.pytorch.org/whl/cu${cuda_version//./}"
# Print version information
$python_executable --version
diff --git a/.github/workflows/shellcheck.yml b/.github/workflows/shellcheck.yml
new file mode 100644
index 0000000000000..4b1587e373e17
--- /dev/null
+++ b/.github/workflows/shellcheck.yml
@@ -0,0 +1,37 @@
+name: Lint shell scripts
+on:
+ push:
+ branches:
+ - "main"
+ paths:
+ - '**/*.sh'
+ - '.github/workflows/shellcheck.yml'
+ pull_request:
+ branches:
+ - "main"
+ paths:
+ - '**/*.sh'
+ - '.github/workflows/shellcheck.yml'
+
+env:
+ LC_ALL: en_US.UTF-8
+
+defaults:
+ run:
+ shell: bash
+
+permissions:
+ contents: read
+
+jobs:
+ shellcheck:
+ runs-on: ubuntu-latest
+ steps:
+ - name: "Checkout"
+ uses: actions/checkout@11bd71901bbe5b1630ceea73d27597364c9af683 # v4.2.2
+ with:
+ fetch-depth: 0
+
+ - name: "Check shell scripts"
+ run: |
+ tools/shellcheck.sh
diff --git a/.github/workflows/sphinx-lint.yml b/.github/workflows/sphinx-lint.yml
new file mode 100644
index 0000000000000..e0bb24276a653
--- /dev/null
+++ b/.github/workflows/sphinx-lint.yml
@@ -0,0 +1,32 @@
+name: Lint documentation
+
+on:
+ push:
+ branches:
+ - main
+ paths:
+ - "docs/**"
+ pull_request:
+ branches:
+ - main
+ paths:
+ - "docs/**"
+
+jobs:
+ sphinx-lint:
+ runs-on: ubuntu-latest
+ strategy:
+ matrix:
+ python-version: ["3.12"]
+ steps:
+ - uses: actions/checkout@11bd71901bbe5b1630ceea73d27597364c9af683 # v4.2.2
+ - name: Set up Python ${{ matrix.python-version }}
+ uses: actions/setup-python@0b93645e9fea7318ecaed2b359559ac225c90a2b # v5.3.0
+ with:
+ python-version: ${{ matrix.python-version }}
+ - name: Install dependencies
+ run: |
+ python -m pip install --upgrade pip
+ pip install -r requirements-lint.txt
+ - name: Linting docs
+ run: tools/sphinx-lint.sh
diff --git a/.github/workflows/stale.yml b/.github/workflows/stale.yml
new file mode 100644
index 0000000000000..81e7c9b050760
--- /dev/null
+++ b/.github/workflows/stale.yml
@@ -0,0 +1,52 @@
+name: 'Close inactive issues and PRs'
+
+on:
+ schedule:
+ # Daily at 1:30 AM UTC
+ - cron: '30 1 * * *'
+
+jobs:
+ close-issues-and-pull-requests:
+ permissions:
+ issues: write
+ pull-requests: write
+ actions: write
+ runs-on: ubuntu-latest
+ steps:
+ - uses: actions/stale@28ca1036281a5e5922ead5184a1bbf96e5fc984e # v9.0.0
+ with:
+ # Increasing this value ensures that changes to this workflow
+ # propagate to all issues and PRs in days rather than months
+ operations-per-run: 1000
+
+ exempt-draft-pr: true
+ exempt-issue-labels: 'keep-open'
+ exempt-pr-labels: 'keep-open'
+
+ labels-to-add-when-unstale: 'unstale'
+ labels-to-remove-when-stale: 'unstale'
+
+ days-before-issue-stale: 90
+ days-before-issue-close: 30
+ stale-issue-label: 'stale'
+ stale-issue-message: >
+ This issue has been automatically marked as stale because it has not
+ had any activity within 90 days. It will be automatically closed if no
+ further activity occurs within 30 days. Leave a comment if
+ you feel this issue should remain open. Thank you!
+ close-issue-message: >
+ This issue has been automatically closed due to inactivity. Please
+ feel free to reopen if you feel it is still relevant. Thank you!
+
+ days-before-pr-stale: 90
+ days-before-pr-close: 30
+ stale-pr-label: 'stale'
+ stale-pr-message: >
+ This pull request has been automatically marked as stale because it
+ has not had any activity within 90 days. It will be automatically
+ closed if no further activity occurs within 30 days. Leave a comment
+ if you feel this pull request should remain open. Thank you!
+ close-pr-message: >
+ This pull request has been automatically closed due to inactivity.
+ Please feel free to reopen if you intend to continue working on it.
+ Thank you!
diff --git a/.github/workflows/yapf.yml b/.github/workflows/yapf.yml
index eb728ae04dfc1..ff441f94435ad 100644
--- a/.github/workflows/yapf.yml
+++ b/.github/workflows/yapf.yml
@@ -6,26 +6,33 @@ on:
push:
branches:
- main
+ paths:
+ - "**/*.py"
+ - .github/workflows/yapf.yml
pull_request:
branches:
- main
+ paths:
+ - "**/*.py"
+ - .github/workflows/yapf.yml
+
jobs:
yapf:
runs-on: ubuntu-latest
strategy:
matrix:
- python-version: ["3.8", "3.9", "3.10", "3.11", "3.12"]
+ python-version: ["3.12"]
steps:
- - uses: actions/checkout@v4
- - name: Set up Python ${{ matrix.python-version }}
- uses: actions/setup-python@v5
- with:
- python-version: ${{ matrix.python-version }}
- - name: Install dependencies
- run: |
- python -m pip install --upgrade pip
- pip install yapf==0.32.0
- pip install toml==0.10.2
- - name: Running yapf
- run: |
- yapf --diff --recursive .
+ - uses: actions/checkout@11bd71901bbe5b1630ceea73d27597364c9af683 # v4.2.2
+ - name: Set up Python ${{ matrix.python-version }}
+ uses: actions/setup-python@0b93645e9fea7318ecaed2b359559ac225c90a2b # v5.3.0
+ with:
+ python-version: ${{ matrix.python-version }}
+ - name: Install dependencies
+ run: |
+ python -m pip install --upgrade pip
+ pip install yapf==0.32.0
+ pip install toml==0.10.2
+ - name: Running yapf
+ run: |
+ yapf --diff --recursive .
diff --git a/.gitignore b/.gitignore
index 1ea6e3419db2a..ceef6a5fba456 100644
--- a/.gitignore
+++ b/.gitignore
@@ -202,3 +202,4 @@ benchmarks/*.json
# Linting
actionlint
+shellcheck*/
diff --git a/.readthedocs.yaml b/.readthedocs.yaml
index 42cbf18a0f712..284196bc2d279 100644
--- a/.readthedocs.yaml
+++ b/.readthedocs.yaml
@@ -6,17 +6,16 @@ version: 2
build:
os: ubuntu-22.04
tools:
- python: "3.8"
+ python: "3.12"
sphinx:
- configuration: docs/source/conf.py
- fail_on_warning: true
+ configuration: docs/source/conf.py
+ fail_on_warning: true
# If using Sphinx, optionally build your docs in additional formats such as PDF
formats: []
# Optionally declare the Python requirements required to build your docs
python:
- install:
- - requirements: docs/requirements-docs.txt
-
+ install:
+ - requirements: docs/requirements-docs.txt
diff --git a/.shellcheckrc b/.shellcheckrc
new file mode 100644
index 0000000000000..f3b6eedf8d907
--- /dev/null
+++ b/.shellcheckrc
@@ -0,0 +1,9 @@
+# rules currently disabled:
+#
+# SC1091 (info): Not following: was not specified as input (see shellcheck -x)
+# SC2004 (style): $/${} is unnecessary on arithmetic variables.
+# SC2129 (style): Consider using { cmd1; cmd2; } >> file instead of individual redirects.
+# SC2155 (warning): Declare and assign separately to avoid masking return values.
+# SC2164 (warning): Use 'cd ... || exit' or 'cd ... || return' in case cd fails.
+#
+disable=SC1091,SC2004,SC2129,SC2155,SC2164
diff --git a/CMakeLists.txt b/CMakeLists.txt
index 1f4648a37dbca..bfe435937e3bb 100644
--- a/CMakeLists.txt
+++ b/CMakeLists.txt
@@ -31,13 +31,13 @@ install(CODE "set(CMAKE_INSTALL_LOCAL_ONLY TRUE)" ALL_COMPONENTS)
# Supported python versions. These versions will be searched in order, the
# first match will be selected. These should be kept in sync with setup.py.
#
-set(PYTHON_SUPPORTED_VERSIONS "3.8" "3.9" "3.10" "3.11" "3.12")
+set(PYTHON_SUPPORTED_VERSIONS "3.9" "3.10" "3.11" "3.12")
# Supported NVIDIA architectures.
set(CUDA_SUPPORTED_ARCHS "7.0;7.5;8.0;8.6;8.9;9.0")
# Supported AMD GPU architectures.
-set(HIP_SUPPORTED_ARCHS "gfx906;gfx908;gfx90a;gfx940;gfx941;gfx942;gfx1030;gfx1100")
+set(HIP_SUPPORTED_ARCHS "gfx906;gfx908;gfx90a;gfx940;gfx941;gfx942;gfx1030;gfx1100;gfx1101")
#
# Supported/expected torch versions for CUDA/ROCm.
@@ -49,8 +49,8 @@ set(HIP_SUPPORTED_ARCHS "gfx906;gfx908;gfx90a;gfx940;gfx941;gfx942;gfx1030;gfx11
# requirements.txt files and should be kept consistent. The ROCm torch
# versions are derived from Dockerfile.rocm
#
-set(TORCH_SUPPORTED_VERSION_CUDA "2.4.0")
-set(TORCH_SUPPORTED_VERSION_ROCM "2.5.0")
+set(TORCH_SUPPORTED_VERSION_CUDA "2.5.1")
+set(TORCH_SUPPORTED_VERSION_ROCM "2.5.1")
#
# Try to find python package with an executable that exactly matches
@@ -83,24 +83,6 @@ endif()
#
find_package(Torch REQUIRED)
-#
-message(STATUS "Enabling core extension.")
-
-# Define _core_C extension
-# built for (almost) every target platform, (excludes TPU and Neuron)
-
-set(VLLM_EXT_SRC
- "csrc/core/torch_bindings.cpp")
-
-define_gpu_extension_target(
- _core_C
- DESTINATION vllm
- LANGUAGE CXX
- SOURCES ${VLLM_EXT_SRC}
- COMPILE_FLAGS ${CXX_COMPILE_FLAGS}
- USE_SABI 3
- WITH_SOABI)
-
#
# Forward the non-CUDA device extensions to external CMake scripts.
#
@@ -146,9 +128,9 @@ endif()
if(VLLM_GPU_LANG STREQUAL "CUDA")
#
- # For cuda we want to be able to control which architectures we compile for on
+ # For cuda we want to be able to control which architectures we compile for on
# a per-file basis in order to cut down on compile time. So here we extract
- # the set of architectures we want to compile for and remove the from the
+ # the set of architectures we want to compile for and remove the from the
# CMAKE_CUDA_FLAGS so that they are not applied globally.
#
clear_cuda_arches(CUDA_ARCH_FLAGS)
@@ -156,7 +138,7 @@ if(VLLM_GPU_LANG STREQUAL "CUDA")
message(STATUS "CUDA target architectures: ${CUDA_ARCHS}")
# Filter the target architectures by the supported supported archs
# since for some files we will build for all CUDA_ARCHS.
- cuda_archs_loose_intersection(CUDA_ARCHS
+ cuda_archs_loose_intersection(CUDA_ARCHS
"${CUDA_SUPPORTED_ARCHS}" "${CUDA_ARCHS}")
message(STATUS "CUDA supported target architectures: ${CUDA_ARCHS}")
else()
@@ -187,12 +169,12 @@ endif()
#
# Use FetchContent for C++ dependencies that are compiled as part of vLLM's build process.
-# Configure it to place files in vllm/.deps, in order to play nicely with sccache.
+# setup.py will override FETCHCONTENT_BASE_DIR to play nicely with sccache.
+# Each dependency that produces build artifacts should override its BINARY_DIR to avoid
+# conflicts between build types. It should instead be set to ${CMAKE_BINARY_DIR}/.
#
include(FetchContent)
-get_filename_component(PROJECT_ROOT_DIR "${CMAKE_CURRENT_SOURCE_DIR}" ABSOLUTE)
-file(MAKE_DIRECTORY "${FETCHCONTENT_BASE_DIR}")
-set(FETCHCONTENT_BASE_DIR "${PROJECT_ROOT_DIR}/.deps")
+file(MAKE_DIRECTORY ${FETCHCONTENT_BASE_DIR}) # Ensure the directory exists
message(STATUS "FetchContent base directory: ${FETCHCONTENT_BASE_DIR}")
#
@@ -205,15 +187,16 @@ message(STATUS "FetchContent base directory: ${FETCHCONTENT_BASE_DIR}")
set(VLLM_EXT_SRC
"csrc/cache_kernels.cu"
- "csrc/attention/attention_kernels.cu"
+ "csrc/attention/paged_attention_v1.cu"
+ "csrc/attention/paged_attention_v2.cu"
"csrc/pos_encoding_kernels.cu"
"csrc/activation_kernels.cu"
"csrc/layernorm_kernels.cu"
+ "csrc/layernorm_quant_kernels.cu"
"csrc/quantization/gptq/q_gemm.cu"
"csrc/quantization/compressed_tensors/int8_quant_kernels.cu"
"csrc/quantization/fp8/common.cu"
"csrc/cuda_utils_kernels.cu"
- "csrc/moe_align_block_size_kernels.cu"
"csrc/prepare_inputs/advance_step.cu"
"csrc/torch_bindings.cpp")
@@ -223,7 +206,19 @@ if(VLLM_GPU_LANG STREQUAL "CUDA")
# Set CUTLASS_REVISION manually -- its revision detection doesn't work in this case.
set(CUTLASS_REVISION "v3.5.1" CACHE STRING "CUTLASS revision to use")
- FetchContent_Declare(
+ # Use the specified CUTLASS source directory for compilation if VLLM_CUTLASS_SRC_DIR is provided
+ if (DEFINED ENV{VLLM_CUTLASS_SRC_DIR})
+ set(VLLM_CUTLASS_SRC_DIR $ENV{VLLM_CUTLASS_SRC_DIR})
+ endif()
+
+ if(VLLM_CUTLASS_SRC_DIR)
+ if(NOT IS_ABSOLUTE VLLM_CUTLASS_SRC_DIR)
+ get_filename_component(VLLM_CUTLASS_SRC_DIR "${VLLM_CUTLASS_SRC_DIR}" ABSOLUTE)
+ endif()
+ message(STATUS "The VLLM_CUTLASS_SRC_DIR is set, using ${VLLM_CUTLASS_SRC_DIR} for compilation")
+ FetchContent_Declare(cutlass SOURCE_DIR ${VLLM_CUTLASS_SRC_DIR})
+ else()
+ FetchContent_Declare(
cutlass
GIT_REPOSITORY https://github.com/nvidia/cutlass.git
GIT_TAG v3.5.1
@@ -233,7 +228,8 @@ if(VLLM_GPU_LANG STREQUAL "CUDA")
# Important: If GIT_SHALLOW is enabled then GIT_TAG works only with branch names and tags.
# So if the GIT_TAG above is updated to a commit hash, GIT_SHALLOW must be set to FALSE
GIT_SHALLOW TRUE
- )
+ )
+ endif()
FetchContent_MakeAvailable(cutlass)
list(APPEND VLLM_EXT_SRC
@@ -255,7 +251,7 @@ if(VLLM_GPU_LANG STREQUAL "CUDA")
# are not supported by Machete yet.
cuda_archs_loose_intersection(MARLIN_ARCHS "8.0;8.6;8.9;9.0" ${CUDA_ARCHS})
if (MARLIN_ARCHS)
- set(MARLIN_SRCS
+ set(MARLIN_SRCS
"csrc/quantization/fp8/fp8_marlin.cu"
"csrc/quantization/marlin/dense/marlin_cuda_kernel.cu"
"csrc/quantization/marlin/sparse/marlin_24_cuda_kernel.cu"
@@ -270,7 +266,7 @@ if(VLLM_GPU_LANG STREQUAL "CUDA")
message(STATUS "Building Marlin kernels for archs: ${MARLIN_ARCHS}")
else()
message(STATUS "Not building Marlin kernels as no compatible archs found"
- "in CUDA target architectures")
+ " in CUDA target architectures")
endif()
#
@@ -296,7 +292,7 @@ if(VLLM_GPU_LANG STREQUAL "CUDA")
"in CUDA target architectures")
endif()
- # clear SCALED_MM_3X_ARCHS so the scaled_mm_c2x kernels know we didn't
+ # clear SCALED_MM_3X_ARCHS so the scaled_mm_c2x kernels know we didn't
# build any 3x kernels
set(SCALED_MM_3X_ARCHS)
endif()
@@ -304,7 +300,7 @@ if(VLLM_GPU_LANG STREQUAL "CUDA")
#
# For the cutlass_scaled_mm kernels we want to build the c2x (CUTLASS 2.x)
# kernels for the remaining archs that are not already built for 3x.
- cuda_archs_loose_intersection(SCALED_MM_2X_ARCHS
+ cuda_archs_loose_intersection(SCALED_MM_2X_ARCHS
"7.5;8.0;8.6;8.9;9.0" "${CUDA_ARCHS}")
# subtract out the archs that are already built for 3x
list(REMOVE_ITEM SCALED_MM_2X_ARCHS ${SCALED_MM_3X_ARCHS})
@@ -335,10 +331,10 @@ if(VLLM_GPU_LANG STREQUAL "CUDA")
cuda_archs_loose_intersection(MACHETE_ARCHS "9.0a" "${CUDA_ARCHS}")
if(${CMAKE_CUDA_COMPILER_VERSION} VERSION_GREATER 12.0 AND MACHETE_ARCHS)
#
- # For the Machete kernels we automatically generate sources for various
+ # For the Machete kernels we automatically generate sources for various
# preselected input type pairs and schedules.
# Generate sources:
- set(MACHETE_GEN_SCRIPT
+ set(MACHETE_GEN_SCRIPT
${CMAKE_CURRENT_SOURCE_DIR}/csrc/quantization/machete/generate.py)
file(MD5 ${MACHETE_GEN_SCRIPT} MACHETE_GEN_SCRIPT_HASH)
@@ -348,8 +344,8 @@ if(VLLM_GPU_LANG STREQUAL "CUDA")
if (NOT DEFINED CACHE{MACHETE_GEN_SCRIPT_HASH}
OR NOT $CACHE{MACHETE_GEN_SCRIPT_HASH} STREQUAL ${MACHETE_GEN_SCRIPT_HASH})
execute_process(
- COMMAND ${CMAKE_COMMAND} -E env
- PYTHONPATH=${CMAKE_CURRENT_SOURCE_DIR}/csrc/cutlass_extensions/:${CUTLASS_DIR}/python/:${VLLM_PYTHON_PATH}:$PYTHONPATH
+ COMMAND ${CMAKE_COMMAND} -E env
+ PYTHONPATH=${CMAKE_CURRENT_SOURCE_DIR}/csrc/cutlass_extensions/:${CUTLASS_DIR}/python/:${VLLM_PYTHON_PATH}:$PYTHONPATH
${Python_EXECUTABLE} ${MACHETE_GEN_SCRIPT}
RESULT_VARIABLE machete_generation_result
OUTPUT_VARIABLE machete_generation_output
@@ -359,11 +355,11 @@ if(VLLM_GPU_LANG STREQUAL "CUDA")
if (NOT machete_generation_result EQUAL 0)
message(FATAL_ERROR "Machete generation failed."
- " Result: \"${machete_generation_result}\""
+ " Result: \"${machete_generation_result}\""
"\nCheck the log for details: "
"${CMAKE_CURRENT_BINARY_DIR}/machete_generation.log")
else()
- set(MACHETE_GEN_SCRIPT_HASH ${MACHETE_GEN_SCRIPT_HASH}
+ set(MACHETE_GEN_SCRIPT_HASH ${MACHETE_GEN_SCRIPT_HASH}
CACHE STRING "Last run machete generate script hash" FORCE)
message(STATUS "Machete generation completed successfully.")
endif()
@@ -385,7 +381,7 @@ if(VLLM_GPU_LANG STREQUAL "CUDA")
message(STATUS "Building Machete kernels for archs: ${MACHETE_ARCHS}")
else()
- if (NOT ${CMAKE_CUDA_COMPILER_VERSION} VERSION_GREATER 12.0
+ if (NOT ${CMAKE_CUDA_COMPILER_VERSION} VERSION_GREATER 12.0
AND MACHETE_ARCHS)
message(STATUS "Not building Machete kernels as CUDA Compiler version is "
"not >= 12.0, we recommend upgrading to CUDA 12.0 or "
@@ -411,8 +407,8 @@ define_gpu_extension_target(
USE_SABI 3
WITH_SOABI)
-# If CUTLASS is compiled on NVCC >= 12.5, it by default uses
-# cudaGetDriverEntryPointByVersion as a wrapper to avoid directly calling the
+# If CUTLASS is compiled on NVCC >= 12.5, it by default uses
+# cudaGetDriverEntryPointByVersion as a wrapper to avoid directly calling the
# driver API. This causes problems when linking with earlier versions of CUDA.
# Setting this variable sidesteps the issue by calling the driver directly.
target_compile_definitions(_C PRIVATE CUTLASS_ENABLE_DIRECT_CUDA_DRIVER_CALL=1)
@@ -423,6 +419,7 @@ target_compile_definitions(_C PRIVATE CUTLASS_ENABLE_DIRECT_CUDA_DRIVER_CALL=1)
set(VLLM_MOE_EXT_SRC
"csrc/moe/torch_bindings.cpp"
+ "csrc/moe/moe_align_sum_kernels.cu"
"csrc/moe/topk_softmax_kernels.cu")
set_gencode_flags_for_srcs(
@@ -450,7 +447,7 @@ if(VLLM_GPU_LANG STREQUAL "CUDA")
message(STATUS "Building Marlin MOE kernels for archs: ${MARLIN_MOE_ARCHS}")
else()
message(STATUS "Not building Marlin MOE kernels as no compatible archs found"
- "in CUDA target architectures")
+ " in CUDA target architectures")
endif()
endif()
@@ -489,9 +486,9 @@ if (NOT VLLM_TARGET_DEVICE STREQUAL "cuda")
return()
endif ()
-# vLLM flash attention requires VLLM_GPU_ARCHES to contain the set of target
-# arches in the CMake syntax (75-real, 89-virtual, etc), since we clear the
-# arches in the CUDA case (and instead set the gencodes on a per file basis)
+# vLLM flash attention requires VLLM_GPU_ARCHES to contain the set of target
+# arches in the CMake syntax (75-real, 89-virtual, etc), since we clear the
+# arches in the CUDA case (and instead set the gencodes on a per file basis)
# we need to manually set VLLM_GPU_ARCHES here.
if(VLLM_GPU_LANG STREQUAL "CUDA")
foreach(_ARCH ${CUDA_ARCHS})
@@ -525,8 +522,10 @@ else()
FetchContent_Declare(
vllm-flash-attn
GIT_REPOSITORY https://github.com/vllm-project/flash-attention.git
- GIT_TAG 013f0c4fc47e6574060879d9734c1df8c5c273bd
+ GIT_TAG 5259c586c403a4e4d8bf69973c159b40cc346fb9
GIT_PROGRESS TRUE
+ # Don't share the vllm-flash-attn build between build types
+ BINARY_DIR ${CMAKE_BINARY_DIR}/vllm-flash-attn
)
endif()
diff --git a/CONTRIBUTING.md b/CONTRIBUTING.md
index 5f79356bd32f7..6d46a6dca371d 100644
--- a/CONTRIBUTING.md
+++ b/CONTRIBUTING.md
@@ -1,50 +1,3 @@
# Contributing to vLLM
-Thank you for your interest in contributing to vLLM! Our community is open to everyone and welcomes all kinds of contributions, no matter how small or large. There are several ways you can contribute to the project:
-
-- Identify and report any issues or bugs.
-- Request or add support for a new model.
-- Suggest or implement new features.
-- Improve documentation or contribute a how-to guide.
-
-We also believe in the power of community support; thus, answering queries, offering PR reviews, and assisting others are also highly regarded and beneficial contributions.
-
-Finally, one of the most impactful ways to support us is by raising awareness about vLLM. Talk about it in your blog posts and highlight how it's driving your incredible projects. Express your support on social media if you're using vLLM, or simply offer your appreciation by starring our repository!
-
-
-## Developing
-
-Depending on the kind of development you'd like to do (e.g. Python, CUDA), you can choose to build vLLM with or without compilation. Check out the [building from source](https://docs.vllm.ai/en/latest/getting_started/installation.html#build-from-source) documentation for details.
-
-
-## Testing
-
-```bash
-pip install -r requirements-dev.txt
-
-# linting and formatting
-bash format.sh
-# Static type checking
-mypy
-# Unit tests
-pytest tests/
-```
-**Note:** Currently, the repository does not pass the ``mypy`` tests.
-
-## Contribution Guidelines
-
-### Issues
-
-If you encounter a bug or have a feature request, please [search existing issues](https://github.com/vllm-project/vllm/issues?q=is%3Aissue) first to see if it has already been reported. If not, please [file a new issue](https://github.com/vllm-project/vllm/issues/new/choose), providing as much relevant information as possible.
-
-> [!IMPORTANT]
-> If you discover a security vulnerability, please follow the instructions [here](/SECURITY.md#reporting-a-vulnerability).
-
-### Pull Requests & Code Reviews
-
-Please check the PR checklist in the [PR template](.github/PULL_REQUEST_TEMPLATE.md) for detailed guide for contribution.
-
-### Thank You
-
-Finally, thank you for taking the time to read these guidelines and for your interest in contributing to vLLM.
-All of your contributions help make vLLM a great tool and community for everyone!
+You may find information about contributing to vLLM on [docs.vllm.ai](https://docs.vllm.ai/en/latest/contributing/overview.html).
diff --git a/DCO b/DCO
new file mode 100644
index 0000000000000..49b8cb0549267
--- /dev/null
+++ b/DCO
@@ -0,0 +1,34 @@
+Developer Certificate of Origin
+Version 1.1
+
+Copyright (C) 2004, 2006 The Linux Foundation and its contributors.
+
+Everyone is permitted to copy and distribute verbatim copies of this
+license document, but changing it is not allowed.
+
+
+Developer's Certificate of Origin 1.1
+
+By making a contribution to this project, I certify that:
+
+(a) The contribution was created in whole or in part by me and I
+ have the right to submit it under the open source license
+ indicated in the file; or
+
+(b) The contribution is based upon previous work that, to the best
+ of my knowledge, is covered under an appropriate open source
+ license and I have the right under that license to submit that
+ work with modifications, whether created in whole or in part
+ by me, under the same open source license (unless I am
+ permitted to submit under a different license), as indicated
+ in the file; or
+
+(c) The contribution was provided directly to me by some other
+ person who certified (a), (b) or (c) and I have not modified
+ it.
+
+(d) I understand and agree that this project and the contribution
+ are public and that a record of the contribution (including all
+ personal information I submit with it, including my sign-off) is
+ maintained indefinitely and may be redistributed consistent with
+ this project or the open source license(s) involved.
diff --git a/Dockerfile b/Dockerfile
index d527868bc4c2f..220dbe26712ec 100644
--- a/Dockerfile
+++ b/Dockerfile
@@ -70,8 +70,10 @@ COPY requirements-build.txt requirements-build.txt
RUN --mount=type=cache,target=/root/.cache/pip \
python3 -m pip install -r requirements-build.txt
-# files and directories related to build wheels
COPY . .
+ARG GIT_REPO_CHECK=0
+RUN --mount=type=bind,source=.git,target=.git \
+ if [ "$GIT_REPO_CHECK" != 0 ]; then bash tools/check_repo.sh ; fi
# max jobs used by Ninja to build extensions
ARG max_jobs=2
@@ -189,6 +191,14 @@ ADD . /vllm-workspace/
RUN --mount=type=cache,target=/root/.cache/pip \
python3 -m pip install -r requirements-dev.txt
+# enable fast downloads from hf (for testing)
+RUN --mount=type=cache,target=/root/.cache/pip \
+ python3 -m pip install hf_transfer
+ENV HF_HUB_ENABLE_HF_TRANSFER 1
+
+# Copy in the v1 package for testing (it isn't distributed yet)
+COPY vllm/v1 /usr/local/lib/python3.12/dist-packages/vllm/v1
+
# doc requires source code
# we hide them inside `test_docs/` , so that this source code
# will not be imported by other tests
@@ -204,7 +214,7 @@ FROM vllm-base AS vllm-openai
# install additional dependencies for openai api server
RUN --mount=type=cache,target=/root/.cache/pip \
- pip install accelerate hf_transfer 'modelscope!=1.15.0' bitsandbytes>=0.44.0 timm==0.9.10
+ pip install accelerate hf_transfer 'modelscope!=1.15.0' 'bitsandbytes>=0.44.0' timm==0.9.10
ENV VLLM_USAGE_SOURCE production-docker-image
diff --git a/Dockerfile.cpu b/Dockerfile.cpu
index b9134d4ae41cb..287b4958da4e5 100644
--- a/Dockerfile.cpu
+++ b/Dockerfile.cpu
@@ -22,7 +22,7 @@ ENV LD_PRELOAD="/usr/lib/x86_64-linux-gnu/libtcmalloc_minimal.so.4:/usr/local/li
RUN echo 'ulimit -c 0' >> ~/.bashrc
-RUN pip install intel_extension_for_pytorch==2.4.0
+RUN pip install intel_extension_for_pytorch==2.5.0
WORKDIR /workspace
@@ -33,19 +33,6 @@ RUN --mount=type=cache,target=/root/.cache/pip \
pip install --upgrade pip && \
pip install -r requirements-build.txt
-# install oneDNN
-RUN git clone -b rls-v3.5 https://github.com/oneapi-src/oneDNN.git
-
-RUN --mount=type=cache,target=/root/.cache/ccache \
- cmake -B ./oneDNN/build -S ./oneDNN -G Ninja -DONEDNN_LIBRARY_TYPE=STATIC \
- -DONEDNN_BUILD_DOC=OFF \
- -DONEDNN_BUILD_EXAMPLES=OFF \
- -DONEDNN_BUILD_TESTS=OFF \
- -DONEDNN_BUILD_GRAPH=OFF \
- -DONEDNN_ENABLE_WORKLOAD=INFERENCE \
- -DONEDNN_ENABLE_PRIMITIVE=MATMUL && \
- cmake --build ./oneDNN/build --target install --config Release
-
FROM cpu-test-1 AS build
WORKDIR /workspace/vllm
@@ -55,7 +42,10 @@ RUN --mount=type=cache,target=/root/.cache/pip \
--mount=type=bind,src=requirements-cpu.txt,target=requirements-cpu.txt \
pip install -v -r requirements-cpu.txt
-COPY ./ ./
+COPY . .
+ARG GIT_REPO_CHECK=0
+RUN --mount=type=bind,source=.git,target=.git \
+ if [ "$GIT_REPO_CHECK" != 0 ]; then bash tools/check_repo.sh ; fi
# Support for building with non-AVX512 vLLM: docker build --build-arg VLLM_CPU_DISABLE_AVX512="true" ...
ARG VLLM_CPU_DISABLE_AVX512
diff --git a/Dockerfile.hpu b/Dockerfile.hpu
new file mode 100644
index 0000000000000..d18fc016387bf
--- /dev/null
+++ b/Dockerfile.hpu
@@ -0,0 +1,18 @@
+FROM vault.habana.ai/gaudi-docker/1.18.0/ubuntu22.04/habanalabs/pytorch-installer-2.4.0:latest
+
+COPY ./ /workspace/vllm
+
+WORKDIR /workspace/vllm
+
+RUN pip install -v -r requirements-hpu.txt
+
+ENV no_proxy=localhost,127.0.0.1
+ENV PT_HPU_ENABLE_LAZY_COLLECTIVES=true
+
+RUN VLLM_TARGET_DEVICE=hpu python3 setup.py install
+
+WORKDIR /workspace/
+
+RUN ln -s /workspace/vllm/tests && ln -s /workspace/vllm/examples && ln -s /workspace/vllm/benchmarks
+
+ENTRYPOINT ["python3", "-m", "vllm.entrypoints.openai.api_server"]
diff --git a/Dockerfile.neuron b/Dockerfile.neuron
index adae6db87ba87..2143315d2a078 100644
--- a/Dockerfile.neuron
+++ b/Dockerfile.neuron
@@ -17,7 +17,7 @@ RUN apt-get update && \
# When launching the container, mount the code directory to /app
ARG APP_MOUNT=/app
VOLUME [ ${APP_MOUNT} ]
-WORKDIR ${APP_MOUNT}
+WORKDIR ${APP_MOUNT}/vllm
RUN python3 -m pip install --upgrade pip
RUN python3 -m pip install --no-cache-dir fastapi ninja tokenizers pandas
@@ -25,17 +25,17 @@ RUN python3 -m pip install sentencepiece transformers==4.36.2 -U
RUN python3 -m pip install transformers-neuronx --extra-index-url=https://pip.repos.neuron.amazonaws.com -U
RUN python3 -m pip install --pre neuronx-cc==2.15.* --extra-index-url=https://pip.repos.neuron.amazonaws.com -U
-COPY . /app/vllm
+COPY . .
+ARG GIT_REPO_CHECK=0
+RUN --mount=type=bind,source=.git,target=.git \
+ if [ "$GIT_REPO_CHECK" != 0 ]; then bash tools/check_repo.sh ; fi
-RUN cd /app/vllm \
- && python3 -m pip install -U \
- cmake>=3.26 ninja packaging setuptools-scm>=8 wheel jinja2 \
+RUN python3 -m pip install -U \
+ 'cmake>=3.26' ninja packaging 'setuptools-scm>=8' wheel jinja2 \
-r requirements-neuron.txt
ENV VLLM_TARGET_DEVICE neuron
RUN --mount=type=bind,source=.git,target=.git \
- cd /app/vllm \
- && pip install --no-build-isolation -v -e . \
- && cd ..
+ pip install --no-build-isolation -v -e .
CMD ["/bin/bash"]
diff --git a/Dockerfile.openvino b/Dockerfile.openvino
index d65bfa08ccd90..a05ff452cd36e 100644
--- a/Dockerfile.openvino
+++ b/Dockerfile.openvino
@@ -10,13 +10,16 @@ RUN apt-get update -y && \
WORKDIR /workspace
COPY . .
+ARG GIT_REPO_CHECK=0
+RUN --mount=type=bind,source=.git,target=.git \
+ if [ "$GIT_REPO_CHECK" != 0 ]; then bash tools/check_repo.sh ; fi
# install build requirements
-RUN PIP_EXTRA_INDEX_URL="https://download.pytorch.org/whl/cpu" python3 -m pip install -r /workspace/vllm/requirements-build.txt
+RUN PIP_EXTRA_INDEX_URL="https://download.pytorch.org/whl/cpu" python3 -m pip install -r /workspace/requirements-build.txt
# build vLLM with OpenVINO backend
-RUN PIP_EXTRA_INDEX_URL="https://download.pytorch.org/whl/cpu" VLLM_TARGET_DEVICE="openvino" python3 -m pip install /workspace/vllm/
+RUN PIP_EXTRA_INDEX_URL="https://download.pytorch.org/whl/cpu" VLLM_TARGET_DEVICE="openvino" python3 -m pip install /workspace
-COPY examples/ /workspace/vllm/examples
-COPY benchmarks/ /workspace/vllm/benchmarks
+COPY examples/ /workspace/examples
+COPY benchmarks/ /workspace/benchmarks
CMD ["/bin/bash"]
diff --git a/Dockerfile.ppc64le b/Dockerfile.ppc64le
index 1f374b01b9bc0..b19c6ddec7948 100644
--- a/Dockerfile.ppc64le
+++ b/Dockerfile.ppc64le
@@ -14,11 +14,14 @@ RUN micromamba install -y -n base -c https://ftp.osuosl.org/pub/open-ce/1.11.0-p
COPY ./ /workspace/vllm
WORKDIR /workspace/vllm
+ARG GIT_REPO_CHECK=0
+RUN --mount=type=bind,source=.git,target=.git \
+ if [ "$GIT_REPO_CHECK" != 0 ]; then bash tools/check_repo.sh; fi
# These packages will be in rocketce eventually
RUN --mount=type=cache,target=/root/.cache/pip \
pip install -v --prefer-binary --extra-index-url https://repo.fury.io/mgiessing \
- cmake>=3.26 ninja packaging setuptools-scm>=8 wheel jinja2 \
+ 'cmake>=3.26' ninja packaging 'setuptools-scm>=8' wheel jinja2 \
torch==2.3.1 \
-r requirements-cpu.txt \
xformers uvloop==0.20.0
@@ -30,4 +33,4 @@ WORKDIR /workspace/
RUN ln -s /workspace/vllm/tests && ln -s /workspace/vllm/examples && ln -s /workspace/vllm/benchmarks
-ENTRYPOINT ["python3", "-m", "vllm.entrypoints.openai.api_server"]
+ENTRYPOINT ["/opt/conda/bin/python3", "-m", "vllm.entrypoints.openai.api_server"]
diff --git a/Dockerfile.rocm b/Dockerfile.rocm
index 496e6bed7c022..62d4a9b4909c3 100644
--- a/Dockerfile.rocm
+++ b/Dockerfile.rocm
@@ -51,9 +51,9 @@ RUN --mount=type=cache,target=/root/.cache/pip \
*"rocm-6.2"*) \
python3 -m pip uninstall -y torch torchvision \
&& python3 -m pip install --pre \
- torch==2.6.0.dev20240918 \
- setuptools-scm>=8 \
- torchvision==0.20.0.dev20240918 \
+ torch==2.6.0.dev20241113+rocm6.2 \
+ 'setuptools-scm>=8' \
+ torchvision==0.20.0.dev20241113+rocm6.2 \
--extra-index-url https://download.pytorch.org/whl/nightly/rocm6.2;; \
*) ;; esac
@@ -117,6 +117,11 @@ RUN --mount=type=cache,target=${CCACHE_DIR} \
FROM base AS final
# Import the vLLM development directory from the build context
COPY . .
+ARG GIT_REPO_CHECK=0
+RUN --mount=type=bind,source=.git,target=.git \
+ if [ "$GIT_REPO_CHECK" != 0 ]; then bash tools/check_repo.sh ; fi
+
+RUN python3 -m pip install --upgrade pip
# Package upgrades for useful functionality or to avoid dependency issues
RUN --mount=type=cache,target=/root/.cache/pip \
diff --git a/Dockerfile.tpu b/Dockerfile.tpu
index d8f1a42c45177..0a507b6ecdf60 100644
--- a/Dockerfile.tpu
+++ b/Dockerfile.tpu
@@ -1,29 +1,25 @@
-ARG NIGHTLY_DATE="20240828"
+ARG NIGHTLY_DATE="20241017"
ARG BASE_IMAGE="us-central1-docker.pkg.dev/tpu-pytorch-releases/docker/xla:nightly_3.10_tpuvm_$NIGHTLY_DATE"
FROM $BASE_IMAGE
-WORKDIR /workspace
+WORKDIR /workspace/vllm
# Install some basic utilities
RUN apt-get update && apt-get install -y \
git \
ffmpeg libsm6 libxext6 libgl1
-# Install the TPU and Pallas dependencies.
-RUN --mount=type=cache,target=/root/.cache/pip \
- python3 -m pip install torch_xla[tpu] -f https://storage.googleapis.com/libtpu-releases/index.html
-RUN --mount=type=cache,target=/root/.cache/pip \
- python3 -m pip install torch_xla[pallas] -f https://storage.googleapis.com/jax-releases/jax_nightly_releases.html -f https://storage.googleapis.com/jax-releases/jaxlib_nightly_releases.html
-
# Build vLLM.
-COPY . /workspace/vllm
+COPY . .
+ARG GIT_REPO_CHECK=0
+RUN --mount=type=bind,source=.git,target=.git \
+ if [ "$GIT_REPO_CHECK" != 0 ]; then bash tools/check_repo.sh; fi
+
ENV VLLM_TARGET_DEVICE="tpu"
RUN --mount=type=cache,target=/root/.cache/pip \
--mount=type=bind,source=.git,target=.git \
- cd /workspace/vllm && \
python3 -m pip install \
- cmake>=3.26 ninja packaging setuptools-scm>=8 wheel jinja2 \
-r requirements-tpu.txt
-RUN cd /workspace/vllm && python3 setup.py develop
+RUN python3 setup.py develop
CMD ["/bin/bash"]
diff --git a/Dockerfile.xpu b/Dockerfile.xpu
index 83db341556eaf..63bc682770422 100644
--- a/Dockerfile.xpu
+++ b/Dockerfile.xpu
@@ -30,10 +30,23 @@ COPY requirements-common.txt /workspace/vllm/requirements-common.txt
RUN --mount=type=cache,target=/root/.cache/pip \
pip install --no-cache-dir \
- --extra-index-url https://pytorch-extension.intel.com/release-whl/stable/xpu/us/ \
-r requirements-xpu.txt
-COPY ./ /workspace/vllm
+RUN git clone https://github.com/intel/pti-gpu && \
+ cd pti-gpu/sdk && \
+ git checkout 6c491f07a777ed872c2654ca9942f1d0dde0a082 && \
+ mkdir build && \
+ cd build && \
+ cmake -DCMAKE_BUILD_TYPE=Release -DCMAKE_TOOLCHAIN_FILE=../cmake/toolchains/icpx_toolchain.cmake -DBUILD_TESTING=OFF .. && \
+ make -j && \
+ cmake --install . --config Release --prefix "/usr/local"
+
+ENV LD_LIBRARY_PATH="$LD_LIBRARY_PATH:/usr/local/lib/"
+
+COPY . .
+ARG GIT_REPO_CHECK
+RUN --mount=type=bind,source=.git,target=.git \
+ if [ "$GIT_REPO_CHECK" != 0 ]; then bash tools/check_repo.sh; fi
ENV VLLM_TARGET_DEVICE=xpu
diff --git a/README.md b/README.md
index 72c3273edc61d..0ef073210d070 100644
--- a/README.md
+++ b/README.md
@@ -13,9 +13,11 @@ Easy, fast, and cheap LLM serving for everyone
| Documentation | Blog | Paper | Discord | Twitter/X | Developer Slack |
+---
*Latest News* 🔥
-- [2024/10] We have just created a developer slack ([slack.vllm.ai](https://slack.vllm.ai)) focusing on coordinating contributions and discussing features. Please feel free to join us there!
+- [2024/11] We hosted [the seventh vLLM meetup](https://lu.ma/h0qvrajz) with Snowflake! Please find the meetup slides [here](https://docs.google.com/presentation/d/1e3CxQBV3JsfGp30SwyvS3eM_tW-ghOhJ9PAJGK6KR54/edit?usp=sharing).
+- [2024/10] We have just created a developer slack ([slack.vllm.ai](https://slack.vllm.ai)) focusing on coordinating contributions and discussing features. Please feel free to join us there!
- [2024/10] Ray Summit 2024 held a special track for vLLM! Please find the opening talk slides from the vLLM team [here](https://docs.google.com/presentation/d/1B_KQxpHBTRa_mDF-tR6i8rWdOU5QoTZNcEg2MKZxEHM/edit?usp=sharing). Learn more from the [talks](https://raysummit.anyscale.com/flow/anyscale/raysummit2024/landing/page/sessioncatalog?tab.day=20241001&search.sessiontracks=1719251906298001uzJ2) from other vLLM contributors and users!
- [2024/09] We hosted [the sixth vLLM meetup](https://lu.ma/87q3nvnh) with NVIDIA! Please find the meetup slides [here](https://docs.google.com/presentation/d/1wrLGwytQfaOTd5wCGSPNhoaW3nq0E-9wqyP7ny93xRs/edit?usp=sharing).
- [2024/07] We hosted [the fifth vLLM meetup](https://lu.ma/lp0gyjqr) with AWS! Please find the meetup slides [here](https://docs.google.com/presentation/d/1RgUD8aCfcHocghoP3zmXzck9vX3RCI9yfUAB2Bbcl4Y/edit?usp=sharing).
@@ -42,7 +44,7 @@ vLLM is fast with:
- Speculative decoding
- Chunked prefill
-**Performance benchmark**: We include a performance benchmark at the end of [our blog post](https://blog.vllm.ai/2024/09/05/perf-update.html). It compares the performance of vLLM against other LLM serving engines ([TensorRT-LLM](https://github.com/NVIDIA/TensorRT-LLM), [SGLang](https://github.com/sgl-project/sglang) and [LMDeploy](https://github.com/InternLM/lmdeploy)). The implementation is under [nightly-benchmarks folder](.buildkite/nightly-benchmarks/) and you can [reproduce](https://github.com/vllm-project/vllm/issues/8176) this benchmark using our one-click runnable script.
+**Performance benchmark**: We include a performance benchmark at the end of [our blog post](https://blog.vllm.ai/2024/09/05/perf-update.html). It compares the performance of vLLM against other LLM serving engines ([TensorRT-LLM](https://github.com/NVIDIA/TensorRT-LLM), [SGLang](https://github.com/sgl-project/sglang) and [LMDeploy](https://github.com/InternLM/lmdeploy)). The implementation is under [nightly-benchmarks folder](.buildkite/nightly-benchmarks/) and you can [reproduce](https://github.com/vllm-project/vllm/issues/8176) this benchmark using our one-click runnable script.
vLLM is flexible and easy to use with:
@@ -98,6 +100,7 @@ vLLM is a community project. Our compute resources for development and testing a
- Dropbox
- Google Cloud
- Lambda Lab
+- Nebius
- NVIDIA
- Replicate
- Roblox
@@ -127,5 +130,6 @@ If you use vLLM for your research, please cite our [paper](https://arxiv.org/abs
* For technical questions and feature requests, please use Github issues or discussions.
* For discussing with fellow users, please use Discord.
+* For coordinating contributions and development, please use Slack.
* For security disclosures, please use Github's security advisory feature.
* For collaborations and partnerships, please contact us at vllm-questions AT lists.berkeley.edu.
diff --git a/benchmarks/README.md b/benchmarks/README.md
index 192d6c4022c83..2aa4a285021f1 100644
--- a/benchmarks/README.md
+++ b/benchmarks/README.md
@@ -6,3 +6,14 @@ You can download the dataset by running:
```bash
wget https://huggingface.co/datasets/anon8231489123/ShareGPT_Vicuna_unfiltered/resolve/main/ShareGPT_V3_unfiltered_cleaned_split.json
```
+
+## Downloading the ShareGPT4V dataset
+
+The json file refers to several image datasets (coco, llava, etc.). The benchmark scripts
+will ignore a datapoint if the referred image is missing.
+```bash
+wget https://huggingface.co/datasets/Lin-Chen/ShareGPT4V/resolve/main/sharegpt4v_instruct_gpt4-vision_cap100k.json
+mkdir coco -p
+wget http://images.cocodataset.org/zips/train2017.zip -O coco/train2017.zip
+unzip coco/train2017.zip -d coco/
+```
diff --git a/benchmarks/backend_request_func.py b/benchmarks/backend_request_func.py
index 4813fde27f0bc..c3fed56e8a956 100644
--- a/benchmarks/backend_request_func.py
+++ b/benchmarks/backend_request_func.py
@@ -54,6 +54,7 @@ async def async_request_tgi(
"do_sample": True,
"temperature": 0.01, # TGI does not accept 0.0 temperature.
"top_p": 0.99, # TGI does not accept 1.0 top_p.
+ "truncate": request_func_input.prompt_len,
# TGI does not accept ignore_eos flag.
}
payload = {
@@ -79,7 +80,7 @@ async def async_request_tgi(
# any data, we should skip it.
if chunk_bytes.startswith(":"):
continue
- chunk = remove_prefix(chunk_bytes, "data:")
+ chunk = chunk_bytes.removeprefix("data:")
data = json.loads(chunk)
timestamp = time.perf_counter()
@@ -144,8 +145,8 @@ async def async_request_trt_llm(
if not chunk_bytes:
continue
- chunk = remove_prefix(chunk_bytes.decode("utf-8"),
- "data:")
+ chunk = chunk_bytes.decode("utf-8").removeprefix(
+ "data:")
data = json.loads(chunk)
output.generated_text += data["text_output"]
@@ -256,13 +257,14 @@ async def async_request_openai_completions(
async with session.post(url=api_url, json=payload,
headers=headers) as response:
if response.status == 200:
+ first_chunk_received = False
async for chunk_bytes in response.content:
chunk_bytes = chunk_bytes.strip()
if not chunk_bytes:
continue
- chunk = remove_prefix(chunk_bytes.decode("utf-8"),
- "data: ")
+ chunk = chunk_bytes.decode("utf-8").removeprefix(
+ "data: ")
if chunk == "[DONE]":
latency = time.perf_counter() - st
else:
@@ -274,7 +276,8 @@ async def async_request_openai_completions(
if data["choices"][0]["text"]:
timestamp = time.perf_counter()
# First token
- if ttft == 0.0:
+ if not first_chunk_received:
+ first_chunk_received = True
ttft = time.perf_counter() - st
output.ttft = ttft
@@ -285,9 +288,14 @@ async def async_request_openai_completions(
most_recent_timestamp = timestamp
generated_text += data["choices"][0]["text"]
-
+ if first_chunk_received:
+ output.success = True
+ else:
+ output.success = False
+ output.error = (
+ "Never received a valid chunk to calculate TTFT."
+ "This response will be marked as failed!")
output.generated_text = generated_text
- output.success = True
output.latency = latency
else:
output.error = response.reason or ""
@@ -324,7 +332,7 @@ async def async_request_openai_chat_completions(
},
],
"temperature": 0.0,
- "max_tokens": request_func_input.output_len,
+ "max_completion_tokens": request_func_input.output_len,
"stream": True,
"ignore_eos": request_func_input.ignore_eos,
}
@@ -349,8 +357,8 @@ async def async_request_openai_chat_completions(
if not chunk_bytes:
continue
- chunk = remove_prefix(chunk_bytes.decode("utf-8"),
- "data: ")
+ chunk = chunk_bytes.decode("utf-8").removeprefix(
+ "data: ")
if chunk == "[DONE]":
latency = time.perf_counter() - st
else:
@@ -389,14 +397,6 @@ async def async_request_openai_chat_completions(
return output
-# Since vllm must support Python 3.8, we can't use str.removeprefix(prefix)
-# introduced in Python 3.9
-def remove_prefix(text: str, prefix: str) -> str:
- if text.startswith(prefix):
- return text[len(prefix):]
- return text
-
-
def get_model(pretrained_model_name_or_path: str) -> str:
if os.getenv('VLLM_USE_MODELSCOPE', 'False').lower() == 'true':
from modelscope import snapshot_download
diff --git a/benchmarks/benchmark_latency.py b/benchmarks/benchmark_latency.py
index 79a48b2a1a845..0a14aedd5feba 100644
--- a/benchmarks/benchmark_latency.py
+++ b/benchmarks/benchmark_latency.py
@@ -1,5 +1,6 @@
"""Benchmark the latency of processing a single batch of requests."""
import argparse
+import dataclasses
import json
import time
from pathlib import Path
@@ -10,44 +11,19 @@
from tqdm import tqdm
from vllm import LLM, SamplingParams
-from vllm.engine.arg_utils import DEVICE_OPTIONS, EngineArgs
+from vllm.engine.arg_utils import EngineArgs
from vllm.inputs import PromptType
-from vllm.model_executor.layers.quantization import QUANTIZATION_METHODS
from vllm.utils import FlexibleArgumentParser
def main(args: argparse.Namespace):
print(args)
+ engine_args = EngineArgs.from_cli_args(args)
+
# NOTE(woosuk): If the request cannot be processed in a single batch,
# the engine will automatically process the request in multiple batches.
- llm = LLM(
- model=args.model,
- speculative_model=args.speculative_model,
- num_speculative_tokens=args.num_speculative_tokens,
- speculative_draft_tensor_parallel_size=\
- args.speculative_draft_tensor_parallel_size,
- tokenizer=args.tokenizer,
- quantization=args.quantization,
- tensor_parallel_size=args.tensor_parallel_size,
- trust_remote_code=args.trust_remote_code,
- dtype=args.dtype,
- max_model_len=args.max_model_len,
- enforce_eager=args.enforce_eager,
- kv_cache_dtype=args.kv_cache_dtype,
- quantization_param_path=args.quantization_param_path,
- device=args.device,
- ray_workers_use_nsight=args.ray_workers_use_nsight,
- use_v2_block_manager=args.use_v2_block_manager,
- enable_chunked_prefill=args.enable_chunked_prefill,
- download_dir=args.download_dir,
- block_size=args.block_size,
- gpu_memory_utilization=args.gpu_memory_utilization,
- load_format=args.load_format,
- distributed_executor_backend=args.distributed_executor_backend,
- otlp_traces_endpoint=args.otlp_traces_endpoint,
- enable_prefix_caching=args.enable_prefix_caching,
- )
+ llm = LLM(**dataclasses.asdict(engine_args))
sampling_params = SamplingParams(
n=args.n,
@@ -126,19 +102,6 @@ def run_to_completion(profile_dir: Optional[str] = None):
parser = FlexibleArgumentParser(
description='Benchmark the latency of processing a single batch of '
'requests till completion.')
- parser.add_argument('--model', type=str, default='facebook/opt-125m')
- parser.add_argument('--speculative-model', type=str, default=None)
- parser.add_argument('--num-speculative-tokens', type=int, default=None)
- parser.add_argument('--speculative-draft-tensor-parallel-size',
- '-spec-draft-tp',
- type=int,
- default=None)
- parser.add_argument('--tokenizer', type=str, default=None)
- parser.add_argument('--quantization',
- '-q',
- choices=[*QUANTIZATION_METHODS, None],
- default=None)
- parser.add_argument('--tensor-parallel-size', '-tp', type=int, default=1)
parser.add_argument('--input-len', type=int, default=32)
parser.add_argument('--output-len', type=int, default=128)
parser.add_argument('--batch-size', type=int, default=8)
@@ -155,45 +118,6 @@ def run_to_completion(profile_dir: Optional[str] = None):
type=int,
default=30,
help='Number of iterations to run.')
- parser.add_argument('--trust-remote-code',
- action='store_true',
- help='trust remote code from huggingface')
- parser.add_argument(
- '--max-model-len',
- type=int,
- default=None,
- help='Maximum length of a sequence (including prompt and output). '
- 'If None, will be derived from the model.')
- parser.add_argument(
- '--dtype',
- type=str,
- default='auto',
- choices=['auto', 'half', 'float16', 'bfloat16', 'float', 'float32'],
- help='data type for model weights and activations. '
- 'The "auto" option will use FP16 precision '
- 'for FP32 and FP16 models, and BF16 precision '
- 'for BF16 models.')
- parser.add_argument('--enforce-eager',
- action='store_true',
- help='enforce eager mode and disable CUDA graph')
- parser.add_argument(
- '--kv-cache-dtype',
- type=str,
- choices=['auto', 'fp8', 'fp8_e5m2', 'fp8_e4m3'],
- default="auto",
- help='Data type for kv cache storage. If "auto", will use model '
- 'data type. CUDA 11.8+ supports fp8 (=fp8_e4m3) and fp8_e5m2. '
- 'ROCm (AMD GPU) supports fp8 (=fp8_e4m3)')
- parser.add_argument(
- '--quantization-param-path',
- type=str,
- default=None,
- help='Path to the JSON file containing the KV cache scaling factors. '
- 'This should generally be supplied, when KV cache dtype is FP8. '
- 'Otherwise, KV cache scaling factors default to 1.0, which may cause '
- 'accuracy issues. FP8_E5M2 (without scaling) is only supported on '
- 'cuda version greater than 11.8. On ROCm (AMD GPU), FP8_E4M3 is '
- 'instead supported for common inference criteria.')
parser.add_argument(
'--profile',
action='store_true',
@@ -204,81 +128,12 @@ def run_to_completion(profile_dir: Optional[str] = None):
default=None,
help=('path to save the pytorch profiler output. Can be visualized '
'with ui.perfetto.dev or Tensorboard.'))
- parser.add_argument("--device",
- type=str,
- default="auto",
- choices=DEVICE_OPTIONS,
- help='device type for vLLM execution')
- parser.add_argument('--block-size',
- type=int,
- default=16,
- help='block size of key/value cache')
- parser.add_argument(
- '--enable-chunked-prefill',
- action='store_true',
- help='If True, the prefill requests can be chunked based on the '
- 'max_num_batched_tokens')
- parser.add_argument("--enable-prefix-caching",
- action='store_true',
- help="Enable automatic prefix caching")
- parser.add_argument('--use-v2-block-manager',
- action='store_true',
- default=EngineArgs.use_v2_block_manager)
- parser.add_argument(
- "--ray-workers-use-nsight",
- action='store_true',
- help="If specified, use nsight to profile ray workers",
- )
- parser.add_argument('--download-dir',
- type=str,
- default=None,
- help='directory to download and load the weights, '
- 'default to the default cache dir of huggingface')
parser.add_argument(
'--output-json',
type=str,
default=None,
help='Path to save the latency results in JSON format.')
- parser.add_argument('--gpu-memory-utilization',
- type=float,
- default=0.9,
- help='the fraction of GPU memory to be used for '
- 'the model executor, which can range from 0 to 1.'
- 'If unspecified, will use the default value of 0.9.')
- parser.add_argument(
- '--load-format',
- type=str,
- default=EngineArgs.load_format,
- choices=[
- 'auto', 'pt', 'safetensors', 'npcache', 'dummy', 'tensorizer',
- 'bitsandbytes'
- ],
- help='The format of the model weights to load.\n\n'
- '* "auto" will try to load the weights in the safetensors format '
- 'and fall back to the pytorch bin format if safetensors format '
- 'is not available.\n'
- '* "pt" will load the weights in the pytorch bin format.\n'
- '* "safetensors" will load the weights in the safetensors format.\n'
- '* "npcache" will load the weights in pytorch format and store '
- 'a numpy cache to speed up the loading.\n'
- '* "dummy" will initialize the weights with random values, '
- 'which is mainly for profiling.\n'
- '* "tensorizer" will load the weights using tensorizer from '
- 'CoreWeave. See the Tensorize vLLM Model script in the Examples'
- 'section for more information.\n'
- '* "bitsandbytes" will load the weights using bitsandbytes '
- 'quantization.\n')
- parser.add_argument(
- '--distributed-executor-backend',
- choices=['ray', 'mp'],
- default=None,
- help='Backend to use for distributed serving. When more than 1 GPU '
- 'is used, will be automatically set to "ray" if installed '
- 'or "mp" (multiprocessing) otherwise.')
- parser.add_argument(
- '--otlp-traces-endpoint',
- type=str,
- default=None,
- help='Target URL to which OpenTelemetry traces will be sent.')
+
+ parser = EngineArgs.add_cli_args(parser)
args = parser.parse_args()
main(args)
diff --git a/benchmarks/benchmark_prefix_caching.py b/benchmarks/benchmark_prefix_caching.py
index f14092d347343..5e9381f712e10 100644
--- a/benchmarks/benchmark_prefix_caching.py
+++ b/benchmarks/benchmark_prefix_caching.py
@@ -25,6 +25,7 @@
--input-length-range 128:256
"""
+import dataclasses
import json
import random
import time
@@ -53,13 +54,30 @@ def test_prefix(llm=None, sampling_params=None, prompts=None):
print(f"cost time {end_time - start_time}")
-def sample_requests(
+@dataclasses.dataclass
+class Request:
+ prompt: str
+ prompt_len: int
+ output_len: int
+
+
+def sample_tokens(tokenizer: PreTrainedTokenizerBase, length: int) -> str:
+ vocab = tokenizer.get_vocab()
+ # Remove the special tokens.
+ vocab = {
+ k: v
+ for k, v in vocab.items() if k not in tokenizer.all_special_ids
+ }
+ return random.choices(list(vocab.values()), k=length)
+
+
+def sample_requests_from_dataset(
dataset_path: str,
num_requests: int,
tokenizer: PreTrainedTokenizerBase,
input_length_range: Tuple[int, int],
fixed_output_len: Optional[int],
-) -> List[Tuple[str, int, int]]:
+) -> List[Request]:
if fixed_output_len is not None and fixed_output_len < 4:
raise ValueError("output_len too small")
@@ -76,31 +94,55 @@ def sample_requests(
random.shuffle(dataset)
min_len, max_len = input_length_range
+ assert min_len >= 0 and max_len >= min_len, "input_length_range too small"
# Filter out sequences that are too long or too short
- filtered_dataset: List[Tuple[str, int, int]] = []
+ filtered_requests: List[Request] = []
+
for i in range(len(dataset)):
- if len(filtered_dataset) == num_requests:
+ if len(filtered_requests) == num_requests:
break
# Tokenize the prompts and completions.
- prompt = dataset[i][0]
- prompt_token_ids = tokenizer(prompt).input_ids
+ prompt_token_ids = tokenizer(dataset[i][0]).input_ids
+ prompt = tokenizer.decode(prompt_token_ids)
completion = dataset[i][1]
completion_token_ids = tokenizer(completion).input_ids
prompt_len = len(prompt_token_ids)
- output_len = len(completion_token_ids
- ) if fixed_output_len is None else fixed_output_len
- if prompt_len < 4 or output_len < 4:
- # Prune too short sequences.
- continue
+ output_len = (len(completion_token_ids)
+ if fixed_output_len is None else fixed_output_len)
if min_len <= prompt_len <= max_len:
- filtered_dataset.append((prompt, prompt_len, output_len))
+ filtered_requests.append(Request(prompt, prompt_len, output_len))
- return filtered_dataset
+ return filtered_requests
-def repeat_and_sort_requests(requests: List[Tuple[str, int, int]],
+def sample_requests_from_random(
+ num_requests: int,
+ tokenizer: PreTrainedTokenizerBase,
+ input_length_range: Tuple[int, int],
+ fixed_output_len: Optional[int],
+ prefix_len: int,
+) -> List[Request]:
+
+ requests = []
+ prefix_token_ids = sample_tokens(tokenizer, prefix_len)
+ min_len, max_len = input_length_range
+
+ for i in range(num_requests):
+ unique_part_token_ids = sample_tokens(
+ tokenizer,
+ random.randint(min_len - prefix_len, max_len - prefix_len))
+ prompt_token_ids = prefix_token_ids + unique_part_token_ids
+ prompt = tokenizer.decode(prompt_token_ids)
+ prompt_len = len(prompt_token_ids)
+ assert (min_len <= prompt_len <= max_len
+ ), f"prompt_len {prompt_len} out of range {min_len}:{max_len}"
+ requests.append(Request(prompt, prompt_len, fixed_output_len))
+ return requests
+
+
+def repeat_and_sort_requests(requests: List[Request],
repeat_count: int,
sort: bool = False) -> List[str]:
repeated_requests = requests * repeat_count
@@ -108,7 +150,7 @@ def repeat_and_sort_requests(requests: List[Tuple[str, int, int]],
repeated_requests.sort(key=lambda x: x[1])
else:
random.shuffle(repeated_requests)
- return [req[0] for req in repeated_requests]
+ return [req.prompt for req in repeated_requests]
def main(args):
@@ -116,9 +158,12 @@ def main(args):
input_length_range = tuple(map(int, args.input_length_range.split(':')))
random.seed(args.seed)
if args.dataset_path is not None:
- print(f"Start to sample {args.num_prompts} prompts"
- "from {args.dataset_path}")
- filtered_datasets = sample_requests(
+ if args.prefix_len > 0:
+ raise ValueError("prefix-len is not supported when "
+ "dataset-path is provided.")
+ print(f"Start to sample {args.num_prompts} prompts "
+ f"from {args.dataset_path}")
+ filtered_requests = sample_requests_from_dataset(
dataset_path=args.dataset_path,
num_requests=args.num_prompts,
tokenizer=tokenizer,
@@ -126,32 +171,34 @@ def main(args):
fixed_output_len=args.output_len,
)
else:
- prompt_len = len(tokenizer(PROMPT).input_ids)
- filtered_datasets = [(PROMPT, prompt_len, args.output_len)
- ] * args.num_prompts
-
- llm = LLM(model=args.model,
- tokenizer_mode='auto',
- trust_remote_code=True,
- enforce_eager=True,
- use_v2_block_manager=args.use_v2_block_manager,
- tensor_parallel_size=args.tensor_parallel_size,
- enable_prefix_caching=args.enable_prefix_caching)
+ print(f"Start to sample {args.num_prompts} prompts from random")
+ filtered_requests = sample_requests_from_random(
+ num_requests=args.num_prompts,
+ tokenizer=tokenizer,
+ input_length_range=input_length_range,
+ fixed_output_len=args.output_len,
+ prefix_len=args.prefix_len,
+ )
+
+ # Print some helpful stats of the requests.
+ print(f"Sampled {len(filtered_requests)} requests.")
+ prompt_lens = [req.prompt_len for req in filtered_requests]
+ print(f"Average input length: {sum(prompt_lens) / len(prompt_lens)}")
+ print(f"P50 input length: {sorted(prompt_lens)[len(prompt_lens) // 2]}")
+ print(f"Min Prompt Length: {min(prompt_lens)}")
+ print(f"Max Prompt Length: {max(prompt_lens)}")
+
+ engine_args = EngineArgs.from_cli_args(args)
+
+ llm = LLM(**dataclasses.asdict(engine_args))
sampling_params = SamplingParams(temperature=0, max_tokens=args.output_len)
- print("Testing filtered datasets")
- prompts = repeat_and_sort_requests(filtered_datasets,
+ print("Testing filtered requests")
+ prompts = repeat_and_sort_requests(filtered_requests,
repeat_count=args.repeat_count,
sort=args.sort)
- print("------warm up------")
- test_prefix(
- llm=llm,
- prompts=prompts,
- sampling_params=sampling_params,
- )
-
print("------start generating------")
test_prefix(
llm=llm,
@@ -164,41 +211,37 @@ def main(args):
parser = FlexibleArgumentParser(
description=
'Benchmark the performance with or without automatic prefix caching.')
- parser.add_argument('--model',
- type=str,
- default='baichuan-inc/Baichuan2-13B-Chat')
parser.add_argument("--dataset-path",
type=str,
default=None,
help="Path to the dataset.")
- parser.add_argument('--tensor-parallel-size', '-tp', type=int, default=1)
parser.add_argument('--output-len', type=int, default=10)
- parser.add_argument('--enable-prefix-caching',
- action='store_true',
- help='enable prefix caching')
- parser.add_argument('--use-v2-block-manager',
- action='store_true',
- default=EngineArgs.use_v2_block_manager,
- help='Use BlockSpaceMangerV2')
parser.add_argument('--num-prompts',
type=int,
- default=1,
+ required=True,
help="Number of the prompts sampled from dataset")
parser.add_argument('--repeat-count',
type=int,
- default=100,
+ default=1,
help='Number of times to repeat each prompt')
parser.add_argument('--sort',
action='store_true',
help='Sort prompts by input length')
parser.add_argument('--input-length-range',
type=str,
- default='128:256',
+ required=True,
help='Range of input lengths for sampling prompts,'
'specified as "min:max" (e.g., "128:256").')
- parser.add_argument("--seed",
- type=int,
- default=0,
- help='Random seed for reproducibility')
+ parser.add_argument(
+ "--prefix-len",
+ type=int,
+ default=0,
+ help="Specifies the length of a common prefix to be "
+ "added to the input prompt. The input-length-range will "
+ "subtract this length when filtering prompts. Only used "
+ "when dataset-path is not provided.",
+ )
+
+ parser = EngineArgs.add_cli_args(parser)
args = parser.parse_args()
main(args)
diff --git a/benchmarks/benchmark_prioritization.py b/benchmarks/benchmark_prioritization.py
index 8843e3a927a01..e0c9e6a6db502 100644
--- a/benchmarks/benchmark_prioritization.py
+++ b/benchmarks/benchmark_prioritization.py
@@ -1,5 +1,6 @@
"""Benchmark offline prioritization."""
import argparse
+import dataclasses
import json
import random
import time
@@ -7,7 +8,8 @@
from transformers import AutoTokenizer, PreTrainedTokenizerBase
-from vllm.model_executor.layers.quantization import QUANTIZATION_METHODS
+from vllm.engine.arg_utils import EngineArgs
+from vllm.utils import FlexibleArgumentParser
def sample_requests(
@@ -62,46 +64,11 @@ def sample_requests(
def run_vllm(
requests: List[Tuple[str, int, int]],
- model: str,
- tokenizer: str,
- quantization: Optional[str],
- tensor_parallel_size: int,
- seed: int,
n: int,
- trust_remote_code: bool,
- dtype: str,
- max_model_len: Optional[int],
- enforce_eager: bool,
- kv_cache_dtype: str,
- quantization_param_path: Optional[str],
- device: str,
- enable_prefix_caching: bool,
- enable_chunked_prefill: bool,
- max_num_batched_tokens: int,
- gpu_memory_utilization: float = 0.9,
- download_dir: Optional[str] = None,
+ engine_args: EngineArgs,
) -> float:
from vllm import LLM, SamplingParams
- llm = LLM(
- model=model,
- tokenizer=tokenizer,
- quantization=quantization,
- tensor_parallel_size=tensor_parallel_size,
- seed=seed,
- trust_remote_code=trust_remote_code,
- dtype=dtype,
- max_model_len=max_model_len,
- gpu_memory_utilization=gpu_memory_utilization,
- enforce_eager=enforce_eager,
- kv_cache_dtype=kv_cache_dtype,
- quantization_param_path=quantization_param_path,
- device=device,
- enable_prefix_caching=enable_prefix_caching,
- download_dir=download_dir,
- enable_chunked_prefill=enable_chunked_prefill,
- max_num_batched_tokens=max_num_batched_tokens,
- disable_log_stats=False,
- )
+ llm = LLM(**dataclasses.asdict(engine_args))
# Add the requests to the engine.
prompts = []
@@ -142,16 +109,8 @@ def main(args: argparse.Namespace):
args.output_len)
if args.backend == "vllm":
- elapsed_time = run_vllm(requests, args.model, args.tokenizer,
- args.quantization, args.tensor_parallel_size,
- args.seed, args.n, args.trust_remote_code,
- args.dtype, args.max_model_len,
- args.enforce_eager, args.kv_cache_dtype,
- args.quantization_param_path, args.device,
- args.enable_prefix_caching,
- args.enable_chunked_prefill,
- args.max_num_batched_tokens,
- args.gpu_memory_utilization, args.download_dir)
+ elapsed_time = run_vllm(requests, args.n,
+ EngineArgs.from_cli_args(args))
else:
raise ValueError(f"Unknown backend: {args.backend}")
total_num_tokens = sum(prompt_len + output_len
@@ -173,7 +132,7 @@ def main(args: argparse.Namespace):
if __name__ == "__main__":
- parser = argparse.ArgumentParser(description="Benchmark the throughput.")
+ parser = FlexibleArgumentParser(description="Benchmark the throughput.")
parser.add_argument("--backend",
type=str,
choices=["vllm", "hf", "mii"],
@@ -191,13 +150,6 @@ def main(args: argparse.Namespace):
default=None,
help="Output length for each request. Overrides the "
"output length from the dataset.")
- parser.add_argument("--model", type=str, default="facebook/opt-125m")
- parser.add_argument("--tokenizer", type=str, default=None)
- parser.add_argument('--quantization',
- '-q',
- choices=[*QUANTIZATION_METHODS, None],
- default=None)
- parser.add_argument("--tensor-parallel-size", "-tp", type=int, default=1)
parser.add_argument("--n",
type=int,
default=1,
@@ -206,81 +158,13 @@ def main(args: argparse.Namespace):
type=int,
default=200,
help="Number of prompts to process.")
- parser.add_argument("--seed", type=int, default=0)
- parser.add_argument('--trust-remote-code',
- action='store_true',
- help='trust remote code from huggingface')
- parser.add_argument(
- '--max-model-len',
- type=int,
- default=None,
- help='Maximum length of a sequence (including prompt and output). '
- 'If None, will be derived from the model.')
- parser.add_argument(
- '--dtype',
- type=str,
- default='auto',
- choices=['auto', 'half', 'float16', 'bfloat16', 'float', 'float32'],
- help='data type for model weights and activations. '
- 'The "auto" option will use FP16 precision '
- 'for FP32 and FP16 models, and BF16 precision '
- 'for BF16 models.')
- parser.add_argument('--gpu-memory-utilization',
- type=float,
- default=0.9,
- help='the fraction of GPU memory to be used for '
- 'the model executor, which can range from 0 to 1.'
- 'If unspecified, will use the default value of 0.9.')
- parser.add_argument("--enforce-eager",
- action="store_true",
- help="enforce eager execution")
- parser.add_argument(
- '--kv-cache-dtype',
- type=str,
- choices=['auto', 'fp8', 'fp8_e5m2', 'fp8_e4m3'],
- default="auto",
- help='Data type for kv cache storage. If "auto", will use model '
- 'data type. CUDA 11.8+ supports fp8 (=fp8_e4m3) and fp8_e5m2. '
- 'ROCm (AMD GPU) supports fp8 (=fp8_e4m3)')
- parser.add_argument(
- '--quantization-param-path',
- type=str,
- default=None,
- help='Path to the JSON file containing the KV cache scaling factors. '
- 'This should generally be supplied, when KV cache dtype is FP8. '
- 'Otherwise, KV cache scaling factors default to 1.0, which may cause '
- 'accuracy issues. FP8_E5M2 (without scaling) is only supported on '
- 'cuda version greater than 11.8. On ROCm (AMD GPU), FP8_E4M3 is '
- 'instead supported for common inference criteria.')
- parser.add_argument(
- "--device",
- type=str,
- default="cuda",
- choices=["cuda", "cpu"],
- help='device type for vLLM execution, supporting CUDA and CPU.')
- parser.add_argument(
- "--enable-prefix-caching",
- action='store_true',
- help="enable automatic prefix caching for vLLM backend.")
- parser.add_argument("--enable-chunked-prefill",
- action='store_true',
- help="enable chunked prefill for vLLM backend.")
- parser.add_argument('--max-num-batched-tokens',
- type=int,
- default=None,
- help='maximum number of batched tokens per '
- 'iteration')
- parser.add_argument('--download-dir',
- type=str,
- default=None,
- help='directory to download and load the weights, '
- 'default to the default cache dir of huggingface')
parser.add_argument(
'--output-json',
type=str,
default=None,
help='Path to save the throughput results in JSON format.')
+ parser = EngineArgs.add_cli_args(parser)
args = parser.parse_args()
if args.tokenizer is None:
args.tokenizer = args.model
diff --git a/benchmarks/benchmark_serving.py b/benchmarks/benchmark_serving.py
index c1a396c81f666..e9fc037a46965 100644
--- a/benchmarks/benchmark_serving.py
+++ b/benchmarks/benchmark_serving.py
@@ -53,6 +53,8 @@
except ImportError:
from argparse import ArgumentParser as FlexibleArgumentParser
+MILLISECONDS_TO_SECONDS_CONVERSION = 1000
+
@dataclass
class BenchmarkMetrics:
@@ -60,6 +62,7 @@ class BenchmarkMetrics:
total_input: int
total_output: int
request_throughput: float
+ request_goodput: float
output_throughput: float
total_token_throughput: float
mean_ttft_ms: float
@@ -202,6 +205,7 @@ def sample_hf_requests(
dataset_split: str,
num_requests: int,
tokenizer: PreTrainedTokenizerBase,
+ random_seed: int,
fixed_output_len: Optional[int] = None,
) -> List[Tuple[str, str, int, Optional[Dict[str, Collection[str]]]]]:
dataset = load_dataset(dataset_path,
@@ -210,8 +214,8 @@ def sample_hf_requests(
streaming=True)
assert "conversations" in dataset.features, (
"HF Dataset must have 'conversations' column.")
- filtered_dataset = dataset.shuffle().filter(
- lambda x: len(x["conversations"]) >= 2)
+ filter_func = lambda x: len(x["conversations"]) >= 2
+ filtered_dataset = dataset.shuffle(seed=random_seed).filter(filter_func)
sampled_requests: List[Tuple[str, int, int, Dict[str,
Collection[str]]]] = []
for data in filtered_dataset:
@@ -247,6 +251,19 @@ def sample_hf_requests(
"url": f"data:image/jpeg;base64,{image_base64}"
},
}
+ elif "image" in data and isinstance(data["image"], str):
+ if (data["image"].startswith("http://") or \
+ data["image"].startswith("file://")):
+ image_url = data["image"]
+ else:
+ image_url = f"file://{data['image']}"
+
+ mm_content = {
+ "type": "image_url",
+ "image_url": {
+ "url": image_url
+ },
+ }
else:
mm_content = None
@@ -293,8 +310,33 @@ def sample_random_requests(
async def get_request(
input_requests: List[Tuple[str, int, int]],
request_rate: float,
+ burstiness: float = 1.0,
) -> AsyncGenerator[Tuple[str, int, int], None]:
+ """
+ Asynchronously generates requests at a specified rate
+ with OPTIONAL burstiness.
+
+ Args:
+ input_requests:
+ A list of input requests, each represented as a tuple.
+ request_rate:
+ The rate at which requests are generated (requests/s).
+ burstiness (optional):
+ The burstiness factor of the request generation.
+ Only takes effect when request_rate is not inf.
+ Default value is 1, which follows a Poisson process.
+ Otherwise, the request intervals follow a gamma distribution.
+ A lower burstiness value (0 < burstiness < 1) results
+ in more bursty requests, while a higher burstiness value
+ (burstiness > 1) results in a more uniform arrival of requests.
+ """
input_requests = iter(input_requests)
+
+ # Calculate scale parameter theta to maintain the desired request_rate.
+ assert burstiness > 0, (
+ f"A positive burstiness factor is expected, but given {burstiness}.")
+ theta = 1.0 / (request_rate * burstiness)
+
for request in input_requests:
yield request
@@ -302,8 +344,9 @@ async def get_request(
# If the request rate is infinity, then we don't need to wait.
continue
- # Sample the request interval from the exponential distribution.
- interval = np.random.exponential(1.0 / request_rate)
+ # Sample the request interval from the gamma distribution.
+ # If burstiness is 1, it follows exponential distribution.
+ interval = np.random.gamma(shape=burstiness, scale=theta)
# The next request will be sent after the interval.
await asyncio.sleep(interval)
@@ -315,12 +358,15 @@ def calculate_metrics(
tokenizer: PreTrainedTokenizerBase,
selected_percentile_metrics: List[str],
selected_percentiles: List[float],
+ gootput_config_dict: Dict[str, float],
) -> Tuple[BenchmarkMetrics, List[int]]:
actual_output_lens: List[int] = []
total_input = 0
completed = 0
+ good_completed = 0
itls: List[float] = []
tpots: List[float] = []
+ all_tpots: List[float] = []
ttfts: List[float] = []
e2els: List[float] = []
for i in range(len(outputs)):
@@ -334,9 +380,13 @@ def calculate_metrics(
add_special_tokens=False).input_ids)
actual_output_lens.append(output_len)
total_input += input_requests[i][1]
+ tpot = 0
if output_len > 1:
- tpots.append(
- (outputs[i].latency - outputs[i].ttft) / (output_len - 1))
+ tpot = (outputs[i].latency - outputs[i].ttft) / (output_len -
+ 1)
+ tpots.append(tpot)
+ # Note: if output_len <= 1, we regard tpot as 0 for goodput
+ all_tpots.append(tpot)
itls += outputs[i].itl
ttfts.append(outputs[i].ttft)
e2els.append(outputs[i].latency)
@@ -344,6 +394,28 @@ def calculate_metrics(
else:
actual_output_lens.append(0)
+ if gootput_config_dict:
+ valid_metrics = []
+ slo_values = []
+
+ if "ttft" in gootput_config_dict:
+ valid_metrics.append(ttfts)
+ slo_values.append(gootput_config_dict["ttft"] /
+ MILLISECONDS_TO_SECONDS_CONVERSION)
+ if "tpot" in gootput_config_dict:
+ valid_metrics.append(all_tpots)
+ slo_values.append(gootput_config_dict["tpot"] /
+ MILLISECONDS_TO_SECONDS_CONVERSION)
+ if "e2el" in gootput_config_dict:
+ valid_metrics.append(e2els)
+ slo_values.append(gootput_config_dict["e2el"] /
+ MILLISECONDS_TO_SECONDS_CONVERSION)
+
+ for req_metric in zip(*valid_metrics):
+ is_good_req = all([s >= r for s, r in zip(slo_values, req_metric)])
+ if is_good_req:
+ good_completed += 1
+
if completed == 0:
warnings.warn(
"All requests failed. This is likely due to a misconfiguration "
@@ -354,6 +426,7 @@ def calculate_metrics(
total_input=total_input,
total_output=sum(actual_output_lens),
request_throughput=completed / dur_s,
+ request_goodput=good_completed / dur_s,
output_throughput=sum(actual_output_lens) / dur_s,
total_token_throughput=(total_input + sum(actual_output_lens)) / dur_s,
mean_ttft_ms=np.mean(ttfts or 0) *
@@ -372,9 +445,9 @@ def calculate_metrics(
median_itl_ms=np.median(itls or 0) * 1000,
percentiles_itl_ms=[(p, np.percentile(itls or 0, p) * 1000)
for p in selected_percentiles],
- mean_e2el_ms=np.median(e2els or 0) * 1000,
+ mean_e2el_ms=np.mean(e2els or 0) * 1000,
std_e2el_ms=np.std(e2els or 0) * 1000,
- median_e2el_ms=np.mean(e2els or 0) * 1000,
+ median_e2el_ms=np.median(e2els or 0) * 1000,
percentiles_e2el_ms=[(p, np.percentile(e2els or 0, p) * 1000)
for p in selected_percentiles],
)
@@ -392,11 +465,14 @@ async def benchmark(
logprobs: Optional[int],
best_of: int,
request_rate: float,
+ burstiness: float,
disable_tqdm: bool,
profile: bool,
selected_percentile_metrics: List[str],
selected_percentiles: List[str],
ignore_eos: bool,
+ gootput_config_dict: Dict[str, float],
+ max_concurrency: Optional[int],
):
if backend in ASYNC_REQUEST_FUNCS:
request_func = ASYNC_REQUEST_FUNCS[backend]
@@ -444,13 +520,35 @@ async def benchmark(
if profile_output.success:
print("Profiler started")
+ if burstiness == 1.0:
+ distribution = "Poisson process"
+ else:
+ distribution = "Gamma distribution"
+
print(f"Traffic request rate: {request_rate}")
+ print(f"Burstiness factor: {burstiness} ({distribution})")
+ print(f"Maximum request concurrency: {max_concurrency}")
pbar = None if disable_tqdm else tqdm(total=len(input_requests))
+ # This can be used once the minimum Python version is 3.10 or higher,
+ # and it will simplify the code in limited_request_func.
+ # semaphore = (asyncio.Semaphore(max_concurrency)
+ # if max_concurrency else contextlib.nullcontext())
+ semaphore = (asyncio.Semaphore(max_concurrency)
+ if max_concurrency else None)
+
+ async def limited_request_func(request_func_input, pbar):
+ if semaphore is None:
+ return await request_func(request_func_input=request_func_input,
+ pbar=pbar)
+ async with semaphore:
+ return await request_func(request_func_input=request_func_input,
+ pbar=pbar)
+
benchmark_start_time = time.perf_counter()
tasks: List[asyncio.Task] = []
- async for request in get_request(input_requests, request_rate):
+ async for request in get_request(input_requests, request_rate, burstiness):
prompt, prompt_len, output_len, mm_content = request
request_func_input = RequestFuncInput(model=model_id,
prompt=prompt,
@@ -463,8 +561,8 @@ async def benchmark(
ignore_eos=ignore_eos)
tasks.append(
asyncio.create_task(
- request_func(request_func_input=request_func_input,
- pbar=pbar)))
+ limited_request_func(request_func_input=request_func_input,
+ pbar=pbar)))
outputs: List[RequestFuncOutput] = await asyncio.gather(*tasks)
if profile:
@@ -494,6 +592,7 @@ async def benchmark(
tokenizer=tokenizer,
selected_percentile_metrics=selected_percentile_metrics,
selected_percentiles=selected_percentiles,
+ gootput_config_dict=gootput_config_dict,
)
print("{s:{c}^{n}}".format(s=' Serving Benchmark Result ', n=50, c='='))
@@ -505,6 +604,9 @@ async def benchmark(
metrics.total_output))
print("{:<40} {:<10.2f}".format("Request throughput (req/s):",
metrics.request_throughput))
+ if gootput_config_dict:
+ print("{:<40} {:<10.2f}".format("Request goodput (req/s):",
+ metrics.request_goodput))
print("{:<40} {:<10.2f}".format("Output token throughput (tok/s):",
metrics.output_throughput))
print("{:<40} {:<10.2f}".format("Total Token throughput (tok/s):",
@@ -516,6 +618,8 @@ async def benchmark(
"total_input_tokens": metrics.total_input,
"total_output_tokens": metrics.total_output,
"request_throughput": metrics.request_throughput,
+ "request_goodput:":
+ metrics.request_goodput if gootput_config_dict else None,
"output_throughput": metrics.output_throughput,
"total_token_throughput": metrics.total_token_throughput,
"input_lens": [output.prompt_len for output in outputs],
@@ -569,6 +673,41 @@ def process_one_metric(
return result
+def check_goodput_args(args):
+ # Check and parse goodput arguments
+ gootput_config_dict = {}
+ VALID_NAMES = ["ttft", "tpot", "e2el"]
+ if args.goodput:
+ gootput_config_dict = parse_goodput(args.goodput)
+ for slo_name, slo_val in gootput_config_dict.items():
+ if slo_name not in VALID_NAMES:
+ raise ValueError(
+ f"Invalid metric name found, {slo_name}: {slo_val}. "
+ "The service level objective name should be one of "
+ f"{str(VALID_NAMES)}. ")
+ if slo_val < 0:
+ raise ValueError(
+ f"Invalid value found, {slo_name}: {slo_val}. "
+ "The service level objective value should be "
+ "non-negative.")
+ return gootput_config_dict
+
+
+def parse_goodput(slo_pairs):
+ gootput_config_dict = {}
+ try:
+ for slo_pair in slo_pairs:
+ slo_name, slo_val = slo_pair.split(":")
+ gootput_config_dict[slo_name] = float(slo_val)
+ except ValueError as err:
+ raise argparse.ArgumentTypeError(
+ "Invalid format found for service level objectives. "
+ "Specify service level objectives for goodput as \"KEY:VALUE\" "
+ "pairs, where the key is a metric name, and the value is a "
+ "number in milliseconds.") from err
+ return gootput_config_dict
+
+
def main(args: argparse.Namespace):
print(args)
random.seed(args.seed)
@@ -646,6 +785,7 @@ def main(args: argparse.Namespace):
dataset_split=args.hf_split,
num_requests=args.num_prompts,
tokenizer=tokenizer,
+ random_seed=args.seed,
fixed_output_len=args.hf_output_len,
)
@@ -662,6 +802,8 @@ def main(args: argparse.Namespace):
else:
raise ValueError(f"Unknown dataset: {args.dataset_name}")
+ gootput_config_dict = check_goodput_args(args)
+
benchmark_result = asyncio.run(
benchmark(
backend=backend,
@@ -673,6 +815,7 @@ def main(args: argparse.Namespace):
logprobs=args.logprobs,
best_of=args.best_of,
request_rate=args.request_rate,
+ burstiness=args.burstiness,
disable_tqdm=args.disable_tqdm,
profile=args.profile,
selected_percentile_metrics=args.percentile_metrics.split(","),
@@ -680,6 +823,8 @@ def main(args: argparse.Namespace):
float(p) for p in args.metric_percentiles.split(",")
],
ignore_eos=args.ignore_eos,
+ gootput_config_dict=gootput_config_dict,
+ max_concurrency=args.max_concurrency,
))
# Save config and results to json
@@ -709,13 +854,17 @@ def main(args: argparse.Namespace):
# Traffic
result_json["request_rate"] = (
args.request_rate if args.request_rate < float("inf") else "inf")
+ result_json["burstiness"] = args.burstiness
+ result_json["max_concurrency"] = args.max_concurrency
# Merge with benchmark result
result_json = {**result_json, **benchmark_result}
# Save to file
base_model_id = model_id.split("/")[-1]
- file_name = f"{backend}-{args.request_rate}qps-{base_model_id}-{current_dt}.json" #noqa
+ max_concurrency_str = (f"-concurrency{args.max_concurrency}"
+ if args.max_concurrency is not None else "")
+ file_name = f"{backend}-{args.request_rate}qps{max_concurrency_str}-{base_model_id}-{current_dt}.json" #noqa
if args.result_filename:
file_name = args.result_filename
if args.result_dir:
@@ -766,6 +915,19 @@ def main(args: argparse.Namespace):
default=None,
help="Path to the sharegpt/sonnet dataset. "
"Or the huggingface dataset ID if using HF dataset.")
+ parser.add_argument(
+ "--max-concurrency",
+ type=int,
+ default=None,
+ help="Maximum number of concurrent requests. This can be used "
+ "to help simulate an environment where a higher level component "
+ "is enforcing a maximum number of concurrent requests. While the "
+ "--request-rate argument controls the rate at which requests are "
+ "initiated, this argument will control how many are actually allowed "
+ "to execute at a time. This means that when used in combination, the "
+ "actual request rate may be lower than specified with --request-rate, "
+ "if the server is not processing requests fast enough to keep up.")
+
parser.add_argument(
"--model",
type=str,
@@ -808,8 +970,20 @@ def main(args: argparse.Namespace):
default=float("inf"),
help="Number of requests per second. If this is inf, "
"then all the requests are sent at time 0. "
- "Otherwise, we use Poisson process to synthesize "
- "the request arrival times.",
+ "Otherwise, we use Poisson process or gamma distribution "
+ "to synthesize the request arrival times.",
+ )
+ parser.add_argument(
+ "--burstiness",
+ type=float,
+ default=1.0,
+ help="Burstiness factor of the request generation. "
+ "Only take effect when request_rate is not inf. "
+ "Default value is 1, which follows Poisson process. "
+ "Otherwise, the request intervals follow a gamma distribution. "
+ "A lower burstiness value (0 < burstiness < 1) results in more "
+ "bursty requests. A higher burstiness value (burstiness > 1) "
+ "results in a more uniform arrival of requests.",
)
parser.add_argument("--seed", type=int, default=0)
parser.add_argument(
@@ -879,6 +1053,17 @@ def main(args: argparse.Namespace):
"Default value is \"99\". "
"Use \"--percentile-metrics\" to select metrics.",
)
+ parser.add_argument(
+ "--goodput",
+ nargs="+",
+ required=False,
+ help="Specify service level objectives for goodput as \"KEY:VALUE\" "
+ "pairs, where the key is a metric name, and the value is in "
+ "milliseconds. Multiple \"KEY:VALUE\" pairs can be provided, "
+ "separated by spaces. Allowed request level metric names are "
+ "\"ttft\", \"tpot\", \"e2el\". For more context on the definition of "
+ "goodput, refer to DistServe paper: https://arxiv.org/pdf/2401.09670 "
+ "and the blog: https://hao-ai-lab.github.io/blogs/distserve")
# group for dataset specific arguments
sonnet_group = parser.add_argument_group("sonnet dataset options")
diff --git a/benchmarks/benchmark_throughput.py b/benchmarks/benchmark_throughput.py
index b7bc2a6402375..159cf055737ce 100644
--- a/benchmarks/benchmark_throughput.py
+++ b/benchmarks/benchmark_throughput.py
@@ -1,30 +1,71 @@
"""Benchmark offline inference throughput."""
import argparse
+import dataclasses
import json
import random
import time
-from typing import List, Optional, Tuple
+from typing import List, Optional
import torch
import uvloop
+from PIL import Image
from tqdm import tqdm
from transformers import (AutoModelForCausalLM, AutoTokenizer,
PreTrainedTokenizerBase)
-from vllm.engine.arg_utils import DEVICE_OPTIONS, AsyncEngineArgs, EngineArgs
+from vllm.engine.arg_utils import AsyncEngineArgs, EngineArgs
from vllm.entrypoints.openai.api_server import (
build_async_engine_client_from_engine_args)
-from vllm.model_executor.layers.quantization import QUANTIZATION_METHODS
+from vllm.inputs import TextPrompt
+from vllm.multimodal import MultiModalDataDict
from vllm.sampling_params import BeamSearchParams
from vllm.utils import FlexibleArgumentParser, merge_async_iterators
-def sample_requests(
- dataset_path: str,
- num_requests: int,
- tokenizer: PreTrainedTokenizerBase,
- fixed_output_len: Optional[int],
-) -> List[Tuple[str, int, int]]:
+@dataclasses.dataclass
+class SampleRequest:
+ """A class representing a single inference request for benchmarking.
+
+ Attributes:
+ prompt: The input text prompt for the model.
+ multi_modal_data: Optional dictionary containing multi-modal data (e.g.
+ images).
+ prompt_len: The length of the prompt in tokens.
+ expected_output_len: The expected length of the output in tokens.
+ """
+ prompt: str
+ prompt_len: int
+ expected_output_len: int
+ multi_modal_data: Optional[MultiModalDataDict] = None
+
+
+def _get_prompt_for_image_model(question: str, *, model: str) -> str:
+ """Prepend and append special tokens around the question to form a prompt.
+
+ Args:
+ question: The input question text to wrap with special tokens
+ model: The name of the model being used, to determine which special
+ tokens to add
+
+ Returns:
+ The formatted prompt string with appropriate special tokens for the
+ model
+
+ Raises:
+ ValueError: If an unsupported model name is provided
+ """
+ model = model.lower()
+ if "pixtral" in model:
+ return f"[INST]{question}\n[IMG][/INST]"
+ raise ValueError(f"Unsupported model {model}")
+
+
+def sample_requests(tokenizer: PreTrainedTokenizerBase,
+ args: argparse.Namespace) -> List[SampleRequest]:
+ dataset_path: str = args.dataset
+ num_requests: int = args.num_prompts
+ fixed_output_len: Optional[int] = args.output_len
+ model: str = args.model
if fixed_output_len is not None and fixed_output_len < 4:
raise ValueError("output_len too small")
@@ -33,23 +74,36 @@ def sample_requests(
dataset = json.load(f)
# Filter out the conversations with less than 2 turns.
dataset = [data for data in dataset if len(data["conversations"]) >= 2]
- # Only keep the first two turns of each conversation.
- dataset = [(data["conversations"][0]["value"],
- data["conversations"][1]["value"]) for data in dataset]
-
# Shuffle the dataset.
random.shuffle(dataset)
# Filter out sequences that are too long or too short
- filtered_dataset: List[Tuple[str, int, int]] = []
- for i in range(len(dataset)):
+ filtered_dataset: List[SampleRequest] = []
+ for data in dataset:
if len(filtered_dataset) == num_requests:
break
+ # Only keep the first two turns of each conversation.
+ prompt = data["conversations"][0]["value"]
+ completion = data["conversations"][1]["value"]
+
+ multi_modal_data: Optional[MultiModalDataDict] = None
+ if "image" in data:
+ multi_modal_data = multi_modal_data or {}
+ image_path = data["image"]
+ # TODO(vllm-project/vllm/issues/9778): Support multiple images.
+ assert isinstance(image_path,
+ str), "Only support single image input"
+ try:
+ multi_modal_data["image"] = Image.open(image_path).convert(
+ "RGB")
+ except FileNotFoundError:
+ # Ignore datapoint where asset is missing
+ continue
+ prompt = _get_prompt_for_image_model(question=prompt, model=model)
+
# Tokenize the prompts and completions.
- prompt = dataset[i][0]
prompt_token_ids = tokenizer(prompt).input_ids
- completion = dataset[i][1]
completion_token_ids = tokenizer(completion).input_ids
prompt_len = len(prompt_token_ids)
output_len = len(completion_token_ids
@@ -60,75 +114,37 @@ def sample_requests(
if prompt_len > 1024 or prompt_len + output_len > 2048:
# Prune too long sequences.
continue
- filtered_dataset.append((prompt, prompt_len, output_len))
+ filtered_dataset.append(
+ SampleRequest(prompt=prompt,
+ prompt_len=prompt_len,
+ expected_output_len=output_len,
+ multi_modal_data=multi_modal_data))
return filtered_dataset
def run_vllm(
- requests: List[Tuple[str, int, int]],
- model: str,
- tokenizer: str,
- quantization: Optional[str],
- tensor_parallel_size: int,
- seed: int,
+ requests: List[SampleRequest],
n: int,
- trust_remote_code: bool,
- dtype: str,
- max_model_len: Optional[int],
- enforce_eager: bool,
- kv_cache_dtype: str,
- quantization_param_path: Optional[str],
- device: str,
- enable_prefix_caching: bool,
- enable_chunked_prefill: bool,
- max_num_batched_tokens: int,
- distributed_executor_backend: Optional[str],
- gpu_memory_utilization: float = 0.9,
- num_scheduler_steps: int = 1,
- use_v2_block_manager: bool = False,
- download_dir: Optional[str] = None,
- load_format: str = EngineArgs.load_format,
- disable_async_output_proc: bool = False,
+ engine_args: EngineArgs,
) -> float:
from vllm import LLM, SamplingParams
- llm = LLM(
- model=model,
- tokenizer=tokenizer,
- quantization=quantization,
- tensor_parallel_size=tensor_parallel_size,
- seed=seed,
- trust_remote_code=trust_remote_code,
- dtype=dtype,
- max_model_len=max_model_len,
- gpu_memory_utilization=gpu_memory_utilization,
- enforce_eager=enforce_eager,
- kv_cache_dtype=kv_cache_dtype,
- quantization_param_path=quantization_param_path,
- device=device,
- enable_prefix_caching=enable_prefix_caching,
- download_dir=download_dir,
- enable_chunked_prefill=enable_chunked_prefill,
- max_num_batched_tokens=max_num_batched_tokens,
- distributed_executor_backend=distributed_executor_backend,
- load_format=load_format,
- num_scheduler_steps=num_scheduler_steps,
- use_v2_block_manager=use_v2_block_manager,
- disable_async_output_proc=disable_async_output_proc,
- )
+ llm = LLM(**dataclasses.asdict(engine_args))
# Add the requests to the engine.
- prompts: List[str] = []
+ prompts: List[TextPrompt] = []
sampling_params: List[SamplingParams] = []
- for prompt, _, output_len in requests:
- prompts.append(prompt)
+ for request in requests:
+ prompts.append(
+ TextPrompt(prompt=request.prompt,
+ multi_modal_data=request.multi_modal_data))
sampling_params.append(
SamplingParams(
n=n,
temperature=1.0,
top_p=1.0,
ignore_eos=True,
- max_tokens=output_len,
+ max_tokens=request.expected_output_len,
))
use_beam_search = False
@@ -138,11 +154,11 @@ def run_vllm(
llm.generate(prompts, sampling_params, use_tqdm=True)
end = time.perf_counter()
else:
- prompts = [prompt for prompt, _, _ in requests]
+ prompts = [request.prompt for request in requests]
# output_len should be the same for all requests.
output_len = requests[0][2]
- for prompt, input_len, _output_len in requests:
- assert _output_len == output_len
+ for request in requests:
+ assert request.expected_output_len == output_len
start = time.perf_counter()
llm.beam_search(
prompts,
@@ -156,75 +172,30 @@ def run_vllm(
async def run_vllm_async(
- requests: List[Tuple[str, int, int]],
- model: str,
- tokenizer: str,
- quantization: Optional[str],
- tensor_parallel_size: int,
- seed: int,
+ requests: List[SampleRequest],
n: int,
- trust_remote_code: bool,
- dtype: str,
- max_model_len: Optional[int],
- enforce_eager: bool,
- kv_cache_dtype: str,
- quantization_param_path: Optional[str],
- device: str,
- enable_prefix_caching: bool,
- enable_chunked_prefill: bool,
- max_num_batched_tokens: int,
- distributed_executor_backend: Optional[str],
- gpu_memory_utilization: float = 0.9,
- num_scheduler_steps: int = 1,
- use_v2_block_manager: bool = False,
- download_dir: Optional[str] = None,
- load_format: str = EngineArgs.load_format,
- disable_async_output_proc: bool = False,
+ engine_args: AsyncEngineArgs,
disable_frontend_multiprocessing: bool = False,
) -> float:
from vllm import SamplingParams
- engine_args = AsyncEngineArgs(
- model=model,
- tokenizer=tokenizer,
- quantization=quantization,
- tensor_parallel_size=tensor_parallel_size,
- seed=seed,
- trust_remote_code=trust_remote_code,
- dtype=dtype,
- max_model_len=max_model_len,
- gpu_memory_utilization=gpu_memory_utilization,
- enforce_eager=enforce_eager,
- kv_cache_dtype=kv_cache_dtype,
- quantization_param_path=quantization_param_path,
- device=device,
- enable_prefix_caching=enable_prefix_caching,
- download_dir=download_dir,
- enable_chunked_prefill=enable_chunked_prefill,
- max_num_batched_tokens=max_num_batched_tokens,
- distributed_executor_backend=distributed_executor_backend,
- load_format=load_format,
- num_scheduler_steps=num_scheduler_steps,
- use_v2_block_manager=use_v2_block_manager,
- disable_async_output_proc=disable_async_output_proc,
- worker_use_ray=False,
- disable_log_requests=True,
- )
async with build_async_engine_client_from_engine_args(
engine_args, disable_frontend_multiprocessing) as llm:
# Add the requests to the engine.
- prompts: List[str] = []
+ prompts: List[TextPrompt] = []
sampling_params: List[SamplingParams] = []
- for prompt, _, output_len in requests:
- prompts.append(prompt)
+ for request in requests:
+ prompts.append(
+ TextPrompt(prompt=request.prompt,
+ multi_modal_data=request.multi_modal_data))
sampling_params.append(
SamplingParams(
n=n,
temperature=1.0,
top_p=1.0,
ignore_eos=True,
- max_tokens=output_len,
+ max_tokens=request.expected_output_len,
))
generators = []
@@ -240,7 +211,7 @@ async def run_vllm_async(
def run_hf(
- requests: List[Tuple[str, int, int]],
+ requests: List[SampleRequest],
model: str,
tokenizer: PreTrainedTokenizerBase,
n: int,
@@ -298,14 +269,14 @@ def run_hf(
def run_mii(
- requests: List[Tuple[str, int, int]],
+ requests: List[SampleRequest],
model: str,
tensor_parallel_size: int,
output_len: int,
) -> float:
from mii import client, serve
llm = serve(model, tensor_parallel=tensor_parallel_size)
- prompts = [prompt for prompt, _, _ in requests]
+ prompts = [request.prompt for request in requests]
start = time.perf_counter()
llm.generate(prompts, max_new_tokens=output_len)
@@ -324,32 +295,39 @@ def main(args: argparse.Namespace):
args.tokenizer, trust_remote_code=args.trust_remote_code)
if args.dataset is None:
# Synthesize a prompt with the given input length.
- prompt = "hi" * (args.input_len - 1)
- requests = [(prompt, args.input_len, args.output_len)
- for _ in range(args.num_prompts)]
+ # As tokenizer may add additional tokens like BOS, we need to try
+ # different lengths to get the desired input length.
+ for i in range(-10, 10):
+ prompt = "hi " * (args.input_len + i)
+ tokenized_prompt = tokenizer(prompt).input_ids
+ if len(tokenized_prompt) == args.input_len:
+ break
+ else:
+ raise ValueError(
+ f"Failed to synthesize a prompt with {args.input_len} tokens.")
+ requests = [
+ SampleRequest(prompt=prompt,
+ prompt_len=args.input_len,
+ expected_output_len=args.output_len)
+ for _ in range(args.num_prompts)
+ ]
else:
- requests = sample_requests(args.dataset, args.num_prompts, tokenizer,
- args.output_len)
+ requests = sample_requests(tokenizer, args)
+ is_multi_modal = any(request.multi_modal_data is not None
+ for request in requests)
if args.backend == "vllm":
- run_args = [
- requests, args.model, args.tokenizer, args.quantization,
- args.tensor_parallel_size, args.seed, args.n,
- args.trust_remote_code, args.dtype, args.max_model_len,
- args.enforce_eager, args.kv_cache_dtype,
- args.quantization_param_path, args.device,
- args.enable_prefix_caching, args.enable_chunked_prefill,
- args.max_num_batched_tokens, args.distributed_executor_backend,
- args.gpu_memory_utilization, args.num_scheduler_steps,
- args.use_v2_block_manager, args.download_dir, args.load_format,
- args.disable_async_output_proc
- ]
-
if args.async_engine:
- run_args.append(args.disable_frontend_multiprocessing)
- elapsed_time = uvloop.run(run_vllm_async(*run_args))
+ elapsed_time = uvloop.run(
+ run_vllm_async(
+ requests,
+ args.n,
+ AsyncEngineArgs.from_cli_args(args),
+ args.disable_frontend_multiprocessing,
+ ))
else:
- elapsed_time = run_vllm(*run_args)
+ elapsed_time = run_vllm(requests, args.n,
+ EngineArgs.from_cli_args(args))
elif args.backend == "hf":
assert args.tensor_parallel_size == 1
elapsed_time = run_hf(requests, args.model, tokenizer, args.n,
@@ -359,10 +337,18 @@ def main(args: argparse.Namespace):
args.output_len)
else:
raise ValueError(f"Unknown backend: {args.backend}")
- total_num_tokens = sum(prompt_len + output_len
- for _, prompt_len, output_len in requests)
+ total_num_tokens = sum(request.prompt_len + request.expected_output_len
+ for request in requests)
+ total_output_tokens = sum(request.expected_output_len
+ for request in requests)
+ if is_multi_modal:
+ print("\033[91mWARNING\033[0m: Multi-modal request detected. The "
+ "following metrics are not accurate because image tokens are not"
+ " counted. See vllm-project/vllm/issues/9778 for details.")
+ # TODO(vllm-project/vllm/issues/9778): Count molti-modal token length.
print(f"Throughput: {len(requests) / elapsed_time:.2f} requests/s, "
- f"{total_num_tokens / elapsed_time:.2f} tokens/s")
+ f"{total_num_tokens / elapsed_time:.2f} total tokens/s, "
+ f"{total_output_tokens / elapsed_time:.2f} output tokens/s")
# Output JSON results if specified
if args.output_json:
@@ -386,7 +372,9 @@ def main(args: argparse.Namespace):
parser.add_argument("--dataset",
type=str,
default=None,
- help="Path to the dataset.")
+ help="Path to the dataset. The dataset is expected to "
+ "be a json in form of List[Dict[..., conversations: "
+ "List[Dict[..., value: ]]]]")
parser.add_argument("--input-len",
type=int,
default=None,
@@ -396,13 +384,6 @@ def main(args: argparse.Namespace):
default=None,
help="Output length for each request. Overrides the "
"output length from the dataset.")
- parser.add_argument("--model", type=str, default="facebook/opt-125m")
- parser.add_argument("--tokenizer", type=str, default=None)
- parser.add_argument('--quantization',
- '-q',
- choices=[*QUANTIZATION_METHODS, None],
- default=None)
- parser.add_argument("--tensor-parallel-size", "-tp", type=int, default=1)
parser.add_argument("--n",
type=int,
default=1,
@@ -411,127 +392,15 @@ def main(args: argparse.Namespace):
type=int,
default=1000,
help="Number of prompts to process.")
- parser.add_argument("--seed", type=int, default=0)
parser.add_argument("--hf-max-batch-size",
type=int,
default=None,
help="Maximum batch size for HF backend.")
- parser.add_argument('--trust-remote-code',
- action='store_true',
- help='trust remote code from huggingface')
- parser.add_argument(
- '--max-model-len',
- type=int,
- default=None,
- help='Maximum length of a sequence (including prompt and output). '
- 'If None, will be derived from the model.')
- parser.add_argument(
- '--dtype',
- type=str,
- default='auto',
- choices=['auto', 'half', 'float16', 'bfloat16', 'float', 'float32'],
- help='data type for model weights and activations. '
- 'The "auto" option will use FP16 precision '
- 'for FP32 and FP16 models, and BF16 precision '
- 'for BF16 models.')
- parser.add_argument('--gpu-memory-utilization',
- type=float,
- default=0.9,
- help='the fraction of GPU memory to be used for '
- 'the model executor, which can range from 0 to 1.'
- 'If unspecified, will use the default value of 0.9.')
- parser.add_argument("--enforce-eager",
- action="store_true",
- help="enforce eager execution")
- parser.add_argument(
- '--kv-cache-dtype',
- type=str,
- choices=['auto', 'fp8', 'fp8_e5m2', 'fp8_e4m3'],
- default="auto",
- help='Data type for kv cache storage. If "auto", will use model '
- 'data type. CUDA 11.8+ supports fp8 (=fp8_e4m3) and fp8_e5m2. '
- 'ROCm (AMD GPU) supports fp8 (=fp8_e4m3)')
- parser.add_argument(
- '--quantization-param-path',
- type=str,
- default=None,
- help='Path to the JSON file containing the KV cache scaling factors. '
- 'This should generally be supplied, when KV cache dtype is FP8. '
- 'Otherwise, KV cache scaling factors default to 1.0, which may cause '
- 'accuracy issues. FP8_E5M2 (without scaling) is only supported on '
- 'cuda version greater than 11.8. On ROCm (AMD GPU), FP8_E4M3 is '
- 'instead supported for common inference criteria.')
- parser.add_argument("--device",
- type=str,
- default="auto",
- choices=DEVICE_OPTIONS,
- help='device type for vLLM execution')
- parser.add_argument(
- "--num-scheduler-steps",
- type=int,
- default=1,
- help="Maximum number of forward steps per scheduler call.")
- parser.add_argument("--use-v2-block-manager",
- action='store_true',
- default=EngineArgs.use_v2_block_manager,
- help="Enable block manager v2.")
- parser.add_argument(
- "--enable-prefix-caching",
- action='store_true',
- help="Enable automatic prefix caching for vLLM backend.")
- parser.add_argument("--enable-chunked-prefill",
- action='store_true',
- help="enable chunked prefill for vLLM backend.")
- parser.add_argument('--max-num-batched-tokens',
- type=int,
- default=None,
- help='maximum number of batched tokens per '
- 'iteration')
- parser.add_argument('--download-dir',
- type=str,
- default=None,
- help='directory to download and load the weights, '
- 'default to the default cache dir of huggingface')
parser.add_argument(
'--output-json',
type=str,
default=None,
help='Path to save the throughput results in JSON format.')
- parser.add_argument(
- '--distributed-executor-backend',
- choices=['ray', 'mp'],
- default=None,
- help='Backend to use for distributed serving. When more than 1 GPU '
- 'is used, will be automatically set to "ray" if installed '
- 'or "mp" (multiprocessing) otherwise.')
- parser.add_argument(
- '--load-format',
- type=str,
- default=EngineArgs.load_format,
- choices=[
- 'auto', 'pt', 'safetensors', 'npcache', 'dummy', 'tensorizer',
- 'bitsandbytes'
- ],
- help='The format of the model weights to load.\n\n'
- '* "auto" will try to load the weights in the safetensors format '
- 'and fall back to the pytorch bin format if safetensors format '
- 'is not available.\n'
- '* "pt" will load the weights in the pytorch bin format.\n'
- '* "safetensors" will load the weights in the safetensors format.\n'
- '* "npcache" will load the weights in pytorch format and store '
- 'a numpy cache to speed up the loading.\n'
- '* "dummy" will initialize the weights with random values, '
- 'which is mainly for profiling.\n'
- '* "tensorizer" will load the weights using tensorizer from '
- 'CoreWeave. See the Tensorize vLLM Model script in the Examples'
- 'section for more information.\n'
- '* "bitsandbytes" will load the weights using bitsandbytes '
- 'quantization.\n')
- parser.add_argument(
- "--disable-async-output-proc",
- action='store_true',
- default=False,
- help="Disable async output processor for vLLM backend.")
parser.add_argument("--async-engine",
action='store_true',
default=False,
@@ -540,6 +409,7 @@ def main(args: argparse.Namespace):
action='store_true',
default=False,
help="Disable decoupled async engine frontend.")
+ parser = AsyncEngineArgs.add_cli_args(parser)
args = parser.parse_args()
if args.tokenizer is None:
args.tokenizer = args.model
diff --git a/benchmarks/kernels/benchmark_layernorm.py b/benchmarks/kernels/benchmark_layernorm.py
index 92f6053cc6d7e..7acea6087fdfd 100644
--- a/benchmarks/kernels/benchmark_layernorm.py
+++ b/benchmarks/kernels/benchmark_layernorm.py
@@ -3,8 +3,8 @@
import torch
from vllm.model_executor.layers.layernorm import RMSNorm
-from vllm.utils import (STR_DTYPE_TO_TORCH_DTYPE, FlexibleArgumentParser,
- seed_everything)
+from vllm.platforms import current_platform
+from vllm.utils import STR_DTYPE_TO_TORCH_DTYPE, FlexibleArgumentParser
@torch.inference_mode()
@@ -16,7 +16,7 @@ def main(num_tokens: int,
do_profile: bool = False,
num_warmup_iters: int = 5,
num_iters: int = 100) -> None:
- seed_everything(seed)
+ current_platform.seed_everything(seed)
torch.set_default_device("cuda")
layer = RMSNorm(hidden_size).to(dtype=dtype)
diff --git a/benchmarks/kernels/benchmark_machete.py b/benchmarks/kernels/benchmark_machete.py
index b70c4b94c97a1..46bab74ae8adf 100644
--- a/benchmarks/kernels/benchmark_machete.py
+++ b/benchmarks/kernels/benchmark_machete.py
@@ -2,8 +2,10 @@
import copy
import itertools
import math
+import os
import pickle as pkl
import time
+from dataclasses import dataclass
from itertools import product
from typing import Callable, Iterable, List, Optional, Tuple
@@ -15,11 +17,12 @@
from vllm import _custom_ops as ops
from vllm.model_executor.layers.quantization.utils.marlin_utils import (
- GPTQ_MARLIN_MAX_PARALLEL, GPTQ_MARLIN_MIN_THREAD_N, marlin_permute_scales)
+ GPTQ_MARLIN_MAX_PARALLEL, GPTQ_MARLIN_MIN_THREAD_N, marlin_permute_scales,
+ marlin_zero_points)
from vllm.model_executor.layers.quantization.utils.marlin_utils_test import (
MarlinWorkspace)
from vllm.model_executor.layers.quantization.utils.quant_utils import (
- gptq_pack, pack_rows, quantize_weights)
+ pack_rows, quantize_weights)
from vllm.scalar_type import ScalarType, scalar_types
from vllm.utils import FlexibleArgumentParser
@@ -27,149 +30,350 @@
DEFAULT_BATCH_SIZES = [1, 16, 32, 64, 128, 256, 512, 1024]
DEFAULT_TP_SIZES = [1]
+NVTX_PROFILE = os.environ.get("NVTX_PROFILE", False)
+
+if NVTX_PROFILE:
+ import nvtx
+
+
+def terse_type_name(dt):
+ return {
+ torch.bfloat16: "bf16",
+ torch.float16: "fp16",
+ torch.int8: "int8",
+ torch.float8_e4m3fn: "fp8",
+ torch.bfloat16: "bf16",
+ torch.float: "float",
+ torch.int: "int",
+ }[dt]
+
+
+@dataclass
+class BenchmarkTensors:
+ w_ref: torch.Tensor
+ a: torch.Tensor
+
+ w_q: torch.Tensor
+ group_size: Optional[int]
+ wtype: ScalarType
+ w_g_s: torch.Tensor
+ w_g_zp: Optional[torch.Tensor]
+ w_ch_s: Optional[torch.Tensor]
+ w_tok_s: Optional[torch.Tensor]
+
+
+@dataclass
+class TypeConfig:
+ act_type: torch.dtype
+ weight_type: ScalarType
+ output_type: Optional[torch.dtype]
+ group_scale_type: Optional[torch.dtype]
+ group_zero_type: Optional[torch.dtype]
+ channel_scale_type: Optional[torch.dtype]
+ token_scale_type: Optional[torch.dtype]
+
+
+def rand_data(shape, dtype=torch.float16, scale=1):
+ if dtype.is_floating_point:
+ return (scale * torch.rand(shape, device="cuda") - 0.3).to(dtype)
+ else:
+ return torch.randint(-15, 15, shape, dtype=dtype, device="cuda")
+
+
+def quantize_and_pack(atype: torch.dtype,
+ w: torch.Tensor,
+ wtype: ScalarType,
+ stype: Optional[torch.dtype],
+ group_size: Optional[int],
+ zero_points: bool = False):
+ assert wtype.is_integer(), "TODO: support floating point weights"
+
+ w_ref, w_q, w_s, w_zp = quantize_weights(
+ w,
+ wtype,
+ group_size=group_size,
+ zero_points=zero_points,
+ # to match how the kernel applies zps
+ ref_zero_points_after_scales=True)
-def machete_pack_weights(w_q: torch.tensor, wtype: ScalarType) -> torch.tensor:
w_q = pack_rows(w_q, wtype.size_bits, *w_q.shape)
- w_q = w_q.t().contiguous().t() # make col major
- return ops.machete_prepack_B(w_q, wtype)
+ return w_ref, w_q, w_s, w_zp
-def make_bench_tensors(
- atype: torch.dtype, wtype: ScalarType, group_size: int, m: int, n: int,
- k: int
-) -> Tuple[torch.tensor, List[Tuple[torch.tensor, torch.tensor, torch.tensor,
- torch.tensor]]]:
- assert wtype.is_integer(), "TODO: support floating point weights"
+def create_bench_tensors(shape: Tuple[int, int, int], types: TypeConfig,
+ group_size: Optional[int]) -> List[BenchmarkTensors]:
+ m, n, k = shape
# we want to make sure that weights don't fit into L2 cache between runs so
# we construct enough weights to exceed L2 cache, which is 50mb on a H100
# so we target total weight size > 2*50mb
- num_weights = math.ceil(2 * 50 * 1024**2 * 8 / (k * n * wtype.size_bits))
-
- a = torch.randn((m, k), device="cuda", dtype=atype) * 5
- weights = [
- torch.randn((k, n), device="cuda", dtype=atype)
- for _ in range(num_weights)
- ]
- quanitized_weights = [
- quantize_weights(w, wtype, group_size) for w in weights
- ]
-
- return a, quanitized_weights
+ num_weights = math.ceil(2 * 50 * 1024**2 * 8 /
+ (k * n * types.weight_type.size_bits))
+
+ a = rand_data((m, k), types.act_type, scale=5)
+
+ benchmark_tensors: List[BenchmarkTensors] = []
+ for _ in range(num_weights):
+ w = rand_data((k, n), types.act_type, scale=5)
+
+ if types.group_scale_type is not None:
+ w = w.to(types.group_scale_type)
+ if w.dtype.itemsize == 1:
+ w = w.to(torch.float16)
+
+ w_ref, w_q_packed, w_s, w_zp = quantize_and_pack(
+ a.dtype, w, types.weight_type, types.group_scale_type, group_size,
+ types.group_zero_type is not None)
+
+ if not a.dtype.is_floating_point:
+ aiinfo = torch.iinfo(a.dtype)
+ w_ref = w_ref.round().clamp(aiinfo.min, aiinfo.max)
+
+ w_ref = w_ref.to(torch.float32)
+
+ w_ch_s = None if types.channel_scale_type is None else\
+ rand_data((n,), types.channel_scale_type)
+ w_tok_s = None if types.token_scale_type is None else\
+ rand_data((m,), types.token_scale_type)
+
+ benchmark_tensors.append(
+ BenchmarkTensors(w_ref=w_ref,
+ a=a,
+ w_q=w_q_packed,
+ wtype=types.weight_type,
+ w_g_s=w_s,
+ w_g_zp=w_zp,
+ group_size=group_size,
+ w_ch_s=w_ch_s,
+ w_tok_s=w_tok_s))
+
+ return benchmark_tensors
+
+
+def torch_matmul_f16_create_bench_fn(bt: BenchmarkTensors) -> Callable:
+ a = bt.a
+ w = bt.w_ref.to(bt.a.dtype) # use float reference tensor
+ if a.dtype not in [torch.float16, torch.bfloat16]:
+ a = a.to(torch.float16)
+ w = w.to(torch.float16)
+ return lambda: torch.matmul(a, w)
+
+
+def cutlass_scaled_mm_create_bench_fn(bt: BenchmarkTensors) -> Callable:
+ if bt.w_ch_s is not None and bt.w_tok_s is not None:
+ scale_a = bt.w_tok_s.to(torch.float32)
+ scale_b = bt.w_ch_s.to(torch.float32)
+ else:
+ scale_a = torch.tensor(1.0, dtype=torch.float32, device=bt.a.device)
+ scale_b = torch.tensor(1.0, dtype=torch.float32, device=bt.a.device)
+ w_col_major = bt.w_ref.to(bt.a.dtype).t().contiguous().t()
+ return lambda: ops.cutlass_scaled_mm(
+ bt.a, w_col_major, scale_a, scale_b, out_dtype=torch.float16)
+
+
+def marlin_create_bench_fn(bt: BenchmarkTensors) -> Callable:
+ device = bt.a.device
+
+ workspace = MarlinWorkspace(bt.w_ref.shape[1], GPTQ_MARLIN_MIN_THREAD_N,
+ GPTQ_MARLIN_MAX_PARALLEL)
+
+ if bt.w_g_zp is None:
+ w_zp = torch.empty(0, dtype=torch.int, device=device)
+ else:
+ w_zp = marlin_zero_points(bt.w_g_zp, bt.w_ref.shape[0],
+ bt.w_ref.shape[1], bt.wtype.size_bits)
+
+ if bt.group_size is None:
+ w_s = torch.tensor([], device="cuda", dtype=torch.half)
+ else:
+ w_s = marlin_permute_scales(bt.w_g_s, bt.w_ref.shape[0],
+ bt.w_ref.shape[1], bt.group_size)
+
+ sort_indices = torch.empty(0, dtype=torch.int, device=device)
+ g_idx = torch.empty(0, dtype=torch.int, device=device)
+ w_q = ops.gptq_marlin_repack(bt.w_q, sort_indices, bt.w_ref.shape[0],
+ bt.w_ref.shape[1], bt.wtype.size_bits)
+
+ if bt.a.dtype.is_floating_point:
+ assert bt.w_ch_s is None
+ assert bt.w_tok_s is None
+ assert bt.group_size is not None
+
+ fn = lambda: ops.gptq_marlin_gemm(a=bt.a,
+ b_q_weight=w_q,
+ b_scales=w_s,
+ b_zeros=w_zp,
+ g_idx=g_idx,
+ perm=sort_indices,
+ workspace=workspace.scratch,
+ b_q_type=bt.wtype,
+ size_m=bt.a.shape[0],
+ size_n=bt.w_ref.shape[1],
+ size_k=bt.w_ref.shape[0],
+ is_k_full=True,
+ is_zp_float=False)
+ else:
+ assert bt.a.dtype == torch.int8
+ assert bt.wtype == scalar_types.uint4b8
+
+ if bt.w_ch_s is not None:
+ s_ch = bt.w_ch_s.to(torch.float32)
+ else:
+ s_ch = torch.ones(bt.w_ref.shape[1],
+ dtype=torch.float32,
+ device=device)
+
+ if bt.w_tok_s is not None:
+ s_tok = bt.w_tok_s.to(torch.float32)
+ else:
+ s_tok = torch.ones(bt.a.shape[0],
+ dtype=torch.float32,
+ device=device)
+
+ fn = lambda: ops.marlin_qqq_gemm(a=bt.a,
+ b_q_weight=w_q,
+ s_group=w_s,
+ s_tok=s_tok,
+ s_ch=s_ch,
+ workspace=workspace.scratch,
+ size_m=bt.a.shape[0],
+ size_n=bt.w_ref.shape[1],
+ size_k=bt.w_ref.shape[0])
+
+ return fn
+
+
+def machete_create_bench_fn(bt: BenchmarkTensors,
+ out_type=torch.dtype,
+ schedule=None) -> Callable:
+ w_q = bt.w_q.t().contiguous().t() # make col major
+ w_q = ops.machete_prepack_B(w_q, bt.a.dtype, bt.wtype,
+ None if bt.w_g_s is None else bt.w_g_s.dtype)
+
+ w_g_zp = bt.w_g_zp
+ if w_g_zp is not None:
+ w_g_zp = -1 * bt.w_g_s * (w_g_zp.to(bt.w_g_s.dtype))
+
+ return lambda: ops.machete_mm(
+ a=bt.a,
+ b_q=bt.w_q,
+ b_type=bt.wtype,
+ b_group_scales=bt.w_g_s,
+ b_group_zeros=w_g_zp,
+ b_group_size=bt.group_size,
+ b_channel_scales=bt.w_ch_s,
+ a_token_scales=bt.w_tok_s,
+ out_type=out_type,
+ schedule=schedule,
+ )
# impl
-
# bench
-def bench_fn(label: str, sub_label: str, description: str,
- fn: Callable) -> TMeasurement:
- min_run_time = 1
- return TBenchmark.Timer(
- stmt="fn()",
+
+def bench_fns(label: str, sub_label: str, description: str,
+ fns: List[Callable]):
+
+ min_run_time = 1 if not NVTX_PROFILE else 0.1
+ res = TBenchmark.Timer(
+ stmt="""
+ for fn in fns:
+ fn()
+ """,
globals={
- "fn": fn
+ "fns": fns
},
label=label,
sub_label=sub_label,
description=description,
).blocked_autorange(min_run_time=min_run_time)
+ if NVTX_PROFILE:
+ with nvtx.annotate("mm-bench"), nvtx.annotate(
+ f"{label}|{sub_label}|{description}"):
+ fns[0]()
-def loop_over_weights(
- a: torch.tensor, weights: List[Tuple[torch.tensor, torch.tensor,
- torch.tensor, torch.tensor]],
- fn: Callable[[torch.tensor, torch.tensor, torch.tensor, torch.tensor],
- None]):
- for w_ref, w_q, w_s, _ in weights:
- fn(a, w_ref, w_q, w_s)
+ return res
_SWEEP_SCHEDULES_RESULTS: Optional[pd.DataFrame] = None
_SWEEP_SCHEDULES_RESULTS_CSV: Optional[str] = None
-def bench(atype: torch.dtype,
- wtype: ScalarType,
+def bench(types: TypeConfig,
group_size: int,
m: int,
k: int,
n: int,
label: str,
sub_label: str,
- benchmark_marlinv1: bool = True,
- sweep_schedules: bool = True) -> Iterable[TMeasurement]:
- global _SWEEP_SCHEDULES_RESULTS
-
- a, weights = make_bench_tensors(atype, wtype, group_size, m, n, k)
- sub_label += f", L={len(weights)}"
-
- weights_machete = [(w_ref, machete_pack_weights(w_q, wtype), w_s, w_zp)
- for w_ref, w_q, w_s, w_zp in weights]
+ sweep_schedules: bool = True) -> List[TMeasurement]:
+ benchmark_tensors = create_bench_tensors((m, n, k), types, group_size)
+ sub_label += f", L={len(benchmark_tensors)}"
+
+ name_type_string = f"W{types.weight_type}"+\
+ f"-A{terse_type_name(types.act_type)}"
+ if types.group_scale_type is not None:
+ name_type_string += f"-GS{terse_type_name(types.group_scale_type)}"
+ if types.group_zero_type is not None:
+ name_type_string += f"-GZ{terse_type_name(types.group_zero_type)}"
+ if group_size is not None:
+ name_type_string += f"-G{group_size}"
+ if types.channel_scale_type is not None:
+ name_type_string += f"-CS{terse_type_name(types.channel_scale_type)}"
+ if types.token_scale_type is not None:
+ name_type_string += f"-TS{terse_type_name(types.token_scale_type)}"
timers = []
# pytorch impl
timers.append(
- bench_fn(
- label, sub_label, "torch.matmul", lambda: loop_over_weights(
- a,
- weights,
- lambda a, w_ref, w_q, w_s: torch.matmul(a, w_ref),
- )))
+ bench_fns(
+ label, sub_label, "torch.matmul (fp16)",
+ [torch_matmul_f16_create_bench_fn(bt)
+ for bt in benchmark_tensors]))
- if benchmark_marlinv1:
- w_ref = weights[0][0]
-
- w_zp_empty = torch.empty(0, dtype=torch.int, device=w_ref.device)
- sort_indices = torch.empty(0, dtype=torch.int, device=w_ref.device)
- g_idx = torch.empty(0, dtype=torch.int, device=w_ref.device)
-
- def marlinv1_pack_weights(w_q: torch.tensor) -> torch.tensor:
- w_q_gptq = gptq_pack(w_q, wtype.size_bits, *w_ref.shape)
- return ops.gptq_marlin_repack(w_q_gptq, sort_indices, *w_ref.shape,
- wtype.size_bits)
-
- def marlinv1_permute_scales(w_s: torch.tensor) -> torch.tensor:
- return marlin_permute_scales(w_s, *w_ref.shape, group_size)
-
- weights_marlinv1 = [(w_ref, marlinv1_pack_weights(w_q),
- marlinv1_permute_scales(w_s), w_zp)
- for w_ref, w_q, w_s, w_zp in weights]
-
- workspace = MarlinWorkspace(w_ref.shape[1], GPTQ_MARLIN_MIN_THREAD_N,
- GPTQ_MARLIN_MAX_PARALLEL)
-
- # marlinv1
+ if types.act_type == torch.int8 or types.act_type == torch.float8_e4m3fn:
+ timers.append(
+ bench_fns(
+ label, sub_label,
+ f"cutlass_scaled_mm ({terse_type_name(types.act_type)})", [
+ cutlass_scaled_mm_create_bench_fn(bt)
+ for bt in benchmark_tensors
+ ]))
+
+ if types.act_type != torch.float8_e4m3fn:
timers.append(
- bench_fn(
- label, sub_label, "marlin_orig", lambda: loop_over_weights(
- a, weights_marlinv1, lambda a, w_ref, w_q, w_s: ops.
- gptq_marlin_gemm(a,
- w_q,
- w_s,
- w_zp_empty,
- g_idx,
- sort_indices,
- workspace.scratch,
- wtype,
- size_m=a.shape[0],
- size_n=w_ref.shape[1],
- size_k=w_ref.shape[0],
- is_k_full=True))))
+ bench_fns(label, sub_label, f"marlin ({name_type_string})",
+ [marlin_create_bench_fn(bt)
+ for bt in benchmark_tensors]))
# machete
timers.append(
- bench_fn(
- label, sub_label, "machete_heuristic", lambda: loop_over_weights(
- a, weights_machete, lambda a, _, w_q, w_s: ops.machete_gemm(
- a, w_q, wtype, b_scales=w_s, b_group_size=group_size))))
+ bench_fns(label, sub_label, f"machete ({name_type_string})", [
+ machete_create_bench_fn(bt, out_type=types.output_type)
+ for bt in benchmark_tensors
+ ]))
if sweep_schedules:
+ global _SWEEP_SCHEDULES_RESULTS
+
print("Finding best schedule for machete")
best = None
best_schedule = None
- schedules = ops.machete_supported_schedules(wtype)
+ schedules = ops.machete_supported_schedules(
+ a_type=types.act_type,
+ b_type=types.weight_type,
+ group_scales_type=types.group_scale_type,
+ group_zeros_type=types.group_zero_type,
+ token_scales_type=types.token_scale_type,
+ channel_scales_type=types.channel_scale_type,
+ out_type=types.output_type)
+
+ if schedules is None or len(schedules) == 0:
+ raise ValueError("No schedules found to sweep")
+
for schedule in reversed(schedules):
schedule_M = int(schedule.split("_")[0].split("x")[1])
@@ -177,16 +381,11 @@ def marlinv1_permute_scales(w_s: torch.tensor) -> torch.tensor:
if schedule_M >= 2 * max(m, 16) or schedule_M < m // 4:
continue
- def run(a, _, w_q, w_s, schedule=schedule):
- ops.machete_gemm(a,
- w_q,
- wtype,
- w_s,
- b_group_size=group_size,
- schedule=schedule)
-
- res = bench_fn(label, sub_label, "machete_best",
- lambda: loop_over_weights(a, weights_machete, run))
+ res = bench_fns(label, sub_label, "machete_best", [
+ machete_create_bench_fn(
+ bt, out_type=types.output_type, schedule=schedule)
+ for bt in benchmark_tensors
+ ])
results_row = {
"M": m,
@@ -213,25 +412,33 @@ def run(a, _, w_q, w_s, schedule=schedule):
# runner
-def print_timers(timers: Iterable[TMeasurement]):
+def print_timers(timers: List[TMeasurement]):
compare = TBenchmark.Compare(timers)
compare.print()
-def run(dtype: torch.dtype, sweep_schedules: bool,
- MKNs: Iterable[Tuple[int, int, int]]) -> Iterable[TMeasurement]:
+def run(args, MKNs: Iterable[Tuple[int, int, int]]) -> Iterable[TMeasurement]:
+ types = TypeConfig(
+ act_type=args.act_type,
+ weight_type=scalar_types.uint4b8 if args.group_zero_type is None \
+ else scalar_types.uint4,
+ output_type=args.out_type,
+ group_scale_type=args.group_scale_type,
+ group_zero_type=args.group_zero_type,
+ channel_scale_type=args.channel_scale_type,
+ token_scale_type=args.token_scale_type,
+ )
- results = []
+ results: List[TMeasurement] = []
for m, k, n in MKNs:
- timers = bench(dtype,
- scalar_types.uint4b8,
- 128,
+ timers = bench(types,
+ args.group_size,
m,
k,
n,
- f"{dtype}-gemm",
+ f"{args.act_type}-gemm",
f"MKN=({m}x{k}x{n})",
- sweep_schedules=sweep_schedules)
+ sweep_schedules=args.sweep_schedules)
print_timers(timers)
results.extend(timers)
@@ -240,7 +447,7 @@ def run(dtype: torch.dtype, sweep_schedules: bool,
# output makers
def make_output(
- data: Iterable[TMeasurement],
+ data: List[TMeasurement],
MKNs: Iterable[Tuple[int, int, int]],
base_description: str,
timestamp=None,
@@ -262,17 +469,16 @@ def run_square_bench(args):
dim_sizes = list(
range(args.dim_start, args.dim_end + 1, args.dim_increment))
MKNs = list(zip(dim_sizes, dim_sizes, dim_sizes))
-
data = run(args.dtype, args.sweep_schedules, MKNs)
make_output(data, MKNs, f"square_bench-{args.dtype}")
def run_range_bench(args):
- m_start, k_start, n_start = [int(x) for x in args.dim_start.split(",")]
- m_end, k_end, n_end = [int(x) for x in args.dim_end.split(",")]
+ m_start, k_start, n_start = (int(x) for x in args.dim_start.split(","))
+ m_end, k_end, n_end = (int(x) for x in args.dim_end.split(","))
m_increment, k_increment, n_increment = \
- [int(x) for x in args.dim_increment.split(",")]
+ (int(x) for x in args.dim_increment.split(","))
Ms = list(range(m_start, m_end + 1, m_increment))
Ks = list(range(k_start, k_end + 1, k_increment))
Ns = list(range(n_start, n_end + 1, n_increment))
@@ -306,33 +512,49 @@ def model_shapes(model_name: str, tp_size: int) -> List[Tuple[int, int]]:
for k, n in KNs:
MKNs.append((m, k, n))
- data = run(args.dtype, args.sweep_schedules, MKNs)
+ data = run(args, MKNs)
model_bench_data.append(data)
+ type_string = f"{args.act_type}"
+
# Print all results
for data, model_tp in zip(model_bench_data, models_tps):
model, tp_size = model_tp
- print(f"== Results {args.dtype} {model}-TP{tp_size} ====")
+ print(f"== Results {type_string} {model}-TP{tp_size} ====")
print_timers(data)
- timestamp = int(time.time())
+ timestr = time.strftime("%Y%m%d-%H%M%S")
- all_data = []
+ all_results = []
for d in model_bench_data:
- all_data.extend(d)
+ all_results.extend(d)
+
# pickle all data
- with open(f"model_bench-{args.dtype}-{timestamp}.pkl", "wb") as f:
- pkl.dump(all_data, f)
+ with open(f"model_bench-{type_string}-{timestr}.pkl", "wb") as f:
+ args_dict = vars(args)
+ args_dict.pop("func")
+ pkl.dump({
+ "args": args_dict,
+ "results": all_results,
+ }, f)
if __name__ == "__main__":
def to_torch_dtype(dt):
- if dt == "bfloat16":
- return torch.bfloat16
- if dt == "float16":
- return torch.float16
- raise ValueError("unsupported dtype")
+ return {
+ "bfloat16": torch.bfloat16,
+ "float16": torch.float16,
+ "int8": torch.int8,
+ "float8_e4m3fn": torch.float8_e4m3fn,
+ "int": torch.int,
+ "float": torch.float,
+ }[dt]
+
+ class ToTorchDtype(argparse.Action):
+
+ def __call__(self, parser, namespace, values, option_string=None):
+ setattr(namespace, self.dest, to_torch_dtype(values))
parser = FlexibleArgumentParser(
description="""
@@ -352,12 +574,42 @@ def to_torch_dtype(dt):
""", # noqa: E501
formatter_class=argparse.RawTextHelpFormatter,
)
-
parser.add_argument(
- "--dtype",
- type=to_torch_dtype,
+ "--act-type",
+ action=ToTorchDtype,
required=True,
- help="Available options are ['bfloat16', 'float16']",
+ choices=['bfloat16', 'float16', 'int8', 'float8_e4m3fn'],
+ )
+ parser.add_argument(
+ "--group-scale-type",
+ action=ToTorchDtype,
+ choices=['bfloat16', 'float16'],
+ )
+ parser.add_argument(
+ "--group-zero-type",
+ type=to_torch_dtype,
+ choices=['bfloat16', 'float16'],
+ )
+ parser.add_argument(
+ "--channel-scale-type",
+ action=ToTorchDtype,
+ choices=['float'],
+ )
+ parser.add_argument(
+ "--token-scale-type",
+ action=ToTorchDtype,
+ choices=['float'],
+ )
+ parser.add_argument(
+ "--out-type",
+ action=ToTorchDtype,
+ choices=['bfloat16', 'float16'],
+ )
+ parser.add_argument(
+ "--group-size",
+ type=int,
+ help="Available options are ['None', '-1', '128'], default=128",
+ default=128,
)
parser.add_argument(
"--sweep-schedules",
diff --git a/benchmarks/kernels/benchmark_marlin.py b/benchmarks/kernels/benchmark_marlin.py
index 536c133bb3341..8fb44e3a3dbd8 100644
--- a/benchmarks/kernels/benchmark_marlin.py
+++ b/benchmarks/kernels/benchmark_marlin.py
@@ -131,7 +131,7 @@ def bench_run(results: List[benchmark.Measurement], model: str,
results.append(
benchmark.Timer(
stmt=
- "output = gptq_marlin_gemm(a, marlin_q_w, marlin_s, marlin_zp, marlin_g_idx, marlin_sort_indices, marlin_workspace.scratch, quant_type, size_m, size_n, size_k, is_k_full, False, False)", # noqa: E501
+ "output = gptq_marlin_gemm(a, marlin_q_w, marlin_s, marlin_zp, marlin_g_idx, marlin_sort_indices, marlin_workspace.scratch, quant_type, size_m, size_n, size_k, is_k_full, False, False, False)", # noqa: E501
globals=globals,
label=label,
sub_label=sub_label,
@@ -141,7 +141,7 @@ def bench_run(results: List[benchmark.Measurement], model: str,
results.append(
benchmark.Timer(
stmt=
- "output = gptq_marlin_gemm(a, marlin_q_w, marlin_s, marlin_zp, marlin_g_idx, marlin_sort_indices, marlin_workspace.scratch, quant_type, size_m, size_n, size_k, is_k_full, False, True)", # noqa: E501
+ "output = gptq_marlin_gemm(a, marlin_q_w, marlin_s, marlin_zp, marlin_g_idx, marlin_sort_indices, marlin_workspace.scratch, quant_type, size_m, size_n, size_k, is_k_full, False, True, False)", # noqa: E501
globals=globals,
label=label,
sub_label=sub_label,
diff --git a/benchmarks/kernels/benchmark_moe.py b/benchmarks/kernels/benchmark_moe.py
index c2ad98b7e2656..8f538c21f7f7e 100644
--- a/benchmarks/kernels/benchmark_moe.py
+++ b/benchmarks/kernels/benchmark_moe.py
@@ -10,7 +10,8 @@
from transformers import AutoConfig
from vllm.model_executor.layers.fused_moe.fused_moe import *
-from vllm.utils import FlexibleArgumentParser, seed_everything
+from vllm.platforms import current_platform
+from vllm.utils import FlexibleArgumentParser
class BenchmarkConfig(TypedDict):
@@ -88,22 +89,23 @@ def prepare(i: int):
input_gating.copy_(gating_output[i])
def run():
- fused_moe(
- x,
- w1,
- w2,
- input_gating,
- topk,
- renormalize=True,
- inplace=True,
- override_config=config,
- use_fp8_w8a8=use_fp8_w8a8,
- use_int8_w8a16=use_int8_w8a16,
- w1_scale=w1_scale,
- w2_scale=w2_scale,
- a1_scale=a1_scale,
- a2_scale=a2_scale,
- )
+ from vllm.model_executor.layers.fused_moe import override_config
+ with override_config(config):
+ fused_moe(
+ x,
+ w1,
+ w2,
+ input_gating,
+ topk,
+ renormalize=True,
+ inplace=True,
+ use_fp8_w8a8=use_fp8_w8a8,
+ use_int8_w8a16=use_int8_w8a16,
+ w1_scale=w1_scale,
+ w2_scale=w2_scale,
+ a1_scale=a1_scale,
+ a2_scale=a2_scale,
+ )
# JIT compilation & warmup
run()
@@ -166,7 +168,7 @@ class BenchmarkWorker:
def __init__(self, seed: int) -> None:
torch.set_default_device("cuda")
- seed_everything(seed)
+ current_platform.seed_everything(seed)
self.seed = seed
def benchmark(
@@ -180,7 +182,7 @@ def benchmark(
use_fp8_w8a8: bool,
use_int8_w8a16: bool,
) -> Tuple[Dict[str, int], float]:
- seed_everything(self.seed)
+ current_platform.seed_everything(self.seed)
dtype_str = get_config_dtype_str(dtype,
use_int8_w8a16=use_int8_w8a16,
use_fp8_w8a8=use_fp8_w8a8)
diff --git a/benchmarks/kernels/benchmark_paged_attention.py b/benchmarks/kernels/benchmark_paged_attention.py
index 87864d038d593..14eef00b855ac 100644
--- a/benchmarks/kernels/benchmark_paged_attention.py
+++ b/benchmarks/kernels/benchmark_paged_attention.py
@@ -5,8 +5,9 @@
import torch
from vllm import _custom_ops as ops
+from vllm.platforms import current_platform
from vllm.utils import (STR_DTYPE_TO_TORCH_DTYPE, FlexibleArgumentParser,
- create_kv_caches_with_random, seed_everything)
+ create_kv_caches_with_random)
NUM_BLOCKS = 1024
PARTITION_SIZE = 512
@@ -28,7 +29,7 @@ def main(
device: str = "cuda",
kv_cache_dtype: Optional[str] = None,
) -> None:
- seed_everything(seed)
+ current_platform.seed_everything(seed)
scale = float(1.0 / (head_size**0.5))
query = torch.empty(num_seqs,
diff --git a/benchmarks/kernels/benchmark_quant.py b/benchmarks/kernels/benchmark_quant.py
index 743a5744e8614..1d62483448946 100644
--- a/benchmarks/kernels/benchmark_quant.py
+++ b/benchmarks/kernels/benchmark_quant.py
@@ -3,8 +3,8 @@
import torch
from vllm import _custom_ops as ops
-from vllm.utils import (STR_DTYPE_TO_TORCH_DTYPE, FlexibleArgumentParser,
- seed_everything)
+from vllm.platforms import current_platform
+from vllm.utils import STR_DTYPE_TO_TORCH_DTYPE, FlexibleArgumentParser
@torch.inference_mode()
@@ -17,7 +17,7 @@ def main(num_tokens: int,
do_profile: bool = False,
num_warmup_iters: int = 5,
num_iters: int = 100) -> None:
- seed_everything(seed)
+ current_platform.seed_everything(seed)
torch.set_default_device("cuda")
x = torch.randn(num_tokens, hidden_size, dtype=dtype)
diff --git a/benchmarks/kernels/benchmark_rope.py b/benchmarks/kernels/benchmark_rope.py
index 784b1cf9844e4..250d505168d09 100644
--- a/benchmarks/kernels/benchmark_rope.py
+++ b/benchmarks/kernels/benchmark_rope.py
@@ -6,7 +6,8 @@
from vllm.model_executor.layers.rotary_embedding import (RotaryEmbedding,
get_rope)
-from vllm.utils import FlexibleArgumentParser, seed_everything
+from vllm.platforms import current_platform
+from vllm.utils import FlexibleArgumentParser
def benchmark_rope_kernels_multi_lora(
@@ -22,7 +23,7 @@ def benchmark_rope_kernels_multi_lora(
max_position: int = 8192,
base: int = 10000,
) -> None:
- seed_everything(seed)
+ current_platform.seed_everything(seed)
torch.set_default_device(device)
if rotary_dim is None:
rotary_dim = head_size
diff --git a/benchmarks/kernels/graph_machete_bench.py b/benchmarks/kernels/graph_machete_bench.py
index de608fd05af70..7d0bd84150a27 100644
--- a/benchmarks/kernels/graph_machete_bench.py
+++ b/benchmarks/kernels/graph_machete_bench.py
@@ -20,10 +20,11 @@
args = parser.parse_args()
with open(args.filename, 'rb') as f:
- data: List[TMeasurement] = pickle.load(f)
+ data = pickle.load(f)
+ raw_results: List[TMeasurement] = data["results"]
results = defaultdict(lambda: list())
- for v in data:
+ for v in raw_results:
result = re.search(r"MKN=\(\d+x(\d+x\d+)\)", v.task_spec.sub_label)
if result is not None:
KN = result.group(1)
diff --git a/benchmarks/kernels/weight_shapes.py b/benchmarks/kernels/weight_shapes.py
index 25ec9d6028627..51f24f3ba1774 100644
--- a/benchmarks/kernels/weight_shapes.py
+++ b/benchmarks/kernels/weight_shapes.py
@@ -40,4 +40,10 @@
([8192, 57344], 1),
([28672, 8192], 0),
],
+ "meta-llama/Llama-3.1-405b-hf": [
+ ([16384, 18432], 1),
+ ([16384, 16384], 0),
+ ([16384, 106496], 1),
+ ([53248, 16384], 0),
+ ],
}
diff --git a/benchmarks/launch_tgi_server.sh b/benchmarks/launch_tgi_server.sh
index 8c5cd454fbbee..ba7383d88dc49 100755
--- a/benchmarks/launch_tgi_server.sh
+++ b/benchmarks/launch_tgi_server.sh
@@ -4,13 +4,13 @@ PORT=8000
MODEL=$1
TOKENS=$2
-docker run -e HF_TOKEN=$HF_TOKEN --gpus all --shm-size 1g -p $PORT:80 \
- -v $PWD/data:/data \
+docker run -e "HF_TOKEN=$HF_TOKEN" --gpus all --shm-size 1g -p $PORT:80 \
+ -v "$PWD/data:/data" \
ghcr.io/huggingface/text-generation-inference:2.2.0 \
- --model-id $MODEL \
+ --model-id "$MODEL" \
--sharded false \
--max-input-length 1024 \
--max-total-tokens 2048 \
--max-best-of 5 \
--max-concurrent-requests 5000 \
- --max-batch-total-tokens $TOKENS
+ --max-batch-total-tokens "$TOKENS"
diff --git a/benchmarks/overheads/benchmark_hashing.py b/benchmarks/overheads/benchmark_hashing.py
index 203699e9a8d06..d16d6f9fba442 100644
--- a/benchmarks/overheads/benchmark_hashing.py
+++ b/benchmarks/overheads/benchmark_hashing.py
@@ -16,7 +16,6 @@ def main(args):
enforce_eager=True,
enable_prefix_caching=True,
tensor_parallel_size=args.tensor_parallel_size,
- use_v2_block_manager=args.use_v2_block_manager,
)
sampling_params = SamplingParams(temperature=0, max_tokens=args.output_len)
@@ -56,8 +55,5 @@ def main(args):
parser.add_argument('--enable-prefix-caching',
action='store_true',
help='enable prefix caching')
- parser.add_argument('--use-v2-block-manager',
- action='store_true',
- help='Use BlockSpaceMangerV2')
args = parser.parse_args()
main(args)
diff --git a/cmake/cpu_extension.cmake b/cmake/cpu_extension.cmake
index bc5f24d3f591c..426189481575b 100644
--- a/cmake/cpu_extension.cmake
+++ b/cmake/cpu_extension.cmake
@@ -1,5 +1,8 @@
+include(FetchContent)
+
+set(CMAKE_CXX_STANDARD_REQUIRED ON)
+set(CMAKE_CXX_EXTENSIONS ON)
set(CMAKE_EXPORT_COMPILE_COMMANDS ON)
-set(CMAKE_CXX_STANDARD 17)
#
# Define environment variables for special configurations
@@ -13,9 +16,16 @@ include_directories("${CMAKE_SOURCE_DIR}/csrc")
#
# Check the compile flags
#
-list(APPEND CXX_COMPILE_FLAGS
- "-fopenmp"
- "-DVLLM_CPU_EXTENSION")
+if (CMAKE_SYSTEM_PROCESSOR STREQUAL "ppc64le")
+ list(APPEND CXX_COMPILE_FLAGS
+ "-fopenmp"
+ "-DVLLM_CPU_EXTENSION")
+else()
+ list(APPEND CXX_COMPILE_FLAGS
+ "-fopenmp"
+ "-mf16c"
+ "-DVLLM_CPU_EXTENSION")
+endif()
execute_process(COMMAND cat /proc/cpuinfo
RESULT_VARIABLE CPUINFO_RET
@@ -82,15 +92,40 @@ else()
message(FATAL_ERROR "vLLM CPU backend requires AVX512 or AVX2 or Power9+ ISA support.")
endif()
+#
+# Build oneDNN for W8A8 GEMM kernels (only for x86-AVX512 platforms)
+#
+if (AVX512_FOUND AND NOT AVX512_DISABLED)
+ FetchContent_Declare(
+ oneDNN
+ GIT_REPOSITORY https://github.com/oneapi-src/oneDNN.git
+ GIT_TAG v3.6
+ GIT_PROGRESS TRUE
+ GIT_SHALLOW TRUE
+ )
+
+ set(ONEDNN_LIBRARY_TYPE "STATIC")
+ set(ONEDNN_BUILD_DOC "OFF")
+ set(ONEDNN_BUILD_EXAMPLES "OFF")
+ set(ONEDNN_BUILD_TESTS "OFF")
+ set(ONEDNN_ENABLE_WORKLOAD "INFERENCE")
+ set(ONEDNN_ENABLE_PRIMITIVE "MATMUL;REORDER")
+ set(ONEDNN_BUILD_GRAPH "OFF")
+ set(ONEDNN_ENABLE_JIT_PROFILING "OFF")
+ set(ONEDNN_ENABLE_ITT_TASKS "OFF")
+ set(ONEDNN_ENABLE_MAX_CPU_ISA "OFF")
+ set(ONEDNN_ENABLE_CPU_ISA_HINTS "OFF")
+ set(CMAKE_POLICY_DEFAULT_CMP0077 NEW)
+
+ FetchContent_MakeAvailable(oneDNN)
+
+ list(APPEND LIBS dnnl)
+endif()
+
message(STATUS "CPU extension compile flags: ${CXX_COMPILE_FLAGS}")
list(APPEND LIBS numa)
-# Appending the dnnl library for the AVX2 and AVX512, as it is not utilized by Power architecture.
-if (AVX2_FOUND OR AVX512_FOUND)
- list(APPEND LIBS dnnl)
-endif()
-
#
# _C extension
#
diff --git a/cmake/utils.cmake b/cmake/utils.cmake
index 24bb7299338ac..40430dae10c5b 100644
--- a/cmake/utils.cmake
+++ b/cmake/utils.cmake
@@ -424,11 +424,7 @@ function (define_gpu_extension_target GPU_MOD_NAME)
# Don't use `TORCH_LIBRARIES` for CUDA since it pulls in a bunch of
# dependencies that are not necessary and may not be installed.
if (GPU_LANGUAGE STREQUAL "CUDA")
- if ("${CUDA_CUDA_LIB}" STREQUAL "")
- set(CUDA_CUDA_LIB "${CUDA_CUDA_LIBRARY}")
- endif()
- target_link_libraries(${GPU_MOD_NAME} PRIVATE ${CUDA_CUDA_LIB}
- ${CUDA_LIBRARIES})
+ target_link_libraries(${GPU_MOD_NAME} PRIVATE CUDA::cudart CUDA::cuda_driver)
else()
target_link_libraries(${GPU_MOD_NAME} PRIVATE ${TORCH_LIBRARIES})
endif()
diff --git a/collect_env.py b/collect_env.py
index 80403d576d78f..254c19b19a5ac 100644
--- a/collect_env.py
+++ b/collect_env.py
@@ -1,17 +1,19 @@
# ruff: noqa
# code borrowed from https://github.com/pytorch/pytorch/blob/main/torch/utils/collect_env.py
-# Unlike the rest of the PyTorch this file must be python2 compliant.
-# This script outputs relevant system environment info
-# Run it with `python collect_env.py` or `python -m torch.utils.collect_env`
import datetime
import locale
import os
import re
import subprocess
import sys
+# Unlike the rest of the PyTorch this file must be python2 compliant.
+# This script outputs relevant system environment info
+# Run it with `python collect_env.py` or `python -m torch.utils.collect_env`
from collections import namedtuple
+from vllm.envs import environment_variables
+
try:
import torch
TORCH_AVAILABLE = True
@@ -52,6 +54,7 @@
'vllm_version', # vllm specific field
'vllm_build_flags', # vllm specific field
'gpu_topo', # vllm specific field
+ 'env_vars',
])
DEFAULT_CONDA_PATTERNS = {
@@ -512,6 +515,22 @@ def is_xnnpack_available():
else:
return "N/A"
+def get_env_vars():
+ env_vars = ''
+ secret_terms=('secret', 'token', 'api', 'access', 'password')
+ report_prefix = ("TORCH", "NCCL", "PYTORCH",
+ "CUDA", "CUBLAS", "CUDNN",
+ "OMP_", "MKL_",
+ "NVIDIA")
+ for k, v in os.environ.items():
+ if any(term in k.lower() for term in secret_terms):
+ continue
+ if k in environment_variables:
+ env_vars = env_vars + "{}={}".format(k, v) + "\n"
+ if k.startswith(report_prefix):
+ env_vars = env_vars + "{}={}".format(k, v) + "\n"
+
+ return env_vars
def get_env_info():
run_lambda = run
@@ -583,6 +602,7 @@ def get_version_or_na(cfg, prefix):
vllm_version=vllm_version,
vllm_build_flags=vllm_build_flags,
gpu_topo=gpu_topo,
+ env_vars=get_env_vars(),
)
@@ -631,6 +651,8 @@ def get_version_or_na(cfg, prefix):
{vllm_build_flags}
GPU Topology:
{gpu_topo}
+
+{env_vars}
""".strip()
diff --git a/csrc/activation_kernels.cu b/csrc/activation_kernels.cu
index 5ed1dc3b8f792..839dc36ba4e29 100644
--- a/csrc/activation_kernels.cu
+++ b/csrc/activation_kernels.cu
@@ -89,6 +89,48 @@ void gelu_tanh_and_mul(torch::Tensor& out, // [..., d]
namespace vllm {
+template
+__device__ __forceinline__ T fatrelu_kernel(const T& x, const float threshold) {
+ const float f = (float)x;
+ return (T)(f > threshold ? f : 0.0f);
+}
+
+template
+__global__ void act_and_mul_kernel_with_param(
+ scalar_t* __restrict__ out, const scalar_t* __restrict__ input, const int d,
+ const float param) {
+ const int64_t token_idx = blockIdx.x;
+ for (int64_t idx = threadIdx.x; idx < d; idx += blockDim.x) {
+ const scalar_t x = VLLM_LDG(&input[token_idx * 2 * d + idx]);
+ const scalar_t y = VLLM_LDG(&input[token_idx * 2 * d + d + idx]);
+ out[token_idx * d + idx] = ACT_FN(x, param) * y;
+ }
+}
+
+} // namespace vllm
+
+#define LAUNCH_ACTIVATION_GATE_KERNEL_WITH_PARAM(KERNEL, PARAM) \
+ int d = input.size(-1) / 2; \
+ int64_t num_tokens = input.numel() / input.size(-1); \
+ dim3 grid(num_tokens); \
+ dim3 block(std::min(d, 1024)); \
+ const at::cuda::OptionalCUDAGuard device_guard(device_of(input)); \
+ const cudaStream_t stream = at::cuda::getCurrentCUDAStream(); \
+ VLLM_DISPATCH_FLOATING_TYPES( \
+ input.scalar_type(), "act_and_mul_kernel_with_param", [&] { \
+ vllm::act_and_mul_kernel_with_param> \
+ <<>>(out.data_ptr(), \
+ input.data_ptr(), d, \
+ PARAM); \
+ });
+
+void fatrelu_and_mul(torch::Tensor& out, // [..., d],
+ torch::Tensor& input, // [..., 2 * d]
+ double threshold) {
+ LAUNCH_ACTIVATION_GATE_KERNEL_WITH_PARAM(vllm::fatrelu_kernel, threshold);
+}
+namespace vllm {
+
// Element-wise activation kernel template.
template
__global__ void activation_kernel(
diff --git a/csrc/attention/attention_kernels.cu b/csrc/attention/attention_kernels.cuh
similarity index 64%
rename from csrc/attention/attention_kernels.cu
rename to csrc/attention/attention_kernels.cuh
index bcd170411e7cb..563e1438f0b01 100644
--- a/csrc/attention/attention_kernels.cu
+++ b/csrc/attention/attention_kernels.cuh
@@ -670,332 +670,6 @@ __global__ void paged_attention_v2_reduce_kernel(
} // namespace vllm
-#define LAUNCH_PAGED_ATTENTION_V1(HEAD_SIZE) \
- VLLM_DevFuncAttribute_SET_MaxDynamicSharedMemorySize( \
- ((void*)vllm::paged_attention_v1_kernel), \
- shared_mem_size); \
- vllm::paged_attention_v1_kernel \
- <<>>( \
- out_ptr, query_ptr, key_cache_ptr, value_cache_ptr, num_kv_heads, \
- scale, block_tables_ptr, seq_lens_ptr, max_num_blocks_per_seq, \
- alibi_slopes_ptr, q_stride, kv_block_stride, kv_head_stride, \
- k_scale, v_scale, tp_rank, blocksparse_local_blocks, \
- blocksparse_vert_stride, blocksparse_block_size, \
- blocksparse_head_sliding_step);
-
-// TODO(woosuk): Tune NUM_THREADS.
-template
-void paged_attention_v1_launcher(
- torch::Tensor& out, torch::Tensor& query, torch::Tensor& key_cache,
- torch::Tensor& value_cache, int num_kv_heads, float scale,
- torch::Tensor& block_tables, torch::Tensor& seq_lens, int max_seq_len,
- const c10::optional& alibi_slopes, float k_scale,
- float v_scale, const int tp_rank, const int blocksparse_local_blocks,
- const int blocksparse_vert_stride, const int blocksparse_block_size,
- const int blocksparse_head_sliding_step) {
- int num_seqs = query.size(0);
- int num_heads = query.size(1);
- int head_size = query.size(2);
- int max_num_blocks_per_seq = block_tables.size(1);
- int q_stride = query.stride(0);
- int kv_block_stride = key_cache.stride(0);
- int kv_head_stride = key_cache.stride(1);
-
- [[maybe_unused]] int thread_group_size = MAX(WARP_SIZE / BLOCK_SIZE, 1);
- assert(head_size % thread_group_size == 0);
-
- // NOTE: alibi_slopes is optional.
- const float* alibi_slopes_ptr =
- alibi_slopes
- ? reinterpret_cast(alibi_slopes.value().data_ptr())
- : nullptr;
-
- T* out_ptr = reinterpret_cast(out.data_ptr());
- T* query_ptr = reinterpret_cast(query.data_ptr());
- CACHE_T* key_cache_ptr = reinterpret_cast(key_cache.data_ptr());
- CACHE_T* value_cache_ptr = reinterpret_cast(value_cache.data_ptr());
- int* block_tables_ptr = block_tables.data_ptr();
- int* seq_lens_ptr = seq_lens.data_ptr();
-
- constexpr int NUM_WARPS = NUM_THREADS / WARP_SIZE;
- int padded_max_seq_len =
- DIVIDE_ROUND_UP(max_seq_len, BLOCK_SIZE) * BLOCK_SIZE;
- int logits_size = padded_max_seq_len * sizeof(float);
- int outputs_size = (NUM_WARPS / 2) * head_size * sizeof(float);
- // Python-side check in vllm.worker.worker._check_if_can_support_max_seq_len
- // Keep that in sync with the logic here!
- int shared_mem_size = std::max(logits_size, outputs_size);
-
- dim3 grid(num_heads, num_seqs, 1);
- dim3 block(NUM_THREADS);
- const at::cuda::OptionalCUDAGuard device_guard(device_of(query));
- const cudaStream_t stream = at::cuda::getCurrentCUDAStream();
- switch (head_size) {
- // NOTE(woosuk): To reduce the compilation time, we only compile for the
- // head sizes that we use in the model. However, we can easily extend this
- // to support any head size which is a multiple of 16.
- case 64:
- LAUNCH_PAGED_ATTENTION_V1(64);
- break;
- case 80:
- LAUNCH_PAGED_ATTENTION_V1(80);
- break;
- case 96:
- LAUNCH_PAGED_ATTENTION_V1(96);
- break;
- case 112:
- LAUNCH_PAGED_ATTENTION_V1(112);
- break;
- case 120:
- LAUNCH_PAGED_ATTENTION_V1(120);
- break;
- case 128:
- LAUNCH_PAGED_ATTENTION_V1(128);
- break;
- case 192:
- LAUNCH_PAGED_ATTENTION_V1(192);
- break;
- case 256:
- LAUNCH_PAGED_ATTENTION_V1(256);
- break;
- default:
- TORCH_CHECK(false, "Unsupported head size: ", head_size);
- break;
- }
-}
-
-#define CALL_V1_LAUNCHER(T, CACHE_T, BLOCK_SIZE, KV_DTYPE, IS_BLOCK_SPARSE) \
- paged_attention_v1_launcher( \
- out, query, key_cache, value_cache, num_kv_heads, scale, block_tables, \
- seq_lens, max_seq_len, alibi_slopes, k_scale, v_scale, tp_rank, \
- blocksparse_local_blocks, blocksparse_vert_stride, \
- blocksparse_block_size, blocksparse_head_sliding_step);
-
-#define CALL_V1_LAUNCHER_SPARSITY(T, CACHE_T, BLOCK_SIZE, IS_FP8_KV_CACHE) \
- switch (is_block_sparse) { \
- case true: \
- CALL_V1_LAUNCHER(T, CACHE_T, BLOCK_SIZE, IS_FP8_KV_CACHE, true); \
- break; \
- case false: \
- CALL_V1_LAUNCHER(T, CACHE_T, BLOCK_SIZE, IS_FP8_KV_CACHE, false); \
- break; \
- }
-
-// NOTE(woosuk): To reduce the compilation time, we omitted block sizes
-// 1, 2, 4, 64, 128, 256.
-#define CALL_V1_LAUNCHER_BLOCK_SIZE(T, CACHE_T, KV_DTYPE) \
- switch (block_size) { \
- case 8: \
- CALL_V1_LAUNCHER_SPARSITY(T, CACHE_T, 8, KV_DTYPE); \
- break; \
- case 16: \
- CALL_V1_LAUNCHER_SPARSITY(T, CACHE_T, 16, KV_DTYPE); \
- break; \
- case 32: \
- CALL_V1_LAUNCHER_SPARSITY(T, CACHE_T, 32, KV_DTYPE); \
- break; \
- default: \
- TORCH_CHECK(false, "Unsupported block size: ", block_size); \
- break; \
- }
-
-void paged_attention_v1(
- torch::Tensor& out, // [num_seqs, num_heads, head_size]
- torch::Tensor& query, // [num_seqs, num_heads, head_size]
- torch::Tensor&
- key_cache, // [num_blocks, num_heads, head_size/x, block_size, x]
- torch::Tensor&
- value_cache, // [num_blocks, num_heads, head_size, block_size]
- int64_t num_kv_heads, // [num_heads]
- double scale,
- torch::Tensor& block_tables, // [num_seqs, max_num_blocks_per_seq]
- torch::Tensor& seq_lens, // [num_seqs]
- int64_t block_size, int64_t max_seq_len,
- const c10::optional& alibi_slopes,
- const std::string& kv_cache_dtype, double k_scale, double v_scale,
- const int64_t tp_rank, const int64_t blocksparse_local_blocks,
- const int64_t blocksparse_vert_stride, const int64_t blocksparse_block_size,
- const int64_t blocksparse_head_sliding_step) {
- const bool is_block_sparse = (blocksparse_vert_stride > 1);
-
- DISPATCH_BY_KV_CACHE_DTYPE(query.dtype(), kv_cache_dtype,
- CALL_V1_LAUNCHER_BLOCK_SIZE)
-}
-
-#define LAUNCH_PAGED_ATTENTION_V2(HEAD_SIZE) \
- vllm::paged_attention_v2_kernel \
- <<>>( \
- exp_sums_ptr, max_logits_ptr, tmp_out_ptr, query_ptr, key_cache_ptr, \
- value_cache_ptr, num_kv_heads, scale, block_tables_ptr, \
- seq_lens_ptr, max_num_blocks_per_seq, alibi_slopes_ptr, q_stride, \
- kv_block_stride, kv_head_stride, k_scale, v_scale, tp_rank, \
- blocksparse_local_blocks, blocksparse_vert_stride, \
- blocksparse_block_size, blocksparse_head_sliding_step); \
- vllm::paged_attention_v2_reduce_kernel \
- <<>>( \
- out_ptr, exp_sums_ptr, max_logits_ptr, tmp_out_ptr, seq_lens_ptr, \
- max_num_partitions);
-
-template
-void paged_attention_v2_launcher(
- torch::Tensor& out, torch::Tensor& exp_sums, torch::Tensor& max_logits,
- torch::Tensor& tmp_out, torch::Tensor& query, torch::Tensor& key_cache,
- torch::Tensor& value_cache, int num_kv_heads, float scale,
- torch::Tensor& block_tables, torch::Tensor& seq_lens, int max_seq_len,
- const c10::optional& alibi_slopes, float k_scale,
- float v_scale, const int tp_rank, const int blocksparse_local_blocks,
- const int blocksparse_vert_stride, const int blocksparse_block_size,
- const int blocksparse_head_sliding_step) {
- int num_seqs = query.size(0);
- int num_heads = query.size(1);
- int head_size = query.size(2);
- int max_num_blocks_per_seq = block_tables.size(1);
- int q_stride = query.stride(0);
- int kv_block_stride = key_cache.stride(0);
- int kv_head_stride = key_cache.stride(1);
-
- [[maybe_unused]] int thread_group_size = MAX(WARP_SIZE / BLOCK_SIZE, 1);
- assert(head_size % thread_group_size == 0);
-
- // NOTE: alibi_slopes is optional.
- const float* alibi_slopes_ptr =
- alibi_slopes
- ? reinterpret_cast(alibi_slopes.value().data_ptr())
- : nullptr;
-
- T* out_ptr = reinterpret_cast(out.data_ptr());
- float* exp_sums_ptr = reinterpret_cast(exp_sums.data_ptr());
- float* max_logits_ptr = reinterpret_cast(max_logits.data_ptr());
- T* tmp_out_ptr = reinterpret_cast(tmp_out.data_ptr());
- T* query_ptr = reinterpret_cast(query.data_ptr());
- CACHE_T* key_cache_ptr = reinterpret_cast(key_cache.data_ptr());
- CACHE_T* value_cache_ptr = reinterpret_cast(value_cache.data_ptr());
- int* block_tables_ptr = block_tables.data_ptr();
- int* seq_lens_ptr = seq_lens.data_ptr();
-
- constexpr int NUM_WARPS = NUM_THREADS / WARP_SIZE;
- int max_num_partitions = DIVIDE_ROUND_UP(max_seq_len, PARTITION_SIZE);
- int logits_size = PARTITION_SIZE * sizeof(float);
- int outputs_size = (NUM_WARPS / 2) * head_size * sizeof(float);
-
- // For paged attention v2 kernel.
- dim3 grid(num_heads, num_seqs, max_num_partitions);
- int shared_mem_size = std::max(logits_size, outputs_size);
- // For paged attention v2 reduce kernel.
- dim3 reduce_grid(num_heads, num_seqs);
- int reduce_shared_mem_size = 2 * max_num_partitions * sizeof(float);
-
- dim3 block(NUM_THREADS);
- const at::cuda::OptionalCUDAGuard device_guard(device_of(query));
- const cudaStream_t stream = at::cuda::getCurrentCUDAStream();
- switch (head_size) {
- // NOTE(woosuk): To reduce the compilation time, we only compile for the
- // head sizes that we use in the model. However, we can easily extend this
- // to support any head size which is a multiple of 16.
- case 64:
- LAUNCH_PAGED_ATTENTION_V2(64);
- break;
- case 80:
- LAUNCH_PAGED_ATTENTION_V2(80);
- break;
- case 96:
- LAUNCH_PAGED_ATTENTION_V2(96);
- break;
- case 112:
- LAUNCH_PAGED_ATTENTION_V2(112);
- break;
- case 120:
- LAUNCH_PAGED_ATTENTION_V2(120);
- break;
- case 128:
- LAUNCH_PAGED_ATTENTION_V2(128);
- break;
- case 192:
- LAUNCH_PAGED_ATTENTION_V2(192);
- break;
- case 256:
- LAUNCH_PAGED_ATTENTION_V2(256);
- break;
- default:
- TORCH_CHECK(false, "Unsupported head size: ", head_size);
- break;
- }
-}
-
-#define CALL_V2_LAUNCHER(T, CACHE_T, BLOCK_SIZE, KV_DTYPE, IS_BLOCK_SPARSE) \
- paged_attention_v2_launcher( \
- out, exp_sums, max_logits, tmp_out, query, key_cache, value_cache, \
- num_kv_heads, scale, block_tables, seq_lens, max_seq_len, alibi_slopes, \
- k_scale, v_scale, tp_rank, blocksparse_local_blocks, \
- blocksparse_vert_stride, blocksparse_block_size, \
- blocksparse_head_sliding_step);
-
-#define CALL_V2_LAUNCHER_SPARSITY(T, CACHE_T, BLOCK_SIZE, IS_FP8_KV_CACHE) \
- switch (is_block_sparse) { \
- case true: \
- CALL_V2_LAUNCHER(T, CACHE_T, BLOCK_SIZE, IS_FP8_KV_CACHE, true); \
- break; \
- case false: \
- CALL_V2_LAUNCHER(T, CACHE_T, BLOCK_SIZE, IS_FP8_KV_CACHE, false); \
- break; \
- }
-
-// NOTE(woosuk): To reduce the compilation time, we omitted block sizes
-// 1, 2, 4, 64, 128, 256.
-#define CALL_V2_LAUNCHER_BLOCK_SIZE(T, CACHE_T, KV_DTYPE) \
- switch (block_size) { \
- case 8: \
- CALL_V2_LAUNCHER_SPARSITY(T, CACHE_T, 8, KV_DTYPE); \
- break; \
- case 16: \
- CALL_V2_LAUNCHER_SPARSITY(T, CACHE_T, 16, KV_DTYPE); \
- break; \
- case 32: \
- CALL_V2_LAUNCHER_SPARSITY(T, CACHE_T, 32, KV_DTYPE); \
- break; \
- default: \
- TORCH_CHECK(false, "Unsupported block size: ", block_size); \
- break; \
- }
-
-void paged_attention_v2(
- torch::Tensor& out, // [num_seqs, num_heads, head_size]
- torch::Tensor& exp_sums, // [num_seqs, num_heads, max_num_partitions]
- torch::Tensor& max_logits, // [num_seqs, num_heads, max_num_partitions]
- torch::Tensor&
- tmp_out, // [num_seqs, num_heads, max_num_partitions, head_size]
- torch::Tensor& query, // [num_seqs, num_heads, head_size]
- torch::Tensor&
- key_cache, // [num_blocks, num_heads, head_size/x, block_size, x]
- torch::Tensor&
- value_cache, // [num_blocks, num_heads, head_size, block_size]
- int64_t num_kv_heads, // [num_heads]
- double scale,
- torch::Tensor& block_tables, // [num_seqs, max_num_blocks_per_seq]
- torch::Tensor& seq_lens, // [num_seqs]
- int64_t block_size, int64_t max_seq_len,
- const c10::optional& alibi_slopes,
- const std::string& kv_cache_dtype, double k_scale, double v_scale,
- const int64_t tp_rank, const int64_t blocksparse_local_blocks,
- const int64_t blocksparse_vert_stride, const int64_t blocksparse_block_size,
- const int64_t blocksparse_head_sliding_step) {
- const bool is_block_sparse = (blocksparse_vert_stride > 1);
- DISPATCH_BY_KV_CACHE_DTYPE(query.dtype(), kv_cache_dtype,
- CALL_V2_LAUNCHER_BLOCK_SIZE)
-}
-
#undef WARP_SIZE
#undef MAX
#undef MIN
diff --git a/csrc/attention/paged_attention_v1.cu b/csrc/attention/paged_attention_v1.cu
new file mode 100644
index 0000000000000..741cd0c82dc89
--- /dev/null
+++ b/csrc/attention/paged_attention_v1.cu
@@ -0,0 +1,196 @@
+/*
+ * Adapted from
+ * https://github.com/NVIDIA/FasterTransformer/blob/release/v5.3_tag/src/fastertransformer/kernels/decoder_masked_multihead_attention/decoder_masked_multihead_attention_template.hpp
+ * Copyright (c) 2023, The vLLM team.
+ * Copyright (c) 2020-2023, NVIDIA CORPORATION. All rights reserved.
+ *
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ * http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+#include "attention_kernels.cuh"
+
+#ifndef USE_ROCM
+ #define WARP_SIZE 32
+#else
+ #define WARP_SIZE warpSize
+#endif
+
+#define MAX(a, b) ((a) > (b) ? (a) : (b))
+#define MIN(a, b) ((a) < (b) ? (a) : (b))
+#define DIVIDE_ROUND_UP(a, b) (((a) + (b) - 1) / (b))
+
+#define LAUNCH_PAGED_ATTENTION_V1(HEAD_SIZE) \
+ VLLM_DevFuncAttribute_SET_MaxDynamicSharedMemorySize( \
+ ((void*)vllm::paged_attention_v1_kernel), \
+ shared_mem_size); \
+ vllm::paged_attention_v1_kernel \
+ <<>>( \
+ out_ptr, query_ptr, key_cache_ptr, value_cache_ptr, num_kv_heads, \
+ scale, block_tables_ptr, seq_lens_ptr, max_num_blocks_per_seq, \
+ alibi_slopes_ptr, q_stride, kv_block_stride, kv_head_stride, \
+ k_scale, v_scale, tp_rank, blocksparse_local_blocks, \
+ blocksparse_vert_stride, blocksparse_block_size, \
+ blocksparse_head_sliding_step);
+
+// TODO(woosuk): Tune NUM_THREADS.
+template
+void paged_attention_v1_launcher(
+ torch::Tensor& out, torch::Tensor& query, torch::Tensor& key_cache,
+ torch::Tensor& value_cache, int num_kv_heads, float scale,
+ torch::Tensor& block_tables, torch::Tensor& seq_lens, int max_seq_len,
+ const c10::optional& alibi_slopes, float k_scale,
+ float v_scale, const int tp_rank, const int blocksparse_local_blocks,
+ const int blocksparse_vert_stride, const int blocksparse_block_size,
+ const int blocksparse_head_sliding_step) {
+ int num_seqs = query.size(0);
+ int num_heads = query.size(1);
+ int head_size = query.size(2);
+ int max_num_blocks_per_seq = block_tables.size(1);
+ int q_stride = query.stride(0);
+ int kv_block_stride = key_cache.stride(0);
+ int kv_head_stride = key_cache.stride(1);
+
+ [[maybe_unused]] int thread_group_size = MAX(WARP_SIZE / BLOCK_SIZE, 1);
+ assert(head_size % thread_group_size == 0);
+
+ // NOTE: alibi_slopes is optional.
+ const float* alibi_slopes_ptr =
+ alibi_slopes
+ ? reinterpret_cast(alibi_slopes.value().data_ptr())
+ : nullptr;
+
+ T* out_ptr = reinterpret_cast(out.data_ptr());
+ T* query_ptr = reinterpret_cast(query.data_ptr());
+ CACHE_T* key_cache_ptr = reinterpret_cast(key_cache.data_ptr());
+ CACHE_T* value_cache_ptr = reinterpret_cast(value_cache.data_ptr());
+ int* block_tables_ptr = block_tables.data_ptr();
+ int* seq_lens_ptr = seq_lens.data_ptr();
+
+ constexpr int NUM_WARPS = NUM_THREADS / WARP_SIZE;
+ int padded_max_seq_len =
+ DIVIDE_ROUND_UP(max_seq_len, BLOCK_SIZE) * BLOCK_SIZE;
+ int logits_size = padded_max_seq_len * sizeof(float);
+ int outputs_size = (NUM_WARPS / 2) * head_size * sizeof(float);
+ // Python-side check in vllm.worker.worker._check_if_can_support_max_seq_len
+ // Keep that in sync with the logic here!
+ int shared_mem_size = std::max(logits_size, outputs_size);
+
+ dim3 grid(num_heads, num_seqs, 1);
+ dim3 block(NUM_THREADS);
+ const at::cuda::OptionalCUDAGuard device_guard(device_of(query));
+ const cudaStream_t stream = at::cuda::getCurrentCUDAStream();
+ switch (head_size) {
+ // NOTE(woosuk): To reduce the compilation time, we only compile for the
+ // head sizes that we use in the model. However, we can easily extend this
+ // to support any head size which is a multiple of 16.
+ case 32:
+ LAUNCH_PAGED_ATTENTION_V1(32);
+ break;
+ case 64:
+ LAUNCH_PAGED_ATTENTION_V1(64);
+ break;
+ case 80:
+ LAUNCH_PAGED_ATTENTION_V1(80);
+ break;
+ case 96:
+ LAUNCH_PAGED_ATTENTION_V1(96);
+ break;
+ case 112:
+ LAUNCH_PAGED_ATTENTION_V1(112);
+ break;
+ case 120:
+ LAUNCH_PAGED_ATTENTION_V1(120);
+ break;
+ case 128:
+ LAUNCH_PAGED_ATTENTION_V1(128);
+ break;
+ case 192:
+ LAUNCH_PAGED_ATTENTION_V1(192);
+ break;
+ case 256:
+ LAUNCH_PAGED_ATTENTION_V1(256);
+ break;
+ default:
+ TORCH_CHECK(false, "Unsupported head size: ", head_size);
+ break;
+ }
+}
+
+#define CALL_V1_LAUNCHER(T, CACHE_T, BLOCK_SIZE, KV_DTYPE, IS_BLOCK_SPARSE) \
+ paged_attention_v1_launcher( \
+ out, query, key_cache, value_cache, num_kv_heads, scale, block_tables, \
+ seq_lens, max_seq_len, alibi_slopes, k_scale, v_scale, tp_rank, \
+ blocksparse_local_blocks, blocksparse_vert_stride, \
+ blocksparse_block_size, blocksparse_head_sliding_step);
+
+#define CALL_V1_LAUNCHER_SPARSITY(T, CACHE_T, BLOCK_SIZE, IS_FP8_KV_CACHE) \
+ switch (is_block_sparse) { \
+ case true: \
+ CALL_V1_LAUNCHER(T, CACHE_T, BLOCK_SIZE, IS_FP8_KV_CACHE, true); \
+ break; \
+ case false: \
+ CALL_V1_LAUNCHER(T, CACHE_T, BLOCK_SIZE, IS_FP8_KV_CACHE, false); \
+ break; \
+ }
+
+// NOTE(woosuk): To reduce the compilation time, we omitted block sizes
+// 1, 2, 4, 64, 128, 256.
+#define CALL_V1_LAUNCHER_BLOCK_SIZE(T, CACHE_T, KV_DTYPE) \
+ switch (block_size) { \
+ case 8: \
+ CALL_V1_LAUNCHER_SPARSITY(T, CACHE_T, 8, KV_DTYPE); \
+ break; \
+ case 16: \
+ CALL_V1_LAUNCHER_SPARSITY(T, CACHE_T, 16, KV_DTYPE); \
+ break; \
+ case 32: \
+ CALL_V1_LAUNCHER_SPARSITY(T, CACHE_T, 32, KV_DTYPE); \
+ break; \
+ default: \
+ TORCH_CHECK(false, "Unsupported block size: ", block_size); \
+ break; \
+ }
+
+void paged_attention_v1(
+ torch::Tensor& out, // [num_seqs, num_heads, head_size]
+ torch::Tensor& query, // [num_seqs, num_heads, head_size]
+ torch::Tensor&
+ key_cache, // [num_blocks, num_heads, head_size/x, block_size, x]
+ torch::Tensor&
+ value_cache, // [num_blocks, num_heads, head_size, block_size]
+ int64_t num_kv_heads, // [num_heads]
+ double scale,
+ torch::Tensor& block_tables, // [num_seqs, max_num_blocks_per_seq]
+ torch::Tensor& seq_lens, // [num_seqs]
+ int64_t block_size, int64_t max_seq_len,
+ const c10::optional& alibi_slopes,
+ const std::string& kv_cache_dtype, double k_scale, double v_scale,
+ const int64_t tp_rank, const int64_t blocksparse_local_blocks,
+ const int64_t blocksparse_vert_stride, const int64_t blocksparse_block_size,
+ const int64_t blocksparse_head_sliding_step) {
+ const bool is_block_sparse = (blocksparse_vert_stride > 1);
+
+ DISPATCH_BY_KV_CACHE_DTYPE(query.dtype(), kv_cache_dtype,
+ CALL_V1_LAUNCHER_BLOCK_SIZE)
+}
+
+#undef WARP_SIZE
+#undef MAX
+#undef MIN
+#undef DIVIDE_ROUND_UP
\ No newline at end of file
diff --git a/csrc/attention/paged_attention_v2.cu b/csrc/attention/paged_attention_v2.cu
new file mode 100644
index 0000000000000..6de8d0bdd5b8d
--- /dev/null
+++ b/csrc/attention/paged_attention_v2.cu
@@ -0,0 +1,206 @@
+/*
+ * Adapted from
+ * https://github.com/NVIDIA/FasterTransformer/blob/release/v5.3_tag/src/fastertransformer/kernels/decoder_masked_multihead_attention/decoder_masked_multihead_attention_template.hpp
+ * Copyright (c) 2023, The vLLM team.
+ * Copyright (c) 2020-2023, NVIDIA CORPORATION. All rights reserved.
+ *
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ * http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+#include "attention_kernels.cuh"
+
+#ifndef USE_ROCM
+ #define WARP_SIZE 32
+#else
+ #define WARP_SIZE warpSize
+#endif
+
+#define MAX(a, b) ((a) > (b) ? (a) : (b))
+#define MIN(a, b) ((a) < (b) ? (a) : (b))
+#define DIVIDE_ROUND_UP(a, b) (((a) + (b) - 1) / (b))
+
+#define LAUNCH_PAGED_ATTENTION_V2(HEAD_SIZE) \
+ vllm::paged_attention_v2_kernel \
+ <<>>( \
+ exp_sums_ptr, max_logits_ptr, tmp_out_ptr, query_ptr, key_cache_ptr, \
+ value_cache_ptr, num_kv_heads, scale, block_tables_ptr, \
+ seq_lens_ptr, max_num_blocks_per_seq, alibi_slopes_ptr, q_stride, \
+ kv_block_stride, kv_head_stride, k_scale, v_scale, tp_rank, \
+ blocksparse_local_blocks, blocksparse_vert_stride, \
+ blocksparse_block_size, blocksparse_head_sliding_step); \
+ vllm::paged_attention_v2_reduce_kernel \
+ <<>>( \
+ out_ptr, exp_sums_ptr, max_logits_ptr, tmp_out_ptr, seq_lens_ptr, \
+ max_num_partitions);
+
+template
+void paged_attention_v2_launcher(
+ torch::Tensor& out, torch::Tensor& exp_sums, torch::Tensor& max_logits,
+ torch::Tensor& tmp_out, torch::Tensor& query, torch::Tensor& key_cache,
+ torch::Tensor& value_cache, int num_kv_heads, float scale,
+ torch::Tensor& block_tables, torch::Tensor& seq_lens, int max_seq_len,
+ const c10::optional& alibi_slopes, float k_scale,
+ float v_scale, const int tp_rank, const int blocksparse_local_blocks,
+ const int blocksparse_vert_stride, const int blocksparse_block_size,
+ const int blocksparse_head_sliding_step) {
+ int num_seqs = query.size(0);
+ int num_heads = query.size(1);
+ int head_size = query.size(2);
+ int max_num_blocks_per_seq = block_tables.size(1);
+ int q_stride = query.stride(0);
+ int kv_block_stride = key_cache.stride(0);
+ int kv_head_stride = key_cache.stride(1);
+
+ [[maybe_unused]] int thread_group_size = MAX(WARP_SIZE / BLOCK_SIZE, 1);
+ assert(head_size % thread_group_size == 0);
+
+ // NOTE: alibi_slopes is optional.
+ const float* alibi_slopes_ptr =
+ alibi_slopes
+ ? reinterpret_cast(alibi_slopes.value().data_ptr())
+ : nullptr;
+
+ T* out_ptr = reinterpret_cast(out.data_ptr());
+ float* exp_sums_ptr = reinterpret_cast(exp_sums.data_ptr());
+ float* max_logits_ptr = reinterpret_cast(max_logits.data_ptr());
+ T* tmp_out_ptr = reinterpret_cast(tmp_out.data_ptr());
+ T* query_ptr = reinterpret_cast(query.data_ptr());
+ CACHE_T* key_cache_ptr = reinterpret_cast(key_cache.data_ptr());
+ CACHE_T* value_cache_ptr = reinterpret_cast(value_cache.data_ptr());
+ int* block_tables_ptr = block_tables.data_ptr();
+ int* seq_lens_ptr = seq_lens.data_ptr();
+
+ constexpr int NUM_WARPS = NUM_THREADS / WARP_SIZE;
+ int max_num_partitions = DIVIDE_ROUND_UP(max_seq_len, PARTITION_SIZE);
+ int logits_size = PARTITION_SIZE * sizeof(float);
+ int outputs_size = (NUM_WARPS / 2) * head_size * sizeof(float);
+
+ // For paged attention v2 kernel.
+ dim3 grid(num_heads, num_seqs, max_num_partitions);
+ int shared_mem_size = std::max(logits_size, outputs_size);
+ // For paged attention v2 reduce kernel.
+ dim3 reduce_grid(num_heads, num_seqs);
+ int reduce_shared_mem_size = 2 * max_num_partitions * sizeof(float);
+
+ dim3 block(NUM_THREADS);
+ const at::cuda::OptionalCUDAGuard device_guard(device_of(query));
+ const cudaStream_t stream = at::cuda::getCurrentCUDAStream();
+ switch (head_size) {
+ // NOTE(woosuk): To reduce the compilation time, we only compile for the
+ // head sizes that we use in the model. However, we can easily extend this
+ // to support any head size which is a multiple of 16.
+ case 32:
+ LAUNCH_PAGED_ATTENTION_V2(32);
+ break;
+ case 64:
+ LAUNCH_PAGED_ATTENTION_V2(64);
+ break;
+ case 80:
+ LAUNCH_PAGED_ATTENTION_V2(80);
+ break;
+ case 96:
+ LAUNCH_PAGED_ATTENTION_V2(96);
+ break;
+ case 112:
+ LAUNCH_PAGED_ATTENTION_V2(112);
+ break;
+ case 120:
+ LAUNCH_PAGED_ATTENTION_V2(120);
+ break;
+ case 128:
+ LAUNCH_PAGED_ATTENTION_V2(128);
+ break;
+ case 192:
+ LAUNCH_PAGED_ATTENTION_V2(192);
+ break;
+ case 256:
+ LAUNCH_PAGED_ATTENTION_V2(256);
+ break;
+ default:
+ TORCH_CHECK(false, "Unsupported head size: ", head_size);
+ break;
+ }
+}
+
+#define CALL_V2_LAUNCHER(T, CACHE_T, BLOCK_SIZE, KV_DTYPE, IS_BLOCK_SPARSE) \
+ paged_attention_v2_launcher( \
+ out, exp_sums, max_logits, tmp_out, query, key_cache, value_cache, \
+ num_kv_heads, scale, block_tables, seq_lens, max_seq_len, alibi_slopes, \
+ k_scale, v_scale, tp_rank, blocksparse_local_blocks, \
+ blocksparse_vert_stride, blocksparse_block_size, \
+ blocksparse_head_sliding_step);
+
+#define CALL_V2_LAUNCHER_SPARSITY(T, CACHE_T, BLOCK_SIZE, IS_FP8_KV_CACHE) \
+ switch (is_block_sparse) { \
+ case true: \
+ CALL_V2_LAUNCHER(T, CACHE_T, BLOCK_SIZE, IS_FP8_KV_CACHE, true); \
+ break; \
+ case false: \
+ CALL_V2_LAUNCHER(T, CACHE_T, BLOCK_SIZE, IS_FP8_KV_CACHE, false); \
+ break; \
+ }
+
+// NOTE(woosuk): To reduce the compilation time, we omitted block sizes
+// 1, 2, 4, 64, 128, 256.
+#define CALL_V2_LAUNCHER_BLOCK_SIZE(T, CACHE_T, KV_DTYPE) \
+ switch (block_size) { \
+ case 8: \
+ CALL_V2_LAUNCHER_SPARSITY(T, CACHE_T, 8, KV_DTYPE); \
+ break; \
+ case 16: \
+ CALL_V2_LAUNCHER_SPARSITY(T, CACHE_T, 16, KV_DTYPE); \
+ break; \
+ case 32: \
+ CALL_V2_LAUNCHER_SPARSITY(T, CACHE_T, 32, KV_DTYPE); \
+ break; \
+ default: \
+ TORCH_CHECK(false, "Unsupported block size: ", block_size); \
+ break; \
+ }
+
+void paged_attention_v2(
+ torch::Tensor& out, // [num_seqs, num_heads, head_size]
+ torch::Tensor& exp_sums, // [num_seqs, num_heads, max_num_partitions]
+ torch::Tensor& max_logits, // [num_seqs, num_heads, max_num_partitions]
+ torch::Tensor&
+ tmp_out, // [num_seqs, num_heads, max_num_partitions, head_size]
+ torch::Tensor& query, // [num_seqs, num_heads, head_size]
+ torch::Tensor&
+ key_cache, // [num_blocks, num_heads, head_size/x, block_size, x]
+ torch::Tensor&
+ value_cache, // [num_blocks, num_heads, head_size, block_size]
+ int64_t num_kv_heads, // [num_heads]
+ double scale,
+ torch::Tensor& block_tables, // [num_seqs, max_num_blocks_per_seq]
+ torch::Tensor& seq_lens, // [num_seqs]
+ int64_t block_size, int64_t max_seq_len,
+ const c10::optional& alibi_slopes,
+ const std::string& kv_cache_dtype, double k_scale, double v_scale,
+ const int64_t tp_rank, const int64_t blocksparse_local_blocks,
+ const int64_t blocksparse_vert_stride, const int64_t blocksparse_block_size,
+ const int64_t blocksparse_head_sliding_step) {
+ const bool is_block_sparse = (blocksparse_vert_stride > 1);
+ DISPATCH_BY_KV_CACHE_DTYPE(query.dtype(), kv_cache_dtype,
+ CALL_V2_LAUNCHER_BLOCK_SIZE)
+}
+
+#undef WARP_SIZE
+#undef MAX
+#undef MIN
+#undef DIVIDE_ROUND_UP
\ No newline at end of file
diff --git a/csrc/core/scalar_type.hpp b/csrc/core/scalar_type.hpp
index 0e1f360d74bd5..408e736d5bc0f 100644
--- a/csrc/core/scalar_type.hpp
+++ b/csrc/core/scalar_type.hpp
@@ -1,6 +1,7 @@
#pragma once
-#include
+// For TORCH_CHECK
+#include
namespace vllm {
@@ -9,12 +10,7 @@ namespace vllm {
// in particular it can be used to represent sub-byte data types (something
// that torch.dtype currently does not support).
//
-// ScalarTypeTorch is a subclass of ScalarType that is compatible with
-// TORCH_LIBRARY, making it accessible from Python as well meaning this class
-// can be used as a argument for custom operators, helping to simplify these
-// interfaces.
-//
-// The type definitions on the Python side can be found in: vllm/_core_ext.pyi
+// The type definitions on the Python side can be found in: vllm/scalar_type.py
// these type definitions should be kept up to date with any Python API changes
// here.
//
@@ -308,204 +304,7 @@ class ScalarType {
}
};
-// Create a TORCH_LIBRARY compatible version of ScalarType (i.e. inherit from
-// torch::CustomClassHolder), we use multiple inheritance here since we cannot
-// have ScalarType inherit from torch::CustomClassHolder and have a constexpr
-// constructor at the same time (torch::CustomClassHolder does not have a
-// constexpr destructor)
-// See also:
-// https://docs.google.com/document/d/18fBMPuOJ0fY5ZQ6YyrHUppw9FA332CpNtgB6SOIgyuA
-class ScalarTypeTorch : public torch::CustomClassHolder, public ScalarType {
- public:
- ScalarTypeTorch(int64_t exponent, int64_t mantissa, int64_t bias,
- bool _signed)
- : ScalarType(exponent, mantissa, bias, _signed){};
-
- ScalarTypeTorch(ScalarType type) : ScalarType(type){};
-
- using Base = ScalarType;
- using Self = ScalarTypeTorch;
- using SelfPtr = c10::intrusive_ptr;
-
- static void check_size_bits(int64_t size_bits, bool signed_) {
- TORCH_CHECK(
- size_bits <=
- std::numeric_limits().mantissa)>::max(),
- "size_bits bit width is too large to be represented");
- }
-
- static void check_bias(int64_t bias) {
- using Bias = decltype(std::declval().bias);
- TORCH_CHECK(bias <= std::numeric_limits::max() &&
- bias >= std::numeric_limits::min(),
- "bias too large or small to be represented");
- }
-
- static void check_exponent(int64_t exponent) {
- TORCH_CHECK(
- exponent <=
- std::numeric_limits().exponent)>::max(),
- "exponent bit width is too large to be represented");
- }
-
- static void check_mantissa(int64_t mantissa) {
- TORCH_CHECK(
- mantissa <=
- std::numeric_limits().mantissa)>::max(),
- "mantissa bit width is too large to be represented");
- }
-
- static SelfPtr int_(int64_t size_bits, c10::optional bias) {
- check_size_bits(size_bits, true);
- check_bias(bias.value_or(0));
- return c10::make_intrusive(
- ScalarType::int_(size_bits, bias.value_or(0)));
- }
-
- static SelfPtr uint(int64_t size_bits, c10::optional bias) {
- check_size_bits(size_bits, true);
- check_bias(bias.value_or(0));
- return c10::make_intrusive(
- ScalarType::uint(size_bits, bias.value_or(0)));
- }
-
- static SelfPtr float_IEEE754(int64_t exponent, int64_t mantissa) {
- check_mantissa(mantissa);
- check_exponent(exponent);
- return c10::make_intrusive(
- ScalarType::float_IEEE754(exponent, mantissa));
- }
-
- static SelfPtr float_(int64_t exponent, int64_t mantissa,
- bool finite_values_only, int64_t nan_repr) {
- check_mantissa(mantissa);
- check_exponent(exponent);
- return c10::make_intrusive(ScalarType::float_(
- exponent, mantissa, finite_values_only, NanRepr(nan_repr)));
- }
-
- // This needs to be implemented and throw a TypeError in order for
- // PyTorch's opcheck to work on ops that use ScalarTypes.
- int64_t len() const {
- throw c10::TypeError({__func__, __FILE__, static_cast(__LINE__)},
- "__len__ not implemented");
- return 0;
- }
-
- // Serialize a ScalarType into a tuple of pairs. Where each pair
- // is a (fieldname, value).
- // For simplicity, we are just going to convert to a ScalarTypeId.
- std::tuple> obj_flatten() const {
- return {{"ScalarType", id()}};
- }
-
- // Deserialize a scalar type that has been serialized by obj_flatten,
- // ostensibly from a tuple of (member name, value) pairs, but in reality
- // just a ScalarTypeId.
- static SelfPtr obj_unflatten(
- std::tuple> const& flat_type) {
- return c10::make_intrusive(
- from_id(std::get<1>(std::get<0>(flat_type))));
- }
-
- template
- static void bind_readonly_property(torch::class_& cls,
- std::string const& name, T Base::*field) {
- auto getter_func_helper = [field = std::move(field)](SelfPtr const& self) {
- if constexpr (std::is_member_function_pointer_v) {
- return (self.get()->*field)();
- } else {
- return self.get()->*field;
- }
- };
-
- auto getter_func = [field = std::move(field),
- getter_func_helper = std::move(getter_func_helper)](
- SelfPtr const& self) {
- auto val = getter_func_helper(self);
- // upconvert uint8_t, int32_t etc. to int64_t for python
- if constexpr (std::is_integral_v) {
- return static_cast(val);
- } else {
- return val;
- }
- };
-
- cls.def_property(name, getter_func);
- }
-
- template
- static void bind_function(torch::class_& cls, const std::string& name,
- MemberFunc Cls::*member) {
- cls.def(name, [member = std::move(member)](SelfPtr const& self) {
- return (self.get()->*member)();
- });
- }
-
- template
- static void bind_function(torch::class_& cls, const std::string& name,
- Func func) {
- cls.def(name, func);
- }
-
- template
- static void bind_static_function(torch::class_& cls,
- const std::string& name, Func func) {
- cls.def_static(name, func);
- }
-
- static void bind_class(torch::Library& lib) {
- auto cls = lib.class_("ScalarType")
- .def(torch::init());
-
- // Bind Properties
- bind_readonly_property(cls, "mantissa", &Base::mantissa);
- bind_readonly_property(cls, "exponent", &Base::exponent);
- bind_readonly_property(cls, "bias", &Base::bias);
- bind_readonly_property(cls, "signed", &Base::is_signed);
- bind_readonly_property(cls, "size_bits", &Base::size_bits);
-
- // Bind member functions
- bind_function(cls, "is_signed", &Base::is_signed);
- bind_function(cls, "is_integer", &Base::is_integer);
- bind_function(cls, "is_floating_point", &Base::is_floating_point);
- bind_function(cls, "is_ieee_754", &Base::is_ieee_754);
- bind_function(cls, "has_nans", &Base::has_nans);
- bind_function(cls, "has_infs", &Base::has_infs);
- bind_function(cls, "has_bias", &Base::has_bias);
-
- bind_function(cls, "max", [](SelfPtr const& self) {
- return std::visit([](auto arg) { return c10::IValue(arg); },
- self.get()->max());
- });
- bind_function(cls, "min", [](SelfPtr const& self) {
- return std::visit([](auto arg) { return c10::IValue(arg); },
- self.get()->min());
- });
-
- bind_function(cls, "__len__", &ScalarTypeTorch::len);
- bind_function(cls, "__str__", &Base::str);
- bind_function(cls, "__eq__", [](SelfPtr const& self, SelfPtr const& other) {
- return *self == *other;
- });
- bind_function(cls, "__repr__", [](SelfPtr const& self) {
- return "ScalarType." + self.get()->str();
- });
-
- bind_function(cls, "__obj_flatten__", &ScalarTypeTorch::obj_flatten);
- bind_static_function(cls, "__obj_unflatten__",
- &ScalarTypeTorch::obj_unflatten);
-
- // Bind static functions (convenience constructors)
- bind_static_function(cls, "int_", &ScalarTypeTorch::int_);
- bind_static_function(cls, "uint", &ScalarTypeTorch::uint);
- bind_static_function(cls, "float_IEEE754", &ScalarTypeTorch::float_IEEE754);
- bind_static_function(cls, "float_", &ScalarTypeTorch::float_);
- }
-};
-
-using ScalarTypeId = int64_t;
-using ScalarTypeTorchPtr = c10::intrusive_ptr;
+using ScalarTypeId = ScalarType::Id;
// "rust style" names generally following:
// https://github.com/pytorch/pytorch/blob/6d9f74f0af54751311f0dd71f7e5c01a93260ab3/torch/csrc/api/include/torch/types.h#L60-L70
diff --git a/csrc/core/torch_bindings.cpp b/csrc/core/torch_bindings.cpp
deleted file mode 100644
index f60254189a2f7..0000000000000
--- a/csrc/core/torch_bindings.cpp
+++ /dev/null
@@ -1,16 +0,0 @@
-#include
-
-#include "scalar_type.hpp"
-#include "registration.h"
-
-// Note the CORE exstension will be built for (almost) all hardware targets so
-// new additions must account for this. (currently not built for TPU and Neuron)
-
-TORCH_LIBRARY_EXPAND(TORCH_EXTENSION_NAME, lib) {
- // ScalarType, a custom class for representing data types that supports
- // quantized types, declared here so it can be used when creating interfaces
- // for custom ops.
- vllm::ScalarTypeTorch::bind_class(lib);
-}
-
-REGISTER_EXTENSION(TORCH_EXTENSION_NAME)
diff --git a/csrc/cpu/attention.cpp b/csrc/cpu/attention.cpp
index abb4e3bea14bb..e6c03dcb034fd 100644
--- a/csrc/cpu/attention.cpp
+++ b/csrc/cpu/attention.cpp
@@ -22,6 +22,24 @@ struct KernelVecType {
using v_load_vec_type = vec_op::FP32Vec16;
};
+template <>
+struct KernelVecType {
+#ifdef __powerpc64__
+ // Power architecture-specific vector types
+ using q_load_vec_type = vec_op::FP32Vec8;
+ using k_load_vec_type = vec_op::FP32Vec16;
+ using v_load_vec_type = vec_op::FP32Vec16;
+#else
+ // Fallback for other architectures, including x86
+ using q_load_vec_type = vec_op::FP16Vec8;
+ using k_load_vec_type = vec_op::FP16Vec16;
+ using v_load_vec_type = vec_op::FP16Vec16;
+#endif
+ using q_vec_type = vec_op::FP32Vec16;
+ using k_vec_type = vec_op::FP32Vec16;
+ using qk_acc_vec_type = vec_op::FP32Vec16;
+};
+
#ifdef __AVX512BF16__
template <>
struct KernelVecType {
@@ -375,6 +393,9 @@ void paged_attention_v1_impl_launcher(
int* seq_lens_ptr = seq_lens.data_ptr();
switch (head_size) {
+ case 32:
+ LAUNCH_V1_ATTENTION_KERNEL(T, 32, BLOCK_SIZE);
+ break;
case 64:
LAUNCH_V1_ATTENTION_KERNEL(T, 64, BLOCK_SIZE);
break;
@@ -692,6 +713,9 @@ void paged_attention_v2_impl_launcher(
int* seq_lens_ptr = seq_lens.data_ptr();
switch (head_size) {
+ case 32:
+ LAUNCH_V2_ATTENTION_KERNEL(T, 32, BLOCK_SIZE);
+ break;
case 64:
LAUNCH_V2_ATTENTION_KERNEL(T, 64, BLOCK_SIZE);
break;
diff --git a/csrc/cpu/cpu_types_x86.hpp b/csrc/cpu/cpu_types_x86.hpp
index 5b1d3d6442b2b..4bb4eb0f491ac 100644
--- a/csrc/cpu/cpu_types_x86.hpp
+++ b/csrc/cpu/cpu_types_x86.hpp
@@ -11,10 +11,10 @@ static_assert(false, "AVX2 must be supported for the current implementation.");
namespace vec_op {
-// FIXME: FP16 is not fully supported in Torch-CPU
#define VLLM_DISPATCH_CASE_FLOATING_TYPES(...) \
AT_DISPATCH_CASE(at::ScalarType::Float, __VA_ARGS__) \
- AT_DISPATCH_CASE(at::ScalarType::BFloat16, __VA_ARGS__)
+ AT_DISPATCH_CASE(at::ScalarType::BFloat16, __VA_ARGS__) \
+ AT_DISPATCH_CASE(at::ScalarType::Half, __VA_ARGS__)
#define VLLM_DISPATCH_FLOATING_TYPES(TYPE, NAME, ...) \
AT_DISPATCH_SWITCH(TYPE, NAME, VLLM_DISPATCH_CASE_FLOATING_TYPES(__VA_ARGS__))
@@ -50,37 +50,37 @@ template struct Vec {
struct FP32Vec8;
struct FP32Vec16;
-#ifdef __AVX512FP16__
struct FP16Vec8 : public Vec {
constexpr static int VEC_ELEM_NUM = 8;
- __m128h reg;
+ __m128i reg;
- explicit FP16Vec8(_Float16 v) : reg(_mm_set1_ph(v)) {}
+ explicit FP16Vec8(const void *ptr)
+ : reg((__m128i)_mm_loadu_si128((__m128i *)ptr)) {}
- explicit FP16Vec8(const void *ptr) : reg(_mm_loadu_ph(ptr)) {}
+ explicit FP16Vec8(const FP32Vec8 &);
- explicit FP16Vec8(__m128h data) : reg(data) {}
+ void save(void *ptr) const { *reinterpret_cast<__m128i *>(ptr) = reg; }
+};
- FP16Vec8 operator*(const FP16Vec8 &b) const {
- return FP16Vec8(_mm_mul_ph(reg, b.reg));
- }
+struct FP16Vec16 : public Vec {
+ constexpr static int VEC_ELEM_NUM = 16;
- FP16Vec8 operator+(const FP16Vec8 &b) const {
- return FP16Vec8(_mm_add_ph(reg, b.reg));
- }
+ __m256i reg;
- FP16Vec8 operator-(const FP16Vec8 &b) const {
- return FP16Vec8(_mm_sub_ph(reg, b.reg));
- }
+ explicit FP16Vec16(const void *ptr)
+ : reg((__m256i)_mm256_loadu_si256((__m256i *)ptr)) {}
- FP16Vec8 operator/(const FP16Vec8 &b) const {
- return FP16Vec8(_mm_div_ph(reg, b.reg));
- }
+ explicit FP16Vec16(const FP32Vec16 &);
+
+ void save(void *ptr) const { *reinterpret_cast<__m256i *>(ptr) = reg; }
- void save(void *ptr) const { _mm_storeu_ph(ptr, reg); }
+ void save(void* ptr, const int elem_num) const {
+ constexpr uint32_t M = 0xFFFFFFFF;
+ __mmask16 mask = _cvtu32_mask16(M >> (32 - elem_num));
+ _mm256_mask_storeu_epi16(ptr, mask, reg);
+ }
};
-#endif
struct BF16Vec8 : public Vec {
constexpr static int VEC_ELEM_NUM = 8;
@@ -202,9 +202,7 @@ struct FP32Vec8 : public Vec {
explicit FP32Vec8(const FP32Vec8 &data) : reg(data.reg) {}
-#ifdef __AVX512FP16__
- explicit FP32Vec8(__m128h v) : reg(_mm256_cvtph_ps(_mm_castph_si128(v))) {}
-#endif
+ explicit FP32Vec8(const FP16Vec8 &v) : reg(_mm256_cvtph_ps(v.reg)) {}
explicit FP32Vec8(const BF16Vec8 &v)
: reg(_mm256_castsi256_ps(
@@ -265,6 +263,30 @@ struct FP32Vec8 : public Vec {
void save(float *ptr) const { _mm256_storeu_ps(ptr, reg); }
};
+#ifdef __AVX512F__
+struct INT32Vec16: public Vec {
+ constexpr static int VEC_ELEM_NUM = 16;
+ union AliasReg {
+ __m512i reg;
+ int32_t values[VEC_ELEM_NUM];
+ };
+
+ __m512i reg;
+
+ explicit INT32Vec16(const void* data_ptr) : reg(_mm512_loadu_epi32(data_ptr)) {}
+
+ void save(int32_t* ptr) const {
+ _mm512_storeu_epi32(ptr, reg);
+ }
+
+ void save(int32_t* ptr, const int elem_num) const {
+ constexpr uint32_t M = 0xFFFFFFFF;
+ __mmask16 mask = _cvtu32_mask16(M >> (32 - elem_num));
+ _mm512_mask_storeu_epi32(ptr, mask, reg);
+ }
+};
+#endif
+
#ifdef __AVX512F__
struct FP32Vec16 : public Vec {
constexpr static int VEC_ELEM_NUM = 16;
@@ -283,8 +305,6 @@ struct FP32Vec16 : public Vec {
explicit FP32Vec16(__m512 data) : reg(data) {}
- explicit FP32Vec16(const FP32Vec16 &data) : reg(data.reg) {}
-
explicit FP32Vec16(const FP32Vec4 &data)
: reg((__m512)_mm512_inserti32x4(
_mm512_inserti32x4(
@@ -301,8 +321,15 @@ struct FP32Vec16 : public Vec {
: reg(_mm512_castsi512_ps(
_mm512_bslli_epi128(_mm512_cvtepu16_epi32(v.reg), 2))) {}
+ explicit FP32Vec16(const FP16Vec16 &v) : reg(_mm512_cvtph_ps(v.reg)) {}
+
+ explicit FP32Vec16(const FP16Vec8 &v) : FP32Vec16(FP32Vec8(v)) {}
+
explicit FP32Vec16(const BF16Vec8 &v) : FP32Vec16(FP32Vec8(v)) {}
+ explicit FP32Vec16(const INT32Vec16 &v)
+ : reg(_mm512_cvt_roundepi32_ps(v.reg, _MM_FROUND_TO_NEAREST_INT |_MM_FROUND_NO_EXC)) {}
+
FP32Vec16 operator*(const FP32Vec16 &b) const {
return FP32Vec16(_mm512_mul_ps(reg, b.reg));
}
@@ -333,6 +360,16 @@ struct FP32Vec16 : public Vec {
return FP32Vec16(_mm512_mask_max_ps(reg, mask, reg, b.reg));
}
+ FP32Vec16 min(const FP32Vec16& b) const {
+ return FP32Vec16(_mm512_min_ps(reg, b.reg));
+ }
+
+ FP32Vec16 min(const FP32Vec16& b, const int elem_num) const {
+ constexpr uint32_t M = 0xFFFFFFFF;
+ __mmask16 mask = _cvtu32_mask16(M >> (32 - elem_num));
+ return FP32Vec16(_mm512_mask_min_ps(reg, mask, reg, b.reg));
+ }
+
FP32Vec16 abs() const {
return FP32Vec16(_mm512_abs_ps(reg));
}
@@ -341,6 +378,8 @@ struct FP32Vec16 : public Vec {
float reduce_max() const { return _mm512_reduce_max_ps(reg); }
+ float reduce_min() const { return _mm512_reduce_min_ps(reg); }
+
template float reduce_sub_sum(int idx) {
static_assert(VEC_ELEM_NUM % group_size == 0);
constexpr uint32_t base_mask = (0xFFFF >> (16 - group_size));
@@ -393,6 +432,16 @@ struct FP32Vec16 : public Vec {
explicit FP32Vec16(const FP32Vec8 &data)
: reg_low(data.reg), reg_high(data.reg) {}
+ explicit FP32Vec16(const FP16Vec16 &v) {
+ __m128i low = _mm256_extractf128_si256(v.reg, 0);
+ __m128i high = _mm256_extractf128_si256(v.reg, 1);
+
+ reg_low = _mm256_cvtph_ps(low);
+ reg_high = _mm256_cvtph_ps(high);
+ }
+
+ explicit FP32Vec16(const FP16Vec8 &v) : FP32Vec16(FP32Vec8(v)) {}
+
explicit FP32Vec16(const BF16Vec16 &v) {
__m128i low = _mm256_extractf128_si256(v.reg, 0);
__m128i high = _mm256_extractf128_si256(v.reg, 1);
@@ -497,24 +546,34 @@ template using vec_t = typename VecType::vec_type;
template <> struct VecType { using vec_type = FP32Vec8; };
-#ifdef __AVX512FP16__
-template <> struct VecType { using vec_type = FP16Vec16; };
-#endif
+template <> struct VecType { using vec_type = FP16Vec8; };
template <> struct VecType { using vec_type = BF16Vec8; };
template void storeFP32(float v, T *ptr) { *ptr = v; }
-#ifdef __AVX512FP16__
-template <> inline void storeFP32(float v, c10::Half *ptr) {
- *reinterpret_cast<_Float16 *>(ptr) = v;
-}
-#endif
-
inline void fma(FP32Vec16 &acc, FP32Vec16 &a, FP32Vec16 &b) {
acc = acc + a * b;
}
+template <> inline void storeFP32(float v, c10::Half *ptr) {
+ *reinterpret_cast(ptr) =
+ _cvtss_sh(v, _MM_FROUND_TO_NEAREST_INT | _MM_FROUND_NO_EXC);
+}
+
+inline FP16Vec8::FP16Vec8(const FP32Vec8 &v)
+ : reg(_mm256_cvtps_ph(v.reg,
+ _MM_FROUND_TO_NEAREST_INT | _MM_FROUND_NO_EXC)) {}
+
+#ifdef __AVX512F__
+inline FP16Vec16::FP16Vec16(const FP32Vec16 &v)
+ : reg(_mm512_cvtps_ph(v.reg,
+ _MM_FROUND_TO_NEAREST_INT | _MM_FROUND_NO_EXC)) {}
+#else
+inline FP16Vec16::FP16Vec16(const FP32Vec16 &v)
+ : reg(_mm256_insertf128_si256(_mm256_castsi128_si256(FP16Vec8(FP32Vec8(v.reg_low)).reg), FP16Vec8(FP32Vec8(v.reg_low)).reg, 1)) {}
+#endif
+
#ifdef __AVX512BF16__
template <> inline void storeFP32(float v, c10::BFloat16 *ptr) {
*reinterpret_cast<__bfloat16 *>(ptr) = _mm_cvtness_sbh(v);
diff --git a/csrc/cpu/dnnl_helper.hpp b/csrc/cpu/dnnl_helper.hpp
index 024ad4ae43da8..8b5011dc065f0 100644
--- a/csrc/cpu/dnnl_helper.hpp
+++ b/csrc/cpu/dnnl_helper.hpp
@@ -2,6 +2,7 @@
#define DNNL_HELPER_HPP
#include
+#include
#include "oneapi/dnnl/dnnl.hpp"
@@ -32,6 +33,11 @@ struct DNNLType {
static constexpr dnnl::memory::data_type type = dnnl::memory::data_type::bf16;
};
+template <>
+struct DNNLType {
+ static constexpr dnnl::memory::data_type type = dnnl::memory::data_type::f16;
+};
+
template
constexpr inline dnnl::memory::data_type get_dnnl_type() {
return DNNLType>::type;
diff --git a/csrc/cpu/quant.cpp b/csrc/cpu/quant.cpp
index 2d7abe6145fee..d9aed657a3113 100644
--- a/csrc/cpu/quant.cpp
+++ b/csrc/cpu/quant.cpp
@@ -5,25 +5,42 @@ namespace {
template
struct KernelVecType {
using load_vec_type = void;
+ using azp_adj_load_vec_type = void;
using cvt_vec_type = void;
};
template <>
struct KernelVecType {
using load_vec_type = vec_op::FP32Vec16;
+ using azp_adj_load_vec_type = vec_op::INT32Vec16;
using cvt_vec_type = vec_op::FP32Vec16;
};
template <>
struct KernelVecType {
using load_vec_type = vec_op::BF16Vec16;
+ using azp_adj_load_vec_type = vec_op::INT32Vec16;
+ using cvt_vec_type = vec_op::FP32Vec16;
+};
+
+template <>
+struct KernelVecType {
+#ifdef __powerpc64__
+ // Power architecture-specific vector type
+ using load_vec_type = vec_op::FP32Vec16;
+#else
+ // Fallback for other architectures
+ using load_vec_type = vec_op::FP16Vec16;
+#endif
+ using azp_adj_load_vec_type = vec_op::INT32Vec16;
using cvt_vec_type = vec_op::FP32Vec16;
};
#ifdef __AVX512F__
-template
+template
void static_scaled_int8_quant_impl(const scalar_t* input, int8_t* output,
- const float* scale, const int num_tokens,
+ const float* scale, const int32_t* azp,
+ const int num_tokens,
const int hidden_size) {
using load_vec_t = typename KernelVecType::load_vec_type;
using cvt_vec_t = typename KernelVecType::cvt_vec_type;
@@ -37,62 +54,110 @@ void static_scaled_int8_quant_impl(const scalar_t* input, int8_t* output,
const cvt_vec_t i8_min_vec(i8_min);
const cvt_vec_t i8_max_vec(i8_max);
+ cvt_vec_t zp_vec;
+ if constexpr (AZP) {
+ zp_vec = cvt_vec_t(static_cast(*azp));
+ }
+
#pragma omp parallel for
for (int i = 0; i < num_tokens; ++i) {
int j = 0;
for (; j < hidden_size - vec_elem_num; j += vec_elem_num) {
load_vec_t elems(input + i * hidden_size + j);
cvt_vec_t elems_fp32(elems);
- elems_fp32 = (elems_fp32 * inv_scale).clamp(i8_min_vec, i8_max_vec);
+ elems_fp32 = elems_fp32 * inv_scale;
+
+ if constexpr (AZP) {
+ elems_fp32 = elems_fp32 + zp_vec;
+ }
+
+ elems_fp32 = elems_fp32.clamp(i8_min_vec, i8_max_vec);
vec_op::INT8Vec16 elems_int8(elems_fp32);
elems_int8.save(output + i * hidden_size + j);
}
load_vec_t elems(input + i * hidden_size + j);
cvt_vec_t elems_fp32(elems);
- elems_fp32 = (elems_fp32 * inv_scale).clamp(i8_min_vec, i8_max_vec);
- vec_op::INT8Vec16 elems_int8(elems_fp32);
+ elems_fp32 = elems_fp32 * inv_scale;
- if (j + vec_elem_num == hidden_size) {
- elems_int8.save(output + i * hidden_size + j);
- } else {
- elems_int8.save(output + i * hidden_size + j, hidden_size - j);
+ if constexpr (AZP) {
+ elems_fp32 = elems_fp32 + zp_vec;
}
+
+ elems_fp32 = elems_fp32.clamp(i8_min_vec, i8_max_vec);
+ vec_op::INT8Vec16 elems_int8(elems_fp32);
+ elems_int8.save(output + i * hidden_size + j, hidden_size - j);
}
}
-template
+template
void dynamic_scaled_int8_quant_impl(const scalar_t* input, int8_t* output,
- float* scale, const int num_tokens,
+ float* scale, int32_t* azp,
+ const int num_tokens,
const int hidden_size) {
using load_vec_t = typename KernelVecType::load_vec_type;
using cvt_vec_t = typename KernelVecType::cvt_vec_type;
constexpr int vec_elem_num = load_vec_t::VEC_ELEM_NUM;
+ constexpr float i8_min =
+ static_cast(std::numeric_limits::min());
+ constexpr float i8_max =
+ static_cast(std::numeric_limits::max());
+ const cvt_vec_t i8_min_vec(i8_min);
+ const cvt_vec_t i8_max_vec(i8_max);
+
#pragma omp parallel for
for (int i = 0; i < num_tokens; ++i) {
- cvt_vec_t max_abs(0.0);
+ cvt_vec_t max_value(std::numeric_limits::lowest());
+ cvt_vec_t min_value(std::numeric_limits::max());
{
int j = 0;
for (; j < hidden_size - vec_elem_num; j += vec_elem_num) {
load_vec_t elems(input + i * hidden_size + j);
cvt_vec_t elems_fp32(elems);
- max_abs = max_abs.max(elems_fp32.abs());
+ if constexpr (AZP) {
+ max_value = max_value.max(elems_fp32);
+ min_value = min_value.min(elems_fp32);
+ } else {
+ max_value = max_value.max(elems_fp32.abs());
+ }
}
load_vec_t elems(input + i * hidden_size + j);
cvt_vec_t elems_fp32(elems);
if (j + vec_elem_num == hidden_size) {
- max_abs = max_abs.max(elems_fp32.abs());
+ if constexpr (AZP) {
+ max_value = max_value.max(elems_fp32);
+ min_value = min_value.min(elems_fp32);
+ } else {
+ max_value = max_value.max(elems_fp32.abs());
+ }
} else {
- max_abs = max_abs.max(elems_fp32.abs(), hidden_size - j);
+ if constexpr (AZP) {
+ max_value = max_value.max(elems_fp32, hidden_size - j);
+ min_value = min_value.min(elems_fp32, hidden_size - j);
+ } else {
+ max_value = max_value.max(elems_fp32.abs(), hidden_size - j);
+ }
}
}
- float scale_val = max_abs.reduce_max() / 127.0f;
- scale[i] = scale_val;
+ float scale_val, azp_val;
+ if constexpr (AZP) {
+ float max_scalar = max_value.reduce_max();
+ float min_scalar = min_value.reduce_min();
+ scale_val = (max_scalar - min_scalar) / 255.0f;
+ azp_val = std::nearbyint(-128.0f - min_scalar / scale_val);
+ azp[i] = static_cast(azp_val);
+ scale[i] = scale_val;
+ } else {
+ scale_val = max_value.reduce_max() / 127.0f;
+ scale[i] = scale_val;
+ }
+
const cvt_vec_t inv_scale(1.0 / scale_val);
+ const cvt_vec_t azp_vec(azp_val);
{
int j = 0;
@@ -100,6 +165,11 @@ void dynamic_scaled_int8_quant_impl(const scalar_t* input, int8_t* output,
load_vec_t elems(input + i * hidden_size + j);
cvt_vec_t elems_fp32(elems);
elems_fp32 = (elems_fp32 * inv_scale);
+
+ if constexpr (AZP) {
+ elems_fp32 = elems_fp32 + azp_vec;
+ }
+ elems_fp32 = elems_fp32.clamp(i8_min_vec, i8_max_vec);
vec_op::INT8Vec16 elems_int8(elems_fp32);
elems_int8.save(output + i * hidden_size + j);
}
@@ -107,34 +177,111 @@ void dynamic_scaled_int8_quant_impl(const scalar_t* input, int8_t* output,
load_vec_t elems(input + i * hidden_size + j);
cvt_vec_t elems_fp32(elems);
elems_fp32 = (elems_fp32 * inv_scale);
- vec_op::INT8Vec16 elems_int8(elems_fp32);
- if (j + vec_elem_num == hidden_size) {
- elems_int8.save(output + i * hidden_size + j);
- } else {
- elems_int8.save(output + i * hidden_size + j, hidden_size - j);
+ if constexpr (AZP) {
+ elems_fp32 = elems_fp32 + azp_vec;
}
+ elems_fp32 = elems_fp32.clamp(i8_min_vec, i8_max_vec);
+ vec_op::INT8Vec16 elems_int8(elems_fp32);
+ elems_int8.save(output + i * hidden_size + j, hidden_size - j);
}
}
}
-template
-void dynamic_output_scale_impl(const float* input, scalar_t* output,
- const float* scale, const scalar_t* bias,
- const int num_tokens, const int hidden_size) {
+template
+void static_quant_epilogue(const float* input, scalar_t* output,
+ const float a_scale, const float* b_scale,
+ const int32_t* azp_with_adj, const int num_tokens,
+ const int hidden_size) {
CPU_KERNEL_GUARD_IN(dynamic_output_scale_impl)
using load_vec_t = typename KernelVecType::load_vec_type;
+ using azp_adj_load_vec_t =
+ typename KernelVecType::azp_adj_load_vec_type;
using cvt_vec_t = typename KernelVecType::cvt_vec_type;
constexpr int vec_elem_num = load_vec_t::VEC_ELEM_NUM;
#pragma omp parallel for
for (int i = 0; i < num_tokens; ++i) {
+ cvt_vec_t a_scale_vec(a_scale);
+ cvt_vec_t b_scale_vec(*b_scale);
+ cvt_vec_t scale_vec = a_scale_vec * b_scale_vec;
+
int j = 0;
- cvt_vec_t token_scale_vec(scale[i]);
+ for (; j < hidden_size - vec_elem_num; j += vec_elem_num) {
+ cvt_vec_t elems_fp32(input + i * hidden_size + j);
+ azp_adj_load_vec_t azp_adj_vec(azp_with_adj + j);
+ cvt_vec_t azp_adj_fp32(azp_adj_vec);
+
+ if constexpr (PerChannel) {
+ b_scale_vec = cvt_vec_t(b_scale + j);
+ scale_vec = b_scale_vec * a_scale_vec;
+ }
+
+ elems_fp32 = elems_fp32 - scale_vec * azp_adj_fp32;
+
+ load_vec_t elems_out(elems_fp32);
+ elems_out.save(output + i * hidden_size + j);
+ }
+
+ cvt_vec_t elems_fp32(input + i * hidden_size + j);
+ azp_adj_load_vec_t azp_adj_vec(azp_with_adj + j);
+ cvt_vec_t azp_adj_fp32(azp_adj_vec);
+
+ if constexpr (PerChannel) {
+ b_scale_vec = cvt_vec_t(b_scale + j);
+ scale_vec = b_scale_vec * a_scale_vec;
+ }
+
+ elems_fp32 = elems_fp32 - scale_vec * azp_adj_fp32;
+
+ load_vec_t elems_out(elems_fp32);
+ elems_out.save(output + i * hidden_size + j, hidden_size - j);
+ }
+}
+
+template
+void dynamic_quant_epilogue(const float* input, scalar_t* output,
+ const float* a_scale, const float* b_scale,
+ const int32_t* azp, const int32_t* azp_adj,
+ const scalar_t* bias, const int num_tokens,
+ const int hidden_size) {
+ CPU_KERNEL_GUARD_IN(dynamic_quant_epilogue)
+ using load_vec_t = typename KernelVecType::load_vec_type;
+ using azp_adj_load_vec_t =
+ typename KernelVecType::azp_adj_load_vec_type;
+ using cvt_vec_t = typename KernelVecType::cvt_vec_type;
+ constexpr int vec_elem_num = load_vec_t::VEC_ELEM_NUM;
+
+ #pragma omp parallel for
+ for (int i = 0; i < num_tokens; ++i) {
+ int j = 0;
+ cvt_vec_t token_scale_vec(a_scale[i]);
+ cvt_vec_t token_zp_scale_vec;
+ if constexpr (AZP) {
+ float zp_scale_val = a_scale[i] * static_cast(azp[i]);
+ if constexpr (!PerChannel) {
+ zp_scale_val *= *b_scale;
+ }
+ token_zp_scale_vec = cvt_vec_t(zp_scale_val);
+ }
+
for (; j < hidden_size - vec_elem_num; j += vec_elem_num) {
cvt_vec_t elems_fp32(input + i * hidden_size + j);
elems_fp32 = elems_fp32 * token_scale_vec;
+ if constexpr (AZP) {
+ azp_adj_load_vec_t azp_adj_vec(azp_adj + j);
+ cvt_vec_t azp_adj_fp32(azp_adj_vec);
+ azp_adj_fp32 = azp_adj_fp32 * token_zp_scale_vec;
+
+ if constexpr (PerChannel) {
+ cvt_vec_t b_scale_vec(b_scale + j);
+ azp_adj_fp32 = azp_adj_fp32 * b_scale_vec;
+ }
+
+ elems_fp32 = elems_fp32 - azp_adj_fp32;
+ }
+
if constexpr (Bias) {
load_vec_t bias_vec(bias + j);
cvt_vec_t bias_vec_fp32(bias_vec);
@@ -148,6 +295,19 @@ void dynamic_output_scale_impl(const float* input, scalar_t* output,
cvt_vec_t elems_fp32(input + i * hidden_size + j);
elems_fp32 = elems_fp32 * token_scale_vec;
+ if constexpr (AZP) {
+ azp_adj_load_vec_t azp_adj_vec(azp_adj + j);
+ cvt_vec_t azp_adj_fp32(azp_adj_vec);
+ azp_adj_fp32 = azp_adj_fp32 * token_zp_scale_vec;
+
+ if constexpr (PerChannel) {
+ cvt_vec_t b_scale_vec(b_scale + j);
+ azp_adj_fp32 = azp_adj_fp32 * b_scale_vec;
+ }
+
+ elems_fp32 = elems_fp32 - azp_adj_fp32;
+ }
+
if constexpr (Bias) {
load_vec_t bias_vec(bias + j);
cvt_vec_t bias_vec_fp32(bias_vec);
@@ -155,32 +315,41 @@ void dynamic_output_scale_impl(const float* input, scalar_t* output,
}
load_vec_t elems_out(elems_fp32);
-
- if (j + vec_elem_num == hidden_size) {
- elems_out.save(output + i * hidden_size + j);
- } else {
- elems_out.save(output + i * hidden_size + j, hidden_size - j);
- }
+ elems_out.save(output + i * hidden_size + j, hidden_size - j);
}
}
#else
template
void static_scaled_int8_quant_impl(const scalar_t* input, int8_t* output,
- const float* scale, const int num_tokens,
+ const float* scale, const int32_t* azp,
+ const int num_tokens,
const int hidden_size) {
TORCH_CHECK(false, "static_scaled_int8_quant_impl requires AVX512 support.")
}
template
void dynamic_scaled_int8_quant_impl(const scalar_t* input, int8_t* output,
- float* scale, const int num_tokens,
+ float* scale, int32_t* azp,
+ const int num_tokens,
const int hidden_size) {
TORCH_CHECK(false, "dynamic_scaled_int8_quant_impl requires AVX512 support.")
}
+template
+void static_quant_epilogue(const float* input, scalar_t* output,
+ const float a_scale, const float* b_scale,
+ const int32_t* azp_with_adj, const int num_tokens,
+ const int hidden_size) {
+ TORCH_CHECK(false, "static_quant_epilogue requires AVX512 support.")
+}
+
template
-void dynamic_output_scale_impl() {
- TORCH_CHECK(false, "dynamic_output_scale_impl requires AVX512 support.")
+void dynamic_quant_epilogue(const float* input, scalar_t* output,
+ const float* a_scale, const float* b_scale,
+ const int32_t* azp, const int32_t* azp_with_adj,
+ const scalar_t* bias, const int num_tokens,
+ const int hidden_size) {
+ TORCH_CHECK(false, "dynamic_quant_epilogue requires AVX512 support.")
}
#endif
} // namespace
@@ -214,39 +383,52 @@ void int8_scaled_mm(torch::Tensor& c, // [M, OC], row-major
bias->dim() == 1);
}
- VLLM_DISPATCH_FLOATING_TYPES(c.scalar_type(), "cutlass_scaled_mm", [&] {
+ VLLM_DISPATCH_FLOATING_TYPES(c.scalar_type(), "int8_scaled_mm", [&] {
if (a_scales.numel() != 1) {
// per-token
// Note: oneDNN doesn't support per-token activation quantization
+ // Ideally we want to fuse the GEMM and the scale procedure with oneDNN
+ // JIT, the intermediate data is cached in registers or L1. But for now
+ // the oneDNN GEMM code generation only supports two quantization
+ // patterns: per-tensor or per-output-channel of weight.
+ // So we have to apply the per-token scale with a 'epilogue'. In C=s_a *
+ // s_b * (A@B) + bias, the C_inter = s_b * (A@B) is computed by oneDNN
+ // GEMM, then the per-token scale (and bias) is applied with the epilogue
+ // C=s_a * C_inter + bias.
torch::Tensor tmp_fp32_out =
torch::empty_like(c, ::at::ScalarType::Float);
- DNNLPrimitiveHelper::gemm_s8s8_jit(
+ // Compute C_inter=s_b * (A@B)
+ DNNLPrimitiveHelper::gemm_s8s8_jit(
a.data_ptr(), b.data_ptr(),
- tmp_fp32_out.data_ptr(), (void*)(0), a.size(0), b.size(1),
- a.size(1), (float*)(0), b_scales.data_ptr(), 0,
- b_scales.numel());
+ tmp_fp32_out.data_ptr(), nullptr, a.size(0), b.size(1),
+ a.size(1), nullptr, b_scales.data_ptr(), 0, b_scales.numel());
if (bias.has_value()) {
- dynamic_output_scale_impl(
+ // Compute C=s_a * C_inter + bias
+ dynamic_quant_epilogue(
tmp_fp32_out.data_ptr(), c.data_ptr(),
- a_scales.data_ptr(), bias->data_ptr(), c.size(0),
- c.size(1));
+ a_scales.data_ptr(), nullptr, nullptr, nullptr,
+ bias->data_ptr(), c.size(0), c.size(1));
} else {
- dynamic_output_scale_impl(
+ // Compute C=s_a * C_inter
+ dynamic_quant_epilogue(
tmp_fp32_out.data_ptr(), c.data_ptr(),
- a_scales.data_ptr(), (scalar_t*)(0), c.size(0), c.size(1));
+ a_scales.data_ptr(), nullptr, nullptr, nullptr, nullptr,
+ c.size(0), c.size(1));
}
} else {
// per-tensor
if (bias.has_value()) {
+ // Compute C=s_a * s_b * (A@B) + bias
DNNLPrimitiveHelper::gemm_s8s8_jit(
a.data_ptr(), b.data_ptr(), c.data_ptr(),
bias->data_ptr(), a.size(0), b.size(1), a.size(1),
a_scales.data_ptr(), b_scales.data_ptr(),
a_scales.numel(), b_scales.numel());
} else {
- DNNLPrimitiveHelper::gemm_s8s8_jit(
+ // Compute C=s_a * s_b * (A@B)
+ DNNLPrimitiveHelper::gemm_s8s8_jit(
a.data_ptr(), b.data_ptr(), c.data_ptr(),
- (void*)(0), a.size(0), b.size(1), a.size(1),
+ nullptr, a.size(0), b.size(1), a.size(1),
a_scales.data_ptr(), b_scales.data_ptr(),
a_scales.numel(), b_scales.numel());
}
@@ -254,6 +436,127 @@ void int8_scaled_mm(torch::Tensor& c, // [M, OC], row-major
});
}
+void int8_scaled_mm_azp(torch::Tensor& c, // [M, OC], row-major
+ const torch::Tensor& a, // [M, IC], row-major
+ const torch::Tensor& b, // [IC, OC], column-major
+ const torch::Tensor& a_scales, // [1] or [M]
+ const torch::Tensor& b_scales, // [1] or [OC]
+ const torch::Tensor& azp_adj, // [OC]
+ const c10::optional& azp, // [1] or [M]
+ const c10::optional& bias // [OC]
+) {
+ CPU_KERNEL_GUARD_IN(cutlass_scaled_mm_azp)
+ // Checks for conformality
+ TORCH_CHECK(a.dtype() == torch::kInt8 && b.dtype() == torch::kInt8,
+ "int8_scaled_mm_azp only supports INT8 inputs.")
+ TORCH_CHECK(a.dim() == 2 && b.dim() == 2 && c.dim() == 2);
+ TORCH_CHECK(c.size(0) == a.size(0) && a.size(1) == b.size(0) &&
+ b.size(1) == c.size(1));
+ TORCH_CHECK(a_scales.numel() == 1 || a_scales.numel() == a.size(0));
+ TORCH_CHECK(b_scales.numel() == 1 || b_scales.numel() == b.size(1));
+
+ // Check for strides and alignment
+ TORCH_CHECK(a.stride(1) == 1 && c.stride(1) == 1); // Row-major
+ TORCH_CHECK(b.stride(0) == 1); // Column-major
+ TORCH_CHECK(c.stride(0) % 16 == 0 &&
+ b.stride(1) % 16 == 0); // 16 Byte Alignment
+ TORCH_CHECK(a_scales.is_contiguous() && b_scales.is_contiguous());
+
+ if (bias) {
+ TORCH_CHECK(bias->numel() == b.size(1) && bias->is_contiguous());
+ }
+ if (azp) {
+ TORCH_CHECK(azp->numel() == a.size(0) && azp->is_contiguous());
+ }
+ TORCH_CHECK(azp_adj.numel() == b.size(1) && azp_adj.is_contiguous());
+
+ // azp & bias types
+ TORCH_CHECK(azp_adj.dtype() == torch::kInt32);
+ TORCH_CHECK(!azp || azp->dtype() == torch::kInt32);
+ TORCH_CHECK(!bias || bias->dtype() == c.dtype(),
+ "currently bias dtype must match output dtype ", c.dtype());
+
+ VLLM_DISPATCH_FLOATING_TYPES(c.scalar_type(), "int8_scaled_mm_azp", [&] {
+ torch::Tensor tmp_fp32_out = torch::empty_like(c, ::at::ScalarType::Float);
+ if (a_scales.numel() != 1) {
+ // per-token
+ // Note: oneDNN doesn't support per-token activation quantization
+ // Compute C_inter=s_b * (A@B)
+ DNNLPrimitiveHelper::gemm_s8s8_jit(
+ a.data_ptr(), b.data_ptr(),
+ tmp_fp32_out.data_ptr(), nullptr, a.size(0), b.size(1),
+ a.size(1), nullptr, b_scales.data_ptr(), 0, b_scales.numel());
+ if (bias.has_value()) {
+ // Compute C=s_a * C_inter - s_a * s_b * azp * azp_adj + bias
+ if (b_scales.numel() != 1) {
+ // Per-Channel
+ dynamic_quant_epilogue(
+ tmp_fp32_out.data_ptr(), c.data_ptr(),
+ a_scales.data_ptr(), b_scales.data_ptr(),
+ azp->data_ptr(), azp_adj.data_ptr(),
+ bias->data_ptr(), c.size(0), c.size(1));
+ } else {
+ // Per-Tensor
+ dynamic_quant_epilogue(
+ tmp_fp32_out.data_ptr(), c.data_ptr(),
+ a_scales.data_ptr(), b_scales.data_ptr(),
+ azp->data_ptr(), azp_adj.data_ptr(),
+ bias->data_ptr(), c.size(0), c.size(1));
+ }
+ } else {
+ // Compute C=s_a * C_inter - s_a * s_b * azp * azp_adj
+ if (b_scales.numel() != 1) {
+ // Per-Channel
+ dynamic_quant_epilogue(
+ tmp_fp32_out.data_ptr(), c.data_ptr(),
+ a_scales.data_ptr(), b_scales.data_ptr(),
+ azp->data_ptr(), azp_adj.data_ptr(), nullptr,
+ c.size(0), c.size(1));
+ } else {
+ // Per-Tensor
+ dynamic_quant_epilogue(
+ tmp_fp32_out.data_ptr(), c.data_ptr(),
+ a_scales.data_ptr(), b_scales.data_ptr(),
+ azp->data_ptr(), azp_adj.data_ptr(), nullptr,
+ c.size(0), c.size(1));
+ }
+ }
+ } else {
+ // per-tensor
+ if (bias.has_value()) {
+ // Compute C_inter=s_a * s_b * (A@B) + bias
+ DNNLPrimitiveHelper