From 2a4cd36ad65ea271da523d3ee80652883c83484a Mon Sep 17 00:00:00 2001
From: seungrokjung <seungrok.jung@amd.com>
Date: Sun, 6 Apr 2025 15:05:31 +0000
Subject: [PATCH] vllm rocm 6.3.1 deepseek r1, v3

Signed-off-by: seungrokjung <seungrok.jung@amd.com>
---
 benchmark/vllm/README.md              | 10 +++-----
 models.json                           | 36 +++++++++++++++++++++++++++
 scripts/vllm/config.csv               | 10 ++++++++
 scripts/vllm/vllm_benchmark_report.sh |  6 ++++-
 4 files changed, 55 insertions(+), 7 deletions(-)

diff --git a/benchmark/vllm/README.md b/benchmark/vllm/README.md
index 4bb91b8..864334e 100644
--- a/benchmark/vllm/README.md
+++ b/benchmark/vllm/README.md
@@ -14,7 +14,7 @@ This Docker image packages vLLM with PyTorch for an AMD Instinct™ MI300X
 accelerator. It includes:
 
 -   ✅ ROCm™ 6.3.1
--   ✅ vLLM 0.7.3
+-   ✅ vLLM 0.7.4
 -   ✅ PyTorch 2.7.0 (dev nightly)
 -   ✅ hipBLASLt 0.13
 
@@ -184,6 +184,7 @@ cd MAD/scripts/vllm
 |              | amd/Mistral-7B-v0.1-FP8-KV              | [Mistral 7B](https://huggingface.co/amd/Mistral-7B-v0.1-FP8-KV)                                   |
 |              | amd/dbrx-instruct-FP8-KV                | [DBRX Instruct](https://huggingface.co/amd/dbrx-instruct-FP8-KV)                                     |
 |              | amd/c4ai-command-r-plus-FP8-KV          | [C4AI Command R+ 08-2024](https://huggingface.co/amd/c4ai-command-r-plus-FP8-KV)                               |
+|              | deepseek-ai/DeepSeek-R1          | [DeepSeek-R1](https://huggingface.co/deepseek-ai/DeepSeek-R1)                               |
 | $num_gpu     | 1 or 8                                  | Number of GPUs                                   |
 | $datatype    | float16, float8                         | Data type                                        |
 
@@ -303,13 +304,10 @@ owners and are only mentioned for informative purposes.   
 ## Changelog
 ----------
 This release note summarizes notable changes since the previous docker release.
-
--   The vLLM version number was incremented from 0.6.6 to 0.7.3.
-
--   Improved fp8 throughput performance with HipblasLT 0.13
+-   The vLLM version number was incremented from 0.7.3 to 0.7.4.
 
 -   The float8 data type benchmark test was added to include the following models: 
-Llama 3.1 8B Instruct
+DeepSeek-R1
 
 ## Support 
 ----------
diff --git a/models.json b/models.json
index 0d0ed06..c59b227 100644
--- a/models.json
+++ b/models.json
@@ -478,6 +478,42 @@
     "args":
     "--model_repo amd/c4ai-command-r-plus-FP8-KV --test_option latency,throughput --num_gpu 8 --datatype float8 --tunableop off"
   },
+  {
+    "name": "pyt_vllm_command-r-plus_fp8",
+    "url": "",
+    "dockerfile": "docker/pyt_vllm",
+    "scripts": "scripts/vllm/run.sh",
+    "data": "huggingface",
+    "n_gpus": "-1",
+    "owner": "mad.support@amd.com",
+    "training_precision": "",
+    "multiple_results": "perf_c4ai-command-r-plus-FP8-KV.csv",
+    "tags": [
+      "pyt",
+      "vllm"
+    ],
+    "timeout": -1,
+    "args":
+    "--model_repo amd/c4ai-command-r-plus-FP8-KV --test_option latency,throughput --num_gpu 8 --datatype float8 --tunableop off"
+  },
+  {
+    "name": "pyt_vllm_deepseek-r1_fp8",
+    "url": "",
+    "dockerfile": "docker/pyt_vllm",
+    "scripts": "scripts/vllm/run.sh",
+    "data": "huggingface",
+    "n_gpus": "-1",
+    "owner": "mad.support@amd.com",
+    "training_precision": "",
+    "multiple_results": "perf_DeepSeek-R1.csv",
+    "tags": [
+      "pyt",
+      "vllm"
+    ],
+    "timeout": -1,
+    "args":
+    "--model_repo deepseek-ai/DeepSeek-R1 --test_option latency,throughput --num_gpu 8 --datatype float8 --tunableop off"
+  },
   {
     "name": "pyt_train_llama-3.1-8b",
     "url": "",
diff --git a/scripts/vllm/config.csv b/scripts/vllm/config.csv
index 6e9dbc8..b54705d 100644
--- a/scripts/vllm/config.csv
+++ b/scripts/vllm/config.csv
@@ -138,3 +138,13 @@ deepseek-ai/deepseek-moe-16b-chat,128,128,4000,4000,NA,NA,NA,0.9,10,FALSE
 deepseek-ai/deepseek-moe-16b-chat,128,2048,3000,3000,NA,NA,NA,0.9,10,FALSE
 deepseek-ai/deepseek-moe-16b-chat,2048,128,3000,3000,NA,NA,NA,0.9,10,FALSE
 deepseek-ai/deepseek-moe-16b-chat,2048,2048,1500,1500,NA,NA,NA,0.9,10,FALSE
+
+deepseek-ai/DeepSeek-R1,128,128,4000,4000,32768,32768,32768,0.9,1,FALSE
+deepseek-ai/DeepSeek-R1,128,2048,1500,1500,32768,32768,32768,0.9,1,FALSE
+deepseek-ai/DeepSeek-R1,2048,128,500,1500,32768,32768,32768,0.9,1,FALSE
+deepseek-ai/DeepSeek-R1,2048,2048,500,1000,32768,32768,32768,0.9,1,FALSE
+
+deepseek-ai/DeepSeek-V3,128,128,4000,4000,32768,32768,32768,0.9,1,FALSE
+deepseek-ai/DeepSeek-V3,128,2048,1500,1500,32768,32768,32768,0.9,1,FALSE
+deepseek-ai/DeepSeek-V3,2048,128,500,1500,32768,32768,32768,0.9,1,FALSE
+deepseek-ai/DeepSeek-V3,2048,2048,500,1000,32768,32768,32768,0.9,1,FALSE
diff --git a/scripts/vllm/vllm_benchmark_report.sh b/scripts/vllm/vllm_benchmark_report.sh
index c3b8b21..686f2b4 100755
--- a/scripts/vllm/vllm_benchmark_report.sh
+++ b/scripts/vllm/vllm_benchmark_report.sh
@@ -64,7 +64,11 @@ fi
 if [[ $datatype == "float16" ]]; then
     DTYPE=" --dtype float16 "	
 elif [[ $datatype == "float8" ]]; then
-    DTYPE=" --dtype float16 --quantization fp8 --kv-cache-dtype fp8 " 
+    if [[ $model_name == "DeepSeek-R1" ]] || [[ $model_name == "DeepSeek-V3" ]] ; then 
+        DTYPE=" --dtype float16 --quantization fp8 --max-model-len 32768 " 
+    else
+        DTYPE=" --dtype float16 --quantization fp8 --kv-cache-dtype fp8 " 
+    fi
 fi
 
 OPTION_LATENCY=" --gpu-memory-utilization 0.9 "