ROCm · seungrokj · Apr 6, 2025
@@ -14,7 +14,7 @@ This Docker image packages vLLM with PyTorch for an AMD Instinct™ MI300X
 accelerator. It includes:
 
 -   ✅ ROCm™ 6.3.1
--   ✅ vLLM 0.7.3
+-   ✅ vLLM 0.7.4
 -   ✅ PyTorch 2.7.0 (dev nightly)
 -   ✅ hipBLASLt 0.13
 
@@ -184,6 +184,7 @@ cd MAD/scripts/vllm
 |              | amd/Mistral-7B-v0.1-FP8-KV              | [Mistral 7B](https://huggingface.co/amd/Mistral-7B-v0.1-FP8-KV)                                   |
 |              | amd/dbrx-instruct-FP8-KV                | [DBRX Instruct](https://huggingface.co/amd/dbrx-instruct-FP8-KV)                                     |
 |              | amd/c4ai-command-r-plus-FP8-KV          | [C4AI Command R+ 08-2024](https://huggingface.co/amd/c4ai-command-r-plus-FP8-KV)                               |
+|              | deepseek-ai/DeepSeek-R1          | [DeepSeek-R1](https://huggingface.co/deepseek-ai/DeepSeek-R1)                               |
 | $num_gpu     | 1 or 8                                  | Number of GPUs                                   |
 | $datatype    | float16, float8                         | Data type                                        |
 
@@ -303,13 +304,10 @@ owners and are only mentioned for informative purposes.
 ## Changelog
 ----------
 This release note summarizes notable changes since the previous docker release.
-
--   The vLLM version number was incremented from 0.6.6 to 0.7.3.
-
--   Improved fp8 throughput performance with HipblasLT 0.13
+-   The vLLM version number was incremented from 0.7.3 to 0.7.4.
 
 -   The float8 data type benchmark test was added to include the following models: 
-Llama 3.1 8B Instruct
+DeepSeek-R1
 
 ## Support 
 ----------

@@ -478,6 +478,42 @@
     "args":
     "--model_repo amd/c4ai-command-r-plus-FP8-KV --test_option latency,throughput --num_gpu 8 --datatype float8 --tunableop off"
   },
+  {
+    "name": "pyt_vllm_command-r-plus_fp8",
+    "url": "",
+    "dockerfile": "docker/pyt_vllm",
+    "scripts": "scripts/vllm/run.sh",
+    "data": "huggingface",
+    "n_gpus": "-1",
+    "owner": "[email protected]",
+    "training_precision": "",
+    "multiple_results": "perf_c4ai-command-r-plus-FP8-KV.csv",
+    "tags": [
+      "pyt",
+      "vllm"
+    ],
+    "timeout": -1,
+    "args":
+    "--model_repo amd/c4ai-command-r-plus-FP8-KV --test_option latency,throughput --num_gpu 8 --datatype float8 --tunableop off"
+  },
+  {
+    "name": "pyt_vllm_deepseek-r1_fp8",
+    "url": "",
+    "dockerfile": "docker/pyt_vllm",
+    "scripts": "scripts/vllm/run.sh",
+    "data": "huggingface",
+    "n_gpus": "-1",
+    "owner": "[email protected]",
+    "training_precision": "",
+    "multiple_results": "perf_DeepSeek-R1.csv",
+    "tags": [
+      "pyt",
+      "vllm"
+    ],
+    "timeout": -1,
+    "args":
+    "--model_repo deepseek-ai/DeepSeek-R1 --test_option latency,throughput --num_gpu 8 --datatype float8 --tunableop off"
+  },
   {
     "name": "pyt_train_llama-3.1-8b",
     "url": "",

@@ -138,3 +138,13 @@ deepseek-ai/deepseek-moe-16b-chat,128,128,4000,4000,NA,NA,NA,0.9,10,FALSE
 deepseek-ai/deepseek-moe-16b-chat,128,2048,3000,3000,NA,NA,NA,0.9,10,FALSE
 deepseek-ai/deepseek-moe-16b-chat,2048,128,3000,3000,NA,NA,NA,0.9,10,FALSE
 deepseek-ai/deepseek-moe-16b-chat,2048,2048,1500,1500,NA,NA,NA,0.9,10,FALSE
+
+deepseek-ai/DeepSeek-R1,128,128,4000,4000,32768,32768,32768,0.9,1,FALSE
+deepseek-ai/DeepSeek-R1,128,2048,1500,1500,32768,32768,32768,0.9,1,FALSE
+deepseek-ai/DeepSeek-R1,2048,128,500,1500,32768,32768,32768,0.9,1,FALSE
+deepseek-ai/DeepSeek-R1,2048,2048,500,1000,32768,32768,32768,0.9,1,FALSE
+
+deepseek-ai/DeepSeek-V3,128,128,4000,4000,32768,32768,32768,0.9,1,FALSE
+deepseek-ai/DeepSeek-V3,128,2048,1500,1500,32768,32768,32768,0.9,1,FALSE
+deepseek-ai/DeepSeek-V3,2048,128,500,1500,32768,32768,32768,0.9,1,FALSE
+deepseek-ai/DeepSeek-V3,2048,2048,500,1000,32768,32768,32768,0.9,1,FALSE
@@ -64,7 +64,11 @@ fi
 if [[ $datatype == "float16" ]]; then
     DTYPE=" --dtype float16 "	
 elif [[ $datatype == "float8" ]]; then
-    DTYPE=" --dtype float16 --quantization fp8 --kv-cache-dtype fp8 " 
+    if [[ $model_name == "DeepSeek-R1" ]] || [[ $model_name == "DeepSeek-V3" ]] ; then 
+        DTYPE=" --dtype float16 --quantization fp8 --max-model-len 32768 " 
+    else
+        DTYPE=" --dtype float16 --quantization fp8 --kv-cache-dtype fp8 " 
+    fi
 fi
 
 OPTION_LATENCY=" --gpu-memory-utilization 0.9 "