Improve the quick_start.md

maobaolong · maobaolong · commit 071c53e77b49 · 2025-10-10T09:29:57.000+08:00
diff --git a/docs/source/getting-started/quick_start.md b/docs/source/getting-started/quick_start.md
@@ -40,6 +40,8 @@ You can use our official offline example script to run offline inference as foll
 
 ```bash
 cd examples/
+# Change the model path to your own model path
+export MODEL_PATH=/home/models/Qwen2.5-14B-Instruct
 python offline_inference.py
 ```
 
@@ -58,7 +60,11 @@ export PYTHONHASHSEED=123456
 Run the following command to start the vLLM server with the Qwen/Qwen2.5-14B-Instruct model:
 
 ```bash
-vllm serve /home/models/Qwen2.5-14B-Instruct \
+# Change the model path to your own model path
+export MODEL_PATH=/home/models/Qwen2.5-14B-Instruct
+vllm serve ${MODEL_PATH} \
+--trust-remote-code \
+--served-model-name vllm_cpu_offload \
 --max-model-len 20000 \
 --tensor-parallel-size 2 \
 --gpu_memory_utilization 0.87 \
@@ -95,7 +101,7 @@ After successfully started the vLLM server，You can interact with the API as fo
 curl http://localhost:7800/v1/completions \
     -H "Content-Type: application/json" \
     -d '{
-        "model": "/home/models/Qwen2.5-14B-Instruct",
+        "model": "vllm_cpu_offload",
         "prompt": "Shanghai is a",
         "max_tokens": 7,
         "temperature": 0