Refactor fastapi-serving and add one card serving(#11581)

* init fastapi-serving one card * mv api code to source * update worker * update for style-check * add worker * update bash * update * update worker name and add readme * rename update * rename to fastapi
intel · Jul 17, 2024 · 9c15abf · 9c15abf
1 parent 373ccbb
commit 9c15abf
Show file tree

Hide file tree

Showing 19 changed files with 583 additions and 367 deletions.
diff --git a/docker/llm/inference/xpu/docker/Dockerfile b/docker/llm/inference/xpu/docker/Dockerfile
@@ -61,7 +61,7 @@ RUN wget -O- https://apt.repos.intel.com/intel-gpg-keys/GPG-PUB-KEY-INTEL-SW-PRO
     cp -r ./ipex-llm/python/llm/example/GPU/vLLM-Serving/ ./vLLM-Serving && \
     # Download pp_serving
     mkdir -p /llm/pp_serving && \
-    cp ./ipex-llm/python/llm/example/GPU/Pipeline-Parallel-FastAPI/*.py /llm/pp_serving/ && \
+    cp ./ipex-llm/python/llm/example/GPU/Pipeline-Parallel-Serving/*.py /llm/pp_serving/ && \
     # Install related library of benchmarking
     pip install pandas omegaconf && \
     chmod +x /llm/benchmark.sh && \

diff --git a/python/llm/example/GPU/Pipeline-Parallel-FastAPI/pipeline_serving.py b/python/llm/example/GPU/Pipeline-Parallel-FastAPI/pipeline_serving.py
diff --git a/...e/GPU/Pipeline-Parallel-FastAPI/README.md → ...e/GPU/Pipeline-Parallel-Serving/README.md b/...e/GPU/Pipeline-Parallel-FastAPI/README.md → ...e/GPU/Pipeline-Parallel-Serving/README.md
@@ -50,7 +50,14 @@ pip install transformers==4.40.0
 pip install trl==0.8.1
 ```
 
-### 2. Run pipeline parallel serving on multiple GPUs
+### 2-1. Run ipex-llm serving on one GPU card 
+
+```bash
+# Need to set NUM_GPUS=1 and MODEL_PATH in run.sh first
+bash run.sh
+```
+
+### 2-2. Run pipeline parallel serving on multiple GPUs
 
 ```bash
 # Need to set MODEL_PATH in run.sh first
@@ -76,7 +83,7 @@ export http_proxy=
 export https_proxy=
 
 curl -X 'POST' \
-  'http://127.0.0.1:8000/generate/' \
+  'http://127.0.0.1:8000/generate' \
   -H 'accept: application/json' \
   -H 'Content-Type: application/json' \
   -d '{
@@ -99,7 +106,7 @@ Please change the test url accordingly.
 
 ```bash
 # set t/c to the number of concurrencies to test full throughput.
-wrk -t1 -c1 -d5m -s ./wrk_script_1024.lua http://127.0.0.1:8000/generate/ --timeout 1m
+wrk -t1 -c1 -d5m -s ./wrk_script_1024.lua http://127.0.0.1:8000/generate --timeout 1m
 ```
 
 ## 5. Using the `benchmark.py` Script

diff --git a/...PU/Pipeline-Parallel-FastAPI/benchmark.py → ...PU/Pipeline-Parallel-Serving/benchmark.py b/...PU/Pipeline-Parallel-FastAPI/benchmark.py → ...PU/Pipeline-Parallel-Serving/benchmark.py
diff --git a/...Pipeline-Parallel-FastAPI/gradio_webui.py → ...Pipeline-Parallel-Serving/gradio_webui.py b/...Pipeline-Parallel-FastAPI/gradio_webui.py → ...Pipeline-Parallel-Serving/gradio_webui.py