Skip to content

Commit

Permalink
feat: switch to using vllm (#103)
Browse files Browse the repository at this point in the history
* when cuda is not available, we are still using llama.cpp as a fallback

Co-authored-by: Avram Tudor <[email protected]>
  • Loading branch information
quitrk and Avram Tudor authored Oct 4, 2024
1 parent 046c62a commit 94397c2
Show file tree
Hide file tree
Showing 11 changed files with 2,315 additions and 525 deletions.
19 changes: 2 additions & 17 deletions Dockerfile
Original file line number Diff line number Diff line change
Expand Up @@ -15,20 +15,8 @@ COPY docker/rootfs/ /
RUN \
apt-dpkg-wrap apt-key adv --keyserver hkp://keyserver.ubuntu.com:80 --recv-keys F23C5A6CF475977595C89F51BA6932366A755776 && \
apt-dpkg-wrap apt-get update && \
apt-dpkg-wrap apt-get install -y wget build-essential libcurl4-openssl-dev python3.11 python3.11-venv

RUN \
wget -nv -O cmake.sh https://github.com/Kitware/CMake/releases/download/v3.29.3/cmake-3.29.3-linux-x86_64.sh && \
sh cmake.sh --skip-license --prefix=/usr/local && \
rm cmake.sh

COPY llama.cpp llama.cpp
RUN \
cd llama.cpp && \
rm -rf build && \
cmake -B build -DCMAKE_BUILD_TYPE=RelWithDebInfo -DGGML_CUDA=ON -DGGML_NATIVE=OFF -DBUILD_SHARED_LIBS=OFF && \
cmake --build build --target llama-server -j`getconf _NPROCESSORS_ONLN` && \
ldd build/bin/llama-server
apt-dpkg-wrap apt-get install -y build-essential libcurl4-openssl-dev python3.11 python3.11-venv && \
apt-cleanup

COPY requirements.txt /app/

Expand Down Expand Up @@ -66,9 +54,6 @@ RUN \

# Copy virtual environment
COPY --chown=jitsi:jitsi --from=builder /app/.venv /app/.venv
COPY --chown=jitsi:jitsi --from=builder /llama.cpp/build/bin /app/llama.cpp

RUN ldd /app/llama.cpp/llama-server

# Copy application files
COPY --chown=jitsi:jitsi /skynet /app/skynet/
Expand Down
2,626 changes: 2,177 additions & 449 deletions poetry.lock

Large diffs are not rendered by default.

11 changes: 6 additions & 5 deletions pyproject.toml
Original file line number Diff line number Diff line change
Expand Up @@ -22,16 +22,16 @@ pytest-mock = "3.12.0"
aiohttp = "3.9.5"
async-lru = "2.0.4"
boto3 = "^1.28.56"
fastapi = "0.109"
fastapi = "0.115.0"
fastapi-versionizer = "3.0.4"
faster-whisper = "1.0.3"
prometheus-client = "0.19.0"
prometheus-fastapi-instrumentator = "6.1.0"
prometheus-client = "0.20.0"
prometheus-fastapi-instrumentator = "7.0.0"
pyjwt = {extras = ["crypto"], version = "^2.8.0"}
python = "~3.11"
redis = "5.0.1"
torch = ">=2.0.0,<2.1.0"
torchaudio = ">=2.0.1,<2.1.0"
torch = "2.4.0"
torchaudio = "2.4.0"
uvicorn = {extras = ["standard"], version = "0.29.0"}
uuid6 = "^2024.1.12"
pyyaml = "^6.0.1"
Expand All @@ -41,6 +41,7 @@ langchain = "^0.3.0"
langchain-openai = "^0.2.0"
av = "^12.3.0"
pybase64 = "^1.4.0"
vllm = "^0.6.2"

[build-system]
build-backend = "poetry.core.masonry.api"
Expand Down
95 changes: 76 additions & 19 deletions requirements.txt
Original file line number Diff line number Diff line change
Expand Up @@ -2,90 +2,147 @@ aiofiles==23.2.1 ; python_version >= "3.11" and python_version < "3.12"
aiohttp==3.9.5 ; python_version >= "3.11" and python_version < "3.12"
aiosignal==1.3.1 ; python_version >= "3.11" and python_version < "3.12"
annotated-types==0.7.0 ; python_version >= "3.11" and python_version < "3.12"
anyio==4.5.0 ; python_version >= "3.11" and python_version < "3.12"
anyio==4.6.0 ; python_version >= "3.11" and python_version < "3.12"
async-lru==2.0.4 ; python_version >= "3.11" and python_version < "3.12"
async-timeout==4.0.3 ; python_version >= "3.11" and python_full_version <= "3.11.2"
attrs==24.2.0 ; python_version >= "3.11" and python_version < "3.12"
av==12.3.0 ; python_version >= "3.11" and python_version < "3.12"
boto3==1.35.23 ; python_version >= "3.11" and python_version < "3.12"
botocore==1.35.23 ; python_version >= "3.11" and python_version < "3.12"
boto3==1.35.27 ; python_version >= "3.11" and python_version < "3.12"
botocore==1.35.27 ; python_version >= "3.11" and python_version < "3.12"
certifi==2024.8.30 ; python_version >= "3.11" and python_version < "3.12"
cffi==1.17.1 ; python_version >= "3.11" and python_version < "3.12" and platform_python_implementation != "PyPy"
cffi==1.17.1 ; python_version >= "3.11" and python_version < "3.12" and (platform_python_implementation != "PyPy" or implementation_name == "pypy")
charset-normalizer==3.3.2 ; python_version >= "3.11" and python_version < "3.12"
click==8.1.7 ; python_version >= "3.11" and python_version < "3.12"
cloudpickle==3.0.0 ; python_version >= "3.11" and python_version < "3.12"
colorama==0.4.6 ; python_version >= "3.11" and python_version < "3.12" and (sys_platform == "win32" or platform_system == "Windows")
coloredlogs==15.0.1 ; python_version >= "3.11" and python_version < "3.12"
cryptography==43.0.1 ; python_version >= "3.11" and python_version < "3.12"
ctranslate2==4.4.0 ; python_version >= "3.11" and python_version < "3.12"
datasets==2.14.4 ; python_version >= "3.11" and python_version < "3.12"
dill==0.3.7 ; python_version >= "3.11" and python_version < "3.12"
diskcache==5.6.3 ; python_version >= "3.11" and python_version < "3.12"
distro==1.9.0 ; python_version >= "3.11" and python_version < "3.12"
einops==0.8.0 ; python_version >= "3.11" and python_version < "3.12"
fastapi-versionizer==3.0.4 ; python_version >= "3.11" and python_version < "3.12"
fastapi==0.109.0 ; python_version >= "3.11" and python_version < "3.12"
fastapi==0.115.0 ; python_version >= "3.11" and python_version < "3.12"
faster-whisper==1.0.3 ; python_version >= "3.11" and python_version < "3.12"
filelock==3.16.1 ; python_version >= "3.11" and python_version < "3.12"
flatbuffers==24.3.25 ; python_version >= "3.11" and python_version < "3.12"
frozenlist==1.4.1 ; python_version >= "3.11" and python_version < "3.12"
fsspec==2024.9.0 ; python_version >= "3.11" and python_version < "3.12"
greenlet==3.1.0 ; python_version < "3.12" and (platform_machine == "aarch64" or platform_machine == "ppc64le" or platform_machine == "x86_64" or platform_machine == "amd64" or platform_machine == "AMD64" or platform_machine == "win32" or platform_machine == "WIN32") and python_version >= "3.11"
fsspec[http]==2024.9.0 ; python_version >= "3.11" and python_version < "3.12"
gguf==0.10.0 ; python_version >= "3.11" and python_version < "3.12"
greenlet==3.1.1 ; python_version < "3.12" and (platform_machine == "aarch64" or platform_machine == "ppc64le" or platform_machine == "x86_64" or platform_machine == "amd64" or platform_machine == "AMD64" or platform_machine == "win32" or platform_machine == "WIN32") and python_version >= "3.11"
h11==0.14.0 ; python_version >= "3.11" and python_version < "3.12"
httpcore==1.0.5 ; python_version >= "3.11" and python_version < "3.12"
httptools==0.6.1 ; python_version >= "3.11" and python_version < "3.12"
httpx==0.27.2 ; python_version >= "3.11" and python_version < "3.12"
huggingface-hub==0.25.0 ; python_version >= "3.11" and python_version < "3.12"
huggingface-hub==0.25.1 ; python_version >= "3.11" and python_version < "3.12"
humanfriendly==10.0 ; python_version >= "3.11" and python_version < "3.12"
idna==3.10 ; python_version >= "3.11" and python_version < "3.12"
importlib-metadata==8.5.0 ; python_version >= "3.11" and python_version < "3.12"
interegular==0.3.3 ; python_version >= "3.11" and python_version < "3.12"
jinja2==3.1.4 ; python_version >= "3.11" and python_version < "3.12"
jiter==0.5.0 ; python_version >= "3.11" and python_version < "3.12"
jmespath==1.0.1 ; python_version >= "3.11" and python_version < "3.12"
jsonpatch==1.33 ; python_version >= "3.11" and python_version < "3.12"
jsonpointer==3.0.0 ; python_version >= "3.11" and python_version < "3.12"
langchain-core==0.3.2 ; python_version >= "3.11" and python_version < "3.12"
jsonschema-specifications==2023.12.1 ; python_version >= "3.11" and python_version < "3.12"
jsonschema==4.23.0 ; python_version >= "3.11" and python_version < "3.12"
langchain-core==0.3.6 ; python_version >= "3.11" and python_version < "3.12"
langchain-openai==0.2.0 ; python_version >= "3.11" and python_version < "3.12"
langchain-text-splitters==0.3.0 ; python_version >= "3.11" and python_version < "3.12"
langchain==0.3.0 ; python_version >= "3.11" and python_version < "3.12"
langsmith==0.1.125 ; python_version >= "3.11" and python_version < "3.12"
langchain==0.3.1 ; python_version >= "3.11" and python_version < "3.12"
langsmith==0.1.128 ; python_version >= "3.11" and python_version < "3.12"
lark==1.2.2 ; python_version >= "3.11" and python_version < "3.12"
llvmlite==0.43.0 ; python_version >= "3.11" and python_version < "3.12"
lm-format-enforcer==0.10.6 ; python_version >= "3.11" and python_version < "3.12"
markupsafe==2.1.5 ; python_version >= "3.11" and python_version < "3.12"
mistral-common==1.4.4 ; python_version >= "3.11" and python_version < "3.12"
mpmath==1.3.0 ; python_version >= "3.11" and python_version < "3.12"
msgpack==1.1.0 ; python_version >= "3.11" and python_version < "3.12"
msgspec==0.18.6 ; python_version >= "3.11" and python_version < "3.12"
multidict==6.1.0 ; python_version >= "3.11" and python_version < "3.12"
multiprocess==0.70.15 ; python_version >= "3.11" and python_version < "3.12"
natsort==8.4.0 ; python_version >= "3.11" and python_version < "3.12"
nest-asyncio==1.6.0 ; python_version >= "3.11" and python_version < "3.12"
networkx==3.3 ; python_version >= "3.11" and python_version < "3.12"
numba==0.60.0 ; python_version >= "3.11" and python_version < "3.12"
numpy==1.26.4 ; python_version >= "3.11" and python_version < "3.12"
nvidia-cublas-cu12==12.1.3.1 ; platform_system == "Linux" and platform_machine == "x86_64" and python_version >= "3.11" and python_version < "3.12"
nvidia-cuda-cupti-cu12==12.1.105 ; platform_system == "Linux" and platform_machine == "x86_64" and python_version >= "3.11" and python_version < "3.12"
nvidia-cuda-nvrtc-cu12==12.1.105 ; platform_system == "Linux" and platform_machine == "x86_64" and python_version >= "3.11" and python_version < "3.12"
nvidia-cuda-runtime-cu12==12.1.105 ; platform_system == "Linux" and platform_machine == "x86_64" and python_version >= "3.11" and python_version < "3.12"
nvidia-cudnn-cu12==9.1.0.70 ; platform_system == "Linux" and platform_machine == "x86_64" and python_version >= "3.11" and python_version < "3.12"
nvidia-cufft-cu12==11.0.2.54 ; platform_system == "Linux" and platform_machine == "x86_64" and python_version >= "3.11" and python_version < "3.12"
nvidia-curand-cu12==10.3.2.106 ; platform_system == "Linux" and platform_machine == "x86_64" and python_version >= "3.11" and python_version < "3.12"
nvidia-cusolver-cu12==11.4.5.107 ; platform_system == "Linux" and platform_machine == "x86_64" and python_version >= "3.11" and python_version < "3.12"
nvidia-cusparse-cu12==12.1.0.106 ; platform_system == "Linux" and platform_machine == "x86_64" and python_version >= "3.11" and python_version < "3.12"
nvidia-ml-py==12.560.30 ; python_version >= "3.11" and python_version < "3.12"
nvidia-nccl-cu12==2.20.5 ; platform_system == "Linux" and platform_machine == "x86_64" and python_version >= "3.11" and python_version < "3.12"
nvidia-nvjitlink-cu12==12.6.68 ; platform_system == "Linux" and platform_machine == "x86_64" and python_version >= "3.11" and python_version < "3.12"
nvidia-nvtx-cu12==12.1.105 ; platform_system == "Linux" and platform_machine == "x86_64" and python_version >= "3.11" and python_version < "3.12"
onnxruntime==1.19.2 ; python_version >= "3.11" and python_version < "3.12"
openai==1.46.1 ; python_version >= "3.11" and python_version < "3.12"
openai==1.48.0 ; python_version >= "3.11" and python_version < "3.12"
orjson==3.10.7 ; python_version >= "3.11" and python_version < "3.12"
outlines==0.0.46 ; python_version >= "3.11" and python_version < "3.12"
packaging==24.1 ; python_version >= "3.11" and python_version < "3.12"
prometheus-client==0.19.0 ; python_version >= "3.11" and python_version < "3.12"
prometheus-fastapi-instrumentator==6.1.0 ; python_version >= "3.11" and python_version < "3.12"
pandas==2.2.3 ; python_version >= "3.11" and python_version < "3.12"
partial-json-parser==0.2.1.1.post4 ; python_version >= "3.11" and python_version < "3.12"
pillow==10.4.0 ; python_version >= "3.11" and python_version < "3.12"
prometheus-client==0.20.0 ; python_version >= "3.11" and python_version < "3.12"
prometheus-fastapi-instrumentator==7.0.0 ; python_version >= "3.11" and python_version < "3.12"
protobuf==5.28.2 ; python_version >= "3.11" and python_version < "3.12"
psutil==6.0.0 ; python_version >= "3.11" and python_version < "3.12"
py-cpuinfo==9.0.0 ; python_version >= "3.11" and python_version < "3.12"
pyairports==2.1.1 ; python_version >= "3.11" and python_version < "3.12"
pyarrow==17.0.0 ; python_version >= "3.11" and python_version < "3.12"
pybase64==1.4.0 ; python_version >= "3.11" and python_version < "3.12"
pycparser==2.22 ; python_version >= "3.11" and python_version < "3.12" and platform_python_implementation != "PyPy"
pycountry==24.6.1 ; python_version >= "3.11" and python_version < "3.12"
pycparser==2.22 ; python_version >= "3.11" and python_version < "3.12" and (platform_python_implementation != "PyPy" or implementation_name == "pypy")
pydantic-core==2.23.4 ; python_version >= "3.11" and python_version < "3.12"
pydantic==2.9.2 ; python_version >= "3.11" and python_version < "3.12"
pyjwt[crypto]==2.9.0 ; python_version >= "3.11" and python_version < "3.12"
pyreadline3==3.5.4 ; sys_platform == "win32" and python_version >= "3.11" and python_version < "3.12"
python-dateutil==2.9.0.post0 ; python_version >= "3.11" and python_version < "3.12"
python-dotenv==1.0.1 ; python_version >= "3.11" and python_version < "3.12"
pytz==2024.2 ; python_version >= "3.11" and python_version < "3.12"
pyyaml==6.0.2 ; python_version >= "3.11" and python_version < "3.12"
pyzmq==26.2.0 ; python_version >= "3.11" and python_version < "3.12"
ray==2.37.0 ; python_version >= "3.11" and python_version < "3.12"
redis==5.0.1 ; python_version >= "3.11" and python_version < "3.12"
referencing==0.35.1 ; python_version >= "3.11" and python_version < "3.12"
regex==2024.9.11 ; python_version >= "3.11" and python_version < "3.12"
requests==2.32.3 ; python_version >= "3.11" and python_version < "3.12"
rpds-py==0.20.0 ; python_version >= "3.11" and python_version < "3.12"
s3transfer==0.10.2 ; python_version >= "3.11" and python_version < "3.12"
safetensors==0.4.5 ; python_version >= "3.11" and python_version < "3.12"
sentencepiece==0.2.0 ; python_version >= "3.11" and python_version < "3.12"
setuptools==75.1.0 ; python_version >= "3.11" and python_version < "3.12"
six==1.16.0 ; python_version >= "3.11" and python_version < "3.12"
sniffio==1.3.1 ; python_version >= "3.11" and python_version < "3.12"
sqlalchemy==2.0.35 ; python_version >= "3.11" and python_version < "3.12"
starlette==0.35.1 ; python_version >= "3.11" and python_version < "3.12"
starlette==0.38.6 ; python_version >= "3.11" and python_version < "3.12"
sympy==1.13.3 ; python_version >= "3.11" and python_version < "3.12"
tenacity==8.5.0 ; python_version >= "3.11" and python_version < "3.12"
tiktoken==0.7.0 ; python_version >= "3.11" and python_version < "3.12"
tokenizers==0.20.0 ; python_version >= "3.11" and python_version < "3.12"
torch==2.0.1 ; python_version >= "3.11" and python_version < "3.12"
torchaudio==2.0.2 ; python_version >= "3.11" and python_version < "3.12"
torch==2.4.0 ; python_version >= "3.11" and python_version < "3.12"
torchaudio==2.4.0 ; python_version >= "3.11" and python_version < "3.12"
torchvision==0.19.0 ; python_version >= "3.11" and python_version < "3.12"
tqdm==4.66.5 ; python_version >= "3.11" and python_version < "3.12"
transformers==4.45.1 ; python_version >= "3.11" and python_version < "3.12"
triton==3.0.0 ; platform_system == "Linux" and platform_machine == "x86_64" and python_version < "3.12" and python_version >= "3.11"
typing-extensions==4.12.2 ; python_version >= "3.11" and python_version < "3.12"
tzdata==2024.2 ; python_version >= "3.11" and python_version < "3.12"
urllib3==2.2.3 ; python_version >= "3.11" and python_version < "3.12"
uuid6==2024.7.10 ; python_version >= "3.11" and python_version < "3.12"
uvicorn[standard]==0.29.0 ; python_version >= "3.11" and python_version < "3.12"
uvloop==0.20.0 ; (sys_platform != "win32" and sys_platform != "cygwin") and platform_python_implementation != "PyPy" and python_version >= "3.11" and python_version < "3.12"
vllm==0.6.2 ; python_version >= "3.11" and python_version < "3.12"
watchfiles==0.24.0 ; python_version >= "3.11" and python_version < "3.12"
websockets==13.0.1 ; python_version >= "3.11" and python_version < "3.12"
yarl==1.11.1 ; python_version >= "3.11" and python_version < "3.12"
websockets==13.1 ; python_version >= "3.11" and python_version < "3.12"
xformers==0.0.27.post2 ; platform_system == "Linux" and platform_machine == "x86_64" and python_version >= "3.11" and python_version < "3.12"
xxhash==3.5.0 ; python_version >= "3.11" and python_version < "3.12"
yarl==1.12.1 ; python_version >= "3.11" and python_version < "3.12"
zipp==3.20.2 ; python_version >= "3.11" and python_version < "3.12"
14 changes: 10 additions & 4 deletions run.sh
Original file line number Diff line number Diff line change
@@ -1,7 +1,13 @@
#!/bin/sh
cd llama.cpp
make llama-server
cd ..

export LLAMA_N_CTX=32000
if nvcc --version
then
export CUDA_VISIBLE_DEVICES=0
else
cd llama.cpp
make llama-server
cd ..
fi

export LLAMA_N_CTX=8192
poetry run python -m uvicorn skynet.main:app --reload
4 changes: 2 additions & 2 deletions skynet/env.py
Original file line number Diff line number Diff line change
Expand Up @@ -35,7 +35,8 @@ def tobool(val: str | None):
azure_openai_api_version = os.environ.get('AZURE_OPENAI_API_VERSION', '2024-02-01')

# openai api
openai_api_server_path = os.environ.get('OPENAI_API_SERVER_PATH', '/app/llama.cpp/llama-server')
llama_cpp_server_path = os.environ.get('LLAMA_CPP_SERVER_PATH', './llama.cpp/llama-server')
vllm_server_path = os.environ.get('VLLM_SERVER_PATH', 'vllm.entrypoints.openai.api_server')
openai_api_server_port = int(os.environ.get('OPENAI_API_SERVER_PORT', 8003))
openai_api_base_url = os.environ.get('OPENAI_API_BASE_URL', f'http://localhost:{openai_api_server_port}')

Expand Down Expand Up @@ -87,7 +88,6 @@ def tobool(val: str | None):
job_timeout = int(os.environ.get('JOB_TIMEOUT', 60 * 5)) # 5 minutes default

# summaries
summary_default_hint_type = os.environ.get('SUMMARY_DEFAULT_HINT_TYPE', 'text')
summary_minimum_payload_length = int(os.environ.get('SUMMARY_MINIMUM_PAYLOAD_LENGTH', 100))

# monitoring
Expand Down
9 changes: 1 addition & 8 deletions skynet/modules/stt/streaming_whisper/cfg.py
Original file line number Diff line number Diff line change
Expand Up @@ -6,18 +6,11 @@
from skynet.env import whisper_compute_type, whisper_device, whisper_gpu_indices, whisper_model_name, whisper_model_path
from skynet.logs import get_logger
from skynet.modules.stt.streaming_whisper.utils import vad_utils as vad
from skynet.utils import get_device

log = get_logger(__name__)


def get_device() -> str:
if torch.cuda.is_available():
log.debug('CUDA device found.')
return 'cuda'
log.warning('No CUDA device found, defaulting to CPU.')
return 'cpu'


vad_model = vad.init_jit_model(f'{os.getcwd()}/skynet/modules/stt/streaming_whisper/models/vad/silero_vad.jit')

device = whisper_device if whisper_device != 'auto' else get_device()
Expand Down
Loading

0 comments on commit 94397c2

Please sign in to comment.