Skip to content

Commit

Permalink
Let llama.cpp download and cache the models and also push all layers …
Browse files Browse the repository at this point in the history
…to GPU
  • Loading branch information
medihack committed Jun 16, 2024
1 parent 4b0b98c commit e17d90c
Show file tree
Hide file tree
Showing 7 changed files with 30 additions and 64 deletions.
2 changes: 1 addition & 1 deletion .devcontainer/devcontainer.json
Original file line number Diff line number Diff line change
Expand Up @@ -11,7 +11,7 @@
},
// https://github.com/orgs/community/discussions/50403
// "initializeCommand": "docker system prune --all --force",
"postCreateCommand": "./.devcontainer/postCreateCommand.sh",
"postCreateCommand": "poetry install && poetry run invoke init-workspace",
"customizations": {
"vscode": {
"extensions": [
Expand Down
5 changes: 0 additions & 5 deletions .devcontainer/postCreateCommand.sh

This file was deleted.

1 change: 0 additions & 1 deletion .gitpod.yml
Original file line number Diff line number Diff line change
Expand Up @@ -6,7 +6,6 @@ tasks:
init: |
poetry install
poetry run invoke init-workspace
poetry run invoke download-llm -m tinyllama-1b-q2
ports:
- port: 8000
Expand Down
22 changes: 22 additions & 0 deletions KNOWLEDGE.md
Original file line number Diff line number Diff line change
@@ -0,0 +1,22 @@
# Knowledge

## LLM Models

### TinyLlama 1.1B

- Just for testing purposes (especially in the cloud IDE)
- <https://huggingface.co/TheBloke/TinyLlama-1.1B-Chat-v1.0-GGUF/resolve/main/tinyllama-1.1b-chat-v1.0.Q2_K.gguf>

### Mistral 7B

- Low quality, good performance, low resources
- <https://huggingface.co/MaziyarPanahi/Mistral-7B-Instruct-v0.3-GGUF/resolve/main/Mistral-7B-Instruct-v0.3.Q5_K_M.gguf>

### Mixtral 8x7B

- Medium quality, good performance, medium resources
- <https://huggingface.co/TheBloke/Mixtral-8x7B-Instruct-v0.1-GGUF/resolve/main/mixtral-8x7b-instruct-v0.1.Q5_K_M.gguf>

### Still to test

- <https://huggingface.co/lightblue/suzume-llama-3-8B-multilingual-gguf/resolve/main/ggml-model-Q8_0.gguf>
5 changes: 4 additions & 1 deletion compose/docker-compose.dev.yml
Original file line number Diff line number Diff line change
Expand Up @@ -10,14 +10,15 @@ x-app: &default-app
pull_policy: never # only works with Docker Compose and not Docker Swarm

x-llamacpp: &llamacpp
environment:
LLAMA_CACHE: "/models"
env_file:
- ../.env.dev
hostname: llamacpp.local
ports:
- 9610:8080
volumes:
- models_data:/models
entrypoint: "/bin/bash -c '/llama-server -mu $${LLM_MODEL_URL} -c 512 --host 0.0.0.0 --port 8080'"

services:
init:
Expand Down Expand Up @@ -82,11 +83,13 @@ services:
llamacpp_cpu:
<<: *llamacpp
image: ghcr.io/ggerganov/llama.cpp:server
entrypoint: "/bin/bash -c '/llama-server -mu $${LLM_MODEL_URL} -c 512 --host 0.0.0.0 --port 8080'"
profiles: ["cpu"]

llamacpp_gpu:
<<: *llamacpp
image: ghcr.io/ggerganov/llama.cpp:server-cuda
entrypoint: "/bin/bash -c '/llama-server -mu $${LLM_MODEL_URL} -ngl 50 -c 4096 --host 0.0.0.0 --port 8080'"
deploy:
resources:
reservations:
Expand Down
4 changes: 3 additions & 1 deletion compose/docker-compose.prod.yml
Original file line number Diff line number Diff line change
Expand Up @@ -86,13 +86,15 @@ services:
llamacpp_gpu:
image: ghcr.io/ggerganov/llama.cpp:server-cuda
hostname: llamacpp.local
environment:
LLAMA_CACHE: "/models"
env_file:
- ../.env.prod
ports:
- 9610:8080
volumes:
- models_data:/models
entrypoint: "/bin/bash -c '/llama-server -mu $${LLM_MODEL_URL} -cb -c 2048 --host 0.0.0.0 --port 8080'"
entrypoint: "/bin/bash -c '/llama-server -mu $${LLM_MODEL_URL} -ngl 50 -cb -c 4096 --host 0.0.0.0 --port 8080'"
deploy:
# <<: *deploy
resources:
Expand Down
55 changes: 0 additions & 55 deletions tasks.py
Original file line number Diff line number Diff line change
Expand Up @@ -5,33 +5,11 @@
from pathlib import Path
from typing import Literal

import requests
from dotenv import set_key
from invoke.context import Context
from invoke.tasks import task
from tqdm import tqdm

Environments = Literal["dev", "prod"]
AVAILABLE_MODELS = {
# TheBloke/TinyLlama-1.1B-Chat-v1.0-GGUF 0.48 GB
"tinyllama-1b-q2": "https://huggingface.co/TheBloke/TinyLlama-1.1B-Chat-v1.0-GGUF/resolve/main/tinyllama-1.1b-chat-v1.0.Q2_K.gguf",
# MaziyarPanahi/Mistral-7B-Instruct-v0.3-GGUF 4.37 GB
"mistral-7b-q4": "https://huggingface.co/MaziyarPanahi/Mistral-7B-Instruct-v0.3-GGUF/resolve/main/Mistral-7B-Instruct-v0.3.Q4_K_M.gguf",
# MaziyarPanahi/Mistral-7B-Instruct-v0.3-GGUF 5.14 GB
"mistral-7b-q5": "https://huggingface.co/MaziyarPanahi/Mistral-7B-Instruct-v0.3-GGUF/resolve/main/Mistral-7B-Instruct-v0.3.Q5_K_M.gguf",
# TheBloke/Mixtral-8x7B-Instruct-v0.1-GGUF 32.2 GB
"mixtral-8x7b-q5": "https://huggingface.co/TheBloke/Mixtral-8x7B-Instruct-v0.1-GGUF/resolve/main/mixtral-8x7b-instruct-v0.1.Q5_K_M.gguf",
# QuantFactory/Meta-Llama-3-8B-Instruct-GGUF 5.73 GB
"llama3-q5": "https://huggingface.co/QuantFactory/Meta-Llama-3-8B-Instruct-GGUF/blob/main/Meta-Llama-3-8B-Instruct.Q5_K_M.gguf",
# QuantFactory/Meta-Llama-3-8B-Instruct-GGUF 8.54 GB
"llama3-q8": "https://huggingface.co/QuantFactory/Meta-Llama-3-8B-Instruct-GGUF/resolve/main/Meta-Llama-3-8B-Instruct.Q8_0.gguf",
# lightblue/suzume-llama-3-8B-multilingual-gguf 4.92 GB
"llama3-ml-q4": "https://huggingface.co/lightblue/suzume-llama-3-8B-multilingual-gguf/resolve/main/ggml-model-Q4_K_M.gguf",
# lightblue/suzume-llama-3-8B-multilingual-gguf 5.14 GB
"llama3-ml-q5": "https://huggingface.co/MaziyarPanahi/Mistral-7B-Instruct-v0.3-GGUF/resolve/main/Mistral-7B-Instruct-v0.3.Q5_K_M.gguf",
# lightblue/suzume-llama-3-8B-multilingual-gguf 8.54 GB
"llama3-ml-q8": "https://huggingface.co/lightblue/suzume-llama-3-8B-multilingual-gguf/resolve/main/ggml-model-Q8_0.gguf",
}

stack_name_dev = "radis_dev"
stack_name_prod = "radis_prod"
Expand Down Expand Up @@ -117,22 +95,6 @@ def confirm(question: str) -> bool:
sys.stdout.write("Please respond with 'yes' or 'no' " "(or 'y' or 'n').\n")


def download_with_progress_bar(url: str, filepath: Path):
response = requests.get(url, stream=True)

total_size = int(response.headers.get("content-length", 0))
block_size = 1024

with tqdm(total=total_size, unit="B", unit_scale=True) as progress_bar:
with open(filepath, "wb") as file:
for data in response.iter_content(block_size):
progress_bar.update(len(data))
file.write(data)

if total_size != 0 and progress_bar.n != total_size:
raise RuntimeError("Could not download file")


###
# Tasks
###
Expand Down Expand Up @@ -499,20 +461,3 @@ def upgrade_postgresql(ctx: Context, env: Environments = "dev", version: str = "
)
else:
print("Cancelled")


@task
def download_llm(ctx: Context, model: str):
url = AVAILABLE_MODELS.get(model)
if not url:
print(f"Unknown model: {model}")
print(f"Available models: {', '.join(AVAILABLE_MODELS.keys())}")
return

models_dir.mkdir(parents=True, exist_ok=True)
model_path = models_dir / "model.gguf"
if model_path.exists():
print(f"Model {model} already exists. Skipping download.")
return

download_with_progress_bar(url, model_path)

0 comments on commit e17d90c

Please sign in to comment.