huggingface · mfuntowicz · Oct 3, 2024 · Oct 3, 2024 · Oct 4, 2024 · Oct 18, 2024
diff --git a/Cargo.lock b/Cargo.lock
diff --git a/Cargo.toml b/Cargo.toml
@@ -7,7 +7,7 @@ members = [
   "backends/trtllm",
   "launcher",
   "router"
-]
+, "backends/llamacpp"]
 default-members = [
   "benchmark",
   "backends/v2",

diff --git a/Dockerfile.llamacpp b/Dockerfile.llamacpp
@@ -0,0 +1,74 @@
+# Build dependencies resolver stage
+FROM lukemathwalker/cargo-chef:latest AS chef
+WORKDIR /usr/src/text-generation-inference/
+
+FROM chef AS planner
+COPY Cargo.lock Cargo.lock
+COPY Cargo.toml Cargo.toml
+COPY rust-toolchain.toml rust-toolchain.toml
+COPY backends backends
+COPY benchmark benchmark
+COPY clients clients
+COPY launcher launcher
+COPY router router
+
+RUN cargo chef prepare --recipe-path recipe.json
+
+FROM chef AS builder
+ENV CMAKE_INSTALL_PREFIX=/usr/src/text-generation-inference/dist
+RUN --mount=type=cache,target=/var/cache/apt,sharing=locked \
+    --mount=type=cache,target=/var/lib/apt,sharing=locked \
+    apt update && DEBIAN_FRONTEND=noninteractive apt install -y \
+    clang \
+    cmake \
+    gcc g++ \
+    libc++-dev \
+    libnumactl-dev \
+    libopenmpi-dev \
+    libssl-dev \
+    ninja-build \
+    openssl \
+    python3-dev
+
+RUN   update-alternatives --install /usr/bin/cc cc /usr/bin/clang 10 \
+      && update-alternatives --install /usr/bin/c++ c++ /usr/bin/clang 10 \
+      && update-alternatives --auto cc \
+      && update-alternatives --auto c++ \
+      && update-alternatives --display cc \
+      && update-alternatives --display c++ \
+      && cc --version \
+      && c++ --version
+
+COPY --from=planner /usr/src/text-generation-inference/recipe.json recipe.json
+RUN cargo chef cook --profile release-opt --package text-generation-backend-llamacpp --bin text-generation-backend-llamacpp --recipe-path recipe.json
+
+COPY Cargo.lock Cargo.lock
+COPY Cargo.toml Cargo.toml
+COPY rust-toolchain.toml rust-toolchain.toml
+COPY backends backends
+COPY benchmark benchmark
+COPY launcher launcher
+COPY router router
+
+ENV RUSTFLAGS="-L/usr/lib"
+ENV CMAKE_INSTALL_PREFIX=/usr/src/text-generation-inference/dist
+RUN cargo build --profile release-opt --package text-generation-backend-llamacpp --bin text-generation-backend-llamacpp --frozen
+
+FROM ubuntu:22.04
+ENV DEBIAN_FRONTEND=noninteractive
+
+RUN --mount=type=cache,target=/var/cache/apt,sharing=locked \
+    --mount=type=cache,target=/var/lib/apt,sharing=locked \
+    apt update && \
+    apt upgrade -y && \
+    apt install -y \
+    numactl \
+    openssl \
+    python3.11-dev
+
+COPY --from=builder /usr/src/text-generation-inference/target/release-opt/text-generation-backend-llamacpp /usr/src/text-generation-inference/text-generation-launcher
+COPY --from=builder /usr/src/text-generation-inference/dist /usr/
+
+ENV PORT=8080
+WORKDIR /usr/src/text-generation-inference
+ENTRYPOINT ["text-generation-launcher"]
diff --git a/LICENSE b/LICENSE
@@ -1,3 +1,4 @@
+
                                  Apache License
                            Version 2.0, January 2004
                         http://www.apache.org/licenses/
@@ -186,7 +187,7 @@
       same "printed page" as the copyright notice for easier
       identification within third-party archives.
 
-   Copyright 2022 Hugging Face
+   Copyright 2024 Hugging Face Inc.
 
    Licensed under the Apache License, Version 2.0 (the "License");
    you may not use this file except in compliance with the License.

diff --git a/backends/llamacpp/CMakeLists.txt b/backends/llamacpp/CMakeLists.txt
@@ -0,0 +1,62 @@
+cmake_minimum_required(VERSION 3.24)
+
+project(tgi-llama-cpp-backend VERSION 1.0.0)
+set(CMAKE_CXX_STANDARD 23)
+
+include(FetchContent)
+
+set(LLAMA_CPP_TARGET_VERSION "b3837" CACHE STRING "Version of llama.cpp to build against")
+set(LLAMA_CPP_TARGET_CUDA_ARCHS "75-real;80-real;86-real;89-real;90-real" CACHE STRING "CUDA arch(s) to build")
+option(LLAMA_CPP_BUILD_OFFLINE_RUNNER "Flag to build the standalone c++ backend runner")
+option(LLAMA_CPP_BUILD_CUDA "Flag to build CUDA enabled inference through llama.cpp")
+
+if (${CMAKE_CXX_COMPILER_ID} STREQUAL "Clang" AND ${CMAKE_SYSTEM_NAME} STREQUAL "Linux")
+    message(STATUS "Targeting libc++")
+    set(CMAKE_CXX_FLAGS -stdlib=libc++ ${CMAKE_CXX_FLAGS})
+else ()
+    message(STATUS "Not using libc++ ${CMAKE_CXX_COMPILER_ID} ${CMAKE_SYSTEM_NAME}")
+endif ()
+
+# Add dependencies
+include(cmake/numa.cmake)
+include(cmake/spdlog.cmake)
+
+if (${LLAMA_CPP_BUILD_CUDA})
+    message(STATUS "Enabling llama.cpp CUDA support")
+
+    if (NOT DEFINED CMAKE_CUDA_ARCHITECTURES)
+        set(CMAKE_CUDA_ARCHITECTURES ${LLAMA_CPP_TARGET_CUDA_ARCHS})
+    endif ()
+    set(GGML_CUDA ON)
+endif ()
+
+# Download llama.cpp repo at the specific version
+fetchcontent_declare(
+        llama
+        URL https://github.com/ggerganov/llama.cpp/archive/refs/tags/b4077.tar.gz
+)
+
+fetchcontent_makeavailable(llama)
+
+add_library(tgi_llamacpp_backend_impl STATIC csrc/backend.hpp csrc/backend.cpp)
+target_compile_features(tgi_llamacpp_backend_impl PRIVATE cxx_std_11)
+target_link_libraries(tgi_llamacpp_backend_impl PUBLIC spdlog::spdlog llama)
+
+if (NUMA_FOUND)
+    target_link_libraries(tgi_llamacpp_backend_impl PUBLIC numa)
+endif ()
+
+install(TARGETS tgi_llamacpp_backend_impl spdlog llama)
+
+if (${CMAKE_BUILD_TYPE} STREQUAL "Debug")
+    target_compile_definitions(tgi_llamacpp_backend_impl PRIVATE TGI_LLAMACPP_BACKEND_DEBUG=1)
+endif ()
+
+if (${LLAMA_CPP_BUILD_OFFLINE_RUNNER})
+    message(STATUS "Building llama.cpp offline runner")
+    add_executable(tgi_llamacpp_offline_runner offline/main.cpp)
+
+    target_link_libraries(tgi_llamacpp_offline_runner PUBLIC tgi_llamacpp_backend_impl llama spdlog::spdlog)
+endif ()
+
+
diff --git a/backends/llamacpp/Cargo.toml b/backends/llamacpp/Cargo.toml
@@ -0,0 +1,33 @@
+[package]
+name = "text-generation-backend-llamacpp"
+version.workspace = true
+edition.workspace = true
+authors.workspace = true
+homepage.workspace = true
+
+[dependencies]
+async-trait = "0.1"
+async-channel = "2.3"
+clap = { version = "4.5.19", features = ["derive"] }
+cxx = "1.0"
+num_cpus = "1"
+hf-hub = { workspace = true }
+image = { version = "0.25.1", features = ["default-formats"] }
+metrics = { workspace = true }
+metrics-exporter-prometheus = { workspace = true }
+serde_json = "1.0.128"
+text-generation-router = { path = "../../router" }
+thiserror = "1.0.64"
+tokio = "1.40.0"
+tokio-stream = "0.1.16"
+tokenizers = { workspace = true }
+tracing = "0.1"
+tracing-opentelemetry = "0.27.0"
+tracing-subscriber = { version = "0.3", features = ["json", "env-filter"] }
+utoipa = { version = "4.2.3", features = ["axum_extras"] }
+log = "0.4.22"
+
+[build-dependencies]
+cmake = "0.1"
+cxx-build = { version = "1.0", features = ["parallel"] }
+pkg-config = "0.3"