From aa1fcba59fef8f3685f2851ac1de4b4420c69cd1 Mon Sep 17 00:00:00 2001
From: Morgan Funtowicz <funtowiczmo@gmail.com>
Date: Thu, 3 Oct 2024 14:00:17 +0200
Subject: [PATCH 01/91] feat(llamacpp): initial commit

# Conflicts:
#	Cargo.lock
---
 Cargo.toml                           |  2 +-
 backends/llamacpp/CMakeLists.txt     | 28 ++++++++++++
 backends/llamacpp/Cargo.toml         |  8 ++++
 backends/llamacpp/cmake/fmt.cmake    |  6 +++
 backends/llamacpp/cmake/spdlog.cmake | 17 +++++++
 backends/llamacpp/csrc/backend.cpp   | 66 ++++++++++++++++++++++++++++
 backends/llamacpp/csrc/backend.hpp   | 28 ++++++++++++
 backends/llamacpp/src/main.rs        |  3 ++
 8 files changed, 157 insertions(+), 1 deletion(-)
 create mode 100644 backends/llamacpp/CMakeLists.txt
 create mode 100644 backends/llamacpp/Cargo.toml
 create mode 100644 backends/llamacpp/cmake/fmt.cmake
 create mode 100644 backends/llamacpp/cmake/spdlog.cmake
 create mode 100644 backends/llamacpp/csrc/backend.cpp
 create mode 100644 backends/llamacpp/csrc/backend.hpp
 create mode 100644 backends/llamacpp/src/main.rs

diff --git a/Cargo.toml b/Cargo.toml
index 9a7e76c412b..f3ab5ee546f 100644
--- a/Cargo.toml
+++ b/Cargo.toml
@@ -7,7 +7,7 @@ members = [
   "backends/trtllm",
   "launcher",
   "router"
-]
+, "backends/llamacpp"]
 default-members = [
   "benchmark",
   "backends/v2",
diff --git a/backends/llamacpp/CMakeLists.txt b/backends/llamacpp/CMakeLists.txt
new file mode 100644
index 00000000000..2f9026f1656
--- /dev/null
+++ b/backends/llamacpp/CMakeLists.txt
@@ -0,0 +1,28 @@
+cmake_minimum_required(VERSION 3.20)
+
+project(tgi-llama-cpp-backend VERSION 1.0.0)
+set(CMAKE_CXX_STANDARD 20)
+
+include(FetchContent)
+
+set(LLAMA_CPP_TARGET_VERSION "b3837" STRING "Version of llama.cpp to build against")
+
+
+# Add dependencies
+include(cmake/fmt.cmake)
+include(cmake/spdlog.cmake)
+
+# Download llama.cpp repo at the specific version
+fetchcontent_declare(
+    llama
+#    DOWNLOAD_EXTRACT_TIMESTAMP TRUE
+    GIT_REPOSITORY https://github.com/ggerganov/llama.cpp.git
+    GIT_TAG b3837
+    GIT_SHALLOW FALSE
+)
+
+fetchcontent_makeavailable(llama)
+
+add_library(tgi_llama_cpp_backend_impl STATIC csrc/backend.hpp csrc/backend.cpp)
+target_compile_features(tgi_llama_cpp_backend_impl PRIVATE cxx_std_11)
+target_link_libraries(tgi_llama_cpp_backend_impl fmt::fmt spdlog::spdlog llama common)
diff --git a/backends/llamacpp/Cargo.toml b/backends/llamacpp/Cargo.toml
new file mode 100644
index 00000000000..2e8ed7ddca9
--- /dev/null
+++ b/backends/llamacpp/Cargo.toml
@@ -0,0 +1,8 @@
+[package]
+name = "text-generation-backend-llamacpp"
+version.workspace = true
+edition.workspace = true
+authors.workspace = true
+homepage.workspace = true
+
+[dependencies]
diff --git a/backends/llamacpp/cmake/fmt.cmake b/backends/llamacpp/cmake/fmt.cmake
new file mode 100644
index 00000000000..f94a9c5668f
--- /dev/null
+++ b/backends/llamacpp/cmake/fmt.cmake
@@ -0,0 +1,6 @@
+FetchContent_Declare(
+        fmt
+        GIT_REPOSITORY https://github.com/fmtlib/fmt
+        GIT_TAG 11.0.1
+)
+FetchContent_MakeAvailable(fmt)
diff --git a/backends/llamacpp/cmake/spdlog.cmake b/backends/llamacpp/cmake/spdlog.cmake
new file mode 100644
index 00000000000..c4ee5c97a58
--- /dev/null
+++ b/backends/llamacpp/cmake/spdlog.cmake
@@ -0,0 +1,17 @@
+set(SPDLOG_USE_FMT ON)
+set(SPDLOG_BUILD_SHARED OFF)
+set(SPDLOG_FMT_EXTERNAL ON)
+
+# Define the level at which SPDLOG_ compilation level is defined
+if (${CMAKE_BUILD_TYPE} STREQUAL "Debug")
+    add_compile_definitions(SPDLOG_ACTIVE_LEVEL SPDLOG_LEVEL_DEBUG)
+else ()
+    add_compile_definitions(SPDLOG_ACTIVE_LEVEL SPDLOG_LEVEL_INFO)
+endif ()
+
+fetchcontent_declare(
+        spdlog
+        GIT_REPOSITORY https://github.com/gabime/spdlog.git
+        GIT_TAG v1.14.1
+)
+fetchcontent_makeavailable(spdlog)
diff --git a/backends/llamacpp/csrc/backend.cpp b/backends/llamacpp/csrc/backend.cpp
new file mode 100644
index 00000000000..9ce1dbc92ff
--- /dev/null
+++ b/backends/llamacpp/csrc/backend.cpp
@@ -0,0 +1,66 @@
+//
+// Created by Morgan Funtowicz on 9/28/2024.
+//
+
+#include <arg.h>
+#include <common.h>
+#include <fmt/format.h>
+#include <spdlog/spdlog.h>
+#include "backend.hpp"
+
+namespace huggingface::tgi::backends::llama {
+
+    std::unique_ptr<TgiLlamaCppBackend> CreateLlamaCppBackend(std::string_view root) {
+        SPDLOG_INFO(FMT_STRING("Loading model from {}"), root);
+        gpt_init();
+
+        // Fake argv
+        std::vector<std::string_view> args = {"tgi_llama_cpp_backend", "--model", root};
+        std::vector<char*> argv;
+        for(const auto& arg : args) {
+            argv.push_back(const_cast<char *>(arg.data()));
+        }
+        argv.push_back(nullptr);
+
+        // Create the GPT parameters
+        gpt_params params;
+        if (!gpt_params_parse(args.size(), argv.data(), params, LLAMA_EXAMPLE_SERVER)) {
+            throw std::runtime_error("Failed to create GPT Params from model");
+        }
+
+
+        // Create the inference engine
+        SPDLOG_INFO("Allocating llama.cpp model from gpt_params");
+        auto result = llama_init_from_gpt_params(params);
+
+        // Unpack all the inference engine components
+        auto model = result.model;
+        auto context = result.context;
+        auto loras = result.lora_adapters;
+
+        // Make sure everything is correctly initialized
+        if(model == nullptr)
+            throw std::runtime_error(fmt::format("Failed to load model from {}", root));
+
+        return std::make_unique<TgiLlamaCppBackend>(model, context);
+    }
+
+    TgiLlamaCppBackend::TgiLlamaCppBackend(llama_model *const model, llama_context *const ctx)
+        : model(model), ctx(ctx), batch() {
+
+    }
+
+    TgiLlamaCppBackend::~TgiLlamaCppBackend() {
+        if(model)
+        {
+            SPDLOG_DEBUG("Freeing llama.cpp model");
+            llama_free_model(model);
+        }
+
+        if(ctx)
+        {
+            SPDLOG_DEBUG("Freeing llama.cpp context");
+            llama_free(ctx);
+        }
+    }
+}
\ No newline at end of file
diff --git a/backends/llamacpp/csrc/backend.hpp b/backends/llamacpp/csrc/backend.hpp
new file mode 100644
index 00000000000..a643454e756
--- /dev/null
+++ b/backends/llamacpp/csrc/backend.hpp
@@ -0,0 +1,28 @@
+//
+// Created by Morgan Funtowicz on 9/28/2024.
+//
+
+#ifndef TGI_LLAMA_CPP_BACKEND_BACKEND_HPP
+#define TGI_LLAMA_CPP_BACKEND_BACKEND_HPP
+
+#include <memory>
+#include <llama.h>
+
+namespace huggingface::tgi::backends::llama {
+    const char* TGI_BACKEND_LLAMA_CPP_NAME = "llama.cpp";
+
+
+    class TgiLlamaCppBackend {
+    private:
+        llama_model* model;
+        llama_context* ctx;
+        llama_batch batch;
+    public:
+        TgiLlamaCppBackend(llama_model* const model, llama_context* const);
+        ~TgiLlamaCppBackend();
+    };
+
+    std::unique_ptr<TgiLlamaCppBackend> CreateLlamaCppBackend(std::string_view root);
+}
+
+#endif //TGI_LLAMA_CPP_BACKEND_BACKEND_HPP
diff --git a/backends/llamacpp/src/main.rs b/backends/llamacpp/src/main.rs
new file mode 100644
index 00000000000..e7a11a969c0
--- /dev/null
+++ b/backends/llamacpp/src/main.rs
@@ -0,0 +1,3 @@
+fn main() {
+    println!("Hello, world!");
+}

From 7d1f8a2bd6695be7a3efd6512c70093c6ae22d6d Mon Sep 17 00:00:00 2001
From: Morgan Funtowicz <funtowiczmo@gmail.com>
Date: Thu, 3 Oct 2024 15:25:15 +0200
Subject: [PATCH 02/91] feat(llamacpp): correctly handle CMAKE_BUILD_TYPE for
 spdlog macros

---
 backends/llamacpp/cmake/spdlog.cmake | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/backends/llamacpp/cmake/spdlog.cmake b/backends/llamacpp/cmake/spdlog.cmake
index c4ee5c97a58..9cd210dd1d1 100644
--- a/backends/llamacpp/cmake/spdlog.cmake
+++ b/backends/llamacpp/cmake/spdlog.cmake
@@ -3,7 +3,7 @@ set(SPDLOG_BUILD_SHARED OFF)
 set(SPDLOG_FMT_EXTERNAL ON)
 
 # Define the level at which SPDLOG_ compilation level is defined
-if (${CMAKE_BUILD_TYPE} STREQUAL "Debug")
+if (CMAKE_BUILD_TYPE STREQUAL "Debug")
     add_compile_definitions(SPDLOG_ACTIVE_LEVEL SPDLOG_LEVEL_DEBUG)
 else ()
     add_compile_definitions(SPDLOG_ACTIVE_LEVEL SPDLOG_LEVEL_INFO)

From 52d57dca798f7eb0ba92b91733e33579921fa03a Mon Sep 17 00:00:00 2001
From: Morgan Funtowicz <funtowiczmo@gmail.com>
Date: Fri, 4 Oct 2024 10:42:31 +0200
Subject: [PATCH 03/91] feat(llamacpp): initial end2end build

---
 backends/llamacpp/CMakeLists.txt     |  18 ++-
 backends/llamacpp/Cargo.toml         |  17 +++
 backends/llamacpp/build.rs           |  94 +++++++++++++
 backends/llamacpp/cmake/spdlog.cmake |   7 +-
 backends/llamacpp/csrc/backend.cpp   |  11 +-
 backends/llamacpp/csrc/backend.hpp   |   7 +-
 backends/llamacpp/offline/main.cpp   |  22 +++
 backends/llamacpp/src/backend.rs     |  18 +++
 backends/llamacpp/src/lib.rs         |  11 ++
 backends/llamacpp/src/main.rs        | 203 ++++++++++++++++++++++++++-
 backends/trtllm/CMakeLists.txt       |   2 +
 11 files changed, 398 insertions(+), 12 deletions(-)
 create mode 100644 backends/llamacpp/build.rs
 create mode 100644 backends/llamacpp/offline/main.cpp
 create mode 100644 backends/llamacpp/src/backend.rs
 create mode 100644 backends/llamacpp/src/lib.rs

diff --git a/backends/llamacpp/CMakeLists.txt b/backends/llamacpp/CMakeLists.txt
index 2f9026f1656..4671314f3dc 100644
--- a/backends/llamacpp/CMakeLists.txt
+++ b/backends/llamacpp/CMakeLists.txt
@@ -6,12 +6,18 @@ set(CMAKE_CXX_STANDARD 20)
 include(FetchContent)
 
 set(LLAMA_CPP_TARGET_VERSION "b3837" STRING "Version of llama.cpp to build against")
-
+option(LLAMA_CPP_BUILD_OFFLINE_RUNNER "Flag to build the standalone c++ backend runner")
+option(LLAMA_CPP_BUILD_CUDA "Flag to build CUDA enabled inference through llama.cpp")
 
 # Add dependencies
 include(cmake/fmt.cmake)
 include(cmake/spdlog.cmake)
 
+if(${LLAMA_CPP_BUILD_CUDA})
+    message(STATUS "Enabling llama.cpp CUDA support")
+    set(GGML_CUDA ON)
+endif()
+
 # Download llama.cpp repo at the specific version
 fetchcontent_declare(
     llama
@@ -25,4 +31,12 @@ fetchcontent_makeavailable(llama)
 
 add_library(tgi_llama_cpp_backend_impl STATIC csrc/backend.hpp csrc/backend.cpp)
 target_compile_features(tgi_llama_cpp_backend_impl PRIVATE cxx_std_11)
-target_link_libraries(tgi_llama_cpp_backend_impl fmt::fmt spdlog::spdlog llama common)
+target_link_libraries(tgi_llama_cpp_backend_impl PUBLIC fmt::fmt spdlog::spdlog llama common)
+
+if(${LLAMA_CPP_BUILD_OFFLINE_RUNNER})
+    message(STATUS "Building llama.cpp offline runner")
+    add_executable(tgi_llama_cpp_offline_runner offline/main.cpp)
+    target_link_libraries(tgi_llama_cpp_offline_runner tgi_llama_cpp_backend_impl)
+endif()
+
+
diff --git a/backends/llamacpp/Cargo.toml b/backends/llamacpp/Cargo.toml
index 2e8ed7ddca9..fdd980c308f 100644
--- a/backends/llamacpp/Cargo.toml
+++ b/backends/llamacpp/Cargo.toml
@@ -6,3 +6,20 @@ authors.workspace = true
 homepage.workspace = true
 
 [dependencies]
+clap = { version = "4.5.19", features = ["derive"] }
+cxx = "1.0"
+hf-hub = { workspace = true }
+image = { version = "0.25.1", features = ["default-formats"] }
+metrics = { workspace = true }
+metrics-exporter-prometheus = { workspace = true }
+serde_json = "1.0.128"
+text-generation-router = { path = "../../router" }
+thiserror = "1.0.64"
+tokio = "1.40.0"
+tokio-stream = "0.1.16"
+tokenizers = { workspace = true }
+
+[build-dependencies]
+cmake = "0.1"
+cxx-build = { version = "1.0", features = ["parallel"] }
+pkg-config = "0.3"
\ No newline at end of file
diff --git a/backends/llamacpp/build.rs b/backends/llamacpp/build.rs
new file mode 100644
index 00000000000..4e8859aba7e
--- /dev/null
+++ b/backends/llamacpp/build.rs
@@ -0,0 +1,94 @@
+use cxx_build::CFG;
+use std::env;
+use std::path::PathBuf;
+
+const CMAKE_LLAMA_CPP_TARGET: &str = "tgi_llama_cpp_backend_impl";
+const ADDITIONAL_BACKEND_LINK_LIBRARIES: [&str; 2] = ["spdlog", "fmt"];
+const MPI_REQUIRED_VERSION: &str = "4.1";
+
+macro_rules! probe {
+    ($name: expr, $version: expr) => {
+        if let Err(_) = pkg_config::probe_library($name) {
+            pkg_config::probe_library(&format!("{}-{}", $name, $version))
+                .expect(&format!("Failed to locate {}", $name));
+        }
+    };
+}
+
+fn build_backend(is_debug: bool, opt_level: &str, out_dir: &PathBuf) -> PathBuf {
+    let install_path = env::var("CMAKE_INSTALL_PREFIX")
+        .map(|val| PathBuf::from(val))
+        .unwrap_or(out_dir.join("dist"));
+
+    let _ = cmake::Config::new(".")
+        .uses_cxx11()
+        .generator("Ninja")
+        .profile(match is_debug {
+            true => "Debug",
+            false => "Release",
+        })
+        .env("OPT_LEVEL", opt_level)
+        .define("CMAKE_INSTALL_PREFIX", &install_path)
+        // .define("CMAKE_CUDA_COMPILER", "/usr/local/cuda/bin/nvcc")
+        // .define("TGI_TRTLLM_BACKEND_TARGET_CUDA_ARCH_LIST", cuda_arch_list)
+        // .define("TGI_TRTLLM_BACKEND_TRT_ROOT", tensorrt_path)
+        .build();
+
+    // Additional transitive CMake dependencies
+    let deps_folder = out_dir.join("build").join("_deps");
+    for dependency in ADDITIONAL_BACKEND_LINK_LIBRARIES {
+        let dep_name = match is_debug {
+            true => format!("{}d", dependency),
+            false => String::from(dependency),
+        };
+        let dep_path = deps_folder.join(format!("{}-build", dependency));
+        println!("cargo:rustc-link-search={}", dep_path.display());
+        println!("cargo:rustc-link-lib=static={}", dep_name);
+    }
+
+    let deps_folder = out_dir.join("build").join("_deps");
+    deps_folder
+}
+
+fn build_ffi_layer(deps_folder: &PathBuf) {
+    println!("cargo:warning={}", &deps_folder.display());
+    CFG.include_prefix = "backends/llamacpp";
+    cxx_build::bridge("src/lib.rs")
+        .static_flag(true)
+        .include(deps_folder.join("fmt-src").join("include"))
+        .include(deps_folder.join("spdlog-src").join("include"))
+        .include(deps_folder.join("llama-src").join("common"))
+        .include(deps_folder.join("llama-src").join("ggml").join("include"))
+        .include(deps_folder.join("llama-src").join("include"))
+        .file("csrc/backend.cpp")
+        .std("c++20")
+        .compile(CMAKE_LLAMA_CPP_TARGET);
+
+    println!("cargo:rerun-if-changed=CMakeLists.txt");
+    println!("cargo:rerun-if-changed=csrc/backend.hpp");
+    println!("cargo:rerun-if-changed=csrc/backend.cpp");
+}
+
+fn main() {
+    // Misc variables
+    let out_dir = PathBuf::from(env::var("OUT_DIR").unwrap());
+    let build_profile = env::var("PROFILE").unwrap();
+    let (is_debug, opt_level) = match build_profile.as_ref() {
+        "debug" => (true, "0"),
+        _ => (false, "3"),
+    };
+
+    // Build the backend
+    let deps_folder = build_backend(is_debug, opt_level, &out_dir);
+
+    // Build the FFI layer calling the backend above
+    build_ffi_layer(&deps_folder);
+
+    // Emit linkage search path
+    probe!("ompi", MPI_REQUIRED_VERSION);
+
+    // Backend
+    // BACKEND_DEPS.iter().for_each(|name| {
+    //     println!("cargo:rustc-link-lib=static={}", name);
+    // });
+}
diff --git a/backends/llamacpp/cmake/spdlog.cmake b/backends/llamacpp/cmake/spdlog.cmake
index 9cd210dd1d1..68658ba5019 100644
--- a/backends/llamacpp/cmake/spdlog.cmake
+++ b/backends/llamacpp/cmake/spdlog.cmake
@@ -4,9 +4,10 @@ set(SPDLOG_FMT_EXTERNAL ON)
 
 # Define the level at which SPDLOG_ compilation level is defined
 if (CMAKE_BUILD_TYPE STREQUAL "Debug")
-    add_compile_definitions(SPDLOG_ACTIVE_LEVEL SPDLOG_LEVEL_DEBUG)
-else ()
-    add_compile_definitions(SPDLOG_ACTIVE_LEVEL SPDLOG_LEVEL_INFO)
+    message(STATUS "Verbose logging is enabled in debug build")
+    add_compile_definitions(SPDLOG_ACTIVE_LEVEL=SPDLOG_LEVEL_DEBUG)
+else()
+    add_compile_definitions(SPDLOG_ACTIVE_LEVEL=SPDLOG_LEVEL_INFO)
 endif ()
 
 fetchcontent_declare(
diff --git a/backends/llamacpp/csrc/backend.cpp b/backends/llamacpp/csrc/backend.cpp
index 9ce1dbc92ff..875fdb684bf 100644
--- a/backends/llamacpp/csrc/backend.cpp
+++ b/backends/llamacpp/csrc/backend.cpp
@@ -46,8 +46,11 @@ namespace huggingface::tgi::backends::llama {
     }
 
     TgiLlamaCppBackend::TgiLlamaCppBackend(llama_model *const model, llama_context *const ctx)
-        : model(model), ctx(ctx), batch() {
-
+        : model(model), ctx(ctx), batch()
+    {
+        char modelName[128];
+        llama_model_meta_val_str(model, "general.name", modelName, sizeof(modelName));
+        SPDLOG_DEBUG(FMT_STRING("Created llama.cpp backend for model: '{}'"), std::string_view(modelName));
     }
 
     TgiLlamaCppBackend::~TgiLlamaCppBackend() {
@@ -63,4 +66,8 @@ namespace huggingface::tgi::backends::llama {
             llama_free(ctx);
         }
     }
+
+    void TgiLlamaCppBackend::schedule() {
+        std::vector<llama_token> tokens;
+    }
 }
\ No newline at end of file
diff --git a/backends/llamacpp/csrc/backend.hpp b/backends/llamacpp/csrc/backend.hpp
index a643454e756..7e3c9020c93 100644
--- a/backends/llamacpp/csrc/backend.hpp
+++ b/backends/llamacpp/csrc/backend.hpp
@@ -1,7 +1,6 @@
 //
 // Created by Morgan Funtowicz on 9/28/2024.
 //
-
 #ifndef TGI_LLAMA_CPP_BACKEND_BACKEND_HPP
 #define TGI_LLAMA_CPP_BACKEND_BACKEND_HPP
 
@@ -9,7 +8,7 @@
 #include <llama.h>
 
 namespace huggingface::tgi::backends::llama {
-    const char* TGI_BACKEND_LLAMA_CPP_NAME = "llama.cpp";
+//    const char* TGI_BACKEND_LLAMA_CPP_NAME = "llama.cpp";
 
 
     class TgiLlamaCppBackend {
@@ -18,8 +17,10 @@ namespace huggingface::tgi::backends::llama {
         llama_context* ctx;
         llama_batch batch;
     public:
-        TgiLlamaCppBackend(llama_model* const model, llama_context* const);
+        TgiLlamaCppBackend(llama_model *model, llama_context *ctx);
         ~TgiLlamaCppBackend();
+
+        void schedule();
     };
 
     std::unique_ptr<TgiLlamaCppBackend> CreateLlamaCppBackend(std::string_view root);
diff --git a/backends/llamacpp/offline/main.cpp b/backends/llamacpp/offline/main.cpp
new file mode 100644
index 00000000000..4009588d4d1
--- /dev/null
+++ b/backends/llamacpp/offline/main.cpp
@@ -0,0 +1,22 @@
+//
+// Created by mfuntowicz on 10/3/24.
+//
+
+#include <string_view>
+#include <fmt/format.h>
+#include <fmt/color.h>
+#include <spdlog/spdlog.h>
+#include "../csrc/backend.hpp"
+
+int main(int argc, char** argv) {
+    if(argc < 2) {
+        fmt::print("No model folder provider");
+        return 1;
+    }
+
+    spdlog::set_level(spdlog::level::debug);
+
+    const std::string_view model_root = argv[1];
+    auto backend = huggingface::tgi::backends::llama::CreateLlamaCppBackend(model_root);
+    fmt::print(fmt::emphasis::bold | fg(fmt::color::yellow), "Successfully initialized llama.cpp model from {}\n", model_root);
+}
\ No newline at end of file
diff --git a/backends/llamacpp/src/backend.rs b/backends/llamacpp/src/backend.rs
new file mode 100644
index 00000000000..8af1067b9d4
--- /dev/null
+++ b/backends/llamacpp/src/backend.rs
@@ -0,0 +1,18 @@
+use text_generation_router::infer::{Backend, InferError, InferStreamResponse};
+use text_generation_router::validation::ValidGenerateRequest;
+use tokio_stream::wrappers::UnboundedReceiverStream;
+
+pub struct TgiLlamaCppBakend {}
+
+impl Backend for TgiLlamaCppBakend {
+    fn schedule(
+        &self,
+        request: ValidGenerateRequest,
+    ) -> Result<UnboundedReceiverStream<Result<InferStreamResponse, InferError>>, InferError> {
+        Err(InferError::GenerationError("Not implemented yet".into()))
+    }
+
+    async fn health(&self, current_health: bool) -> bool {
+        todo!()
+    }
+}
diff --git a/backends/llamacpp/src/lib.rs b/backends/llamacpp/src/lib.rs
new file mode 100644
index 00000000000..d4c3caf9a0c
--- /dev/null
+++ b/backends/llamacpp/src/lib.rs
@@ -0,0 +1,11 @@
+pub mod backend;
+
+#[cxx::bridge(namespace = "huggingface::tgi::backends::llama")]
+mod ffi {
+    unsafe extern "C++" {
+        include!("backends/llamacpp/csrc/backend.cpp");
+
+        /// Represent an instance of the llama.cpp backend instance on C++ side
+        type LlamaCppBackendImpl;
+    }
+}
diff --git a/backends/llamacpp/src/main.rs b/backends/llamacpp/src/main.rs
index e7a11a969c0..7226473c70f 100644
--- a/backends/llamacpp/src/main.rs
+++ b/backends/llamacpp/src/main.rs
@@ -1,3 +1,202 @@
-fn main() {
-    println!("Hello, world!");
+use clap::{Parser, Subcommand};
+use text_generation_router::{server, usage_stats};
+use thiserror::Error;
+use text_generation_router::server::ApiDoc;
+
+/// App Configuration
+#[derive(Parser, Debug)]
+#[clap(author, version, about, long_about = None)]
+struct Args {
+    #[command(subcommand)]
+    command: Option<Commands>,
+
+    #[clap(default_value = "128", long, env)]
+    max_concurrent_requests: usize,
+    #[clap(default_value = "2", long, env)]
+    max_best_of: usize,
+    #[clap(default_value = "4", long, env)]
+    max_stop_sequences: usize,
+    #[clap(default_value = "5", long, env)]
+    max_top_n_tokens: u32,
+    #[clap(default_value = "1024", long, env)]
+    max_input_tokens: usize,
+    #[clap(default_value = "2048", long, env)]
+    max_total_tokens: usize,
+    #[clap(default_value = "1.2", long, env)]
+    waiting_served_ratio: f32,
+    #[clap(default_value = "4096", long, env)]
+    max_batch_prefill_tokens: u32,
+    #[clap(long, env)]
+    max_batch_total_tokens: Option<u32>,
+    #[clap(default_value = "20", long, env)]
+    max_waiting_tokens: usize,
+    #[clap(long, env)]
+    max_batch_size: Option<usize>,
+    #[clap(default_value = "0.0.0.0", long, env)]
+    hostname: String,
+    #[clap(default_value = "3000", long, short, env)]
+    port: u16,
+    #[clap(default_value = "/tmp/text-generation-server-0", long, env)]
+    master_shard_uds_path: String,
+    #[clap(default_value = "bigscience/bloom", long, env)]
+    tokenizer_name: String,
+    #[clap(long, env)]
+    tokenizer_config_path: Option<String>,
+    #[clap(long, env)]
+    revision: Option<String>,
+    #[clap(default_value = "2", long, env)]
+    validation_workers: usize,
+    #[clap(long, env)]
+    api_key: Option<String>,
+    #[clap(long, env)]
+    json_output: bool,
+    #[clap(long, env)]
+    otlp_endpoint: Option<String>,
+    #[clap(default_value = "text-generation-inference.router", long, env)]
+    otlp_service_name: String,
+    #[clap(long, env)]
+    cors_allow_origin: Option<Vec<String>>,
+    #[clap(long, env)]
+    ngrok: bool,
+    #[clap(long, env)]
+    ngrok_authtoken: Option<String>,
+    #[clap(long, env)]
+    ngrok_edge: Option<String>,
+    #[clap(long, env, default_value_t = false)]
+    messages_api_enabled: bool,
+    #[clap(long, env, default_value_t = false)]
+    disable_grammar_support: bool,
+    #[clap(default_value = "4", long, env)]
+    max_client_batch_size: usize,
+    #[clap(default_value = "on", long, env)]
+    usage_stats: usage_stats::UsageStatsLevel,
 }
+
+#[derive(Debug, Subcommand)]
+enum Commands {
+    PrintSchema,
+}
+
+#[tokio::main]
+async fn main() -> Result<(), RouterError> {
+    // Get args
+    let args = Args::parse();
+    // Pattern match configuration
+    let Args {
+        command,
+        max_concurrent_requests,
+        max_best_of,
+        max_stop_sequences,
+        max_top_n_tokens,
+        max_input_tokens,
+        max_total_tokens,
+        waiting_served_ratio,
+        max_batch_prefill_tokens,
+        max_batch_total_tokens,
+        max_waiting_tokens,
+        max_batch_size,
+        hostname,
+        port,
+        master_shard_uds_path,
+        tokenizer_name,
+        tokenizer_config_path,
+        revision,
+        validation_workers,
+        api_key,
+        json_output,
+        otlp_endpoint,
+        otlp_service_name,
+        cors_allow_origin,
+        ngrok,
+        ngrok_authtoken,
+        ngrok_edge,
+        messages_api_enabled,
+        disable_grammar_support,
+        max_client_batch_size,
+        usage_stats,
+    } = args;
+
+    if let Some(Commands::PrintSchema) = command {
+        use utoipa::OpenApi;
+        let api_doc = ApiDoc::openapi();
+        let api_doc = serde_json::to_string_pretty(&api_doc).unwrap();
+        println!("{}", api_doc);
+        std::process::exit(0);
+    };
+    text_generation_router::logging::init_logging(otlp_endpoint, otlp_service_name, json_output);
+
+    // Validate args
+    if max_input_tokens >= max_total_tokens {
+        return Err(RouterError::ArgumentValidation(
+            "`max_input_tokens` must be < `max_total_tokens`".to_string(),
+        ));
+    }
+    if max_input_tokens as u32 > max_batch_prefill_tokens {
+        return Err(RouterError::ArgumentValidation(format!("`max_batch_prefill_tokens` must be >= `max_input_tokens`. Given: {max_batch_prefill_tokens} and {max_input_tokens}")));
+    }
+
+    if validation_workers == 0 {
+        return Err(RouterError::ArgumentValidation(
+            "`validation_workers` must be > 0".to_string(),
+        ));
+    }
+
+    if let Some(ref max_batch_total_tokens) = max_batch_total_tokens {
+        if max_batch_prefill_tokens > *max_batch_total_tokens {
+            return Err(RouterError::ArgumentValidation(format!("`max_batch_prefill_tokens` must be <= `max_batch_total_tokens`. Given: {max_batch_prefill_tokens} and {max_batch_total_tokens}")));
+        }
+        if max_total_tokens as u32 > *max_batch_total_tokens {
+            return Err(RouterError::ArgumentValidation(format!("`max_total_tokens` must be <= `max_batch_total_tokens`. Given: {max_total_tokens} and {max_batch_total_tokens}")));
+        }
+    }
+
+    if let Some(max_batch_size) = max_batch_size {
+        if max_batch_size == 0 {
+            return Err(RouterError::ArgumentValidation(
+                "`max_batch_size` must be > 0".to_string(),
+            ));
+        }
+    }
+
+    let backend = LlamaCppBackend::new();
+
+    // Run server
+    server::run(
+        backend,
+        max_concurrent_requests,
+        max_best_of,
+        max_stop_sequences,
+        max_top_n_tokens,
+        max_input_tokens,
+        max_total_tokens,
+        validation_workers,
+        api_key,
+        tokenizer_name,
+        tokenizer_config_path,
+        revision,
+        hostname,
+        port,
+        cors_allow_origin,
+        ngrok,
+        ngrok_authtoken,
+        ngrok_edge,
+        messages_api_enabled,
+        disable_grammar_support,
+        max_client_batch_size,
+        usage_stats,
+    )
+        .await?;
+    Ok(())
+}
+
+#[derive(Debug, Error)]
+enum RouterError {
+    #[error("Argument validation error: {0}")]
+    ArgumentValidation(String),
+    #[error("Backend failed: {0}")]
+    Backend(#[from] V3Error),
+    #[error("WebServer error: {0}")]
+    WebServer(#[from] server::WebServerError),
+    #[error("Tokio runtime failed to start: {0}")]
+    Tokio(#[from] std::io::Error),
+}
\ No newline at end of file
diff --git a/backends/trtllm/CMakeLists.txt b/backends/trtllm/CMakeLists.txt
index 831372cdf99..80b2b4305af 100644
--- a/backends/trtllm/CMakeLists.txt
+++ b/backends/trtllm/CMakeLists.txt
@@ -18,6 +18,8 @@ set(CMAKE_CXX_STANDARD 20)
 include(FetchContent)
 include(ExternalProject)
 
+set(CMAKE_CUDA_FLAGS ${CMAKE_CUDA_FLAGS} "--allow-unsupported-compiler -ccbin=gcc")
+
 option(TGI_TRTLLM_BACKEND_BUILD_TESTS "Enable building the unittests suite" OFF)
 option(TGI_TRTLLM_BACKEND_BUILD_EXAMPLES "Enable building the examples suite" OFF)
 set(TGI_TRTLLM_BACKEND_TARGET_CUDA_ARCH_LIST "89-real" CACHE STRING "List of CUDA architectures to support")

From e4432d36b1dbcdd53d614072cde4f08734e726b1 Mon Sep 17 00:00:00 2001
From: Morgan Funtowicz <funtowiczmo@gmail.com>
Date: Fri, 18 Oct 2024 17:10:22 +0200
Subject: [PATCH 04/91] misc(cmake): add parameter to build specific cuda arch

---
 backends/llamacpp/CMakeLists.txt | 29 +++++++++++++++++------------
 1 file changed, 17 insertions(+), 12 deletions(-)

diff --git a/backends/llamacpp/CMakeLists.txt b/backends/llamacpp/CMakeLists.txt
index 4671314f3dc..890d99daa99 100644
--- a/backends/llamacpp/CMakeLists.txt
+++ b/backends/llamacpp/CMakeLists.txt
@@ -1,11 +1,12 @@
-cmake_minimum_required(VERSION 3.20)
+cmake_minimum_required(VERSION 3.24)
 
 project(tgi-llama-cpp-backend VERSION 1.0.0)
-set(CMAKE_CXX_STANDARD 20)
+set(CMAKE_CXX_STANDARD 23)
 
 include(FetchContent)
 
-set(LLAMA_CPP_TARGET_VERSION "b3837" STRING "Version of llama.cpp to build against")
+set(LLAMA_CPP_TARGET_VERSION "b3837" CACHE STRING "Version of llama.cpp to build against")
+set(LLAMA_CPP_TARGET_CUDA_ARCHS "75-real;80-real;86-real;89-real;90-real" CACHE STRING "CUDA arch(s) to build")
 option(LLAMA_CPP_BUILD_OFFLINE_RUNNER "Flag to build the standalone c++ backend runner")
 option(LLAMA_CPP_BUILD_CUDA "Flag to build CUDA enabled inference through llama.cpp")
 
@@ -13,18 +14,22 @@ option(LLAMA_CPP_BUILD_CUDA "Flag to build CUDA enabled inference through llama.
 include(cmake/fmt.cmake)
 include(cmake/spdlog.cmake)
 
-if(${LLAMA_CPP_BUILD_CUDA})
+if (${LLAMA_CPP_BUILD_CUDA})
     message(STATUS "Enabling llama.cpp CUDA support")
+
+    if (NOT DEFINED CMAKE_CUDA_ARCHITECTURES)
+        set(CMAKE_CUDA_ARCHITECTURES ${LLAMA_CPP_TARGET_CUDA_ARCHS})
+    endif ()
     set(GGML_CUDA ON)
-endif()
+endif ()
 
 # Download llama.cpp repo at the specific version
 fetchcontent_declare(
-    llama
-#    DOWNLOAD_EXTRACT_TIMESTAMP TRUE
-    GIT_REPOSITORY https://github.com/ggerganov/llama.cpp.git
-    GIT_TAG b3837
-    GIT_SHALLOW FALSE
+        llama
+        #    DOWNLOAD_EXTRACT_TIMESTAMP TRUE
+        GIT_REPOSITORY https://github.com/ggerganov/llama.cpp.git
+        GIT_TAG b3837
+        GIT_SHALLOW FALSE
 )
 
 fetchcontent_makeavailable(llama)
@@ -33,10 +38,10 @@ add_library(tgi_llama_cpp_backend_impl STATIC csrc/backend.hpp csrc/backend.cpp)
 target_compile_features(tgi_llama_cpp_backend_impl PRIVATE cxx_std_11)
 target_link_libraries(tgi_llama_cpp_backend_impl PUBLIC fmt::fmt spdlog::spdlog llama common)
 
-if(${LLAMA_CPP_BUILD_OFFLINE_RUNNER})
+if (${LLAMA_CPP_BUILD_OFFLINE_RUNNER})
     message(STATUS "Building llama.cpp offline runner")
     add_executable(tgi_llama_cpp_offline_runner offline/main.cpp)
     target_link_libraries(tgi_llama_cpp_offline_runner tgi_llama_cpp_backend_impl)
-endif()
+endif ()
 
 

From fa89d1e613c6f8971e14d84aba821d20984967cd Mon Sep 17 00:00:00 2001
From: Morgan Funtowicz <funtowiczmo@gmail.com>
Date: Mon, 21 Oct 2024 09:14:35 +0200
Subject: [PATCH 05/91] misc(cmake): wut

---
 Cargo.lock                   | 21 +++++++++++++++++++++
 LICENSE                      |  3 ++-
 backends/llamacpp/src/lib.rs |  2 +-
 3 files changed, 24 insertions(+), 2 deletions(-)

diff --git a/Cargo.lock b/Cargo.lock
index 72441430240..4075556bfef 100644
--- a/Cargo.lock
+++ b/Cargo.lock
@@ -4183,6 +4183,27 @@ dependencies = [
  "winapi-util",
 ]
 
+[[package]]
+name = "text-generation-backend-llamacpp"
+version = "2.4.1-dev0"
+dependencies = [
+ "clap 4.5.20",
+ "cmake",
+ "cxx",
+ "cxx-build",
+ "hf-hub",
+ "image",
+ "metrics",
+ "metrics-exporter-prometheus",
+ "pkg-config",
+ "serde_json",
+ "text-generation-router",
+ "thiserror",
+ "tokenizers",
+ "tokio",
+ "tokio-stream",
+]
+
 [[package]]
 name = "text-generation-backends-trtllm"
 version = "2.4.1-dev0"
diff --git a/LICENSE b/LICENSE
index 7d0e80345c7..d6456956733 100644
--- a/LICENSE
+++ b/LICENSE
@@ -1,3 +1,4 @@
+
                                  Apache License
                            Version 2.0, January 2004
                         http://www.apache.org/licenses/
@@ -186,7 +187,7 @@
       same "printed page" as the copyright notice for easier
       identification within third-party archives.
 
-   Copyright 2022 Hugging Face
+   Copyright [yyyy] [name of copyright owner]
 
    Licensed under the Apache License, Version 2.0 (the "License");
    you may not use this file except in compliance with the License.
diff --git a/backends/llamacpp/src/lib.rs b/backends/llamacpp/src/lib.rs
index d4c3caf9a0c..bea7c06fc65 100644
--- a/backends/llamacpp/src/lib.rs
+++ b/backends/llamacpp/src/lib.rs
@@ -1,6 +1,6 @@
 pub mod backend;
 
-#[cxx::bridge(namespace = "huggingface::tgi::backends::llama")]
+#[cxx::bridge(namespace = "huggingface::tgi::backends::llama::impl")]
 mod ffi {
     unsafe extern "C++" {
         include!("backends/llamacpp/csrc/backend.cpp");

From 05ad68467625ac9c1c6831b43bd4359454387820 Mon Sep 17 00:00:00 2001
From: Morgan Funtowicz <funtowiczmo@gmail.com>
Date: Mon, 21 Oct 2024 09:14:51 +0200
Subject: [PATCH 06/91] feat(llamacpp): enable cuda

---
 backends/llamacpp/build.rs         | 12 +++++++----
 backends/llamacpp/csrc/backend.cpp | 33 +++++++++++++++++-------------
 2 files changed, 27 insertions(+), 18 deletions(-)

diff --git a/backends/llamacpp/build.rs b/backends/llamacpp/build.rs
index 4e8859aba7e..26ea8d929b9 100644
--- a/backends/llamacpp/build.rs
+++ b/backends/llamacpp/build.rs
@@ -2,6 +2,7 @@ use cxx_build::CFG;
 use std::env;
 use std::path::PathBuf;
 
+const CMAKE_LLAMA_CPP_DEFAULT_CUDA_ARCHS: &str = "75-real;80-real;86-real;89-real;90-real";
 const CMAKE_LLAMA_CPP_TARGET: &str = "tgi_llama_cpp_backend_impl";
 const ADDITIONAL_BACKEND_LINK_LIBRARIES: [&str; 2] = ["spdlog", "fmt"];
 const MPI_REQUIRED_VERSION: &str = "4.1";
@@ -20,6 +21,10 @@ fn build_backend(is_debug: bool, opt_level: &str, out_dir: &PathBuf) -> PathBuf
         .map(|val| PathBuf::from(val))
         .unwrap_or(out_dir.join("dist"));
 
+    let build_cuda = option_env!("LLAMA_CPP_BUILD_CUDA").unwrap_or("OFF");
+    let cuda_archs =
+        option_env!("LLAMA_CPP_TARGET_CUDA_ARCHS").unwrap_or(CMAKE_LLAMA_CPP_DEFAULT_CUDA_ARCHS);
+
     let _ = cmake::Config::new(".")
         .uses_cxx11()
         .generator("Ninja")
@@ -29,9 +34,8 @@ fn build_backend(is_debug: bool, opt_level: &str, out_dir: &PathBuf) -> PathBuf
         })
         .env("OPT_LEVEL", opt_level)
         .define("CMAKE_INSTALL_PREFIX", &install_path)
-        // .define("CMAKE_CUDA_COMPILER", "/usr/local/cuda/bin/nvcc")
-        // .define("TGI_TRTLLM_BACKEND_TARGET_CUDA_ARCH_LIST", cuda_arch_list)
-        // .define("TGI_TRTLLM_BACKEND_TRT_ROOT", tensorrt_path)
+        .define("LLAMA_CPP_BUILD_CUDA", build_cuda)
+        .define("LLAMA_CPP_TARGET_CUDA_ARCHS", cuda_archs)
         .build();
 
     // Additional transitive CMake dependencies
@@ -61,7 +65,7 @@ fn build_ffi_layer(deps_folder: &PathBuf) {
         .include(deps_folder.join("llama-src").join("ggml").join("include"))
         .include(deps_folder.join("llama-src").join("include"))
         .file("csrc/backend.cpp")
-        .std("c++20")
+        .std("c++23")
         .compile(CMAKE_LLAMA_CPP_TARGET);
 
     println!("cargo:rerun-if-changed=CMakeLists.txt");
diff --git a/backends/llamacpp/csrc/backend.cpp b/backends/llamacpp/csrc/backend.cpp
index 875fdb684bf..38a94c8ac5d 100644
--- a/backends/llamacpp/csrc/backend.cpp
+++ b/backends/llamacpp/csrc/backend.cpp
@@ -10,14 +10,15 @@
 
 namespace huggingface::tgi::backends::llama {
 
-    std::unique_ptr<TgiLlamaCppBackend> CreateLlamaCppBackend(std::string_view root) {
+    std::unique_ptr<huggingface::tgi::backends::llama::TgiLlamaCppBackend>
+    CreateLlamaCppBackend(std::string_view root) {
         SPDLOG_INFO(FMT_STRING("Loading model from {}"), root);
         gpt_init();
 
         // Fake argv
         std::vector<std::string_view> args = {"tgi_llama_cpp_backend", "--model", root};
-        std::vector<char*> argv;
-        for(const auto& arg : args) {
+        std::vector<char *> argv;
+        for (const auto &arg: args) {
             argv.push_back(const_cast<char *>(arg.data()));
         }
         argv.push_back(nullptr);
@@ -39,35 +40,39 @@ namespace huggingface::tgi::backends::llama {
         auto loras = result.lora_adapters;
 
         // Make sure everything is correctly initialized
-        if(model == nullptr)
+        if (model == nullptr)
             throw std::runtime_error(fmt::format("Failed to load model from {}", root));
 
-        return std::make_unique<TgiLlamaCppBackend>(model, context);
+        return std::make_unique<huggingface::tgi::backends::llama::TgiLlamaCppBackend>(model, context);
     }
 
-    TgiLlamaCppBackend::TgiLlamaCppBackend(llama_model *const model, llama_context *const ctx)
-        : model(model), ctx(ctx), batch()
-    {
+    huggingface::tgi::backends::llama::TgiLlamaCppBackend::TgiLlamaCppBackend(llama_model *const model,
+                                                                              llama_context *const ctx)
+            : model(model), ctx(ctx), batch() {
         char modelName[128];
         llama_model_meta_val_str(model, "general.name", modelName, sizeof(modelName));
         SPDLOG_DEBUG(FMT_STRING("Created llama.cpp backend for model: '{}'"), std::string_view(modelName));
     }
 
-    TgiLlamaCppBackend::~TgiLlamaCppBackend() {
-        if(model)
-        {
+    huggingface::tgi::backends::llama::TgiLlamaCppBackend::~TgiLlamaCppBackend() {
+        if (model) {
             SPDLOG_DEBUG("Freeing llama.cpp model");
             llama_free_model(model);
         }
 
-        if(ctx)
-        {
+        if (ctx) {
             SPDLOG_DEBUG("Freeing llama.cpp context");
             llama_free(ctx);
         }
     }
 
-    void TgiLlamaCppBackend::schedule() {
+    void huggingface::tgi::backends::llama::TgiLlamaCppBackend::schedule() {
         std::vector<llama_token> tokens;
     }
+
+    namespace impl {
+        class LlamaCppBackendImpl {
+
+        };
+    }
 }
\ No newline at end of file

From 091107632068a8d9f24ff4505e4264015e5da101 Mon Sep 17 00:00:00 2001
From: Morgan Funtowicz <funtowiczmo@gmail.com>
Date: Tue, 22 Oct 2024 15:22:56 +0200
Subject: [PATCH 07/91] feat(backend): correctly load llama.cpp model from
 llama api and not gpt2

---
 backends/llamacpp/csrc/backend.cpp | 54 ++++++++++++------------------
 backends/llamacpp/csrc/backend.hpp |  8 ++++-
 backends/llamacpp/offline/main.cpp |  7 ++--
 3 files changed, 32 insertions(+), 37 deletions(-)

diff --git a/backends/llamacpp/csrc/backend.cpp b/backends/llamacpp/csrc/backend.cpp
index 38a94c8ac5d..332bb4d5f0d 100644
--- a/backends/llamacpp/csrc/backend.cpp
+++ b/backends/llamacpp/csrc/backend.cpp
@@ -2,52 +2,40 @@
 // Created by Morgan Funtowicz on 9/28/2024.
 //
 
-#include <arg.h>
-#include <common.h>
+#include <expected>
+#include <filesystem>
+#include <ggml.h>
+#include <llama.h>
 #include <fmt/format.h>
+#include <fmt/std.h>
 #include <spdlog/spdlog.h>
 #include "backend.hpp"
 
 namespace huggingface::tgi::backends::llama {
 
-    std::unique_ptr<huggingface::tgi::backends::llama::TgiLlamaCppBackend>
-    CreateLlamaCppBackend(std::string_view root) {
-        SPDLOG_INFO(FMT_STRING("Loading model from {}"), root);
-        gpt_init();
+    std::expected<std::unique_ptr<TgiLlamaCppBackend>, TgiLlamaCppBackendError>
+    CreateLlamaCppBackend(const std::filesystem::path& modelPath) {
+        SPDLOG_INFO(FMT_STRING("Loading model from {}"), modelPath);
+        llama_backend_init();
+        llama_numa_init(ggml_numa_strategy::GGML_NUMA_STRATEGY_NUMACTL);
 
-        // Fake argv
-        std::vector<std::string_view> args = {"tgi_llama_cpp_backend", "--model", root};
-        std::vector<char *> argv;
-        for (const auto &arg: args) {
-            argv.push_back(const_cast<char *>(arg.data()));
+        // Load the model
+        if(!exists(modelPath)) {
+            return std::unexpected(TgiLlamaCppBackendError::MODEL_FILE_DOESNT_EXIST);
         }
-        argv.push_back(nullptr);
 
-        // Create the GPT parameters
-        gpt_params params;
-        if (!gpt_params_parse(args.size(), argv.data(), params, LLAMA_EXAMPLE_SERVER)) {
-            throw std::runtime_error("Failed to create GPT Params from model");
-        }
-
-
-        // Create the inference engine
-        SPDLOG_INFO("Allocating llama.cpp model from gpt_params");
-        auto result = llama_init_from_gpt_params(params);
-
-        // Unpack all the inference engine components
-        auto model = result.model;
-        auto context = result.context;
-        auto loras = result.lora_adapters;
-
-        // Make sure everything is correctly initialized
-        if (model == nullptr)
-            throw std::runtime_error(fmt::format("Failed to load model from {}", root));
+        auto params = llama_model_default_params();
+        auto* model = llama_load_model_from_file(modelPath.c_str(), params);
+        auto* context = llama_new_context_with_model(model, {
+            .n_batch = 1,
+            .attention_type = llama_attention_type::LLAMA_ATTENTION_TYPE_CAUSAL,
+            .flash_attn = true,
+        });
 
         return std::make_unique<huggingface::tgi::backends::llama::TgiLlamaCppBackend>(model, context);
     }
 
-    huggingface::tgi::backends::llama::TgiLlamaCppBackend::TgiLlamaCppBackend(llama_model *const model,
-                                                                              llama_context *const ctx)
+    huggingface::tgi::backends::llama::TgiLlamaCppBackend::TgiLlamaCppBackend(llama_model *const model, llama_context *const ctx)
             : model(model), ctx(ctx), batch() {
         char modelName[128];
         llama_model_meta_val_str(model, "general.name", modelName, sizeof(modelName));
diff --git a/backends/llamacpp/csrc/backend.hpp b/backends/llamacpp/csrc/backend.hpp
index 7e3c9020c93..bcf728dbf33 100644
--- a/backends/llamacpp/csrc/backend.hpp
+++ b/backends/llamacpp/csrc/backend.hpp
@@ -4,12 +4,17 @@
 #ifndef TGI_LLAMA_CPP_BACKEND_BACKEND_HPP
 #define TGI_LLAMA_CPP_BACKEND_BACKEND_HPP
 
+#include <filesystem>
 #include <memory>
 #include <llama.h>
 
 namespace huggingface::tgi::backends::llama {
 //    const char* TGI_BACKEND_LLAMA_CPP_NAME = "llama.cpp";
 
+    enum TgiLlamaCppBackendError {
+        MODEL_FILE_DOESNT_EXIST = 1
+    };
+
 
     class TgiLlamaCppBackend {
     private:
@@ -23,7 +28,8 @@ namespace huggingface::tgi::backends::llama {
         void schedule();
     };
 
-    std::unique_ptr<TgiLlamaCppBackend> CreateLlamaCppBackend(std::string_view root);
+    std::expected<std::unique_ptr<TgiLlamaCppBackend>, TgiLlamaCppBackendError>
+    CreateLlamaCppBackend(const std::filesystem::path& root);
 }
 
 #endif //TGI_LLAMA_CPP_BACKEND_BACKEND_HPP
diff --git a/backends/llamacpp/offline/main.cpp b/backends/llamacpp/offline/main.cpp
index 4009588d4d1..2f50cac1ef0 100644
--- a/backends/llamacpp/offline/main.cpp
+++ b/backends/llamacpp/offline/main.cpp
@@ -4,6 +4,7 @@
 
 #include <string_view>
 #include <fmt/format.h>
+#include <fmt/std.h>
 #include <fmt/color.h>
 #include <spdlog/spdlog.h>
 #include "../csrc/backend.hpp"
@@ -16,7 +17,7 @@ int main(int argc, char** argv) {
 
     spdlog::set_level(spdlog::level::debug);
 
-    const std::string_view model_root = argv[1];
-    auto backend = huggingface::tgi::backends::llama::CreateLlamaCppBackend(model_root);
-    fmt::print(fmt::emphasis::bold | fg(fmt::color::yellow), "Successfully initialized llama.cpp model from {}\n", model_root);
+    const auto modelPath = absolute(std::filesystem::path(argv[1]));
+    if(auto backend = huggingface::tgi::backends::llama::CreateLlamaCppBackend(modelPath); backend.has_value())
+        fmt::print(fmt::emphasis::bold | fg(fmt::color::yellow), "Successfully initialized llama.cpp model from {}\n", modelPath);
 }
\ No newline at end of file

From 098c66920d7e70e0221f0ebb34bec29f84b1cfe5 Mon Sep 17 00:00:00 2001
From: Morgan Funtowicz <funtowiczmo@gmail.com>
Date: Tue, 22 Oct 2024 15:23:16 +0200
Subject: [PATCH 08/91] feat(backend): tell cmake to build llama-common and
 link to it

---
 backends/llamacpp/CMakeLists.txt | 6 ++++--
 1 file changed, 4 insertions(+), 2 deletions(-)

diff --git a/backends/llamacpp/CMakeLists.txt b/backends/llamacpp/CMakeLists.txt
index 890d99daa99..9f08d0f3a0c 100644
--- a/backends/llamacpp/CMakeLists.txt
+++ b/backends/llamacpp/CMakeLists.txt
@@ -6,6 +6,7 @@ set(CMAKE_CXX_STANDARD 23)
 include(FetchContent)
 
 set(LLAMA_CPP_TARGET_VERSION "b3837" CACHE STRING "Version of llama.cpp to build against")
+set(LLAMA_BUILD_COMMON ON)
 set(LLAMA_CPP_TARGET_CUDA_ARCHS "75-real;80-real;86-real;89-real;90-real" CACHE STRING "CUDA arch(s) to build")
 option(LLAMA_CPP_BUILD_OFFLINE_RUNNER "Flag to build the standalone c++ backend runner")
 option(LLAMA_CPP_BUILD_CUDA "Flag to build CUDA enabled inference through llama.cpp")
@@ -28,7 +29,7 @@ fetchcontent_declare(
         llama
         #    DOWNLOAD_EXTRACT_TIMESTAMP TRUE
         GIT_REPOSITORY https://github.com/ggerganov/llama.cpp.git
-        GIT_TAG b3837
+        GIT_TAG b3958
         GIT_SHALLOW FALSE
 )
 
@@ -41,7 +42,8 @@ target_link_libraries(tgi_llama_cpp_backend_impl PUBLIC fmt::fmt spdlog::spdlog
 if (${LLAMA_CPP_BUILD_OFFLINE_RUNNER})
     message(STATUS "Building llama.cpp offline runner")
     add_executable(tgi_llama_cpp_offline_runner offline/main.cpp)
-    target_link_libraries(tgi_llama_cpp_offline_runner tgi_llama_cpp_backend_impl)
+
+    target_link_libraries(tgi_llama_cpp_offline_runner PUBLIC tgi_llama_cpp_backend_impl llama common)
 endif ()
 
 

From 45d5a6a8c5b21144cecf1db550822077148925c9 Mon Sep 17 00:00:00 2001
From: Morgan Funtowicz <funtowiczmo@gmail.com>
Date: Wed, 23 Oct 2024 00:09:10 +0200
Subject: [PATCH 09/91] feat(backend): add some initial decoding steps

---
 backends/llamacpp/csrc/backend.cpp | 103 ++++++++++++++++++++++++-----
 backends/llamacpp/csrc/backend.hpp |  41 ++++++++++--
 backends/llamacpp/offline/main.cpp |  26 ++++++--
 3 files changed, 146 insertions(+), 24 deletions(-)

diff --git a/backends/llamacpp/csrc/backend.cpp b/backends/llamacpp/csrc/backend.cpp
index 332bb4d5f0d..859041c20eb 100644
--- a/backends/llamacpp/csrc/backend.cpp
+++ b/backends/llamacpp/csrc/backend.cpp
@@ -2,20 +2,23 @@
 // Created by Morgan Funtowicz on 9/28/2024.
 //
 
-#include <expected>
 #include <filesystem>
+#include <span>
+
 #include <ggml.h>
 #include <llama.h>
+#include <fmt/chrono.h>
 #include <fmt/format.h>
 #include <fmt/std.h>
 #include <spdlog/spdlog.h>
+
 #include "backend.hpp"
 
 namespace huggingface::tgi::backends::llama {
 
     std::expected<std::unique_ptr<TgiLlamaCppBackend>, TgiLlamaCppBackendError>
     CreateLlamaCppBackend(const std::filesystem::path& modelPath) {
-        SPDLOG_INFO(FMT_STRING("Loading model from {}"), modelPath);
+        SPDLOG_DEBUG(FMT_STRING("Loading model from {}"), modelPath);
         llama_backend_init();
         llama_numa_init(ggml_numa_strategy::GGML_NUMA_STRATEGY_NUMACTL);
 
@@ -28,39 +31,109 @@ namespace huggingface::tgi::backends::llama {
         auto* model = llama_load_model_from_file(modelPath.c_str(), params);
         auto* context = llama_new_context_with_model(model, {
             .n_batch = 1,
+            .n_threads = 16,
             .attention_type = llama_attention_type::LLAMA_ATTENTION_TYPE_CAUSAL,
-            .flash_attn = true,
+            .flash_attn = false,
         });
 
         return std::make_unique<huggingface::tgi::backends::llama::TgiLlamaCppBackend>(model, context);
     }
 
     huggingface::tgi::backends::llama::TgiLlamaCppBackend::TgiLlamaCppBackend(llama_model *const model, llama_context *const ctx)
-            : model(model), ctx(ctx), batch() {
-        char modelName[128];
-        llama_model_meta_val_str(model, "general.name", modelName, sizeof(modelName));
+        : model(model), ctx(ctx) {
+#ifndef NDEBUG
+        char modelName[256];
+        llama_model_meta_val_str(llama_get_model(ctx), "general.name", modelName, sizeof(modelName));
         SPDLOG_DEBUG(FMT_STRING("Created llama.cpp backend for model: '{}'"), std::string_view(modelName));
+#endif
     }
 
     huggingface::tgi::backends::llama::TgiLlamaCppBackend::~TgiLlamaCppBackend() {
-        if (model) {
+        if (ctx) {
+            SPDLOG_DEBUG("Freeing llama.cpp context");
+            llama_free(ctx);
+        }
+
+        if(model) {
             SPDLOG_DEBUG("Freeing llama.cpp model");
             llama_free_model(model);
         }
+    }
 
-        if (ctx) {
-            SPDLOG_DEBUG("Freeing llama.cpp context");
-            llama_free(ctx);
+    std::vector<TgiLlamaCppBackend::TokenId> TgiLlamaCppBackend::Tokenize(const std::string &text) const {
+        std::vector<TgiLlamaCppBackend::TokenId> tokens(llama_n_seq_max(ctx));
+
+        if(auto nTokens = llama_tokenize(model, text.c_str(), text.length(), tokens.data(), tokens.capacity(), true, true); nTokens < 0){
+            tokens.resize(-nTokens);
+            llama_tokenize(model, text.c_str(), text.length(), tokens.data(), tokens.capacity(), true, true);
+        } else {
+            tokens.resize(nTokens);
         }
+
+        SPDLOG_DEBUG(FMT_STRING("Tokenized input with {:d} tokens"), tokens.size());
+        return tokens;
     }
 
-    void huggingface::tgi::backends::llama::TgiLlamaCppBackend::schedule() {
-        std::vector<llama_token> tokens;
+    std::unique_ptr<llama_sampler *> TgiLlamaCppBackend::GetSamplerFromArgs(
+            const uint32_t topK, const float_t topP, const float_t frequencyPenalty, const float_t repetitionPenalty, const uint64_t seed) {
+        auto *sampler = llama_sampler_chain_init({.no_perf = false});
+
+        // Penalties
+        llama_sampler_chain_add(sampler, llama_sampler_init_penalties(
+                llama_n_vocab(model),
+                llama_token_eos(model),
+                llama_token_nl (model),
+                0.0f,
+                repetitionPenalty,
+                frequencyPenalty,
+                0.0f,
+                false,
+                false
+        ));
+        llama_sampler_chain_add(sampler, llama_sampler_init_top_k(static_cast<int32_t>(topK)));
+
+        if(0 < topP && topP < 1) {
+            llama_sampler_chain_add(sampler, llama_sampler_init_top_p(topP, 1));
+        }
+
+        llama_sampler_chain_add(sampler, llama_sampler_init_dist(seed));
+        return std::make_unique<llama_sampler*>(sampler);
     }
 
-    namespace impl {
-        class LlamaCppBackendImpl {
+    std::vector<TgiLlamaCppBackend::TokenId> huggingface::tgi::backends::llama::TgiLlamaCppBackend::Generate(
+            std::span<const TokenId> tokens, const uint32_t topK, const float_t topP, const uint32_t maxNewTokens) {
+        SPDLOG_DEBUG(FMT_STRING("Received {:d} tokens to schedule"), tokens.size());
+
+        // Allocate generation result
+        std::vector<TgiLlamaCppBackend::TokenId> generated;
+        generated.reserve(llama_n_seq_max(ctx) - tokens.size());
+
+        // Retrieve decoding context
+        auto batch = llama_batch_get_one(const_cast<int32_t *>(tokens.data()), static_cast<int32_t>(tokens.size()));
+        auto sampler = GetSamplerFromArgs(topK, topP, 1.0, 1.0, 2014);
 
-        };
+        // Decode
+        for(auto [generating, nDecoded] = std::pair{true, 0uz}; generating && nDecoded < maxNewTokens; ++nDecoded) {
+#ifndef NDEBUG
+            const auto start = std::chrono::steady_clock::now();
+            const auto status = llama_decode(ctx, batch);
+            const auto end = std::chrono::steady_clock::now();
+            const auto latency = std::chrono::duration_cast<std::chrono::milliseconds>(end - start);
+            SPDLOG_DEBUG(FMT_STRING("Successfully decoded {:d} token(s) in {}"), batch.n_tokens, latency);
+#else
+            const auto status = llama_decode(ctx, batch);
+#endif
+            if (status == LLAMA_SUCCESS) {
+                // Sample the new token
+                auto new_token_id = llama_sampler_sample(*sampler, ctx, -1);
+                generated.emplace_back(new_token_id);
+                generating = !llama_token_is_eog(model, new_token_id);
+
+                // Next iteration
+                batch = llama_batch_get_one(&new_token_id, 1);
+            }
+        }
+        generated.shrink_to_fit();
+        return generated;
     }
 }
\ No newline at end of file
diff --git a/backends/llamacpp/csrc/backend.hpp b/backends/llamacpp/csrc/backend.hpp
index bcf728dbf33..e109a158cd7 100644
--- a/backends/llamacpp/csrc/backend.hpp
+++ b/backends/llamacpp/csrc/backend.hpp
@@ -4,28 +4,61 @@
 #ifndef TGI_LLAMA_CPP_BACKEND_BACKEND_HPP
 #define TGI_LLAMA_CPP_BACKEND_BACKEND_HPP
 
+#include <cmath>
+#include <expected>
 #include <filesystem>
 #include <memory>
 #include <llama.h>
 
-namespace huggingface::tgi::backends::llama {
-//    const char* TGI_BACKEND_LLAMA_CPP_NAME = "llama.cpp";
+#define LLAMA_SUCCESS 0
 
+namespace huggingface::tgi::backends::llama {
     enum TgiLlamaCppBackendError {
         MODEL_FILE_DOESNT_EXIST = 1
     };
 
 
     class TgiLlamaCppBackend {
+        using TokenId = int32_t;
+
     private:
         llama_model* model;
         llama_context* ctx;
-        llama_batch batch;
+
+        /**
+         *
+         * @param topK
+         * @param topP
+         * @return
+         */
+        std::unique_ptr<llama_sampler *> GetSamplerFromArgs(
+                uint32_t topK, float_t topP, float_t frequencyPenalty, float_t repetitionPenalty, uint64_t seed);
+
     public:
         TgiLlamaCppBackend(llama_model *model, llama_context *ctx);
         ~TgiLlamaCppBackend();
 
-        void schedule();
+        /**
+         *
+         * @param text
+         * @return
+         */
+        [[nodiscard]] std::vector<TgiLlamaCppBackend::TokenId> Tokenize(const std::string& text) const;
+
+        /**
+         *
+         * @param tokens
+         * @param topK
+         * @param topP
+         * @param maxNewTokens
+         * @return
+         */
+        [[nodiscard]] std::vector<TgiLlamaCppBackend::TokenId> Generate(
+                std::span<const TokenId> tokens,
+                uint32_t topK,
+                float_t topP = 1.0f,
+                uint32_t maxNewTokens = std::numeric_limits<uint32_t>::max()
+        );
     };
 
     std::expected<std::unique_ptr<TgiLlamaCppBackend>, TgiLlamaCppBackendError>
diff --git a/backends/llamacpp/offline/main.cpp b/backends/llamacpp/offline/main.cpp
index 2f50cac1ef0..3165261fa9f 100644
--- a/backends/llamacpp/offline/main.cpp
+++ b/backends/llamacpp/offline/main.cpp
@@ -3,21 +3,37 @@
 //
 
 #include <string_view>
+#include <fmt/color.h>
 #include <fmt/format.h>
 #include <fmt/std.h>
-#include <fmt/color.h>
+#include <fmt/ranges.h>
 #include <spdlog/spdlog.h>
 #include "../csrc/backend.hpp"
 
 int main(int argc, char** argv) {
-    if(argc < 2) {
+    if (argc < 2) {
         fmt::print("No model folder provider");
         return 1;
     }
 
     spdlog::set_level(spdlog::level::debug);
 
+    const auto prompt = "My name is Morgan";
+
     const auto modelPath = absolute(std::filesystem::path(argv[1]));
-    if(auto backend = huggingface::tgi::backends::llama::CreateLlamaCppBackend(modelPath); backend.has_value())
-        fmt::print(fmt::emphasis::bold | fg(fmt::color::yellow), "Successfully initialized llama.cpp model from {}\n", modelPath);
-}
\ No newline at end of file
+    if (auto maybeBackend = huggingface::tgi::backends::llama::CreateLlamaCppBackend(modelPath); maybeBackend.has_value()) {
+        // Retrieve the backend
+        const auto& backend = *maybeBackend;
+
+        // Generate
+        const auto promptTokens = backend->Tokenize(prompt);
+        const auto out = backend->Generate(promptTokens, 30, 1.0, 32);
+        fmt::print(FMT_STRING("Generated: {}"), out);
+    } else {
+        switch (maybeBackend.error()) {
+            case huggingface::tgi::backends::llama::TgiLlamaCppBackendError::MODEL_FILE_DOESNT_EXIST:
+                fmt::print(fmt::emphasis::bold | fg(fmt::color::red), "Specified file {} doesnt exist", modelPath);
+                return maybeBackend.error();
+        }
+    }
+}

From 92bb1136533be3dc6fd99f1beba9986ec70efec2 Mon Sep 17 00:00:00 2001
From: Morgan Funtowicz <funtowiczmo@gmail.com>
Date: Wed, 23 Oct 2024 00:10:41 +0200
Subject: [PATCH 10/91] feat(backend): use llama_token as TokenId type

---
 backends/llamacpp/csrc/backend.hpp | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/backends/llamacpp/csrc/backend.hpp b/backends/llamacpp/csrc/backend.hpp
index e109a158cd7..73bad99cb41 100644
--- a/backends/llamacpp/csrc/backend.hpp
+++ b/backends/llamacpp/csrc/backend.hpp
@@ -19,7 +19,7 @@ namespace huggingface::tgi::backends::llama {
 
 
     class TgiLlamaCppBackend {
-        using TokenId = int32_t;
+        using TokenId = llama_token;
 
     private:
         llama_model* model;

From d4b5be10f9aeaea16da52ad60a459435c81d8444 Mon Sep 17 00:00:00 2001
From: Morgan Funtowicz <funtowiczmo@gmail.com>
Date: Wed, 23 Oct 2024 14:12:32 +0200
Subject: [PATCH 11/91] feat(backend): minor refactor

---
 backends/llamacpp/csrc/backend.cpp | 3 +--
 backends/llamacpp/csrc/backend.hpp | 2 +-
 2 files changed, 2 insertions(+), 3 deletions(-)

diff --git a/backends/llamacpp/csrc/backend.cpp b/backends/llamacpp/csrc/backend.cpp
index 859041c20eb..f283b2ac7e9 100644
--- a/backends/llamacpp/csrc/backend.cpp
+++ b/backends/llamacpp/csrc/backend.cpp
@@ -123,7 +123,7 @@ namespace huggingface::tgi::backends::llama {
 #else
             const auto status = llama_decode(ctx, batch);
 #endif
-            if (status == LLAMA_SUCCESS) {
+            if (LLAMA_SUCCESS(status)) {
                 // Sample the new token
                 auto new_token_id = llama_sampler_sample(*sampler, ctx, -1);
                 generated.emplace_back(new_token_id);
@@ -133,7 +133,6 @@ namespace huggingface::tgi::backends::llama {
                 batch = llama_batch_get_one(&new_token_id, 1);
             }
         }
-        generated.shrink_to_fit();
         return generated;
     }
 }
\ No newline at end of file
diff --git a/backends/llamacpp/csrc/backend.hpp b/backends/llamacpp/csrc/backend.hpp
index 73bad99cb41..26d690c8321 100644
--- a/backends/llamacpp/csrc/backend.hpp
+++ b/backends/llamacpp/csrc/backend.hpp
@@ -10,7 +10,7 @@
 #include <memory>
 #include <llama.h>
 
-#define LLAMA_SUCCESS 0
+#define LLAMA_SUCCESS(x) x == 0
 
 namespace huggingface::tgi::backends::llama {
     enum TgiLlamaCppBackendError {

From 37faeb34b248bc3b6568539b6ae3a7a6d85f2c0d Mon Sep 17 00:00:00 2001
From: Morgan Funtowicz <funtowiczmo@gmail.com>
Date: Wed, 23 Oct 2024 14:12:52 +0200
Subject: [PATCH 12/91] feat(backend): expose frequency and repetition
 penalties

---
 backends/llamacpp/csrc/backend.cpp | 13 ++++++++++---
 backends/llamacpp/csrc/backend.hpp | 17 +++++++++++++----
 backends/llamacpp/offline/main.cpp | 11 +++++++++--
 3 files changed, 32 insertions(+), 9 deletions(-)

diff --git a/backends/llamacpp/csrc/backend.cpp b/backends/llamacpp/csrc/backend.cpp
index f283b2ac7e9..1f6dcfaefdf 100644
--- a/backends/llamacpp/csrc/backend.cpp
+++ b/backends/llamacpp/csrc/backend.cpp
@@ -100,8 +100,15 @@ namespace huggingface::tgi::backends::llama {
         return std::make_unique<llama_sampler*>(sampler);
     }
 
-    std::vector<TgiLlamaCppBackend::TokenId> huggingface::tgi::backends::llama::TgiLlamaCppBackend::Generate(
-            std::span<const TokenId> tokens, const uint32_t topK, const float_t topP, const uint32_t maxNewTokens) {
+    std::expected<std::vector<TgiLlamaCppBackend::TokenId>, TgiLlamaCppBackendError> huggingface::tgi::backends::llama::TgiLlamaCppBackend::Generate(
+            std::span<const TokenId> tokens,
+            const uint32_t topK,
+            const float_t topP,
+            const float_t frequencyPenalty,
+            const float_t repetitionPenalty,
+            const uint32_t maxNewTokens,
+            const uint64_t seed
+        ) {
         SPDLOG_DEBUG(FMT_STRING("Received {:d} tokens to schedule"), tokens.size());
 
         // Allocate generation result
@@ -110,7 +117,7 @@ namespace huggingface::tgi::backends::llama {
 
         // Retrieve decoding context
         auto batch = llama_batch_get_one(const_cast<int32_t *>(tokens.data()), static_cast<int32_t>(tokens.size()));
-        auto sampler = GetSamplerFromArgs(topK, topP, 1.0, 1.0, 2014);
+        auto sampler = GetSamplerFromArgs(topK, topP, frequencyPenalty, repetitionPenalty, seed);
 
         // Decode
         for(auto [generating, nDecoded] = std::pair{true, 0uz}; generating && nDecoded < maxNewTokens; ++nDecoded) {
diff --git a/backends/llamacpp/csrc/backend.hpp b/backends/llamacpp/csrc/backend.hpp
index 26d690c8321..5f356bc06b6 100644
--- a/backends/llamacpp/csrc/backend.hpp
+++ b/backends/llamacpp/csrc/backend.hpp
@@ -13,7 +13,7 @@
 #define LLAMA_SUCCESS(x) x == 0
 
 namespace huggingface::tgi::backends::llama {
-    enum TgiLlamaCppBackendError {
+    enum TgiLlamaCppBackendError: uint8_t {
         MODEL_FILE_DOESNT_EXIST = 1
     };
 
@@ -43,24 +43,33 @@ namespace huggingface::tgi::backends::llama {
          * @param text
          * @return
          */
-        [[nodiscard]] std::vector<TgiLlamaCppBackend::TokenId> Tokenize(const std::string& text) const;
+        [[nodiscard("Tokens will be freed after this call if not assigned to an lvalue")]]
+        std::vector<TgiLlamaCppBackend::TokenId> Tokenize(const std::string& text) const;
 
         /**
          *
          * @param tokens
          * @param topK
          * @param topP
+         * @param frequencyPenalty
+         * @param repetitionPenalty
          * @param maxNewTokens
+         * @param seed
          * @return
          */
-        [[nodiscard]] std::vector<TgiLlamaCppBackend::TokenId> Generate(
+        [[nodiscard("Generated tokens will be freed after this call if not assigned to an lvalue")]]
+        std::expected<std::vector<TgiLlamaCppBackend::TokenId>, TgiLlamaCppBackendError> Generate(
                 std::span<const TokenId> tokens,
                 uint32_t topK,
                 float_t topP = 1.0f,
-                uint32_t maxNewTokens = std::numeric_limits<uint32_t>::max()
+                float_t frequencyPenalty = 0.0f,
+                float_t repetitionPenalty = 0.0f,
+                uint32_t maxNewTokens = std::numeric_limits<uint32_t>::max() - 1,
+                uint64_t seed = 2014
         );
     };
 
+    [[nodiscard("Create backend will be freed after this call if not assigned to an lvalue")]]
     std::expected<std::unique_ptr<TgiLlamaCppBackend>, TgiLlamaCppBackendError>
     CreateLlamaCppBackend(const std::filesystem::path& root);
 }
diff --git a/backends/llamacpp/offline/main.cpp b/backends/llamacpp/offline/main.cpp
index 3165261fa9f..c2ae05c726f 100644
--- a/backends/llamacpp/offline/main.cpp
+++ b/backends/llamacpp/offline/main.cpp
@@ -27,8 +27,15 @@ int main(int argc, char** argv) {
 
         // Generate
         const auto promptTokens = backend->Tokenize(prompt);
-        const auto out = backend->Generate(promptTokens, 30, 1.0, 32);
-        fmt::print(FMT_STRING("Generated: {}"), out);
+        const auto out = backend->Generate(promptTokens, 30, 1.0, 2.0, 0.0, 32);
+
+        if(out.has_value())
+            fmt::print(FMT_STRING("Generated: {}"), *out);
+        else {
+            const auto err = out.error();
+            fmt::print(fmt::emphasis::bold | fg(fmt::color::red), "Got an error: {:d}", static_cast<uint8_t>(err));
+        }
+
     } else {
         switch (maybeBackend.error()) {
             case huggingface::tgi::backends::llama::TgiLlamaCppBackendError::MODEL_FILE_DOESNT_EXIST:

From f9c248657dbe3b418e97a3039a934d5aa628b777 Mon Sep 17 00:00:00 2001
From: Morgan Funtowicz <funtowiczmo@gmail.com>
Date: Wed, 23 Oct 2024 22:11:58 +0200
Subject: [PATCH 13/91] chore(backend): minor formatting

---
 backends/llamacpp/csrc/backend.cpp | 2 +-
 backends/llamacpp/csrc/backend.hpp | 1 -
 2 files changed, 1 insertion(+), 2 deletions(-)

diff --git a/backends/llamacpp/csrc/backend.cpp b/backends/llamacpp/csrc/backend.cpp
index 1f6dcfaefdf..c8806957bb7 100644
--- a/backends/llamacpp/csrc/backend.cpp
+++ b/backends/llamacpp/csrc/backend.cpp
@@ -15,10 +15,10 @@
 #include "backend.hpp"
 
 namespace huggingface::tgi::backends::llama {
-
     std::expected<std::unique_ptr<TgiLlamaCppBackend>, TgiLlamaCppBackendError>
     CreateLlamaCppBackend(const std::filesystem::path& modelPath) {
         SPDLOG_DEBUG(FMT_STRING("Loading model from {}"), modelPath);
+
         llama_backend_init();
         llama_numa_init(ggml_numa_strategy::GGML_NUMA_STRATEGY_NUMACTL);
 
diff --git a/backends/llamacpp/csrc/backend.hpp b/backends/llamacpp/csrc/backend.hpp
index 5f356bc06b6..e4c31ad6411 100644
--- a/backends/llamacpp/csrc/backend.hpp
+++ b/backends/llamacpp/csrc/backend.hpp
@@ -17,7 +17,6 @@ namespace huggingface::tgi::backends::llama {
         MODEL_FILE_DOESNT_EXIST = 1
     };
 
-
     class TgiLlamaCppBackend {
         using TokenId = llama_token;
 

From 355d8a55b46f4ac56a8741bc3e3960a6bed2c03a Mon Sep 17 00:00:00 2001
From: Morgan Funtowicz <funtowiczmo@gmail.com>
Date: Thu, 24 Oct 2024 09:56:40 +0200
Subject: [PATCH 14/91] feat(backend): wip Rust binding

---
 backends/llamacpp/CMakeLists.txt   |  7 +++++++
 backends/llamacpp/build.rs         |  6 ++++--
 backends/llamacpp/csrc/backend.hpp |  1 +
 backends/llamacpp/csrc/ffi.hpp     | 19 +++++++++++++++++++
 backends/llamacpp/src/backend.rs   | 15 ++++++++++++++-
 backends/llamacpp/src/lib.rs       |  9 +++++++--
 6 files changed, 52 insertions(+), 5 deletions(-)
 create mode 100644 backends/llamacpp/csrc/ffi.hpp

diff --git a/backends/llamacpp/CMakeLists.txt b/backends/llamacpp/CMakeLists.txt
index 9f08d0f3a0c..644db5ae162 100644
--- a/backends/llamacpp/CMakeLists.txt
+++ b/backends/llamacpp/CMakeLists.txt
@@ -11,6 +11,13 @@ set(LLAMA_CPP_TARGET_CUDA_ARCHS "75-real;80-real;86-real;89-real;90-real" CACHE
 option(LLAMA_CPP_BUILD_OFFLINE_RUNNER "Flag to build the standalone c++ backend runner")
 option(LLAMA_CPP_BUILD_CUDA "Flag to build CUDA enabled inference through llama.cpp")
 
+if(${CMAKE_CXX_COMPILER_ID} STREQUAL "Clang" AND ${CMAKE_SYSTEM_NAME} STREQUAL "Linux")
+    message(STATUS "Targeting libc++")
+    set(CMAKE_CXX_FLAGS -stdlib=libc++ ${CMAKE_CXX_FLAGS})
+else()
+    message(STATUS "Not using libc++ ${CMAKE_CXX_COMPILER_ID} ${CMAKE_SYSTEM_NAME}")
+endif()
+
 # Add dependencies
 include(cmake/fmt.cmake)
 include(cmake/spdlog.cmake)
diff --git a/backends/llamacpp/build.rs b/backends/llamacpp/build.rs
index 26ea8d929b9..d84e517f2c6 100644
--- a/backends/llamacpp/build.rs
+++ b/backends/llamacpp/build.rs
@@ -59,18 +59,20 @@ fn build_ffi_layer(deps_folder: &PathBuf) {
     CFG.include_prefix = "backends/llamacpp";
     cxx_build::bridge("src/lib.rs")
         .static_flag(true)
+        .std("c++23")
         .include(deps_folder.join("fmt-src").join("include"))
         .include(deps_folder.join("spdlog-src").join("include"))
         .include(deps_folder.join("llama-src").join("common"))
         .include(deps_folder.join("llama-src").join("ggml").join("include"))
         .include(deps_folder.join("llama-src").join("include"))
-        .file("csrc/backend.cpp")
-        .std("c++23")
+        .include("csrc/backend.hpp")
+        .file("csrc/ffi.cpp")
         .compile(CMAKE_LLAMA_CPP_TARGET);
 
     println!("cargo:rerun-if-changed=CMakeLists.txt");
     println!("cargo:rerun-if-changed=csrc/backend.hpp");
     println!("cargo:rerun-if-changed=csrc/backend.cpp");
+    println!("cargo:rerun-if-changed=csrc/ffi.hpp");
 }
 
 fn main() {
diff --git a/backends/llamacpp/csrc/backend.hpp b/backends/llamacpp/csrc/backend.hpp
index e4c31ad6411..7075642acd5 100644
--- a/backends/llamacpp/csrc/backend.hpp
+++ b/backends/llamacpp/csrc/backend.hpp
@@ -8,6 +8,7 @@
 #include <expected>
 #include <filesystem>
 #include <memory>
+#include <span>
 #include <llama.h>
 
 #define LLAMA_SUCCESS(x) x == 0
diff --git a/backends/llamacpp/csrc/ffi.hpp b/backends/llamacpp/csrc/ffi.hpp
new file mode 100644
index 00000000000..e924316e36e
--- /dev/null
+++ b/backends/llamacpp/csrc/ffi.hpp
@@ -0,0 +1,19 @@
+//
+// Created by mfuntowicz on 10/23/24.
+//
+
+#ifndef TGI_LLAMA_CPP_BACKEND_FFI_HPP
+#define TGI_LLAMA_CPP_BACKEND_FFI_HPP
+
+#include "backend.hpp"
+//#include "backends/llamacpp/src/lib.rs.h"
+
+
+namespace huggingface::tgi::backends::llama {
+    class LlamaCppBackendImpl {
+
+    };
+}
+
+
+#endif //TGI_LLAMA_CPP_BACKEND_FFI_HPP
diff --git a/backends/llamacpp/src/backend.rs b/backends/llamacpp/src/backend.rs
index 8af1067b9d4..89daeee3658 100644
--- a/backends/llamacpp/src/backend.rs
+++ b/backends/llamacpp/src/backend.rs
@@ -1,8 +1,21 @@
+use crate::ffi::{create_llamacpp_backend, LlamaCppBackendImpl};
+use cxx::UniquePtr;
+use std::path::Path;
 use text_generation_router::infer::{Backend, InferError, InferStreamResponse};
 use text_generation_router::validation::ValidGenerateRequest;
 use tokio_stream::wrappers::UnboundedReceiverStream;
 
-pub struct TgiLlamaCppBakend {}
+pub struct TgiLlamaCppBakend {
+    backend: UniquePtr<LlamaCppBackendImpl>,
+}
+
+impl TgiLlamaCppBakend {
+    pub fn new<P: AsRef<Path>>(model_path: P) -> Result<Self, ()> {
+        Ok(Self {
+            backend: create_llamacpp_backend(model_path.as_ref().to_str().unwrap()),
+        })
+    }
+}
 
 impl Backend for TgiLlamaCppBakend {
     fn schedule(
diff --git a/backends/llamacpp/src/lib.rs b/backends/llamacpp/src/lib.rs
index bea7c06fc65..d25e3ca0bea 100644
--- a/backends/llamacpp/src/lib.rs
+++ b/backends/llamacpp/src/lib.rs
@@ -1,11 +1,16 @@
 pub mod backend;
 
-#[cxx::bridge(namespace = "huggingface::tgi::backends::llama::impl")]
+#[cxx::bridge(namespace = "huggingface::tgi::backends::llama")]
 mod ffi {
     unsafe extern "C++" {
-        include!("backends/llamacpp/csrc/backend.cpp");
+        include!("backends/llamacpp/csrc/ffi.hpp");
 
         /// Represent an instance of the llama.cpp backend instance on C++ side
         type LlamaCppBackendImpl;
+
+        #[rust_name = "create_llamacpp_backend"]
+        fn CreateLlamaCppBackend(
+            engine_folder: &str,
+        ) -> UniquePtr<LlamaCppBackendImpl>;
     }
 }

From e4d803c94ef8a48a172a57f783d2e5f9c9387edd Mon Sep 17 00:00:00 2001
From: Morgan Funtowicz <funtowiczmo@gmail.com>
Date: Thu, 24 Oct 2024 16:42:50 +0200
Subject: [PATCH 15/91] feat(backend): build and link through build.rs

---
 Cargo.lock                         | 90 ++++++++++++++++++++++++++++--
 backends/llamacpp/CMakeLists.txt   | 18 +++---
 backends/llamacpp/Cargo.toml       |  6 ++
 backends/llamacpp/build.rs         | 86 +++++++++++++++-------------
 backends/llamacpp/csrc/backend.cpp | 51 +++++++++--------
 backends/llamacpp/csrc/backend.hpp | 22 ++++++--
 backends/llamacpp/csrc/ffi.hpp     | 34 ++++++++++-
 backends/llamacpp/offline/main.cpp |  6 +-
 backends/llamacpp/src/backend.rs   | 59 ++++++++++++++++----
 backends/llamacpp/src/lib.rs       |  8 +--
 backends/llamacpp/src/main.rs      | 28 ++++++----
 11 files changed, 295 insertions(+), 113 deletions(-)

diff --git a/Cargo.lock b/Cargo.lock
index 4075556bfef..479e94d7fca 100644
--- a/Cargo.lock
+++ b/Cargo.lock
@@ -2732,6 +2732,20 @@ dependencies = [
  "thiserror",
 ]
 
+[[package]]
+name = "opentelemetry"
+version = "0.26.0"
+source = "registry+https://github.com/rust-lang/crates.io-index"
+checksum = "570074cc999d1a58184080966e5bd3bf3a9a4af650c3b05047c2621e7405cd17"
+dependencies = [
+ "futures-core",
+ "futures-sink",
+ "js-sys",
+ "once_cell",
+ "pin-project-lite",
+ "thiserror",
+]
+
 [[package]]
 name = "opentelemetry-otlp"
 version = "0.13.0"
@@ -2849,6 +2863,24 @@ dependencies = [
  "thiserror",
 ]
 
+[[package]]
+name = "opentelemetry_sdk"
+version = "0.26.0"
+source = "registry+https://github.com/rust-lang/crates.io-index"
+checksum = "d2c627d9f4c9cdc1f21a29ee4bfbd6028fcb8bcf2a857b43f3abdf72c9c862f3"
+dependencies = [
+ "async-trait",
+ "futures-channel",
+ "futures-executor",
+ "futures-util",
+ "glob",
+ "once_cell",
+ "opentelemetry 0.26.0",
+ "percent-encoding",
+ "rand",
+ "thiserror",
+]
+
 [[package]]
 name = "option-ext"
 version = "0.2.0"
@@ -4187,12 +4219,14 @@ dependencies = [
 name = "text-generation-backend-llamacpp"
 version = "2.4.1-dev0"
 dependencies = [
+ "async-trait",
  "clap 4.5.20",
  "cmake",
  "cxx",
  "cxx-build",
  "hf-hub",
  "image",
+ "log",
  "metrics",
  "metrics-exporter-prometheus",
  "pkg-config",
@@ -4202,6 +4236,10 @@ dependencies = [
  "tokenizers",
  "tokio",
  "tokio-stream",
+ "tracing",
+ "tracing-opentelemetry 0.27.0",
+ "tracing-subscriber",
+ "utoipa 5.1.2",
 ]
 
 [[package]]
@@ -4330,7 +4368,7 @@ dependencies = [
  "tracing-opentelemetry 0.21.0",
  "tracing-subscriber",
  "ureq",
- "utoipa",
+ "utoipa 4.2.3",
  "utoipa-swagger-ui",
  "uuid",
  "vergen",
@@ -4381,7 +4419,7 @@ dependencies = [
  "tracing",
  "tracing-opentelemetry 0.21.0",
  "tracing-subscriber",
- "utoipa",
+ "utoipa 4.2.3",
  "utoipa-swagger-ui",
 ]
 
@@ -4432,7 +4470,7 @@ dependencies = [
  "tracing",
  "tracing-opentelemetry 0.21.0",
  "tracing-subscriber",
- "utoipa",
+ "utoipa 4.2.3",
  "utoipa-swagger-ui",
 ]
 
@@ -4946,6 +4984,24 @@ dependencies = [
  "web-time 1.1.0",
 ]
 
+[[package]]
+name = "tracing-opentelemetry"
+version = "0.27.0"
+source = "registry+https://github.com/rust-lang/crates.io-index"
+checksum = "dc58af5d3f6c5811462cabb3289aec0093f7338e367e5a33d28c0433b3c7360b"
+dependencies = [
+ "js-sys",
+ "once_cell",
+ "opentelemetry 0.26.0",
+ "opentelemetry_sdk 0.26.0",
+ "smallvec",
+ "tracing",
+ "tracing-core",
+ "tracing-log 0.2.0",
+ "tracing-subscriber",
+ "web-time 1.1.0",
+]
+
 [[package]]
 name = "tracing-opentelemetry-instrumentation-sdk"
 version = "0.16.0"
@@ -5136,7 +5192,19 @@ dependencies = [
  "indexmap 2.6.0",
  "serde",
  "serde_json",
- "utoipa-gen",
+ "utoipa-gen 4.3.0",
+]
+
+[[package]]
+name = "utoipa"
+version = "5.1.2"
+source = "registry+https://github.com/rust-lang/crates.io-index"
+checksum = "5e12e84f0ff45b6818029cd0f67280e453c80132c1b9897df407ecc20b9f7cfd"
+dependencies = [
+ "indexmap 2.5.0",
+ "serde",
+ "serde_json",
+ "utoipa-gen 5.1.2",
 ]
 
 [[package]]
@@ -5152,6 +5220,18 @@ dependencies = [
  "syn 2.0.85",
 ]
 
+[[package]]
+name = "utoipa-gen"
+version = "5.1.2"
+source = "registry+https://github.com/rust-lang/crates.io-index"
+checksum = "0dfc694d3a3118d2b9e80d68be83bf1aab7988510916934db83da61c14e7e6b2"
+dependencies = [
+ "proc-macro2",
+ "quote",
+ "regex",
+ "syn 2.0.79",
+]
+
 [[package]]
 name = "utoipa-swagger-ui"
 version = "6.0.0"
@@ -5164,7 +5244,7 @@ dependencies = [
  "rust-embed",
  "serde",
  "serde_json",
- "utoipa",
+ "utoipa 4.2.3",
  "zip",
 ]
 
diff --git a/backends/llamacpp/CMakeLists.txt b/backends/llamacpp/CMakeLists.txt
index 644db5ae162..c4b6f0ce2ff 100644
--- a/backends/llamacpp/CMakeLists.txt
+++ b/backends/llamacpp/CMakeLists.txt
@@ -11,12 +11,12 @@ set(LLAMA_CPP_TARGET_CUDA_ARCHS "75-real;80-real;86-real;89-real;90-real" CACHE
 option(LLAMA_CPP_BUILD_OFFLINE_RUNNER "Flag to build the standalone c++ backend runner")
 option(LLAMA_CPP_BUILD_CUDA "Flag to build CUDA enabled inference through llama.cpp")
 
-if(${CMAKE_CXX_COMPILER_ID} STREQUAL "Clang" AND ${CMAKE_SYSTEM_NAME} STREQUAL "Linux")
+if (${CMAKE_CXX_COMPILER_ID} STREQUAL "Clang" AND ${CMAKE_SYSTEM_NAME} STREQUAL "Linux")
     message(STATUS "Targeting libc++")
     set(CMAKE_CXX_FLAGS -stdlib=libc++ ${CMAKE_CXX_FLAGS})
-else()
+else ()
     message(STATUS "Not using libc++ ${CMAKE_CXX_COMPILER_ID} ${CMAKE_SYSTEM_NAME}")
-endif()
+endif ()
 
 # Add dependencies
 include(cmake/fmt.cmake)
@@ -42,15 +42,17 @@ fetchcontent_declare(
 
 fetchcontent_makeavailable(llama)
 
-add_library(tgi_llama_cpp_backend_impl STATIC csrc/backend.hpp csrc/backend.cpp)
-target_compile_features(tgi_llama_cpp_backend_impl PRIVATE cxx_std_11)
-target_link_libraries(tgi_llama_cpp_backend_impl PUBLIC fmt::fmt spdlog::spdlog llama common)
+add_library(tgi_llamacpp_backend_impl STATIC csrc/backend.hpp csrc/backend.cpp)
+target_compile_features(tgi_llamacpp_backend_impl PRIVATE cxx_std_11)
+target_link_libraries(tgi_llamacpp_backend_impl PUBLIC fmt::fmt spdlog::spdlog llama common)
+
+install(TARGETS tgi_llamacpp_backend_impl spdlog llama common)
 
 if (${LLAMA_CPP_BUILD_OFFLINE_RUNNER})
     message(STATUS "Building llama.cpp offline runner")
-    add_executable(tgi_llama_cpp_offline_runner offline/main.cpp)
+    add_executable(tgi_llama_cppoffline_runner offline/main.cpp)
 
-    target_link_libraries(tgi_llama_cpp_offline_runner PUBLIC tgi_llama_cpp_backend_impl llama common)
+    target_link_libraries(tgi_llamacpp_offline_runner PUBLIC tgi_llama_cpp_backend_impl llama common)
 endif ()
 
 
diff --git a/backends/llamacpp/Cargo.toml b/backends/llamacpp/Cargo.toml
index fdd980c308f..4a14dcdfd05 100644
--- a/backends/llamacpp/Cargo.toml
+++ b/backends/llamacpp/Cargo.toml
@@ -6,6 +6,7 @@ authors.workspace = true
 homepage.workspace = true
 
 [dependencies]
+async-trait = "0.1"
 clap = { version = "4.5.19", features = ["derive"] }
 cxx = "1.0"
 hf-hub = { workspace = true }
@@ -18,6 +19,11 @@ thiserror = "1.0.64"
 tokio = "1.40.0"
 tokio-stream = "0.1.16"
 tokenizers = { workspace = true }
+tracing = "0.1"
+tracing-opentelemetry = "0.27.0"
+tracing-subscriber = { version = "0.3", features = ["json", "env-filter"] }
+utoipa = { version = "5.1.2", features = ["axum_extras"] }
+log = "0.4.22"
 
 [build-dependencies]
 cmake = "0.1"
diff --git a/backends/llamacpp/build.rs b/backends/llamacpp/build.rs
index d84e517f2c6..642a9665cc0 100644
--- a/backends/llamacpp/build.rs
+++ b/backends/llamacpp/build.rs
@@ -1,12 +1,14 @@
 use cxx_build::CFG;
 use std::env;
-use std::path::PathBuf;
+use std::path::{Path, PathBuf};
 
 const CMAKE_LLAMA_CPP_DEFAULT_CUDA_ARCHS: &str = "75-real;80-real;86-real;89-real;90-real";
-const CMAKE_LLAMA_CPP_TARGET: &str = "tgi_llama_cpp_backend_impl";
-const ADDITIONAL_BACKEND_LINK_LIBRARIES: [&str; 2] = ["spdlog", "fmt"];
+const CMAKE_LLAMA_CPP_TARGET: &str = "tgi_llamacpp_backend_impl";
+const CMAKE_LLAMA_CPP_FFI_TARGET: &str = "tgi_llamacpp_backend";
 const MPI_REQUIRED_VERSION: &str = "4.1";
 
+const BACKEND_DEPS: [&str; 2] = [CMAKE_LLAMA_CPP_TARGET, CMAKE_LLAMA_CPP_FFI_TARGET];
+
 macro_rules! probe {
     ($name: expr, $version: expr) => {
         if let Err(_) = pkg_config::probe_library($name) {
@@ -16,11 +18,12 @@ macro_rules! probe {
     };
 }
 
-fn build_backend(is_debug: bool, opt_level: &str, out_dir: &PathBuf) -> PathBuf {
-    let install_path = env::var("CMAKE_INSTALL_PREFIX")
-        .map(|val| PathBuf::from(val))
-        .unwrap_or(out_dir.join("dist"));
-
+fn build_backend(
+    is_debug: bool,
+    opt_level: &str,
+    out_dir: &Path,
+    install_path: &PathBuf,
+) -> PathBuf {
     let build_cuda = option_env!("LLAMA_CPP_BUILD_CUDA").unwrap_or("OFF");
     let cuda_archs =
         option_env!("LLAMA_CPP_TARGET_CUDA_ARCHS").unwrap_or(CMAKE_LLAMA_CPP_DEFAULT_CUDA_ARCHS);
@@ -38,41 +41,28 @@ fn build_backend(is_debug: bool, opt_level: &str, out_dir: &PathBuf) -> PathBuf
         .define("LLAMA_CPP_TARGET_CUDA_ARCHS", cuda_archs)
         .build();
 
-    // Additional transitive CMake dependencies
-    let deps_folder = out_dir.join("build").join("_deps");
-    for dependency in ADDITIONAL_BACKEND_LINK_LIBRARIES {
-        let dep_name = match is_debug {
-            true => format!("{}d", dependency),
-            false => String::from(dependency),
-        };
-        let dep_path = deps_folder.join(format!("{}-build", dependency));
-        println!("cargo:rustc-link-search={}", dep_path.display());
-        println!("cargo:rustc-link-lib=static={}", dep_name);
-    }
+    let lib_path = install_path.join("lib64");
+    println!("cargo:rustc-link-search=native={}", lib_path.display());
 
     let deps_folder = out_dir.join("build").join("_deps");
     deps_folder
 }
 
-fn build_ffi_layer(deps_folder: &PathBuf) {
-    println!("cargo:warning={}", &deps_folder.display());
+fn build_ffi_layer(deps_folder: &Path, install_prefix: &Path) {
+    println!("cargo:warning={}", deps_folder.display());
     CFG.include_prefix = "backends/llamacpp";
     cxx_build::bridge("src/lib.rs")
         .static_flag(true)
         .std("c++23")
-        .include(deps_folder.join("fmt-src").join("include"))
-        .include(deps_folder.join("spdlog-src").join("include"))
-        .include(deps_folder.join("llama-src").join("common"))
-        .include(deps_folder.join("llama-src").join("ggml").join("include"))
-        .include(deps_folder.join("llama-src").join("include"))
-        .include("csrc/backend.hpp")
-        .file("csrc/ffi.cpp")
-        .compile(CMAKE_LLAMA_CPP_TARGET);
-
-    println!("cargo:rerun-if-changed=CMakeLists.txt");
-    println!("cargo:rerun-if-changed=csrc/backend.hpp");
-    println!("cargo:rerun-if-changed=csrc/backend.cpp");
-    println!("cargo:rerun-if-changed=csrc/ffi.hpp");
+        .include(deps_folder.join("spdlog-src").join("include")) // Why spdlog doesnt install headers?
+        // .include(deps_folder.join("fmt-src").join("include")) // Why spdlog doesnt install headers?
+        // .include(deps_folder.join("llama-src").join("include")) // Why spdlog doesnt install headers?
+        .include(deps_folder.join("llama-src").join("ggml").join("include")) // Why spdlog doesnt install headers?
+        .include(deps_folder.join("llama-src").join("common").join("include")) // Why spdlog doesnt install headers?
+        .include(install_prefix.join("include"))
+        .include("csrc")
+        .file("csrc/ffi.hpp")
+        .compile(CMAKE_LLAMA_CPP_FFI_TARGET);
 }
 
 fn main() {
@@ -84,17 +74,35 @@ fn main() {
         _ => (false, "3"),
     };
 
+    let install_path = env::var("CMAKE_INSTALL_PREFIX")
+        .map(|val| PathBuf::from(val))
+        .unwrap_or(out_dir.join("dist"));
+
     // Build the backend
-    let deps_folder = build_backend(is_debug, opt_level, &out_dir);
+    let deps_path = build_backend(is_debug, opt_level, out_dir.as_path(), &install_path);
 
     // Build the FFI layer calling the backend above
-    build_ffi_layer(&deps_folder);
+    build_ffi_layer(&deps_path, &install_path);
 
     // Emit linkage search path
     probe!("ompi", MPI_REQUIRED_VERSION);
 
     // Backend
-    // BACKEND_DEPS.iter().for_each(|name| {
-    //     println!("cargo:rustc-link-lib=static={}", name);
-    // });
+    BACKEND_DEPS.iter().for_each(|name| {
+        println!("cargo:rustc-link-lib=static={}", name);
+    });
+
+    // Linkage info
+    println!("cargo:rustc-link-search=native={}", out_dir.display());
+    println!("cargo:rustc-link-lib=static=fmtd");
+    println!("cargo:rustc-link-lib=static=spdlogd");
+    println!("cargo:rustc-link-lib=static=common");
+    println!("cargo:rustc-link-lib=dylib=ggml");
+    println!("cargo:rustc-link-lib=dylib=llama");
+
+    // Rerun if one of these file change
+    println!("cargo:rerun-if-changed=CMakeLists.txt");
+    println!("cargo:rerun-if-changed=csrc/backend.hpp");
+    println!("cargo:rerun-if-changed=csrc/backend.cpp");
+    println!("cargo:rerun-if-changed=csrc/ffi.hpp");
 }
diff --git a/backends/llamacpp/csrc/backend.cpp b/backends/llamacpp/csrc/backend.cpp
index c8806957bb7..ba4a02d5fc4 100644
--- a/backends/llamacpp/csrc/backend.cpp
+++ b/backends/llamacpp/csrc/backend.cpp
@@ -14,33 +14,35 @@
 
 #include "backend.hpp"
 
-namespace huggingface::tgi::backends::llama {
-    std::expected<std::unique_ptr<TgiLlamaCppBackend>, TgiLlamaCppBackendError>
-    CreateLlamaCppBackend(const std::filesystem::path& modelPath) {
+namespace huggingface::tgi::backends::llamacpp {
+    [[nodiscard]]
+    std::expected<std::pair<llama_model *, llama_context *>, TgiLlamaCppBackendError>
+    TgiLlamaCppBackend::FromGGUF(const std::filesystem::path &modelPath) noexcept {
         SPDLOG_DEBUG(FMT_STRING("Loading model from {}"), modelPath);
 
         llama_backend_init();
         llama_numa_init(ggml_numa_strategy::GGML_NUMA_STRATEGY_NUMACTL);
 
         // Load the model
-        if(!exists(modelPath)) {
+        if (!exists(modelPath)) {
             return std::unexpected(TgiLlamaCppBackendError::MODEL_FILE_DOESNT_EXIST);
         }
 
         auto params = llama_model_default_params();
-        auto* model = llama_load_model_from_file(modelPath.c_str(), params);
-        auto* context = llama_new_context_with_model(model, {
-            .n_batch = 1,
-            .n_threads = 16,
-            .attention_type = llama_attention_type::LLAMA_ATTENTION_TYPE_CAUSAL,
-            .flash_attn = false,
+        auto *model = llama_load_model_from_file(modelPath.c_str(), params);
+        auto *context = llama_new_context_with_model(model, {
+                .n_batch = 1,
+                .n_threads = 16,
+                .attention_type = llama_attention_type::LLAMA_ATTENTION_TYPE_CAUSAL,
+                .flash_attn = false,
         });
 
-        return std::make_unique<huggingface::tgi::backends::llama::TgiLlamaCppBackend>(model, context);
+        return std::make_pair(model, context);
     }
 
-    huggingface::tgi::backends::llama::TgiLlamaCppBackend::TgiLlamaCppBackend(llama_model *const model, llama_context *const ctx)
-        : model(model), ctx(ctx) {
+    huggingface::tgi::backends::llamacpp::TgiLlamaCppBackend::TgiLlamaCppBackend(llama_model *const model,
+                                                                                 llama_context *const ctx)
+            : model(model), ctx(ctx) {
 #ifndef NDEBUG
         char modelName[256];
         llama_model_meta_val_str(llama_get_model(ctx), "general.name", modelName, sizeof(modelName));
@@ -48,13 +50,13 @@ namespace huggingface::tgi::backends::llama {
 #endif
     }
 
-    huggingface::tgi::backends::llama::TgiLlamaCppBackend::~TgiLlamaCppBackend() {
+    huggingface::tgi::backends::llamacpp::TgiLlamaCppBackend::~TgiLlamaCppBackend() {
         if (ctx) {
             SPDLOG_DEBUG("Freeing llama.cpp context");
             llama_free(ctx);
         }
 
-        if(model) {
+        if (model) {
             SPDLOG_DEBUG("Freeing llama.cpp model");
             llama_free_model(model);
         }
@@ -63,7 +65,8 @@ namespace huggingface::tgi::backends::llama {
     std::vector<TgiLlamaCppBackend::TokenId> TgiLlamaCppBackend::Tokenize(const std::string &text) const {
         std::vector<TgiLlamaCppBackend::TokenId> tokens(llama_n_seq_max(ctx));
 
-        if(auto nTokens = llama_tokenize(model, text.c_str(), text.length(), tokens.data(), tokens.capacity(), true, true); nTokens < 0){
+        if (auto nTokens = llama_tokenize(model, text.c_str(), text.length(), tokens.data(), tokens.capacity(), true,
+                                          true); nTokens < 0) {
             tokens.resize(-nTokens);
             llama_tokenize(model, text.c_str(), text.length(), tokens.data(), tokens.capacity(), true, true);
         } else {
@@ -75,14 +78,15 @@ namespace huggingface::tgi::backends::llama {
     }
 
     std::unique_ptr<llama_sampler *> TgiLlamaCppBackend::GetSamplerFromArgs(
-            const uint32_t topK, const float_t topP, const float_t frequencyPenalty, const float_t repetitionPenalty, const uint64_t seed) {
+            const uint32_t topK, const float_t topP, const float_t frequencyPenalty, const float_t repetitionPenalty,
+            const uint64_t seed) {
         auto *sampler = llama_sampler_chain_init({.no_perf = false});
 
         // Penalties
         llama_sampler_chain_add(sampler, llama_sampler_init_penalties(
                 llama_n_vocab(model),
                 llama_token_eos(model),
-                llama_token_nl (model),
+                llama_token_nl(model),
                 0.0f,
                 repetitionPenalty,
                 frequencyPenalty,
@@ -92,15 +96,16 @@ namespace huggingface::tgi::backends::llama {
         ));
         llama_sampler_chain_add(sampler, llama_sampler_init_top_k(static_cast<int32_t>(topK)));
 
-        if(0 < topP && topP < 1) {
+        if (0 < topP && topP < 1) {
             llama_sampler_chain_add(sampler, llama_sampler_init_top_p(topP, 1));
         }
 
         llama_sampler_chain_add(sampler, llama_sampler_init_dist(seed));
-        return std::make_unique<llama_sampler*>(sampler);
+        return std::make_unique<llama_sampler *>(sampler);
     }
 
-    std::expected<std::vector<TgiLlamaCppBackend::TokenId>, TgiLlamaCppBackendError> huggingface::tgi::backends::llama::TgiLlamaCppBackend::Generate(
+    std::expected<std::vector<TgiLlamaCppBackend::TokenId>, TgiLlamaCppBackendError>
+    huggingface::tgi::backends::llamacpp::TgiLlamaCppBackend::Generate(
             std::span<const TokenId> tokens,
             const uint32_t topK,
             const float_t topP,
@@ -108,7 +113,7 @@ namespace huggingface::tgi::backends::llama {
             const float_t repetitionPenalty,
             const uint32_t maxNewTokens,
             const uint64_t seed
-        ) {
+    ) {
         SPDLOG_DEBUG(FMT_STRING("Received {:d} tokens to schedule"), tokens.size());
 
         // Allocate generation result
@@ -120,7 +125,7 @@ namespace huggingface::tgi::backends::llama {
         auto sampler = GetSamplerFromArgs(topK, topP, frequencyPenalty, repetitionPenalty, seed);
 
         // Decode
-        for(auto [generating, nDecoded] = std::pair{true, 0uz}; generating && nDecoded < maxNewTokens; ++nDecoded) {
+        for (auto [generating, nDecoded] = std::pair{true, 0uz}; generating && nDecoded < maxNewTokens; ++nDecoded) {
 #ifndef NDEBUG
             const auto start = std::chrono::steady_clock::now();
             const auto status = llama_decode(ctx, batch);
diff --git a/backends/llamacpp/csrc/backend.hpp b/backends/llamacpp/csrc/backend.hpp
index 7075642acd5..7fa47e84d1c 100644
--- a/backends/llamacpp/csrc/backend.hpp
+++ b/backends/llamacpp/csrc/backend.hpp
@@ -9,12 +9,14 @@
 #include <filesystem>
 #include <memory>
 #include <span>
+#include <vector>
+
 #include <llama.h>
 
 #define LLAMA_SUCCESS(x) x == 0
 
-namespace huggingface::tgi::backends::llama {
-    enum TgiLlamaCppBackendError: uint8_t {
+namespace huggingface::tgi::backends::llamacpp {
+    enum TgiLlamaCppBackendError : uint8_t {
         MODEL_FILE_DOESNT_EXIST = 1
     };
 
@@ -22,8 +24,8 @@ namespace huggingface::tgi::backends::llama {
         using TokenId = llama_token;
 
     private:
-        llama_model* model;
-        llama_context* ctx;
+        llama_model *model;
+        llama_context *ctx;
 
         /**
          *
@@ -35,7 +37,15 @@ namespace huggingface::tgi::backends::llama {
                 uint32_t topK, float_t topP, float_t frequencyPenalty, float_t repetitionPenalty, uint64_t seed);
 
     public:
+        /**
+         *
+         * @return
+         */
+        static std::expected<std::pair<llama_model *, llama_context *>, TgiLlamaCppBackendError>
+        FromGGUF(const std::filesystem::path &) noexcept;
+
         TgiLlamaCppBackend(llama_model *model, llama_context *ctx);
+
         ~TgiLlamaCppBackend();
 
         /**
@@ -44,7 +54,7 @@ namespace huggingface::tgi::backends::llama {
          * @return
          */
         [[nodiscard("Tokens will be freed after this call if not assigned to an lvalue")]]
-        std::vector<TgiLlamaCppBackend::TokenId> Tokenize(const std::string& text) const;
+        std::vector<TgiLlamaCppBackend::TokenId> Tokenize(const std::string &text) const;
 
         /**
          *
@@ -71,7 +81,7 @@ namespace huggingface::tgi::backends::llama {
 
     [[nodiscard("Create backend will be freed after this call if not assigned to an lvalue")]]
     std::expected<std::unique_ptr<TgiLlamaCppBackend>, TgiLlamaCppBackendError>
-    CreateLlamaCppBackend(const std::filesystem::path& root);
+    CreateLlamaCppBackend(const std::filesystem::path &root);
 }
 
 #endif //TGI_LLAMA_CPP_BACKEND_BACKEND_HPP
diff --git a/backends/llamacpp/csrc/ffi.hpp b/backends/llamacpp/csrc/ffi.hpp
index e924316e36e..82f3f29651d 100644
--- a/backends/llamacpp/csrc/ffi.hpp
+++ b/backends/llamacpp/csrc/ffi.hpp
@@ -5,14 +5,44 @@
 #ifndef TGI_LLAMA_CPP_BACKEND_FFI_HPP
 #define TGI_LLAMA_CPP_BACKEND_FFI_HPP
 
+#include <exception>
+#include <filesystem>
+#include <string_view>
+
+#include <spdlog/spdlog.h>
 #include "backend.hpp"
-//#include "backends/llamacpp/src/lib.rs.h"
+
+namespace huggingface::tgi::backends::llamacpp::impl {
+    class LlamaCppBackendImpl;
+}
+
+
+#include "backends/llamacpp/src/lib.rs.h"
 
 
-namespace huggingface::tgi::backends::llama {
+namespace huggingface::tgi::backends::llamacpp::impl {
+
+    class LlamaCppBackendException : std::exception {
+
+    };
+
     class LlamaCppBackendImpl {
+    private:
+        TgiLlamaCppBackend _inner;
 
+    public:
+        LlamaCppBackendImpl(llama_model *model, llama_context *context) : _inner(model, context) {}
     };
+
+    std::unique_ptr<LlamaCppBackendImpl> CreateLlamaCppBackendImpl(rust::Str modelPath) {
+        const auto cxxPath = std::string_view(modelPath);
+        if (auto maybe = TgiLlamaCppBackend::FromGGUF(std::filesystem::path(cxxPath)); maybe.has_value()) {
+            auto [model, context] = *maybe;
+            return std::make_unique<LlamaCppBackendImpl>(model, context);
+        } else {
+            throw LlamaCppBackendException();
+        }
+    }
 }
 
 
diff --git a/backends/llamacpp/offline/main.cpp b/backends/llamacpp/offline/main.cpp
index c2ae05c726f..56eb88c5464 100644
--- a/backends/llamacpp/offline/main.cpp
+++ b/backends/llamacpp/offline/main.cpp
@@ -10,6 +10,8 @@
 #include <spdlog/spdlog.h>
 #include "../csrc/backend.hpp"
 
+using namespace huggingface::tgi::backends::llamacpp;
+
 int main(int argc, char** argv) {
     if (argc < 2) {
         fmt::print("No model folder provider");
@@ -21,7 +23,7 @@ int main(int argc, char** argv) {
     const auto prompt = "My name is Morgan";
 
     const auto modelPath = absolute(std::filesystem::path(argv[1]));
-    if (auto maybeBackend = huggingface::tgi::backends::llama::CreateLlamaCppBackend(modelPath); maybeBackend.has_value()) {
+    if (auto maybeBackend = CreateLlamaCppBackend(modelPath); maybeBackend.has_value()) {
         // Retrieve the backend
         const auto& backend = *maybeBackend;
 
@@ -38,7 +40,7 @@ int main(int argc, char** argv) {
 
     } else {
         switch (maybeBackend.error()) {
-            case huggingface::tgi::backends::llama::TgiLlamaCppBackendError::MODEL_FILE_DOESNT_EXIST:
+            case TgiLlamaCppBackendError::MODEL_FILE_DOESNT_EXIST:
                 fmt::print(fmt::emphasis::bold | fg(fmt::color::red), "Specified file {} doesnt exist", modelPath);
                 return maybeBackend.error();
         }
diff --git a/backends/llamacpp/src/backend.rs b/backends/llamacpp/src/backend.rs
index 89daeee3658..7b22e4a2d71 100644
--- a/backends/llamacpp/src/backend.rs
+++ b/backends/llamacpp/src/backend.rs
@@ -1,31 +1,66 @@
 use crate::ffi::{create_llamacpp_backend, LlamaCppBackendImpl};
+use async_trait::async_trait;
 use cxx::UniquePtr;
-use std::path::Path;
+use std::path::{Path, PathBuf};
+use std::sync::Arc;
 use text_generation_router::infer::{Backend, InferError, InferStreamResponse};
 use text_generation_router::validation::ValidGenerateRequest;
+use thiserror::Error;
+use tokio::task::spawn_blocking;
 use tokio_stream::wrappers::UnboundedReceiverStream;
+use tracing::info;
 
-pub struct TgiLlamaCppBakend {
-    backend: UniquePtr<LlamaCppBackendImpl>,
+unsafe impl Send for LlamaCppBackendImpl {}
+
+#[derive(Debug, Error)]
+pub enum LlamaCppBackendError {
+    #[error("Provided GGUF model path {0} doesn't exist")]
+    ModelFileDoesntExist(String),
+
+    #[error("Failed to initialize model from GGUF file {0}: {1}")]
+    ModelInitializationFailed(PathBuf, String),
 }
 
-impl TgiLlamaCppBakend {
-    pub fn new<P: AsRef<Path>>(model_path: P) -> Result<Self, ()> {
-        Ok(Self {
-            backend: create_llamacpp_backend(model_path.as_ref().to_str().unwrap()),
-        })
+pub struct LlamaCppBackend {}
+
+impl LlamaCppBackend {
+    pub fn new<P: AsRef<Path> + Send>(model_path: P) -> Result<Self, LlamaCppBackendError> {
+        let path = Arc::new(model_path.as_ref());
+        if !path.exists() {
+            return Err(LlamaCppBackendError::ModelFileDoesntExist(
+                path.display().to_string(),
+            ));
+        }
+
+        let mut backend = create_llamacpp_backend(path.to_str().unwrap()).map_err(|err| {
+            LlamaCppBackendError::ModelInitializationFailed(
+                path.to_path_buf(),
+                err.what().to_string(),
+            )
+        })?;
+
+        info!(
+            "Successfully initialized llama.cpp backend from {}",
+            path.display()
+        );
+
+        spawn_blocking(move || scheduler_loop(backend));
+        Ok(Self {})
     }
 }
 
-impl Backend for TgiLlamaCppBakend {
+async fn scheduler_loop(mut backend: UniquePtr<LlamaCppBackendImpl>) {}
+
+#[async_trait]
+impl Backend for LlamaCppBackend {
     fn schedule(
         &self,
-        request: ValidGenerateRequest,
+        _request: ValidGenerateRequest,
     ) -> Result<UnboundedReceiverStream<Result<InferStreamResponse, InferError>>, InferError> {
         Err(InferError::GenerationError("Not implemented yet".into()))
     }
 
-    async fn health(&self, current_health: bool) -> bool {
-        todo!()
+    async fn health(&self, _: bool) -> bool {
+        true
     }
 }
diff --git a/backends/llamacpp/src/lib.rs b/backends/llamacpp/src/lib.rs
index d25e3ca0bea..2bfc30654a6 100644
--- a/backends/llamacpp/src/lib.rs
+++ b/backends/llamacpp/src/lib.rs
@@ -1,6 +1,6 @@
 pub mod backend;
 
-#[cxx::bridge(namespace = "huggingface::tgi::backends::llama")]
+#[cxx::bridge(namespace = "huggingface::tgi::backends::llamacpp::impl")]
 mod ffi {
     unsafe extern "C++" {
         include!("backends/llamacpp/csrc/ffi.hpp");
@@ -9,8 +9,8 @@ mod ffi {
         type LlamaCppBackendImpl;
 
         #[rust_name = "create_llamacpp_backend"]
-        fn CreateLlamaCppBackend(
-            engine_folder: &str,
-        ) -> UniquePtr<LlamaCppBackendImpl>;
+        fn CreateLlamaCppBackendImpl(
+            modelPath: &str,
+        ) -> Result<UniquePtr<LlamaCppBackendImpl>>;
     }
 }
diff --git a/backends/llamacpp/src/main.rs b/backends/llamacpp/src/main.rs
index 7226473c70f..7420e16a518 100644
--- a/backends/llamacpp/src/main.rs
+++ b/backends/llamacpp/src/main.rs
@@ -1,7 +1,8 @@
 use clap::{Parser, Subcommand};
+use std::path::PathBuf;
+use text_generation_backend_llamacpp::backend::{LlamaCppBackend, LlamaCppBackendError};
 use text_generation_router::{server, usage_stats};
 use thiserror::Error;
-use text_generation_router::server::ApiDoc;
 
 /// App Configuration
 #[derive(Parser, Debug)]
@@ -38,6 +39,8 @@ struct Args {
     port: u16,
     #[clap(default_value = "/tmp/text-generation-server-0", long, env)]
     master_shard_uds_path: String,
+    #[clap(long, env, help = "Path to GGUF model file(s) to load")]
+    gguf_path: PathBuf,
     #[clap(default_value = "bigscience/bloom", long, env)]
     tokenizer_name: String,
     #[clap(long, env)]
@@ -98,6 +101,7 @@ async fn main() -> Result<(), RouterError> {
         hostname,
         port,
         master_shard_uds_path,
+        gguf_path,
         tokenizer_name,
         tokenizer_config_path,
         revision,
@@ -116,13 +120,13 @@ async fn main() -> Result<(), RouterError> {
         usage_stats,
     } = args;
 
-    if let Some(Commands::PrintSchema) = command {
-        use utoipa::OpenApi;
-        let api_doc = ApiDoc::openapi();
-        let api_doc = serde_json::to_string_pretty(&api_doc).unwrap();
-        println!("{}", api_doc);
-        std::process::exit(0);
-    };
+    // if let Some(Commands::PrintSchema) = command {
+    //     use utoipa::OpenApi;
+    //     let api_doc = ApiDoc::openapi();
+    //     let api_doc = serde_json::to_string_pretty(&api_doc).unwrap();
+    //     println!("{}", api_doc);
+    //     std::process::exit(0);
+    // };
     text_generation_router::logging::init_logging(otlp_endpoint, otlp_service_name, json_output);
 
     // Validate args
@@ -158,7 +162,7 @@ async fn main() -> Result<(), RouterError> {
         }
     }
 
-    let backend = LlamaCppBackend::new();
+    let backend = LlamaCppBackend::new(gguf_path)?;
 
     // Run server
     server::run(
@@ -185,7 +189,7 @@ async fn main() -> Result<(), RouterError> {
         max_client_batch_size,
         usage_stats,
     )
-        .await?;
+    .await?;
     Ok(())
 }
 
@@ -194,9 +198,9 @@ enum RouterError {
     #[error("Argument validation error: {0}")]
     ArgumentValidation(String),
     #[error("Backend failed: {0}")]
-    Backend(#[from] V3Error),
+    Backend(#[from] LlamaCppBackendError),
     #[error("WebServer error: {0}")]
     WebServer(#[from] server::WebServerError),
     #[error("Tokio runtime failed to start: {0}")]
     Tokio(#[from] std::io::Error),
-}
\ No newline at end of file
+}

From f0859c247f0dadbfe6d26920221ba270bb99f258 Mon Sep 17 00:00:00 2001
From: Morgan Funtowicz <funtowiczmo@gmail.com>
Date: Fri, 25 Oct 2024 07:27:12 +0200
Subject: [PATCH 16/91] misc(build): handle different lib destination folder
 lib/lib64

---
 backends/llamacpp/build.rs | 26 ++++++++++++++++++--------
 1 file changed, 18 insertions(+), 8 deletions(-)

diff --git a/backends/llamacpp/build.rs b/backends/llamacpp/build.rs
index 642a9665cc0..6d6bd514957 100644
--- a/backends/llamacpp/build.rs
+++ b/backends/llamacpp/build.rs
@@ -41,7 +41,12 @@ fn build_backend(
         .define("LLAMA_CPP_TARGET_CUDA_ARCHS", cuda_archs)
         .build();
 
-    let lib_path = install_path.join("lib64");
+    // On some x64 and ARM mainly the lib install destination is "lib" and not "lib64"
+    let lib_path = if install_path.join("lib64").exists() {
+        install_path.join("lib64")
+    } else {
+        install_path.join("lib")
+    };
     println!("cargo:rustc-link-search=native={}", lib_path.display());
 
     let deps_folder = out_dir.join("build").join("_deps");
@@ -55,14 +60,12 @@ fn build_ffi_layer(deps_folder: &Path, install_prefix: &Path) {
         .static_flag(true)
         .std("c++23")
         .include(deps_folder.join("spdlog-src").join("include")) // Why spdlog doesnt install headers?
-        // .include(deps_folder.join("fmt-src").join("include")) // Why spdlog doesnt install headers?
-        // .include(deps_folder.join("llama-src").join("include")) // Why spdlog doesnt install headers?
-        .include(deps_folder.join("llama-src").join("ggml").join("include")) // Why spdlog doesnt install headers?
-        .include(deps_folder.join("llama-src").join("common").join("include")) // Why spdlog doesnt install headers?
+        .include(deps_folder.join("llama-src").join("ggml").join("include")) // Why ggml doesnt install headers?
+        .include(deps_folder.join("llama-src").join("common").join("include")) // Why common doesnt install headers?
         .include(install_prefix.join("include"))
         .include("csrc")
         .file("csrc/ffi.hpp")
-        .compile(CMAKE_LLAMA_CPP_FFI_TARGET);
+        .compile(CMAKE_LLAMA_CPP_FFI_TARGET); // Make sure this target is not the same as cmake above
 }
 
 fn main() {
@@ -94,8 +97,15 @@ fn main() {
 
     // Linkage info
     println!("cargo:rustc-link-search=native={}", out_dir.display());
-    println!("cargo:rustc-link-lib=static=fmtd");
-    println!("cargo:rustc-link-lib=static=spdlogd");
+
+    if is_debug {
+        println!("cargo:rustc-link-lib=static=fmtd");
+        println!("cargo:rustc-link-lib=static=spdlogd");
+    } else {
+        println!("cargo:rustc-link-lib=static=fmt");
+        println!("cargo:rustc-link-lib=static=spdlog");
+    }
+
     println!("cargo:rustc-link-lib=static=common");
     println!("cargo:rustc-link-lib=dylib=ggml");
     println!("cargo:rustc-link-lib=dylib=llama");

From 179309b364ebafc4a3da0179f03fee7a5277799a Mon Sep 17 00:00:00 2001
From: Morgan Funtowicz <funtowiczmo@gmail.com>
Date: Fri, 25 Oct 2024 08:02:45 +0200
Subject: [PATCH 17/91] misc(build): refactor build type detection in cmake

---
 backends/llamacpp/CMakeLists.txt | 7 +++++--
 1 file changed, 5 insertions(+), 2 deletions(-)

diff --git a/backends/llamacpp/CMakeLists.txt b/backends/llamacpp/CMakeLists.txt
index c4b6f0ce2ff..adcc6af29dc 100644
--- a/backends/llamacpp/CMakeLists.txt
+++ b/backends/llamacpp/CMakeLists.txt
@@ -45,12 +45,15 @@ fetchcontent_makeavailable(llama)
 add_library(tgi_llamacpp_backend_impl STATIC csrc/backend.hpp csrc/backend.cpp)
 target_compile_features(tgi_llamacpp_backend_impl PRIVATE cxx_std_11)
 target_link_libraries(tgi_llamacpp_backend_impl PUBLIC fmt::fmt spdlog::spdlog llama common)
-
 install(TARGETS tgi_llamacpp_backend_impl spdlog llama common)
 
+if (${CMAKE_BUILD_TYPE} STREQUAL "Debug")
+    target_compile_definitions(tgi_llamacpp_backend_impl PRIVATE TGI_LLAMACPP_BACKEND_DEBUG=1)
+endif ()
+
 if (${LLAMA_CPP_BUILD_OFFLINE_RUNNER})
     message(STATUS "Building llama.cpp offline runner")
-    add_executable(tgi_llama_cppoffline_runner offline/main.cpp)
+    add_executable(tgi_llamacpp_offline_runner offline/main.cpp)
 
     target_link_libraries(tgi_llamacpp_offline_runner PUBLIC tgi_llama_cpp_backend_impl llama common)
 endif ()

From a316c532550a76485fcf152cbb911144b0b80231 Mon Sep 17 00:00:00 2001
From: Morgan Funtowicz <funtowiczmo@gmail.com>
Date: Fri, 25 Oct 2024 08:11:42 +0200
Subject: [PATCH 18/91] feat(llamacpp): expose number of threads for the
 backend when constructing the model

---
 backends/llamacpp/csrc/backend.cpp | 12 ++++++++----
 backends/llamacpp/csrc/backend.hpp |  2 +-
 backends/llamacpp/csrc/ffi.hpp     |  4 ++--
 backends/llamacpp/src/backend.rs   | 18 +++++++++++-------
 backends/llamacpp/src/lib.rs       |  1 +
 backends/llamacpp/src/main.rs      | 19 +++++++++----------
 6 files changed, 32 insertions(+), 24 deletions(-)

diff --git a/backends/llamacpp/csrc/backend.cpp b/backends/llamacpp/csrc/backend.cpp
index ba4a02d5fc4..907fe58e688 100644
--- a/backends/llamacpp/csrc/backend.cpp
+++ b/backends/llamacpp/csrc/backend.cpp
@@ -17,12 +17,16 @@
 namespace huggingface::tgi::backends::llamacpp {
     [[nodiscard]]
     std::expected<std::pair<llama_model *, llama_context *>, TgiLlamaCppBackendError>
-    TgiLlamaCppBackend::FromGGUF(const std::filesystem::path &modelPath) noexcept {
+    TgiLlamaCppBackend::FromGGUF(const std::filesystem::path &modelPath, const uint16_t nThreads) noexcept {
         SPDLOG_DEBUG(FMT_STRING("Loading model from {}"), modelPath);
 
         llama_backend_init();
         llama_numa_init(ggml_numa_strategy::GGML_NUMA_STRATEGY_NUMACTL);
 
+#ifdef TGI_LLAMACPP_BACKEND_DEBUG
+        llama_print_system_info();
+#endif
+
         // Load the model
         if (!exists(modelPath)) {
             return std::unexpected(TgiLlamaCppBackendError::MODEL_FILE_DOESNT_EXIST);
@@ -32,7 +36,7 @@ namespace huggingface::tgi::backends::llamacpp {
         auto *model = llama_load_model_from_file(modelPath.c_str(), params);
         auto *context = llama_new_context_with_model(model, {
                 .n_batch = 1,
-                .n_threads = 16,
+                .n_threads = nThreads,
                 .attention_type = llama_attention_type::LLAMA_ATTENTION_TYPE_CAUSAL,
                 .flash_attn = false,
         });
@@ -43,7 +47,7 @@ namespace huggingface::tgi::backends::llamacpp {
     huggingface::tgi::backends::llamacpp::TgiLlamaCppBackend::TgiLlamaCppBackend(llama_model *const model,
                                                                                  llama_context *const ctx)
             : model(model), ctx(ctx) {
-#ifndef NDEBUG
+#ifdef TGI_LLAMACPP_BACKEND_DEBUG
         char modelName[256];
         llama_model_meta_val_str(llama_get_model(ctx), "general.name", modelName, sizeof(modelName));
         SPDLOG_DEBUG(FMT_STRING("Created llama.cpp backend for model: '{}'"), std::string_view(modelName));
@@ -126,7 +130,7 @@ namespace huggingface::tgi::backends::llamacpp {
 
         // Decode
         for (auto [generating, nDecoded] = std::pair{true, 0uz}; generating && nDecoded < maxNewTokens; ++nDecoded) {
-#ifndef NDEBUG
+#ifdef TGI_LLAMACPP_BACKEND_DEBUG
             const auto start = std::chrono::steady_clock::now();
             const auto status = llama_decode(ctx, batch);
             const auto end = std::chrono::steady_clock::now();
diff --git a/backends/llamacpp/csrc/backend.hpp b/backends/llamacpp/csrc/backend.hpp
index 7fa47e84d1c..24b49949612 100644
--- a/backends/llamacpp/csrc/backend.hpp
+++ b/backends/llamacpp/csrc/backend.hpp
@@ -42,7 +42,7 @@ namespace huggingface::tgi::backends::llamacpp {
          * @return
          */
         static std::expected<std::pair<llama_model *, llama_context *>, TgiLlamaCppBackendError>
-        FromGGUF(const std::filesystem::path &) noexcept;
+        FromGGUF(const std::filesystem::path &, uint16_t) noexcept;
 
         TgiLlamaCppBackend(llama_model *model, llama_context *ctx);
 
diff --git a/backends/llamacpp/csrc/ffi.hpp b/backends/llamacpp/csrc/ffi.hpp
index 82f3f29651d..09d8af2d9bd 100644
--- a/backends/llamacpp/csrc/ffi.hpp
+++ b/backends/llamacpp/csrc/ffi.hpp
@@ -34,9 +34,9 @@ namespace huggingface::tgi::backends::llamacpp::impl {
         LlamaCppBackendImpl(llama_model *model, llama_context *context) : _inner(model, context) {}
     };
 
-    std::unique_ptr<LlamaCppBackendImpl> CreateLlamaCppBackendImpl(rust::Str modelPath) {
+    std::unique_ptr<LlamaCppBackendImpl> CreateLlamaCppBackendImpl(rust::Str modelPath, uint16_t nThreads) {
         const auto cxxPath = std::string_view(modelPath);
-        if (auto maybe = TgiLlamaCppBackend::FromGGUF(std::filesystem::path(cxxPath)); maybe.has_value()) {
+        if (auto maybe = TgiLlamaCppBackend::FromGGUF(std::filesystem::path(cxxPath), nThreads); maybe.has_value()) {
             auto [model, context] = *maybe;
             return std::make_unique<LlamaCppBackendImpl>(model, context);
         } else {
diff --git a/backends/llamacpp/src/backend.rs b/backends/llamacpp/src/backend.rs
index 7b22e4a2d71..0693ed34a8b 100644
--- a/backends/llamacpp/src/backend.rs
+++ b/backends/llamacpp/src/backend.rs
@@ -24,7 +24,10 @@ pub enum LlamaCppBackendError {
 pub struct LlamaCppBackend {}
 
 impl LlamaCppBackend {
-    pub fn new<P: AsRef<Path> + Send>(model_path: P) -> Result<Self, LlamaCppBackendError> {
+    pub fn new<P: AsRef<Path> + Send>(
+        model_path: P,
+        n_threads: u16,
+    ) -> Result<Self, LlamaCppBackendError> {
         let path = Arc::new(model_path.as_ref());
         if !path.exists() {
             return Err(LlamaCppBackendError::ModelFileDoesntExist(
@@ -32,12 +35,13 @@ impl LlamaCppBackend {
             ));
         }
 
-        let mut backend = create_llamacpp_backend(path.to_str().unwrap()).map_err(|err| {
-            LlamaCppBackendError::ModelInitializationFailed(
-                path.to_path_buf(),
-                err.what().to_string(),
-            )
-        })?;
+        let mut backend =
+            create_llamacpp_backend(path.to_str().unwrap(), n_threads).map_err(|err| {
+                LlamaCppBackendError::ModelInitializationFailed(
+                    path.to_path_buf(),
+                    err.what().to_string(),
+                )
+            })?;
 
         info!(
             "Successfully initialized llama.cpp backend from {}",
diff --git a/backends/llamacpp/src/lib.rs b/backends/llamacpp/src/lib.rs
index 2bfc30654a6..673fe130255 100644
--- a/backends/llamacpp/src/lib.rs
+++ b/backends/llamacpp/src/lib.rs
@@ -11,6 +11,7 @@ mod ffi {
         #[rust_name = "create_llamacpp_backend"]
         fn CreateLlamaCppBackendImpl(
             modelPath: &str,
+            n_threads: u16,
         ) -> Result<UniquePtr<LlamaCppBackendImpl>>;
     }
 }
diff --git a/backends/llamacpp/src/main.rs b/backends/llamacpp/src/main.rs
index 7420e16a518..3920da21d9d 100644
--- a/backends/llamacpp/src/main.rs
+++ b/backends/llamacpp/src/main.rs
@@ -23,24 +23,25 @@ struct Args {
     max_input_tokens: usize,
     #[clap(default_value = "2048", long, env)]
     max_total_tokens: usize,
-    #[clap(default_value = "1.2", long, env)]
-    waiting_served_ratio: f32,
     #[clap(default_value = "4096", long, env)]
     max_batch_prefill_tokens: u32,
     #[clap(long, env)]
     max_batch_total_tokens: Option<u32>,
-    #[clap(default_value = "20", long, env)]
-    max_waiting_tokens: usize,
     #[clap(long, env)]
     max_batch_size: Option<usize>,
     #[clap(default_value = "0.0.0.0", long, env)]
     hostname: String,
     #[clap(default_value = "3000", long, short, env)]
     port: u16,
-    #[clap(default_value = "/tmp/text-generation-server-0", long, env)]
-    master_shard_uds_path: String,
     #[clap(long, env, help = "Path to GGUF model file(s) to load")]
     gguf_path: PathBuf,
+    #[clap(
+        long,
+        env,
+        default_value = "1",
+        help = "Number of CPU threads allocated to one llama.cpp model"
+    )]
+    cores_per_instance: u16,
     #[clap(default_value = "bigscience/bloom", long, env)]
     tokenizer_name: String,
     #[clap(long, env)]
@@ -93,15 +94,13 @@ async fn main() -> Result<(), RouterError> {
         max_top_n_tokens,
         max_input_tokens,
         max_total_tokens,
-        waiting_served_ratio,
         max_batch_prefill_tokens,
         max_batch_total_tokens,
-        max_waiting_tokens,
         max_batch_size,
         hostname,
         port,
-        master_shard_uds_path,
         gguf_path,
+        cores_per_instance,
         tokenizer_name,
         tokenizer_config_path,
         revision,
@@ -162,7 +161,7 @@ async fn main() -> Result<(), RouterError> {
         }
     }
 
-    let backend = LlamaCppBackend::new(gguf_path)?;
+    let backend = LlamaCppBackend::new(gguf_path, cores_per_instance)?;
 
     // Run server
     server::run(

From 0c1dd0ed2b3d38dfaa2aa5409b39c7b73eca9493 Mon Sep 17 00:00:00 2001
From: Morgan Funtowicz <funtowiczmo@gmail.com>
Date: Tue, 29 Oct 2024 22:30:36 +0100
Subject: [PATCH 19/91] feat(llamacpp): wip explosion

---
 backends/llamacpp/csrc/backend.cpp | 172 ++++++++++-------------------
 backends/llamacpp/csrc/backend.hpp |  95 +++++++++-------
 backends/llamacpp/csrc/ffi.hpp     |   4 +-
 3 files changed, 116 insertions(+), 155 deletions(-)

diff --git a/backends/llamacpp/csrc/backend.cpp b/backends/llamacpp/csrc/backend.cpp
index 907fe58e688..080a4401409 100644
--- a/backends/llamacpp/csrc/backend.cpp
+++ b/backends/llamacpp/csrc/backend.cpp
@@ -15,140 +15,86 @@
 #include "backend.hpp"
 
 namespace huggingface::tgi::backends::llamacpp {
-    [[nodiscard]]
-    std::expected<std::pair<llama_model *, llama_context *>, TgiLlamaCppBackendError>
-    TgiLlamaCppBackend::FromGGUF(const std::filesystem::path &modelPath, const uint16_t nThreads) noexcept {
-        SPDLOG_DEBUG(FMT_STRING("Loading model from {}"), modelPath);
 
-        llama_backend_init();
-        llama_numa_init(ggml_numa_strategy::GGML_NUMA_STRATEGY_NUMACTL);
+    std::unique_ptr<llama_sampler> SamplingParams::IntoLlamaSampler(const llama_model *pModel) const {
+        auto *pSampler = llama_sampler_chain_init({.no_perf = false});
 
-#ifdef TGI_LLAMACPP_BACKEND_DEBUG
-        llama_print_system_info();
-#endif
+        // Penalties
+        llama_sampler_chain_add(pSampler, llama_sampler_init_penalties(
+                llama_n_vocab(pModel),
+                llama_token_eos(pModel),
+                llama_token_nl(pModel),
+                0.0f,
+                repetitionPenalty,
+                frequencyPenalty,
+                0.0f,
+                false,
+                false
+        ));
+        llama_sampler_chain_add(pSampler, llama_sampler_init_top_k(static_cast<int32_t>(topK)));
 
-        // Load the model
-        if (!exists(modelPath)) {
-            return std::unexpected(TgiLlamaCppBackendError::MODEL_FILE_DOESNT_EXIST);
+        if (0 < topP && topP < 1) {
+            llama_sampler_chain_add(pSampler, llama_sampler_init_top_p(topP, 1));
         }
 
-        auto params = llama_model_default_params();
-        auto *model = llama_load_model_from_file(modelPath.c_str(), params);
-        auto *context = llama_new_context_with_model(model, {
-                .n_batch = 1,
-                .n_threads = nThreads,
-                .attention_type = llama_attention_type::LLAMA_ATTENTION_TYPE_CAUSAL,
-                .flash_attn = false,
-        });
-
-        return std::make_pair(model, context);
+        llama_sampler_chain_add(pSampler, llama_sampler_init_dist(seed));
+        return std::unique_ptr<llama_sampler>(pSampler);
     }
 
-    huggingface::tgi::backends::llamacpp::TgiLlamaCppBackend::TgiLlamaCppBackend(llama_model *const model,
-                                                                                 llama_context *const ctx)
-            : model(model), ctx(ctx) {
+    Worker::Worker(std::shared_ptr<llama_model> pModel, const llama_context_params &params)
+            : mModel_(pModel), mParams_(params) {
+
 #ifdef TGI_LLAMACPP_BACKEND_DEBUG
         char modelName[256];
-        llama_model_meta_val_str(llama_get_model(ctx), "general.name", modelName, sizeof(modelName));
+        llama_model_meta_val_str(pModel.get(), "general.name", modelName, sizeof(modelName));
         SPDLOG_DEBUG(FMT_STRING("Created llama.cpp backend for model: '{}'"), std::string_view(modelName));
 #endif
     }
 
-    huggingface::tgi::backends::llamacpp::TgiLlamaCppBackend::~TgiLlamaCppBackend() {
-        if (ctx) {
-            SPDLOG_DEBUG("Freeing llama.cpp context");
-            llama_free(ctx);
-        }
-
-        if (model) {
-            SPDLOG_DEBUG("Freeing llama.cpp model");
-            llama_free_model(model);
-        }
-    }
+    void Worker::Loop(std::atomic_flag &running, std::atomic_uint8_t &waiting, std::queue<SamplingParams> &backlog) {
+        auto *context = llama_new_context_with_model(mModel_.get(), mParams_);
 
-    std::vector<TgiLlamaCppBackend::TokenId> TgiLlamaCppBackend::Tokenize(const std::string &text) const {
-        std::vector<TgiLlamaCppBackend::TokenId> tokens(llama_n_seq_max(ctx));
+        while (running.test(std::memory_order_acquire)) {
+            if (waiting.load(std::memory_order_acquire) > 0) {
+                --waiting;
 
-        if (auto nTokens = llama_tokenize(model, text.c_str(), text.length(), tokens.data(), tokens.capacity(), true,
-                                          true); nTokens < 0) {
-            tokens.resize(-nTokens);
-            llama_tokenize(model, text.c_str(), text.length(), tokens.data(), tokens.capacity(), true, true);
-        } else {
-            tokens.resize(nTokens);
-        }
+                auto request = backlog.front();
+                auto sampler = request.IntoLlamaSampler(mModel_.get());
 
-        SPDLOG_DEBUG(FMT_STRING("Tokenized input with {:d} tokens"), tokens.size());
-        return tokens;
-    }
+                // Retrieve decoding context
+                auto batch = llama_batch_get_one(tokens.data(), tokens.size());
+                // Decode
+                for (auto [generating, nDecoded] = std::pair{true, 0uz}; generating && nDecoded < 1; ++nDecoded) {
+#ifdef TGI_LLAMACPP_BACKEND_DEBUG
+                    const auto start = std::chrono::steady_clock::now();
+                    const auto status = llama_decode(context, batch);
+                    const auto end = std::chrono::steady_clock::now();
+                    const auto latency = std::chrono::duration_cast<std::chrono::milliseconds>(end - start);
+                    SPDLOG_DEBUG(FMT_STRING("Successfully decoded {:d} token(s) in {}"), batch.n_tokens, latency);
+#else
+                    const auto status = llama_decode(ctx, batch);
+#endif
+                    if (LLAMA_SUCCESS(status)) {
+                        // Sample the new token
+                        auto new_token_id = llama_sampler_sample(sampler.get(), context, -1);
+                        generated.emplace_back(new_token_id);
+                        generating = !llama_token_is_eog(mModel_.get(), new_token_id);
 
-    std::unique_ptr<llama_sampler *> TgiLlamaCppBackend::GetSamplerFromArgs(
-            const uint32_t topK, const float_t topP, const float_t frequencyPenalty, const float_t repetitionPenalty,
-            const uint64_t seed) {
-        auto *sampler = llama_sampler_chain_init({.no_perf = false});
+                        // Next iteration
+                        batch = llama_batch_get_one(&new_token_id, 1);
+                    }
+                }
 
-        // Penalties
-        llama_sampler_chain_add(sampler, llama_sampler_init_penalties(
-                llama_n_vocab(model),
-                llama_token_eos(model),
-                llama_token_nl(model),
-                0.0f,
-                repetitionPenalty,
-                frequencyPenalty,
-                0.0f,
-                false,
-                false
-        ));
-        llama_sampler_chain_add(sampler, llama_sampler_init_top_k(static_cast<int32_t>(topK)));
+                backlog.pop();
 
-        if (0 < topP && topP < 1) {
-            llama_sampler_chain_add(sampler, llama_sampler_init_top_p(topP, 1));
+            }
         }
 
-        llama_sampler_chain_add(sampler, llama_sampler_init_dist(seed));
-        return std::make_unique<llama_sampler *>(sampler);
+        llama_free(context);
     }
 
-    std::expected<std::vector<TgiLlamaCppBackend::TokenId>, TgiLlamaCppBackendError>
-    huggingface::tgi::backends::llamacpp::TgiLlamaCppBackend::Generate(
-            std::span<const TokenId> tokens,
-            const uint32_t topK,
-            const float_t topP,
-            const float_t frequencyPenalty,
-            const float_t repetitionPenalty,
-            const uint32_t maxNewTokens,
-            const uint64_t seed
-    ) {
-        SPDLOG_DEBUG(FMT_STRING("Received {:d} tokens to schedule"), tokens.size());
-
-        // Allocate generation result
-        std::vector<TgiLlamaCppBackend::TokenId> generated;
-        generated.reserve(llama_n_seq_max(ctx) - tokens.size());
-
-        // Retrieve decoding context
-        auto batch = llama_batch_get_one(const_cast<int32_t *>(tokens.data()), static_cast<int32_t>(tokens.size()));
-        auto sampler = GetSamplerFromArgs(topK, topP, frequencyPenalty, repetitionPenalty, seed);
-
-        // Decode
-        for (auto [generating, nDecoded] = std::pair{true, 0uz}; generating && nDecoded < maxNewTokens; ++nDecoded) {
-#ifdef TGI_LLAMACPP_BACKEND_DEBUG
-            const auto start = std::chrono::steady_clock::now();
-            const auto status = llama_decode(ctx, batch);
-            const auto end = std::chrono::steady_clock::now();
-            const auto latency = std::chrono::duration_cast<std::chrono::milliseconds>(end - start);
-            SPDLOG_DEBUG(FMT_STRING("Successfully decoded {:d} token(s) in {}"), batch.n_tokens, latency);
-#else
-            const auto status = llama_decode(ctx, batch);
-#endif
-            if (LLAMA_SUCCESS(status)) {
-                // Sample the new token
-                auto new_token_id = llama_sampler_sample(*sampler, ctx, -1);
-                generated.emplace_back(new_token_id);
-                generating = !llama_token_is_eog(model, new_token_id);
-
-                // Next iteration
-                batch = llama_batch_get_one(&new_token_id, 1);
-            }
-        }
-        return generated;
-    }
+    huggingface::tgi::backends::llamacpp::BackendBase::BackendBase(llama_model *model)
+            : mModel_(model, llama_free_model) { llama_backend_init(); }
+
+    BackendBase::~BackendBase() { llama_backend_free(); }
 }
\ No newline at end of file
diff --git a/backends/llamacpp/csrc/backend.hpp b/backends/llamacpp/csrc/backend.hpp
index 24b49949612..e4814d45689 100644
--- a/backends/llamacpp/csrc/backend.hpp
+++ b/backends/llamacpp/csrc/backend.hpp
@@ -4,9 +4,11 @@
 #ifndef TGI_LLAMA_CPP_BACKEND_BACKEND_HPP
 #define TGI_LLAMA_CPP_BACKEND_BACKEND_HPP
 
+#include <atomic>
 #include <cmath>
 #include <expected>
 #include <filesystem>
+#include <queue>
 #include <memory>
 #include <span>
 #include <vector>
@@ -16,72 +18,85 @@
 #define LLAMA_SUCCESS(x) x == 0
 
 namespace huggingface::tgi::backends::llamacpp {
-    enum TgiLlamaCppBackendError : uint8_t {
+    enum BackendError : uint8_t {
         MODEL_FILE_DOESNT_EXIST = 1
     };
 
-    class TgiLlamaCppBackend {
-        using TokenId = llama_token;
-
-    private:
-        llama_model *model;
-        llama_context *ctx;
+    struct SamplingParams {
+        uint32_t topK = std::numeric_limits<decltype(topK)>::max();
+        float_t topP = 1.0f;
+        float_t frequencyPenalty = 0.0f;
+        float_t repetitionPenalty = 0.0f;
+        uint64_t seed = 2014;
 
         /**
-         *
-         * @param topK
-         * @param topP
+         * Convert this GenerationParams to the respective llama_sampler structure
+         * @param Pointer to the model data
          * @return
          */
-        std::unique_ptr<llama_sampler *> GetSamplerFromArgs(
-                uint32_t topK, float_t topP, float_t frequencyPenalty, float_t repetitionPenalty, uint64_t seed);
+        std::unique_ptr<llama_sampler> IntoLlamaSampler(const llama_model *) const;
+    };
+
+    class Worker {
+    protected:
+        constexpr static auto llama_context_deleter = [](llama_context *pContext) { llama_free(pContext); };
 
     public:
-        /**
-         *
-         * @return
-         */
-        static std::expected<std::pair<llama_model *, llama_context *>, TgiLlamaCppBackendError>
-        FromGGUF(const std::filesystem::path &, uint16_t) noexcept;
+        using model_ptr_type = std::shared_ptr<llama_model>;
+        using context_params_type = llama_context_params;
+        using token_id_type = llama_token;
+
+    private:
+        const model_ptr_type mModel_;
+        context_params_type mParams_;
 
-        TgiLlamaCppBackend(llama_model *model, llama_context *ctx);
+    public:
+        Worker(std::shared_ptr<llama_model> pModel, const llama_context_params &params);
 
-        ~TgiLlamaCppBackend();
+        void Loop(std::atomic_flag &, std::atomic_uint8_t &, std::queue<SamplingParams> &) const;
+    };
+
+
+    class BackendBase {
+
+    private:
+        std::shared_ptr<llama_model> mModel_;
+
+    public:
+        explicit BackendBase(llama_model *model);
+
+        ~BackendBase();
 
         /**
          *
-         * @param text
+         * @param tokens
+         * @params out
+         * @param params
+         * @param maxNewTokens
          * @return
          */
-        [[nodiscard("Tokens will be freed after this call if not assigned to an lvalue")]]
-        std::vector<TgiLlamaCppBackend::TokenId> Tokenize(const std::string &text) const;
+        [[nodiscard("Generated tokens will be freed after this call if not assigned to an lvalue")]]
+        std::expected<std::vector<llama_token>, BackendError> Generate(
+                std::span<const llama_token> tokens,
+                std::span<llama_token> out,
+                const SamplingParams &params,
+                uint32_t maxNewTokens = std::numeric_limits<uint32_t>::max() - 1
+        );
 
         /**
          *
          * @param tokens
-         * @param topK
-         * @param topP
-         * @param frequencyPenalty
-         * @param repetitionPenalty
+         * @param params
          * @param maxNewTokens
-         * @param seed
          * @return
          */
         [[nodiscard("Generated tokens will be freed after this call if not assigned to an lvalue")]]
-        std::expected<std::vector<TgiLlamaCppBackend::TokenId>, TgiLlamaCppBackendError> Generate(
-                std::span<const TokenId> tokens,
-                uint32_t topK,
-                float_t topP = 1.0f,
-                float_t frequencyPenalty = 0.0f,
-                float_t repetitionPenalty = 0.0f,
-                uint32_t maxNewTokens = std::numeric_limits<uint32_t>::max() - 1,
-                uint64_t seed = 2014
+        std::expected<std::vector<llama_token>, BackendError> Generate(
+                std::span<const llama_token> tokens,
+                const SamplingParams &params,
+                uint32_t maxNewTokens = std::numeric_limits<uint32_t>::max() - 1
         );
     };
-
-    [[nodiscard("Create backend will be freed after this call if not assigned to an lvalue")]]
-    std::expected<std::unique_ptr<TgiLlamaCppBackend>, TgiLlamaCppBackendError>
-    CreateLlamaCppBackend(const std::filesystem::path &root);
 }
 
 #endif //TGI_LLAMA_CPP_BACKEND_BACKEND_HPP
diff --git a/backends/llamacpp/csrc/ffi.hpp b/backends/llamacpp/csrc/ffi.hpp
index 09d8af2d9bd..d15728b9a02 100644
--- a/backends/llamacpp/csrc/ffi.hpp
+++ b/backends/llamacpp/csrc/ffi.hpp
@@ -28,10 +28,10 @@ namespace huggingface::tgi::backends::llamacpp::impl {
 
     class LlamaCppBackendImpl {
     private:
-        TgiLlamaCppBackend _inner;
+        BackendBase _inner;
 
     public:
-        LlamaCppBackendImpl(llama_model *model, llama_context *context) : _inner(model, context) {}
+        LlamaCppBackendImpl(llama_model *model) : _inner(model) {}
     };
 
     std::unique_ptr<LlamaCppBackendImpl> CreateLlamaCppBackendImpl(rust::Str modelPath, uint16_t nThreads) {

From dbc5b7a0f7defc463ca43ec6eeae43e0a1f2182b Mon Sep 17 00:00:00 2001
From: Morgan Funtowicz <funtowiczmo@gmail.com>
Date: Sat, 26 Oct 2024 22:24:05 +0200
Subject: [PATCH 20/91] misc(offline): link correctly

---
 backends/llamacpp/offline/main.cpp | 14 +++++++-------
 1 file changed, 7 insertions(+), 7 deletions(-)

diff --git a/backends/llamacpp/offline/main.cpp b/backends/llamacpp/offline/main.cpp
index 56eb88c5464..d8121d3df37 100644
--- a/backends/llamacpp/offline/main.cpp
+++ b/backends/llamacpp/offline/main.cpp
@@ -2,7 +2,6 @@
 // Created by mfuntowicz on 10/3/24.
 //
 
-#include <string_view>
 #include <fmt/color.h>
 #include <fmt/format.h>
 #include <fmt/std.h>
@@ -12,7 +11,7 @@
 
 using namespace huggingface::tgi::backends::llamacpp;
 
-int main(int argc, char** argv) {
+int main(int argc, char **argv) {
     if (argc < 2) {
         fmt::print("No model folder provider");
         return 1;
@@ -23,15 +22,16 @@ int main(int argc, char** argv) {
     const auto prompt = "My name is Morgan";
 
     const auto modelPath = absolute(std::filesystem::path(argv[1]));
-    if (auto maybeBackend = CreateLlamaCppBackend(modelPath); maybeBackend.has_value()) {
+    if (auto maybeBackend = TgiLlamaCppBackend::FromGGUF(modelPath); maybeBackend.has_value()) {
         // Retrieve the backend
-        const auto& backend = *maybeBackend;
+        auto [model, context] = *maybeBackend;
+        auto backend = TgiLlamaCppBackend(model, context);
 
         // Generate
-        const auto promptTokens = backend->Tokenize(prompt);
-        const auto out = backend->Generate(promptTokens, 30, 1.0, 2.0, 0.0, 32);
+        const auto promptTokens = backend.Tokenize(prompt);
+        const auto out = backend.Generate(promptTokens, 30, 1.0, 2.0, 0.0, 32);
 
-        if(out.has_value())
+        if (out.has_value())
             fmt::print(FMT_STRING("Generated: {}"), *out);
         else {
             const auto err = out.error();

From 611590440dd0b0bf23d04ba5604ecead8ef509b3 Mon Sep 17 00:00:00 2001
From: Morgan Funtowicz <funtowiczmo@gmail.com>
Date: Mon, 28 Oct 2024 22:44:47 +0100
Subject: [PATCH 21/91] misc(offline): expose more parameters for generate

---
 backends/llamacpp/src/backend.rs | 24 ++++++++++++++++++++----
 1 file changed, 20 insertions(+), 4 deletions(-)

diff --git a/backends/llamacpp/src/backend.rs b/backends/llamacpp/src/backend.rs
index 0693ed34a8b..af50470d2a3 100644
--- a/backends/llamacpp/src/backend.rs
+++ b/backends/llamacpp/src/backend.rs
@@ -1,12 +1,13 @@
 use crate::ffi::{create_llamacpp_backend, LlamaCppBackendImpl};
 use async_trait::async_trait;
-use cxx::UniquePtr;
+use cxx::{Exception, UniquePtr};
+use std::ops::Deref;
 use std::path::{Path, PathBuf};
 use std::sync::Arc;
+use std::thread::spawn;
 use text_generation_router::infer::{Backend, InferError, InferStreamResponse};
 use text_generation_router::validation::ValidGenerateRequest;
 use thiserror::Error;
-use tokio::task::spawn_blocking;
 use tokio_stream::wrappers::UnboundedReceiverStream;
 use tracing::info;
 
@@ -48,12 +49,27 @@ impl LlamaCppBackend {
             path.display()
         );
 
-        spawn_blocking(move || scheduler_loop(backend));
+        let j = spawn(|| scheduler_loop(backend));
+        j.join().ok();
         Ok(Self {})
     }
 }
 
-async fn scheduler_loop(mut backend: UniquePtr<LlamaCppBackendImpl>) {}
+fn scheduler_loop(mut backend: UniquePtr<LlamaCppBackendImpl>) {
+    println!("Scheduler loop");
+    let tokens = [128000i32, 5159, 836, 374, 23809];
+    let mut generated = vec![0i32; 128];
+    match backend
+        .pin_mut()
+        .generate(&tokens, &mut generated, 40, 32, 1.0, 1.0, 1.0, 1.0, 2014)
+    {
+        Ok(n_tokens) => {
+            generated.truncate(n_tokens);
+            println!("Generated {} tokens -> {:?}", n_tokens, generated);
+        }
+        Err(err) => println!("Error: {}", err),
+    }
+}
 
 #[async_trait]
 impl Backend for LlamaCppBackend {

From b98c635781e365a9669eca8ba7f2b770b4893855 Mon Sep 17 00:00:00 2001
From: Morgan Funtowicz <funtowiczmo@gmail.com>
Date: Wed, 30 Oct 2024 22:40:37 +0100
Subject: [PATCH 22/91] feat(backend): entirely rewrite backend

---
 backends/llamacpp/csrc/backend.cpp | 155 +++++++++++++++++++++--------
 backends/llamacpp/csrc/backend.hpp | 148 +++++++++++++++++++++------
 2 files changed, 230 insertions(+), 73 deletions(-)

diff --git a/backends/llamacpp/csrc/backend.cpp b/backends/llamacpp/csrc/backend.cpp
index 080a4401409..daf8de54484 100644
--- a/backends/llamacpp/csrc/backend.cpp
+++ b/backends/llamacpp/csrc/backend.cpp
@@ -16,85 +16,156 @@
 
 namespace huggingface::tgi::backends::llamacpp {
 
-    std::unique_ptr<llama_sampler> SamplingParams::IntoLlamaSampler(const llama_model *pModel) const {
+    void llama_batch_fill_prompt(llama_batch &batch, std::span<const llama_token> input_tokens) {
+        for (auto i = 0; i < input_tokens.size(); ++i) {
+            batch.token[i] = input_tokens[i];
+            batch.pos[i] = i;
+            batch.n_seq_id[i] = 1;
+            batch.seq_id[i] = 0;
+            batch.logits[i] = false;
+            ++batch.n_tokens;
+        }
+
+        batch.logits[batch.n_tokens] = true;
+    }
+
+    std::unique_ptr<llama_sampler> sampling_params_t::into_llama_sampler(const llama_model *model) const {
         auto *pSampler = llama_sampler_chain_init({.no_perf = false});
 
         // Penalties
         llama_sampler_chain_add(pSampler, llama_sampler_init_penalties(
-                llama_n_vocab(pModel),
-                llama_token_eos(pModel),
-                llama_token_nl(pModel),
+                llama_n_vocab(model),
+                llama_token_eos(model),
+                llama_token_nl(model),
                 0.0f,
-                repetitionPenalty,
-                frequencyPenalty,
+                repetition_penalty,
+                frequency_penalty,
                 0.0f,
                 false,
                 false
         ));
-        llama_sampler_chain_add(pSampler, llama_sampler_init_top_k(static_cast<int32_t>(topK)));
+        llama_sampler_chain_add(pSampler, llama_sampler_init_top_k(static_cast<int32_t>(top_k)));
 
-        if (0 < topP && topP < 1) {
-            llama_sampler_chain_add(pSampler, llama_sampler_init_top_p(topP, 1));
+        if (0 < top_p && top_p < 1) {
+            llama_sampler_chain_add(pSampler, llama_sampler_init_top_p(top_p, 1));
         }
 
         llama_sampler_chain_add(pSampler, llama_sampler_init_dist(seed));
         return std::unique_ptr<llama_sampler>(pSampler);
     }
 
-    Worker::Worker(std::shared_ptr<llama_model> pModel, const llama_context_params &params)
-            : mModel_(pModel), mParams_(params) {
+    worker_t::worker_t(std::shared_ptr<llama_model> model, const llama_context_params &params)
+            : mModel_(model), mParams_(params) {
 
 #ifdef TGI_LLAMACPP_BACKEND_DEBUG
         char modelName[256];
-        llama_model_meta_val_str(pModel.get(), "general.name", modelName, sizeof(modelName));
+        llama_model_meta_val_str(model.get(), "general.name", modelName, sizeof(modelName));
         SPDLOG_DEBUG(FMT_STRING("Created llama.cpp backend for model: '{}'"), std::string_view(modelName));
 #endif
     }
 
-    void Worker::Loop(std::atomic_flag &running, std::atomic_uint8_t &waiting, std::queue<SamplingParams> &backlog) {
+    void worker_t::loop(std::stop_source &driver, std::queue<generation_context_t> &backlog) const {
         auto *context = llama_new_context_with_model(mModel_.get(), mParams_);
 
-        while (running.test(std::memory_order_acquire)) {
-            if (waiting.load(std::memory_order_acquire) > 0) {
-                --waiting;
+        while (!driver.stop_requested()) {
+            const auto generation_context = backlog.front();
+
+            generate(context, generation_context, std::nullopt);
+            backlog.pop();
+
+            SPDLOG_DEBUG("Processed request ({:d} remaining)", backlog.size());
+        }
+
+        llama_free(context);
+    }
+
+    size_t worker_t::generate(
+            llama_context *context,
+            const generation_context_t &generation_context,
+            const std::optional<llama_decode_callback> &callback) const {
+        // Store information about context and generation size
+        auto prompt_length = std::ssize(generation_context.input_tokens);
+        auto max_new_tokens = generation_context.generation_params.max_new_tokens;
+
+        // Convert sampling params to what llama.cpp is looking for
+        auto sampler = generation_context.sampling_params.into_llama_sampler(mModel_.get());
 
-                auto request = backlog.front();
-                auto sampler = request.IntoLlamaSampler(mModel_.get());
+        // Setup the prompt
+        auto copy = std::vector(generation_context.input_tokens.begin(), generation_context.input_tokens.end());
+        auto batch = llama_batch_get_one(copy.data(), copy.size());
+
+        // Decode
+        auto n_decoded_tokens = 0;
+        for (bool generating = true; generating && n_decoded_tokens < max_new_tokens; ++n_decoded_tokens) {
+            const auto callback_ = callback.value_or(llama_void_callback);
 
-                // Retrieve decoding context
-                auto batch = llama_batch_get_one(tokens.data(), tokens.size());
-                // Decode
-                for (auto [generating, nDecoded] = std::pair{true, 0uz}; generating && nDecoded < 1; ++nDecoded) {
 #ifdef TGI_LLAMACPP_BACKEND_DEBUG
-                    const auto start = std::chrono::steady_clock::now();
-                    const auto status = llama_decode(context, batch);
-                    const auto end = std::chrono::steady_clock::now();
-                    const auto latency = std::chrono::duration_cast<std::chrono::milliseconds>(end - start);
-                    SPDLOG_DEBUG(FMT_STRING("Successfully decoded {:d} token(s) in {}"), batch.n_tokens, latency);
+            const auto start = std::chrono::steady_clock::now();
+            const auto status = llama_decode(context, batch);
+            const auto end = std::chrono::steady_clock::now();
+            const auto latency = std::chrono::duration_cast<std::chrono::milliseconds>(end - start);
+            SPDLOG_DEBUG(FMT_STRING("Successfully decoded {:d} token(s) in {}"), batch.n_tokens, latency);
 #else
-                    const auto status = llama_decode(ctx, batch);
+            const auto status = llama_decode(ctx, batch);
 #endif
-                    if (LLAMA_SUCCESS(status)) {
-                        // Sample the new token
-                        auto new_token_id = llama_sampler_sample(sampler.get(), context, -1);
-                        generated.emplace_back(new_token_id);
-                        generating = !llama_token_is_eog(mModel_.get(), new_token_id);
+            batch.n_tokens = 0;
+            if (LLAMA_SUCCESS(status)) {
+                // Sample the new token
+                auto new_token_id = llama_sampler_sample(sampler.get(), context, -1);
+                auto is_eos = llama_token_is_eog(mModel_.get(), new_token_id);
 
-                        // Next iteration
-                        batch = llama_batch_get_one(&new_token_id, 1);
-                    }
-                }
+                generation_context.generated_tokens[n_decoded_tokens] = new_token_id;
+                generating = !is_eos;
 
-                backlog.pop();
+                // Bubble up the generated token if a callback is provided
+                std::invoke(std::forward<const llama_decode_callback>(callback_), new_token_id, is_eos);
 
+                batch = llama_batch_get_one(&new_token_id, 1);
             }
         }
 
-        llama_free(context);
+        return n_decoded_tokens;
+    }
+
+
+    backend_base_t::backend_base_t(llama_model *model) : mModel_(model, llama_free_model) { llama_backend_init(); }
+
+    backend_base_t::~backend_base_t() { llama_backend_free(); }
+
+    std::expected<std::vector<llama_token>, backend_error_t> backend_base_t::generate(
+            std::span<const llama_token> tokens,
+            const generation_params_t &generation_params,
+            const sampling_params_t &sampling_params,
+            const std::optional<llama_decode_callback> &callback
+    ) {
+        // TODO: Should we provide a way to change this value?
+        auto generated = std::vector<llama_token>(2 << 8);
+
+        auto nTokensGenerated = generate(tokens, generated, generation_params, sampling_params, callback);
+        if (nTokensGenerated.has_value())
+            generated.resize(*nTokensGenerated);
+        return generated;
     }
 
-    huggingface::tgi::backends::llamacpp::BackendBase::BackendBase(llama_model *model)
-            : mModel_(model, llama_free_model) { llama_backend_init(); }
 
-    BackendBase::~BackendBase() { llama_backend_free(); }
+    /** Single worker_t Backend impl **/
+
+    single_worker_backend_t::single_worker_backend_t(llama_model *model,
+                                                     const std::optional<llama_context_params> &params)
+            : backend_base_t(model),
+              mContext_(llama_context_factory(model)),
+              mWorker_(mModel_, params.value_or(llama_context_default_params())) {
+        llama_numa_init(ggml_numa_strategy::GGML_NUMA_STRATEGY_NUMACTL);
+    }
+
+    std::expected<std::size_t, backend_error_t>
+    single_worker_backend_t::generate(
+            std::span<const llama_token> tokens,
+            std::span<llama_token> out,
+            const generation_params_t &generation_params,
+            const sampling_params_t &sampling_params,
+            const std::optional<llama_decode_callback> &callback
+    ) {
+        return mWorker_.generate(mContext_.get(), {generation_params, sampling_params, tokens, out}, callback);
+    }
 }
\ No newline at end of file
diff --git a/backends/llamacpp/csrc/backend.hpp b/backends/llamacpp/csrc/backend.hpp
index e4814d45689..e7545a3c9e1 100644
--- a/backends/llamacpp/csrc/backend.hpp
+++ b/backends/llamacpp/csrc/backend.hpp
@@ -8,25 +8,42 @@
 #include <cmath>
 #include <expected>
 #include <filesystem>
+#include <functional>
 #include <queue>
 #include <memory>
+#include <optional>
 #include <span>
+#include <stop_token>
 #include <vector>
 
 #include <llama.h>
+#include <thread>
 
 #define LLAMA_SUCCESS(x) x == 0
 
 namespace huggingface::tgi::backends::llamacpp {
-    enum BackendError : uint8_t {
+
+    static constexpr auto llama_context_deleter = [](llama_context *pContext) { llama_free(pContext); };
+    typedef std::unique_ptr<llama_context, decltype(llama_context_deleter)> llama_context_smart_ptr;
+
+    typedef std::function<void(llama_token, bool)> llama_decode_callback;
+    static constexpr auto llama_void_callback = [](llama_token token_id, bool is_eos) {};
+
+    /**
+     *
+     */
+    enum backend_error_t : uint8_t {
         MODEL_FILE_DOESNT_EXIST = 1
     };
 
-    struct SamplingParams {
-        uint32_t topK = std::numeric_limits<decltype(topK)>::max();
-        float_t topP = 1.0f;
-        float_t frequencyPenalty = 0.0f;
-        float_t repetitionPenalty = 0.0f;
+    /**
+     *
+     */
+    struct sampling_params_t {
+        uint32_t top_k = std::numeric_limits<decltype(top_k)>::max();
+        float_t top_p = 1.0f;
+        float_t frequency_penalty = 0.0f;
+        float_t repetition_penalty = 0.0f;
         uint64_t seed = 2014;
 
         /**
@@ -34,38 +51,72 @@ namespace huggingface::tgi::backends::llamacpp {
          * @param Pointer to the model data
          * @return
          */
-        std::unique_ptr<llama_sampler> IntoLlamaSampler(const llama_model *) const;
+        std::unique_ptr<llama_sampler> into_llama_sampler(const llama_model *pModel) const;
     };
 
-    class Worker {
-    protected:
-        constexpr static auto llama_context_deleter = [](llama_context *pContext) { llama_free(pContext); };
+    /**
+     *
+     */
+    struct generation_params_t {
+        uint32_t max_new_tokens = std::numeric_limits<uint32_t>::max();
+    };
 
-    public:
-        using model_ptr_type = std::shared_ptr<llama_model>;
-        using context_params_type = llama_context_params;
-        using token_id_type = llama_token;
+    struct generation_context_t {
+        generation_params_t generation_params;
+        sampling_params_t sampling_params;
+        std::span<const llama_token> input_tokens;
+        std::span<llama_token> generated_tokens;
+    };
 
+    /**
+     *
+     */
+    class worker_t {
     private:
-        const model_ptr_type mModel_;
-        context_params_type mParams_;
+        const std::shared_ptr<llama_model> mModel_;
+        const llama_context_params mParams_;
 
     public:
-        Worker(std::shared_ptr<llama_model> pModel, const llama_context_params &params);
+        /**
+         *
+         * @param model
+         * @param params
+         */
+        worker_t(std::shared_ptr<llama_model> model, const llama_context_params &params);
 
-        void Loop(std::atomic_flag &, std::atomic_uint8_t &, std::queue<SamplingParams> &) const;
+        /**
+         *
+         * @param context
+         * @param generation_context
+         * @param callback
+         */
+        size_t
+        generate(llama_context *, const generation_context_t &, const std::optional<llama_decode_callback> &) const;
+
+        /**
+         *
+         */
+        void loop(std::stop_source &driver, std::queue<generation_context_t> &backlog) const;
     };
 
 
-    class BackendBase {
+    class backend_base_t {
 
-    private:
+    protected:
         std::shared_ptr<llama_model> mModel_;
 
     public:
-        explicit BackendBase(llama_model *model);
 
-        ~BackendBase();
+        /**
+         *
+         * @param model
+         */
+        explicit backend_base_t(llama_model *model);
+
+        /**
+         * Destructor
+         */
+        ~backend_base_t();
 
         /**
          *
@@ -76,12 +127,13 @@ namespace huggingface::tgi::backends::llamacpp {
          * @return
          */
         [[nodiscard("Generated tokens will be freed after this call if not assigned to an lvalue")]]
-        std::expected<std::vector<llama_token>, BackendError> Generate(
-                std::span<const llama_token> tokens,
-                std::span<llama_token> out,
-                const SamplingParams &params,
-                uint32_t maxNewTokens = std::numeric_limits<uint32_t>::max() - 1
-        );
+        virtual std::expected<size_t, backend_error_t> generate(
+                std::span<const llama_token> input_tokens,
+                std::span<llama_token> generated_tokens,
+                const generation_params_t &generation_params,
+                const sampling_params_t &sampling_params,
+                const std::optional<llama_decode_callback> &callback
+        ) = 0;
 
         /**
          *
@@ -91,12 +143,46 @@ namespace huggingface::tgi::backends::llamacpp {
          * @return
          */
         [[nodiscard("Generated tokens will be freed after this call if not assigned to an lvalue")]]
-        std::expected<std::vector<llama_token>, BackendError> Generate(
+        std::expected<std::vector<llama_token>, backend_error_t> generate(
                 std::span<const llama_token> tokens,
-                const SamplingParams &params,
-                uint32_t maxNewTokens = std::numeric_limits<uint32_t>::max() - 1
+                const generation_params_t &generation_params,
+                const sampling_params_t &sampling_params,
+                const std::optional<llama_decode_callback> &callback = std::nullopt
         );
     };
+
+
+    class single_worker_backend_t : backend_base_t {
+    private:
+        constexpr const static auto llama_context_factory = [](llama_model *pModel) -> llama_context_smart_ptr {
+            auto llParams = llama_context_default_params();
+            llParams.flash_attn = true;
+            llParams.n_batch = 1;
+            llParams.no_perf = true;
+            llParams.attention_type = llama_attention_type::LLAMA_ATTENTION_TYPE_CAUSAL;
+
+            return {llama_new_context_with_model(pModel, llParams), llama_context_deleter};
+        };
+
+        llama_context_smart_ptr mContext_;
+        worker_t mWorker_;
+
+    public:
+        explicit single_worker_backend_t(llama_model *pModel, const std::optional<llama_context_params> &);
+
+        using backend_base_t::generate;
+
+        std::expected<size_t, backend_error_t>
+        generate(
+                std::span<const llama_token> tokens,
+                std::span<llama_token> out,
+                const generation_params_t &generation_params,
+                const sampling_params_t &sampling_params,
+                const std::optional<llama_decode_callback> &callback
+        ) override;
+
+
+    };
 }
 
 #endif //TGI_LLAMA_CPP_BACKEND_BACKEND_HPP

From 6a5f6b07551bf59d45b1b800779bda4b98709722 Mon Sep 17 00:00:00 2001
From: Morgan Funtowicz <funtowiczmo@gmail.com>
Date: Wed, 30 Oct 2024 22:40:49 +0100
Subject: [PATCH 23/91] misc(offline): update offline tester

---
 backends/llamacpp/CMakeLists.txt   |  2 +-
 backends/llamacpp/offline/main.cpp | 36 ++++++++++++------------------
 2 files changed, 15 insertions(+), 23 deletions(-)

diff --git a/backends/llamacpp/CMakeLists.txt b/backends/llamacpp/CMakeLists.txt
index adcc6af29dc..e536efc57a2 100644
--- a/backends/llamacpp/CMakeLists.txt
+++ b/backends/llamacpp/CMakeLists.txt
@@ -55,7 +55,7 @@ if (${LLAMA_CPP_BUILD_OFFLINE_RUNNER})
     message(STATUS "Building llama.cpp offline runner")
     add_executable(tgi_llamacpp_offline_runner offline/main.cpp)
 
-    target_link_libraries(tgi_llamacpp_offline_runner PUBLIC tgi_llama_cpp_backend_impl llama common)
+    target_link_libraries(tgi_llamacpp_offline_runner PUBLIC tgi_llamacpp_backend_impl llama common spdlog::spdlog fmt::fmt)
 endif ()
 
 
diff --git a/backends/llamacpp/offline/main.cpp b/backends/llamacpp/offline/main.cpp
index d8121d3df37..57e55efefe0 100644
--- a/backends/llamacpp/offline/main.cpp
+++ b/backends/llamacpp/offline/main.cpp
@@ -22,27 +22,19 @@ int main(int argc, char **argv) {
     const auto prompt = "My name is Morgan";
 
     const auto modelPath = absolute(std::filesystem::path(argv[1]));
-    if (auto maybeBackend = TgiLlamaCppBackend::FromGGUF(modelPath); maybeBackend.has_value()) {
-        // Retrieve the backend
-        auto [model, context] = *maybeBackend;
-        auto backend = TgiLlamaCppBackend(model, context);
-
-        // Generate
-        const auto promptTokens = backend.Tokenize(prompt);
-        const auto out = backend.Generate(promptTokens, 30, 1.0, 2.0, 0.0, 32);
-
-        if (out.has_value())
-            fmt::print(FMT_STRING("Generated: {}"), *out);
-        else {
-            const auto err = out.error();
-            fmt::print(fmt::emphasis::bold | fg(fmt::color::red), "Got an error: {:d}", static_cast<uint8_t>(err));
-        }
-
-    } else {
-        switch (maybeBackend.error()) {
-            case TgiLlamaCppBackendError::MODEL_FILE_DOESNT_EXIST:
-                fmt::print(fmt::emphasis::bold | fg(fmt::color::red), "Specified file {} doesnt exist", modelPath);
-                return maybeBackend.error();
-        }
+    const auto params = llama_model_default_params();
+    auto *model = llama_load_model_from_file(modelPath.c_str(), params);
+
+    auto backend = single_worker_backend_t(model, {});
+
+    // generate
+    const auto promptTokens = {128000, 9906, 856, 836, 374, 23809, 128001};
+    const auto out = backend.generate(promptTokens, {.max_new_tokens = 32}, {.top_k = 40});
+
+    if (out.has_value())
+        fmt::print(FMT_STRING("Generated: {}"), *out);
+    else {
+        const auto err = out.error();
+        fmt::print(fmt::emphasis::bold | fg(fmt::color::red), "Got an error: {:d}", static_cast<uint8_t>(err));
     }
 }

From d52b4c497887e097ea189fae8d29431e16e1e905 Mon Sep 17 00:00:00 2001
From: Morgan Funtowicz <funtowiczmo@gmail.com>
Date: Thu, 31 Oct 2024 17:51:57 +0100
Subject: [PATCH 24/91] feat(backend): full rework of the backend internal to
 safer c++

---
 backends/llamacpp/csrc/backend.cpp | 16 ++++--
 backends/llamacpp/csrc/backend.hpp | 12 +++++
 backends/llamacpp/csrc/ffi.hpp     | 86 ++++++++++++++++++++++++------
 backends/llamacpp/src/backend.rs   | 43 ++++++++-------
 backends/llamacpp/src/lib.rs       | 51 +++++++++++++++---
 backends/llamacpp/src/main.rs      |  2 +-
 6 files changed, 166 insertions(+), 44 deletions(-)

diff --git a/backends/llamacpp/csrc/backend.cpp b/backends/llamacpp/csrc/backend.cpp
index daf8de54484..f2f5d4c6aca 100644
--- a/backends/llamacpp/csrc/backend.cpp
+++ b/backends/llamacpp/csrc/backend.cpp
@@ -21,7 +21,7 @@ namespace huggingface::tgi::backends::llamacpp {
             batch.token[i] = input_tokens[i];
             batch.pos[i] = i;
             batch.n_seq_id[i] = 1;
-            batch.seq_id[i] = 0;
+            batch.seq_id[i] = nullptr;
             batch.logits[i] = false;
             ++batch.n_tokens;
         }
@@ -84,13 +84,12 @@ namespace huggingface::tgi::backends::llamacpp {
             const generation_context_t &generation_context,
             const std::optional<llama_decode_callback> &callback) const {
         // Store information about context and generation size
-        auto prompt_length = std::ssize(generation_context.input_tokens);
         auto max_new_tokens = generation_context.generation_params.max_new_tokens;
 
         // Convert sampling params to what llama.cpp is looking for
         auto sampler = generation_context.sampling_params.into_llama_sampler(mModel_.get());
 
-        // Setup the prompt
+        // Set up the prompt
         auto copy = std::vector(generation_context.input_tokens.begin(), generation_context.input_tokens.end());
         auto batch = llama_batch_get_one(copy.data(), copy.size());
 
@@ -168,4 +167,15 @@ namespace huggingface::tgi::backends::llamacpp {
     ) {
         return mWorker_.generate(mContext_.get(), {generation_params, sampling_params, tokens, out}, callback);
     }
+
+    std::expected<size_t, backend_error_t>
+    multi_worker_backend_t::generate(
+            std::span<const llama_token>,
+            std::span<llama_token>,
+            const generation_params_t &generation_params,
+            const sampling_params_t &sampling_params,
+            const std::optional<llama_decode_callback> &callback) {
+        SPDLOG_ERROR("Not implemented yet");
+        return 0uz;
+    }
 }
\ No newline at end of file
diff --git a/backends/llamacpp/csrc/backend.hpp b/backends/llamacpp/csrc/backend.hpp
index e7545a3c9e1..871490f255b 100644
--- a/backends/llamacpp/csrc/backend.hpp
+++ b/backends/llamacpp/csrc/backend.hpp
@@ -180,8 +180,20 @@ namespace huggingface::tgi::backends::llamacpp {
                 const sampling_params_t &sampling_params,
                 const std::optional<llama_decode_callback> &callback
         ) override;
+    };
 
+    class multi_worker_backend_t : backend_base_t {
+    private:
+        llama_context_smart_ptr mContext_;
 
+    public:
+        std::expected<size_t, backend_error_t> generate(
+                std::span<const llama_token>,
+                std::span<llama_token>,
+                const generation_params_t &generation_params,
+                const sampling_params_t &sampling_params,
+                const std::optional<llama_decode_callback> &callback
+        ) override;
     };
 }
 
diff --git a/backends/llamacpp/csrc/ffi.hpp b/backends/llamacpp/csrc/ffi.hpp
index d15728b9a02..182541141c6 100644
--- a/backends/llamacpp/csrc/ffi.hpp
+++ b/backends/llamacpp/csrc/ffi.hpp
@@ -12,36 +12,92 @@
 #include <spdlog/spdlog.h>
 #include "backend.hpp"
 
-namespace huggingface::tgi::backends::llamacpp::impl {
-    class LlamaCppBackendImpl;
+namespace huggingface::tgi::backends::llamacpp {
+    struct generation_params_t;
+    struct sampling_params_t;
+
+    class llama_cpp_backend_impl_t;
 }
 
 
 #include "backends/llamacpp/src/lib.rs.h"
 
 
-namespace huggingface::tgi::backends::llamacpp::impl {
+namespace huggingface::tgi::backends::llamacpp {
+
+    // Concept identifying types which have a .generate() -> size_t method to do in-place generation
+    template<typename T>
+    concept has_emplace_generate = requires(
+            T t,
+            std::span<const llama_token> input_tokens,
+            std::span<llama_token> generated_tokens,
+            const generation_params_t &generation_params,
+            const sampling_params_t &sampling_params,
+            llama_decode_callback callback
+    ) {
+        {
+        t.generate(input_tokens, generated_tokens, generation_params, sampling_params, callback)
+        } -> std::same_as<std::expected<size_t, backend_error_t>>;
+    };
+
+    static_assert(has_emplace_generate<single_worker_backend_t>,
+                  "single_worker_backend_t doesn't meet concept is_generate_emplace_capable");
+    static_assert(has_emplace_generate<multi_worker_backend_t>,
+                  "multi_worker_backend_t doesn't meet concept is_generate_emplace_capable");
 
-    class LlamaCppBackendException : std::exception {
+    class llama_cpp_backend_exception_t : std::exception {
 
     };
 
-    class LlamaCppBackendImpl {
+    /**
+     * Llama.cpp backend interfacing with Rust FFI layer
+     */
+    class llama_cpp_backend_impl_t {
     private:
-        BackendBase _inner;
+        std::variant<single_worker_backend_t, multi_worker_backend_t> mInner_;
 
     public:
-        LlamaCppBackendImpl(llama_model *model) : _inner(model) {}
-    };
+        explicit llama_cpp_backend_impl_t(single_worker_backend_t &&backend) : mInner_(std::move(backend)) {}
+
+        explicit llama_cpp_backend_impl_t(multi_worker_backend_t &&backend) : mInner_(std::move(backend)) {}
 
-    std::unique_ptr<LlamaCppBackendImpl> CreateLlamaCppBackendImpl(rust::Str modelPath, uint16_t nThreads) {
-        const auto cxxPath = std::string_view(modelPath);
-        if (auto maybe = TgiLlamaCppBackend::FromGGUF(std::filesystem::path(cxxPath), nThreads); maybe.has_value()) {
-            auto [model, context] = *maybe;
-            return std::make_unique<LlamaCppBackendImpl>(model, context);
-        } else {
-            throw LlamaCppBackendException();
+        size_t generate(
+                rust::Slice<const uint32_t> input_tokens,
+                rust::Slice <uint32_t> generated_tokens,
+                const generation_params_t &generation_params,
+                const sampling_params_t &sampling_params,
+                rust::Fn<void(uint32_t, bool)> callback
+        ) {
+            // Define the visitor lambda function which requires the has_emplace_generate constraint on T
+            static auto inner_fw = [=, &generation_params, &sampling_params]<has_emplace_generate T>(T &&backend)
+                    -> std::expected<size_t, backend_error_t> {
+
+                // Ask the compiler to create view over Rust slice transmuting from uint32_t* to int32_t*
+                auto input_tokens_v =
+                        std::span(reinterpret_cast<const llama_token *>(input_tokens.data()), input_tokens.size());
+                auto generated_tokens_v =
+                        std::span(reinterpret_cast<llama_token *>(generated_tokens.data()), generated_tokens.size());
+
+                return backend.generate(
+                        input_tokens_v, generated_tokens_v, generation_params, sampling_params, callback);
+            };
+
+            if (const auto result = std::visit(inner_fw, mInner_); result.has_value()) {
+                return *result;
+            } else {
+                throw llama_cpp_backend_exception_t();
+            }
         }
+    };
+
+    std::unique_ptr<llama_cpp_backend_impl_t> create_single_worker_backend(rust::Str modelPath) {
+        const auto cxxPath = std::string(modelPath);
+        auto params = llama_model_default_params();
+        params.use_mmap = true;
+
+        auto *model = llama_load_model_from_file(cxxPath.c_str(), params);
+        auto backend = single_worker_backend_t(model, std::nullopt);
+        return std::make_unique<llama_cpp_backend_impl_t>(std::move(backend));
     }
 }
 
diff --git a/backends/llamacpp/src/backend.rs b/backends/llamacpp/src/backend.rs
index af50470d2a3..6e9e8d2d8af 100644
--- a/backends/llamacpp/src/backend.rs
+++ b/backends/llamacpp/src/backend.rs
@@ -1,7 +1,8 @@
-use crate::ffi::{create_llamacpp_backend, LlamaCppBackendImpl};
+use crate::ffi::{
+    create_single_worker_backend, GenerationParams, LlamaCppBackendImpl, SamplingParams,
+};
 use async_trait::async_trait;
 use cxx::{Exception, UniquePtr};
-use std::ops::Deref;
 use std::path::{Path, PathBuf};
 use std::sync::Arc;
 use std::thread::spawn;
@@ -25,10 +26,7 @@ pub enum LlamaCppBackendError {
 pub struct LlamaCppBackend {}
 
 impl LlamaCppBackend {
-    pub fn new<P: AsRef<Path> + Send>(
-        model_path: P,
-        n_threads: u16,
-    ) -> Result<Self, LlamaCppBackendError> {
+    pub fn new<P: AsRef<Path> + Send>(model_path: P) -> Result<Self, LlamaCppBackendError> {
         let path = Arc::new(model_path.as_ref());
         if !path.exists() {
             return Err(LlamaCppBackendError::ModelFileDoesntExist(
@@ -36,13 +34,12 @@ impl LlamaCppBackend {
             ));
         }
 
-        let mut backend =
-            create_llamacpp_backend(path.to_str().unwrap(), n_threads).map_err(|err| {
-                LlamaCppBackendError::ModelInitializationFailed(
-                    path.to_path_buf(),
-                    err.what().to_string(),
-                )
-            })?;
+        let mut backend = create_single_worker_backend(path.to_str().unwrap()).map_err(|err| {
+            LlamaCppBackendError::ModelInitializationFailed(
+                path.to_path_buf(),
+                err.what().to_string(),
+            )
+        })?;
 
         info!(
             "Successfully initialized llama.cpp backend from {}",
@@ -57,12 +54,20 @@ impl LlamaCppBackend {
 
 fn scheduler_loop(mut backend: UniquePtr<LlamaCppBackendImpl>) {
     println!("Scheduler loop");
-    let tokens = [128000i32, 5159, 836, 374, 23809];
-    let mut generated = vec![0i32; 128];
-    match backend
-        .pin_mut()
-        .generate(&tokens, &mut generated, 40, 32, 1.0, 1.0, 1.0, 1.0, 2014)
-    {
+    let tokens = [128000u32, 5159, 836, 374, 23809];
+    let mut generated = vec![0u32; 16];
+    let generation_params = GenerationParams {
+        max_new_tokens: generated.len() as u32,
+    };
+    let sampling_params = SamplingParams::default();
+
+    match backend.pin_mut().generate(
+        &tokens,
+        &mut generated,
+        &generation_params,
+        &sampling_params,
+        |new_token_id: u32, is_eos: bool| println!("Generated {new_token_id} (is_eos: {is_eos})"),
+    ) {
         Ok(n_tokens) => {
             generated.truncate(n_tokens);
             println!("Generated {} tokens -> {:?}", n_tokens, generated);
diff --git a/backends/llamacpp/src/lib.rs b/backends/llamacpp/src/lib.rs
index 673fe130255..9fb79501ba9 100644
--- a/backends/llamacpp/src/lib.rs
+++ b/backends/llamacpp/src/lib.rs
@@ -1,17 +1,56 @@
+use crate::ffi::SamplingParams;
+
 pub mod backend;
 
-#[cxx::bridge(namespace = "huggingface::tgi::backends::llamacpp::impl")]
+impl Default for SamplingParams {
+    fn default() -> Self {
+        Self {
+            top_k: u32::MAX,
+            top_p: 1.0f32,
+            frequency_penalty: 0.0f32,
+            repetition_penalty: 0.0f32,
+            seed: 2014u64,
+        }
+    }
+}
+
+#[cxx::bridge(namespace = "huggingface::tgi::backends::llamacpp")]
 mod ffi {
+    struct GenerationParams {
+        max_new_tokens: u32,
+    }
+
+    struct SamplingParams {
+        top_k: u32,
+        top_p: f32,
+        frequency_penalty: f32,
+        repetition_penalty: f32,
+        seed: u64,
+    }
+
     unsafe extern "C++" {
         include!("backends/llamacpp/csrc/ffi.hpp");
 
+        #[cxx_name = "generation_params_t"]
+        type GenerationParams;
+
+        #[cxx_name = "sampling_params_t"]
+        type SamplingParams;
+
         /// Represent an instance of the llama.cpp backend instance on C++ side
+        #[cxx_name = "llama_cpp_backend_impl_t"]
         type LlamaCppBackendImpl;
 
-        #[rust_name = "create_llamacpp_backend"]
-        fn CreateLlamaCppBackendImpl(
-            modelPath: &str,
-            n_threads: u16,
-        ) -> Result<UniquePtr<LlamaCppBackendImpl>>;
+        #[rust_name = "create_single_worker_backend"]
+        fn create_single_worker_backend(modelPath: &str) -> Result<UniquePtr<LlamaCppBackendImpl>>;
+
+        fn generate(
+            self: Pin<&mut LlamaCppBackendImpl>,
+            tokens: &[u32],
+            generated: &mut [u32],
+            generation_params: &GenerationParams,
+            sampling_params: &SamplingParams,
+            callback: fn(u32, bool),
+        ) -> Result<usize>;
     }
 }
diff --git a/backends/llamacpp/src/main.rs b/backends/llamacpp/src/main.rs
index 3920da21d9d..62f81848744 100644
--- a/backends/llamacpp/src/main.rs
+++ b/backends/llamacpp/src/main.rs
@@ -161,7 +161,7 @@ async fn main() -> Result<(), RouterError> {
         }
     }
 
-    let backend = LlamaCppBackend::new(gguf_path, cores_per_instance)?;
+    let backend = LlamaCppBackend::new(gguf_path)?;
 
     // Run server
     server::run(

From 3af2c6837c77d0d38a7feaf8819289615ce9821a Mon Sep 17 00:00:00 2001
From: Morgan Funtowicz <funtowiczmo@gmail.com>
Date: Thu, 31 Oct 2024 17:52:18 +0100
Subject: [PATCH 25/91] misc(offline): match rework

---
 backends/llamacpp/offline/main.cpp | 6 ++----
 1 file changed, 2 insertions(+), 4 deletions(-)

diff --git a/backends/llamacpp/offline/main.cpp b/backends/llamacpp/offline/main.cpp
index 57e55efefe0..7eb7dbde0a9 100644
--- a/backends/llamacpp/offline/main.cpp
+++ b/backends/llamacpp/offline/main.cpp
@@ -18,9 +18,7 @@ int main(int argc, char **argv) {
     }
 
     spdlog::set_level(spdlog::level::debug);
-
-    const auto prompt = "My name is Morgan";
-
+    
     const auto modelPath = absolute(std::filesystem::path(argv[1]));
     const auto params = llama_model_default_params();
     auto *model = llama_load_model_from_file(modelPath.c_str(), params);
@@ -28,7 +26,7 @@ int main(int argc, char **argv) {
     auto backend = single_worker_backend_t(model, {});
 
     // generate
-    const auto promptTokens = {128000, 9906, 856, 836, 374, 23809, 128001};
+    const auto promptTokens = {128000, 5159, 836, 374, 23809, 11};
     const auto out = backend.generate(promptTokens, {.max_new_tokens = 32}, {.top_k = 40});
 
     if (out.has_value())

From f39edc72ff4eaa3226d3ea469ebad6c107dfd5cb Mon Sep 17 00:00:00 2001
From: Morgan Funtowicz <funtowiczmo@gmail.com>
Date: Thu, 31 Oct 2024 21:32:29 +0100
Subject: [PATCH 26/91] feat(backend): add mapping for ignore_eos_token
 stopping criteria

---
 backends/llamacpp/csrc/backend.cpp | 6 ++++--
 backends/llamacpp/csrc/backend.hpp | 3 ++-
 backends/llamacpp/src/lib.rs       | 1 +
 3 files changed, 7 insertions(+), 3 deletions(-)

diff --git a/backends/llamacpp/csrc/backend.cpp b/backends/llamacpp/csrc/backend.cpp
index f2f5d4c6aca..665f78df789 100644
--- a/backends/llamacpp/csrc/backend.cpp
+++ b/backends/llamacpp/csrc/backend.cpp
@@ -113,8 +113,10 @@ namespace huggingface::tgi::backends::llamacpp {
                 auto new_token_id = llama_sampler_sample(sampler.get(), context, -1);
                 auto is_eos = llama_token_is_eog(mModel_.get(), new_token_id);
 
-                generation_context.generated_tokens[n_decoded_tokens] = new_token_id;
-                generating = !is_eos;
+                if (!generation_context.generation_params.ignore_eos_token) {
+                    generation_context.generated_tokens[n_decoded_tokens] = new_token_id;
+                    generating = !is_eos;
+                }
 
                 // Bubble up the generated token if a callback is provided
                 std::invoke(std::forward<const llama_decode_callback>(callback_), new_token_id, is_eos);
diff --git a/backends/llamacpp/csrc/backend.hpp b/backends/llamacpp/csrc/backend.hpp
index 871490f255b..44952a5ddbf 100644
--- a/backends/llamacpp/csrc/backend.hpp
+++ b/backends/llamacpp/csrc/backend.hpp
@@ -27,7 +27,7 @@ namespace huggingface::tgi::backends::llamacpp {
     typedef std::unique_ptr<llama_context, decltype(llama_context_deleter)> llama_context_smart_ptr;
 
     typedef std::function<void(llama_token, bool)> llama_decode_callback;
-    static constexpr auto llama_void_callback = [](llama_token token_id, bool is_eos) {};
+    static constexpr auto llama_void_callback = [](llama_token, bool) {};
 
     /**
      *
@@ -59,6 +59,7 @@ namespace huggingface::tgi::backends::llamacpp {
      */
     struct generation_params_t {
         uint32_t max_new_tokens = std::numeric_limits<uint32_t>::max();
+        bool ignore_eos_token = false;
     };
 
     struct generation_context_t {
diff --git a/backends/llamacpp/src/lib.rs b/backends/llamacpp/src/lib.rs
index 9fb79501ba9..33088d54c25 100644
--- a/backends/llamacpp/src/lib.rs
+++ b/backends/llamacpp/src/lib.rs
@@ -18,6 +18,7 @@ impl Default for SamplingParams {
 mod ffi {
     struct GenerationParams {
         max_new_tokens: u32,
+        ignore_eos_token: bool,
     }
 
     struct SamplingParams {

From d4aee42fd8dc16113c42c1d6032f405717c5794b Mon Sep 17 00:00:00 2001
From: Morgan Funtowicz <funtowiczmo@gmail.com>
Date: Fri, 1 Nov 2024 00:49:50 +0100
Subject: [PATCH 27/91] feat(backend): add logit parameter in the callback fn

---
 backends/llamacpp/csrc/backend.cpp | 4 +++-
 backends/llamacpp/csrc/backend.hpp | 4 ++--
 backends/llamacpp/csrc/ffi.hpp     | 2 +-
 backends/llamacpp/src/lib.rs       | 2 +-
 4 files changed, 7 insertions(+), 5 deletions(-)

diff --git a/backends/llamacpp/csrc/backend.cpp b/backends/llamacpp/csrc/backend.cpp
index 665f78df789..50d5897cb25 100644
--- a/backends/llamacpp/csrc/backend.cpp
+++ b/backends/llamacpp/csrc/backend.cpp
@@ -111,6 +111,7 @@ namespace huggingface::tgi::backends::llamacpp {
             if (LLAMA_SUCCESS(status)) {
                 // Sample the new token
                 auto new_token_id = llama_sampler_sample(sampler.get(), context, -1);
+                auto new_token_logits = 0.0f; // TODO: return logit
                 auto is_eos = llama_token_is_eog(mModel_.get(), new_token_id);
 
                 if (!generation_context.generation_params.ignore_eos_token) {
@@ -119,7 +120,8 @@ namespace huggingface::tgi::backends::llamacpp {
                 }
 
                 // Bubble up the generated token if a callback is provided
-                std::invoke(std::forward<const llama_decode_callback>(callback_), new_token_id, is_eos);
+                std::invoke(
+                        std::forward<const llama_decode_callback>(callback_), new_token_id, new_token_logits, is_eos);
 
                 batch = llama_batch_get_one(&new_token_id, 1);
             }
diff --git a/backends/llamacpp/csrc/backend.hpp b/backends/llamacpp/csrc/backend.hpp
index 44952a5ddbf..288bf36afce 100644
--- a/backends/llamacpp/csrc/backend.hpp
+++ b/backends/llamacpp/csrc/backend.hpp
@@ -26,8 +26,8 @@ namespace huggingface::tgi::backends::llamacpp {
     static constexpr auto llama_context_deleter = [](llama_context *pContext) { llama_free(pContext); };
     typedef std::unique_ptr<llama_context, decltype(llama_context_deleter)> llama_context_smart_ptr;
 
-    typedef std::function<void(llama_token, bool)> llama_decode_callback;
-    static constexpr auto llama_void_callback = [](llama_token, bool) {};
+    typedef std::function<void(llama_token, float_t, bool)> llama_decode_callback;
+    static constexpr auto llama_void_callback = [](llama_token, float_t, bool) {};
 
     /**
      *
diff --git a/backends/llamacpp/csrc/ffi.hpp b/backends/llamacpp/csrc/ffi.hpp
index 182541141c6..5c404b01176 100644
--- a/backends/llamacpp/csrc/ffi.hpp
+++ b/backends/llamacpp/csrc/ffi.hpp
@@ -66,7 +66,7 @@ namespace huggingface::tgi::backends::llamacpp {
                 rust::Slice <uint32_t> generated_tokens,
                 const generation_params_t &generation_params,
                 const sampling_params_t &sampling_params,
-                rust::Fn<void(uint32_t, bool)> callback
+                rust::Fn<void(uint32_t, float_t, bool)> callback
         ) {
             // Define the visitor lambda function which requires the has_emplace_generate constraint on T
             static auto inner_fw = [=, &generation_params, &sampling_params]<has_emplace_generate T>(T &&backend)
diff --git a/backends/llamacpp/src/lib.rs b/backends/llamacpp/src/lib.rs
index 33088d54c25..8d51a15a1bb 100644
--- a/backends/llamacpp/src/lib.rs
+++ b/backends/llamacpp/src/lib.rs
@@ -51,7 +51,7 @@ mod ffi {
             generated: &mut [u32],
             generation_params: &GenerationParams,
             sampling_params: &SamplingParams,
-            callback: fn(u32, bool),
+            callback: fn(u32, f32, bool),
         ) -> Result<usize>;
     }
 }

From 612f2f939f2b40d76db4a77032695fb90e1fd084 Mon Sep 17 00:00:00 2001
From: Morgan Funtowicz <funtowiczmo@gmail.com>
Date: Fri, 1 Nov 2024 00:50:42 +0100
Subject: [PATCH 28/91] feat(backend): bind incoming request to the server

---
 backends/llamacpp/src/backend.rs | 158 +++++++++++++++++++++++++------
 backends/llamacpp/src/lib.rs     |   2 +
 2 files changed, 129 insertions(+), 31 deletions(-)

diff --git a/backends/llamacpp/src/backend.rs b/backends/llamacpp/src/backend.rs
index 6e9e8d2d8af..670f4397901 100644
--- a/backends/llamacpp/src/backend.rs
+++ b/backends/llamacpp/src/backend.rs
@@ -2,18 +2,54 @@ use crate::ffi::{
     create_single_worker_backend, GenerationParams, LlamaCppBackendImpl, SamplingParams,
 };
 use async_trait::async_trait;
-use cxx::{Exception, UniquePtr};
+use cxx::UniquePtr;
 use std::path::{Path, PathBuf};
+use std::sync::mpsc::{channel, Receiver, SendError, Sender};
 use std::sync::Arc;
-use std::thread::spawn;
+use std::thread::{spawn, JoinHandle};
 use text_generation_router::infer::{Backend, InferError, InferStreamResponse};
-use text_generation_router::validation::ValidGenerateRequest;
+use text_generation_router::validation::{
+    ValidGenerateRequest, ValidParameters, ValidStoppingParameters,
+};
+use text_generation_router::Token;
 use thiserror::Error;
+use tokio::sync::mpsc::{unbounded_channel, UnboundedSender};
+use tokio::sync::TryAcquireError;
 use tokio_stream::wrappers::UnboundedReceiverStream;
-use tracing::info;
+use tracing::{error, info};
 
 unsafe impl Send for LlamaCppBackendImpl {}
 
+impl From<&ValidParameters> for SamplingParams {
+    fn from(v: &ValidParameters) -> Self {
+        Self {
+            top_k: v.top_k,
+            top_p: v.top_p,
+            frequency_penalty: v.frequency_penalty,
+            repetition_penalty: v.repetition_penalty,
+            seed: v.seed,
+        }
+    }
+}
+
+impl From<&ValidStoppingParameters> for GenerationParams {
+    fn from(v: &ValidStoppingParameters) -> Self {
+        Self {
+            max_new_tokens: v.max_new_tokens,
+            ignore_eos_token: v.ignore_eos_token,
+        }
+    }
+}
+
+#[cfg_attr(debug_assertions, derive(Debug))]
+struct InferContext {
+    pub(crate) stream: UnboundedSender<Result<InferStreamResponse, InferError>>,
+    pub(crate) input_tokens: Arc<Vec<u32>>,
+    pub(crate) generated_tokens: Vec<u32>,
+    pub(crate) generation_params: GenerationParams,
+    pub(crate) sampling_params: SamplingParams,
+}
+
 #[derive(Debug, Error)]
 pub enum LlamaCppBackendError {
     #[error("Provided GGUF model path {0} doesn't exist")]
@@ -23,7 +59,10 @@ pub enum LlamaCppBackendError {
     ModelInitializationFailed(PathBuf, String),
 }
 
-pub struct LlamaCppBackend {}
+pub struct LlamaCppBackend {
+    backlog: Sender<InferContext>,
+    scheduler_handle: JoinHandle<()>,
+}
 
 impl LlamaCppBackend {
     pub fn new<P: AsRef<Path> + Send>(model_path: P) -> Result<Self, LlamaCppBackendError> {
@@ -34,7 +73,7 @@ impl LlamaCppBackend {
             ));
         }
 
-        let mut backend = create_single_worker_backend(path.to_str().unwrap()).map_err(|err| {
+        let backend = create_single_worker_backend(path.to_str().unwrap()).map_err(|err| {
             LlamaCppBackendError::ModelInitializationFailed(
                 path.to_path_buf(),
                 err.what().to_string(),
@@ -46,33 +85,67 @@ impl LlamaCppBackend {
             path.display()
         );
 
-        let j = spawn(|| scheduler_loop(backend));
-        j.join().ok();
-        Ok(Self {})
+        let (submitter, receiver) = channel();
+        let handle = spawn(|| scheduler_loop(backend, receiver));
+        Ok(Self {
+            backlog: submitter,
+            scheduler_handle: handle,
+        })
     }
 }
 
-fn scheduler_loop(mut backend: UniquePtr<LlamaCppBackendImpl>) {
-    println!("Scheduler loop");
-    let tokens = [128000u32, 5159, 836, 374, 23809];
-    let mut generated = vec![0u32; 16];
-    let generation_params = GenerationParams {
-        max_new_tokens: generated.len() as u32,
-    };
-    let sampling_params = SamplingParams::default();
-
-    match backend.pin_mut().generate(
-        &tokens,
-        &mut generated,
-        &generation_params,
-        &sampling_params,
-        |new_token_id: u32, is_eos: bool| println!("Generated {new_token_id} (is_eos: {is_eos})"),
-    ) {
-        Ok(n_tokens) => {
-            generated.truncate(n_tokens);
-            println!("Generated {} tokens -> {:?}", n_tokens, generated);
+fn scheduler_loop(
+    mut backend: UniquePtr<LlamaCppBackendImpl>,
+    mut backlog: Receiver<InferContext>,
+) {
+    loop {
+        println!("Looping");
+        if let Ok(mut ctx) = backlog.recv() {
+            println!("{ctx:?}, {}", &ctx.generated_tokens.capacity());
+            match backend.pin_mut().generate(
+                &ctx.input_tokens,
+                &mut ctx.generated_tokens,
+                &ctx.generation_params,
+                &ctx.sampling_params,
+                |new_token_id: u32, new_token_logit: f32, is_eos: bool| {
+                    let response = InferStreamResponse::Intermediate {
+                        token: Token {
+                            id: new_token_id,
+                            text: "".to_string(),
+                            logprob: new_token_logit,
+                            special: false,
+                        },
+                        top_tokens: vec![],
+                    };
+                    println!("Generated token: {response:?}");
+                    // let _ = tokio::spawn(async {
+                    //     match ctx.stream.send(Ok(response)) {
+                    //         Ok(_) => {}
+                    //         Err(ref err) => {
+                    //             error!(
+                    //                 "Failed to send back token to the client: {}",
+                    //                 err.to_string()
+                    //             );
+                    //         }
+                    //     }
+                    // });
+                },
+            ) {
+                Ok(n_tokens) => {
+                    unsafe {
+                        ctx.generated_tokens.set_len(n_tokens);
+                    }
+                    println!(
+                        "Generated {} tokens -> {:?}",
+                        n_tokens, &ctx.generated_tokens
+                    );
+                }
+                Err(err) => println!("Error: {}", err),
+            }
+        } else {
+            info!("IPC channel is closed, exiting the scheduler loop");
+            break;
         }
-        Err(err) => println!("Error: {}", err),
     }
 }
 
@@ -80,9 +153,32 @@ fn scheduler_loop(mut backend: UniquePtr<LlamaCppBackendImpl>) {
 impl Backend for LlamaCppBackend {
     fn schedule(
         &self,
-        _request: ValidGenerateRequest,
+        request: ValidGenerateRequest,
     ) -> Result<UnboundedReceiverStream<Result<InferStreamResponse, InferError>>, InferError> {
-        Err(InferError::GenerationError("Not implemented yet".into()))
+        if let Some(input_ids) = request.input_ids {
+            let (sx, rx) = unbounded_channel();
+            let sampling_params = SamplingParams::from(&request.parameters);
+            let generation_params = GenerationParams::from(&request.stopping_parameters);
+
+            let ctx = InferContext {
+                stream: sx,
+                input_tokens: Arc::clone(&input_ids),
+                generated_tokens: Vec::with_capacity(generation_params.max_new_tokens as usize),
+                generation_params,
+                sampling_params,
+            };
+
+            match self.backlog.send(ctx) {
+                Ok(_) => Ok(UnboundedReceiverStream::new(rx)),
+                Err(_) => Err(InferError::GenerationError(
+                    "Failed to sent the request".to_string(),
+                )),
+            }
+        } else {
+            Err(InferError::GenerationError(
+                "Unsupported modalities".to_string(),
+            ))
+        }
     }
 
     async fn health(&self, _: bool) -> bool {
diff --git a/backends/llamacpp/src/lib.rs b/backends/llamacpp/src/lib.rs
index 8d51a15a1bb..489188c1a6f 100644
--- a/backends/llamacpp/src/lib.rs
+++ b/backends/llamacpp/src/lib.rs
@@ -16,11 +16,13 @@ impl Default for SamplingParams {
 
 #[cxx::bridge(namespace = "huggingface::tgi::backends::llamacpp")]
 mod ffi {
+    #[derive(Debug, Copy, Clone)]
     struct GenerationParams {
         max_new_tokens: u32,
         ignore_eos_token: bool,
     }
 
+    #[derive(Debug, Copy, Clone)]
     struct SamplingParams {
         top_k: u32,
         top_p: f32,

From b50dcddbb8d5b02633083dbcb626d33b531fc9b3 Mon Sep 17 00:00:00 2001
From: Morgan Funtowicz <funtowiczmo@gmail.com>
Date: Sun, 3 Nov 2024 00:36:32 +0100
Subject: [PATCH 29/91] feat(backend): avoid dropping the boxed stream at the
 end of the callback

---
 backends/llamacpp/csrc/ffi.hpp   | 21 +++++++--
 backends/llamacpp/src/backend.rs | 80 +++++++++++++++++++-------------
 backends/llamacpp/src/lib.rs     | 23 +++++++--
 3 files changed, 84 insertions(+), 40 deletions(-)

diff --git a/backends/llamacpp/csrc/ffi.hpp b/backends/llamacpp/csrc/ffi.hpp
index 5c404b01176..c823b72b83f 100644
--- a/backends/llamacpp/csrc/ffi.hpp
+++ b/backends/llamacpp/csrc/ffi.hpp
@@ -21,6 +21,7 @@ namespace huggingface::tgi::backends::llamacpp {
 
 
 #include "backends/llamacpp/src/lib.rs.h"
+#include "rust/cxx.h"
 
 
 namespace huggingface::tgi::backends::llamacpp {
@@ -61,17 +62,22 @@ namespace huggingface::tgi::backends::llamacpp {
 
         explicit llama_cpp_backend_impl_t(multi_worker_backend_t &&backend) : mInner_(std::move(backend)) {}
 
-        size_t generate(
+        size_t stream(
                 rust::Slice<const uint32_t> input_tokens,
                 rust::Slice <uint32_t> generated_tokens,
-                const generation_params_t &generation_params,
+                const generation_params_t generation_params,
                 const sampling_params_t &sampling_params,
-                rust::Fn<void(uint32_t, float_t, bool)> callback
+                OpaqueStream *stream,
+                rust::Fn<void(OpaqueStream *, uint32_t, float_t, bool)> callback
         ) {
             // Define the visitor lambda function which requires the has_emplace_generate constraint on T
-            static auto inner_fw = [=, &generation_params, &sampling_params]<has_emplace_generate T>(T &&backend)
+            static auto inner_fw = [=, &sampling_params, &stream, &callback]<has_emplace_generate T>(T &&backend)
                     -> std::expected<size_t, backend_error_t> {
 
+                auto context_forwarding_callback = [=, &stream](uint32_t new_token_id, float_t logits, bool is_eos){
+                    callback(stream, new_token_id, logits, is_eos);
+                };
+
                 // Ask the compiler to create view over Rust slice transmuting from uint32_t* to int32_t*
                 auto input_tokens_v =
                         std::span(reinterpret_cast<const llama_token *>(input_tokens.data()), input_tokens.size());
@@ -79,7 +85,12 @@ namespace huggingface::tgi::backends::llamacpp {
                         std::span(reinterpret_cast<llama_token *>(generated_tokens.data()), generated_tokens.size());
 
                 return backend.generate(
-                        input_tokens_v, generated_tokens_v, generation_params, sampling_params, callback);
+                        input_tokens_v,
+                        generated_tokens_v,
+                        generation_params,
+                        sampling_params,
+                        context_forwarding_callback
+                );
             };
 
             if (const auto result = std::visit(inner_fw, mInner_); result.has_value()) {
diff --git a/backends/llamacpp/src/backend.rs b/backends/llamacpp/src/backend.rs
index 670f4397901..09afbc7bec0 100644
--- a/backends/llamacpp/src/backend.rs
+++ b/backends/llamacpp/src/backend.rs
@@ -1,23 +1,27 @@
 use crate::ffi::{
     create_single_worker_backend, GenerationParams, LlamaCppBackendImpl, SamplingParams,
 };
+use crate::OpaqueStream;
 use async_trait::async_trait;
 use cxx::UniquePtr;
 use std::path::{Path, PathBuf};
 use std::sync::mpsc::{channel, Receiver, SendError, Sender};
 use std::sync::Arc;
 use std::thread::{spawn, JoinHandle};
-use text_generation_router::infer::{Backend, InferError, InferStreamResponse};
+use text_generation_router::infer::{Backend, GeneratedText, InferError, InferStreamResponse};
 use text_generation_router::validation::{
     ValidGenerateRequest, ValidParameters, ValidStoppingParameters,
 };
-use text_generation_router::Token;
+use text_generation_router::{FinishReason, Token};
 use thiserror::Error;
 use tokio::sync::mpsc::{unbounded_channel, UnboundedSender};
 use tokio::sync::TryAcquireError;
+use tokio::time::Instant;
 use tokio_stream::wrappers::UnboundedReceiverStream;
 use tracing::{error, info};
 
+type BoxedOpaqueStream = Box<OpaqueStream>;
+
 unsafe impl Send for LlamaCppBackendImpl {}
 
 impl From<&ValidParameters> for SamplingParams {
@@ -86,7 +90,7 @@ impl LlamaCppBackend {
         );
 
         let (submitter, receiver) = channel();
-        let handle = spawn(|| scheduler_loop(backend, receiver));
+        let handle = unsafe { spawn(|| scheduler_loop(backend, receiver)) };
         Ok(Self {
             backlog: submitter,
             scheduler_handle: handle,
@@ -94,47 +98,59 @@ impl LlamaCppBackend {
     }
 }
 
-fn scheduler_loop(
+fn llama_generate_callback(
+    channel: *mut OpaqueStream,
+    new_token_id: u32,
+    new_token_logit: f32,
+    is_eos: bool,
+) {
+    let response = InferStreamResponse::Intermediate {
+        token: Token {
+            id: new_token_id,
+            text: "".to_string(),
+            logprob: new_token_logit,
+            special: false,
+        },
+        top_tokens: vec![],
+    };
+    println!("Generated token: {new_token_id} -> logits={new_token_logit}, is_eos={is_eos}");
+
+    unsafe {
+        if let Err(ref err) = (*channel).0.send(Ok(response)) {
+            error!(
+                "Failed to send back token to the client: {}",
+                err.to_string()
+            );
+        }
+    }
+}
+
+unsafe fn scheduler_loop(
     mut backend: UniquePtr<LlamaCppBackendImpl>,
     mut backlog: Receiver<InferContext>,
 ) {
     loop {
-        println!("Looping");
         if let Ok(mut ctx) = backlog.recv() {
-            println!("{ctx:?}, {}", &ctx.generated_tokens.capacity());
-            match backend.pin_mut().generate(
+            let stream = BoxedOpaqueStream::new(OpaqueStream(ctx.stream));
+            let stream_ptr = Box::into_raw(stream);
+            let result = backend.pin_mut().stream(
                 &ctx.input_tokens,
                 &mut ctx.generated_tokens,
-                &ctx.generation_params,
+                ctx.generation_params,
                 &ctx.sampling_params,
-                |new_token_id: u32, new_token_logit: f32, is_eos: bool| {
-                    let response = InferStreamResponse::Intermediate {
-                        token: Token {
-                            id: new_token_id,
-                            text: "".to_string(),
-                            logprob: new_token_logit,
-                            special: false,
-                        },
-                        top_tokens: vec![],
-                    };
-                    println!("Generated token: {response:?}");
-                    // let _ = tokio::spawn(async {
-                    //     match ctx.stream.send(Ok(response)) {
-                    //         Ok(_) => {}
-                    //         Err(ref err) => {
-                    //             error!(
-                    //                 "Failed to send back token to the client: {}",
-                    //                 err.to_string()
-                    //             );
-                    //         }
-                    //     }
-                    // });
-                },
-            ) {
+                stream_ptr,
+                llama_generate_callback,
+            );
+
+            // Make sure we re-keep track of the OpaqueStream box
+            let _ = Box::from_raw(stream_ptr);
+
+            match result {
                 Ok(n_tokens) => {
                     unsafe {
                         ctx.generated_tokens.set_len(n_tokens);
                     }
+
                     println!(
                         "Generated {} tokens -> {:?}",
                         n_tokens, &ctx.generated_tokens
diff --git a/backends/llamacpp/src/lib.rs b/backends/llamacpp/src/lib.rs
index 489188c1a6f..f923526f98b 100644
--- a/backends/llamacpp/src/lib.rs
+++ b/backends/llamacpp/src/lib.rs
@@ -1,4 +1,6 @@
 use crate::ffi::SamplingParams;
+use text_generation_router::infer::{InferError, InferStreamResponse};
+use tokio::sync::mpsc::UnboundedSender;
 
 pub mod backend;
 
@@ -14,6 +16,8 @@ impl Default for SamplingParams {
     }
 }
 
+struct OpaqueStream(UnboundedSender<Result<InferStreamResponse, InferError>>);
+
 #[cxx::bridge(namespace = "huggingface::tgi::backends::llamacpp")]
 mod ffi {
     #[derive(Debug, Copy, Clone)]
@@ -31,6 +35,10 @@ mod ffi {
         seed: u64,
     }
 
+    extern "Rust" {
+        type OpaqueStream;
+    }
+
     unsafe extern "C++" {
         include!("backends/llamacpp/csrc/ffi.hpp");
 
@@ -47,13 +55,22 @@ mod ffi {
         #[rust_name = "create_single_worker_backend"]
         fn create_single_worker_backend(modelPath: &str) -> Result<UniquePtr<LlamaCppBackendImpl>>;
 
-        fn generate(
+        // fn generate(
+        //     self: Pin<&mut LlamaCppBackendImpl>,
+        //     tokens: &[u32],
+        //     generated: &mut [u32],
+        //     generation_params: GenerationParams,
+        //     sampling_params: &SamplingParams,
+        // ) -> Result<usize>;
+
+        unsafe fn stream(
             self: Pin<&mut LlamaCppBackendImpl>,
             tokens: &[u32],
             generated: &mut [u32],
-            generation_params: &GenerationParams,
+            generation_params: GenerationParams,
             sampling_params: &SamplingParams,
-            callback: fn(u32, f32, bool),
+            stream: *mut OpaqueStream,
+            callback: unsafe fn(*mut OpaqueStream, u32, f32, bool),
         ) -> Result<usize>;
     }
 }

From 3e82f14f577fd2ac3c8b2b4352e0c0bfbca8373d Mon Sep 17 00:00:00 2001
From: Morgan Funtowicz <funtowiczmo@gmail.com>
Date: Sun, 3 Nov 2024 00:46:04 +0100
Subject: [PATCH 30/91] feat(backend): somewhat generates the final infer
 response

---
 backends/llamacpp/src/backend.rs | 34 +++++++++++++++++++++++---------
 1 file changed, 25 insertions(+), 9 deletions(-)

diff --git a/backends/llamacpp/src/backend.rs b/backends/llamacpp/src/backend.rs
index 09afbc7bec0..5262bd8a919 100644
--- a/backends/llamacpp/src/backend.rs
+++ b/backends/llamacpp/src/backend.rs
@@ -18,7 +18,7 @@ use tokio::sync::mpsc::{unbounded_channel, UnboundedSender};
 use tokio::sync::TryAcquireError;
 use tokio::time::Instant;
 use tokio_stream::wrappers::UnboundedReceiverStream;
-use tracing::{error, info};
+use tracing::{debug, error, info};
 
 type BoxedOpaqueStream = Box<OpaqueStream>;
 
@@ -113,7 +113,7 @@ fn llama_generate_callback(
         },
         top_tokens: vec![],
     };
-    println!("Generated token: {new_token_id} -> logits={new_token_logit}, is_eos={is_eos}");
+    debug!("Generated token: {new_token_id} -> logits={new_token_logit}, is_eos={is_eos}");
 
     unsafe {
         if let Err(ref err) = (*channel).0.send(Ok(response)) {
@@ -121,7 +121,7 @@ fn llama_generate_callback(
                 "Failed to send back token to the client: {}",
                 err.to_string()
             );
-        }
+        };
     }
 }
 
@@ -131,6 +131,7 @@ unsafe fn scheduler_loop(
 ) {
     loop {
         if let Ok(mut ctx) = backlog.recv() {
+            let start = Instant::now();
             let stream = BoxedOpaqueStream::new(OpaqueStream(ctx.stream));
             let stream_ptr = Box::into_raw(stream);
             let result = backend.pin_mut().stream(
@@ -143,7 +144,7 @@ unsafe fn scheduler_loop(
             );
 
             // Make sure we re-keep track of the OpaqueStream box
-            let _ = Box::from_raw(stream_ptr);
+            let stream = Box::from_raw(stream_ptr);
 
             match result {
                 Ok(n_tokens) => {
@@ -151,12 +152,27 @@ unsafe fn scheduler_loop(
                         ctx.generated_tokens.set_len(n_tokens);
                     }
 
-                    println!(
-                        "Generated {} tokens -> {:?}",
-                        n_tokens, &ctx.generated_tokens
-                    );
+                    let _ = stream.0.send(Ok(InferStreamResponse::End {
+                        token: Token {
+                            id: ctx.generated_tokens[n_tokens - 1],
+                            text: "".to_string(),
+                            logprob: 0.0,
+                            special: false,
+                        },
+                        top_tokens: vec![],
+                        generated_text: GeneratedText {
+                            text: "".to_string(),
+                            generated_tokens: n_tokens as u32,
+                            finish_reason: FinishReason::Length,
+                            seed: Some(ctx.sampling_params.seed),
+                        },
+                        start,
+                        queued: start,
+                    }));
+
+                    debug!("Generated {n_tokens} tokens -> {:?}", ctx.generated_tokens);
                 }
-                Err(err) => println!("Error: {}", err),
+                Err(err) => println!("Error: {err}"),
             }
         } else {
             info!("IPC channel is closed, exiting the scheduler loop");

From bd8f0f15e11d433e31d64ae101d6e5c62b1765d6 Mon Sep 17 00:00:00 2001
From: Morgan Funtowicz <funtowiczmo@gmail.com>
Date: Sun, 3 Nov 2024 00:52:58 +0100
Subject: [PATCH 31/91] feat(backend): fix invalid reference to ctx instead of
 context in release build

---
 backends/llamacpp/csrc/backend.cpp | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/backends/llamacpp/csrc/backend.cpp b/backends/llamacpp/csrc/backend.cpp
index 50d5897cb25..b88067f8b1a 100644
--- a/backends/llamacpp/csrc/backend.cpp
+++ b/backends/llamacpp/csrc/backend.cpp
@@ -105,7 +105,7 @@ namespace huggingface::tgi::backends::llamacpp {
             const auto latency = std::chrono::duration_cast<std::chrono::milliseconds>(end - start);
             SPDLOG_DEBUG(FMT_STRING("Successfully decoded {:d} token(s) in {}"), batch.n_tokens, latency);
 #else
-            const auto status = llama_decode(ctx, batch);
+            const auto status = llama_decode(context, batch);
 #endif
             batch.n_tokens = 0;
             if (LLAMA_SUCCESS(status)) {

From 2cdfed94d92299479d9d022f9d88dd11cf785a29 Mon Sep 17 00:00:00 2001
From: Morgan Funtowicz <funtowiczmo@gmail.com>
Date: Sun, 3 Nov 2024 00:53:17 +0100
Subject: [PATCH 32/91] feat(backend): correctly link to shared fmt and spdlog
 instead of static

---
 backends/llamacpp/build.rs | 4 ++--
 1 file changed, 2 insertions(+), 2 deletions(-)

diff --git a/backends/llamacpp/build.rs b/backends/llamacpp/build.rs
index 6d6bd514957..eefc6403278 100644
--- a/backends/llamacpp/build.rs
+++ b/backends/llamacpp/build.rs
@@ -102,8 +102,8 @@ fn main() {
         println!("cargo:rustc-link-lib=static=fmtd");
         println!("cargo:rustc-link-lib=static=spdlogd");
     } else {
-        println!("cargo:rustc-link-lib=static=fmt");
-        println!("cargo:rustc-link-lib=static=spdlog");
+        println!("cargo:rustc-link-lib=fmt");
+        println!("cargo:rustc-link-lib=spdlog");
     }
 
     println!("cargo:rustc-link-lib=static=common");

From 86a2ae6ba2ad74b28521cb9f1732b4af96811709 Mon Sep 17 00:00:00 2001
From: Morgan Funtowicz <funtowiczmo@gmail.com>
Date: Sun, 3 Nov 2024 00:53:34 +0100
Subject: [PATCH 33/91] chore: unsued variables

---
 backends/llamacpp/src/backend.rs | 5 ++---
 1 file changed, 2 insertions(+), 3 deletions(-)

diff --git a/backends/llamacpp/src/backend.rs b/backends/llamacpp/src/backend.rs
index 5262bd8a919..bfdac34b520 100644
--- a/backends/llamacpp/src/backend.rs
+++ b/backends/llamacpp/src/backend.rs
@@ -5,7 +5,7 @@ use crate::OpaqueStream;
 use async_trait::async_trait;
 use cxx::UniquePtr;
 use std::path::{Path, PathBuf};
-use std::sync::mpsc::{channel, Receiver, SendError, Sender};
+use std::sync::mpsc::{channel, Receiver, Sender};
 use std::sync::Arc;
 use std::thread::{spawn, JoinHandle};
 use text_generation_router::infer::{Backend, GeneratedText, InferError, InferStreamResponse};
@@ -15,7 +15,6 @@ use text_generation_router::validation::{
 use text_generation_router::{FinishReason, Token};
 use thiserror::Error;
 use tokio::sync::mpsc::{unbounded_channel, UnboundedSender};
-use tokio::sync::TryAcquireError;
 use tokio::time::Instant;
 use tokio_stream::wrappers::UnboundedReceiverStream;
 use tracing::{debug, error, info};
@@ -127,7 +126,7 @@ fn llama_generate_callback(
 
 unsafe fn scheduler_loop(
     mut backend: UniquePtr<LlamaCppBackendImpl>,
-    mut backlog: Receiver<InferContext>,
+    backlog: Receiver<InferContext>,
 ) {
     loop {
         if let Ok(mut ctx) = backlog.recv() {

From 7b0a56f40fc5766bef8c707a800b53462399f31c Mon Sep 17 00:00:00 2001
From: Morgan Funtowicz <funtowiczmo@gmail.com>
Date: Sun, 3 Nov 2024 11:17:02 +0100
Subject: [PATCH 34/91] feat(backend): fix memory leaking on llama_sampler when
 the decode ends

---
 backends/llamacpp/csrc/backend.cpp |  4 ++--
 backends/llamacpp/csrc/backend.hpp | 13 ++++++++-----
 2 files changed, 10 insertions(+), 7 deletions(-)

diff --git a/backends/llamacpp/csrc/backend.cpp b/backends/llamacpp/csrc/backend.cpp
index b88067f8b1a..4b6086200aa 100644
--- a/backends/llamacpp/csrc/backend.cpp
+++ b/backends/llamacpp/csrc/backend.cpp
@@ -29,7 +29,7 @@ namespace huggingface::tgi::backends::llamacpp {
         batch.logits[batch.n_tokens] = true;
     }
 
-    std::unique_ptr<llama_sampler> sampling_params_t::into_llama_sampler(const llama_model *model) const {
+    llama_sampler_ptr sampling_params_t::into_llama_sampler(const llama_model *model) const {
         auto *pSampler = llama_sampler_chain_init({.no_perf = false});
 
         // Penalties
@@ -51,7 +51,7 @@ namespace huggingface::tgi::backends::llamacpp {
         }
 
         llama_sampler_chain_add(pSampler, llama_sampler_init_dist(seed));
-        return std::unique_ptr<llama_sampler>(pSampler);
+        return llama_sampler_ptr(pSampler, llama_sampler_deleter);
     }
 
     worker_t::worker_t(std::shared_ptr<llama_model> model, const llama_context_params &params)
diff --git a/backends/llamacpp/csrc/backend.hpp b/backends/llamacpp/csrc/backend.hpp
index 288bf36afce..70f992687f7 100644
--- a/backends/llamacpp/csrc/backend.hpp
+++ b/backends/llamacpp/csrc/backend.hpp
@@ -24,7 +24,10 @@
 namespace huggingface::tgi::backends::llamacpp {
 
     static constexpr auto llama_context_deleter = [](llama_context *pContext) { llama_free(pContext); };
-    typedef std::unique_ptr<llama_context, decltype(llama_context_deleter)> llama_context_smart_ptr;
+    typedef std::unique_ptr<llama_context, decltype(llama_context_deleter)> llama_context_ptr;
+
+    static constexpr auto llama_sampler_deleter = [](llama_sampler *pSampler) { llama_sampler_free(pSampler); };
+    typedef std::unique_ptr<llama_sampler, decltype(llama_sampler_deleter)> llama_sampler_ptr;
 
     typedef std::function<void(llama_token, float_t, bool)> llama_decode_callback;
     static constexpr auto llama_void_callback = [](llama_token, float_t, bool) {};
@@ -51,7 +54,7 @@ namespace huggingface::tgi::backends::llamacpp {
          * @param Pointer to the model data
          * @return
          */
-        std::unique_ptr<llama_sampler> into_llama_sampler(const llama_model *pModel) const;
+        llama_sampler_ptr into_llama_sampler(const llama_model *pModel) const;
     };
 
     /**
@@ -155,7 +158,7 @@ namespace huggingface::tgi::backends::llamacpp {
 
     class single_worker_backend_t : backend_base_t {
     private:
-        constexpr const static auto llama_context_factory = [](llama_model *pModel) -> llama_context_smart_ptr {
+        constexpr const static auto llama_context_factory = [](llama_model *pModel) -> llama_context_ptr {
             auto llParams = llama_context_default_params();
             llParams.flash_attn = true;
             llParams.n_batch = 1;
@@ -165,7 +168,7 @@ namespace huggingface::tgi::backends::llamacpp {
             return {llama_new_context_with_model(pModel, llParams), llama_context_deleter};
         };
 
-        llama_context_smart_ptr mContext_;
+        llama_context_ptr mContext_;
         worker_t mWorker_;
 
     public:
@@ -185,7 +188,7 @@ namespace huggingface::tgi::backends::llamacpp {
 
     class multi_worker_backend_t : backend_base_t {
     private:
-        llama_context_smart_ptr mContext_;
+        llama_context_ptr mContext_;
 
     public:
         std::expected<size_t, backend_error_t> generate(

From 31d925477600564da81668727f86b954b1a13e26 Mon Sep 17 00:00:00 2001
From: Morgan Funtowicz <funtowiczmo@gmail.com>
Date: Sun, 3 Nov 2024 11:25:12 +0100
Subject: [PATCH 35/91] feat(backend): remove static from inner_fw visitor as
 it leads to invalid memory locations

---
 backends/llamacpp/csrc/ffi.hpp | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/backends/llamacpp/csrc/ffi.hpp b/backends/llamacpp/csrc/ffi.hpp
index c823b72b83f..63f8d3b6c7f 100644
--- a/backends/llamacpp/csrc/ffi.hpp
+++ b/backends/llamacpp/csrc/ffi.hpp
@@ -71,7 +71,7 @@ namespace huggingface::tgi::backends::llamacpp {
                 rust::Fn<void(OpaqueStream *, uint32_t, float_t, bool)> callback
         ) {
             // Define the visitor lambda function which requires the has_emplace_generate constraint on T
-            static auto inner_fw = [=, &sampling_params, &stream, &callback]<has_emplace_generate T>(T &&backend)
+            auto inner_fw = [=, &sampling_params, &stream, &callback]<has_emplace_generate T>(T &&backend)
                     -> std::expected<size_t, backend_error_t> {
 
                 auto context_forwarding_callback = [=, &stream](uint32_t new_token_id, float_t logits, bool is_eos){

From 188442f67dd68520896b81fe56abf49f55c7082d Mon Sep 17 00:00:00 2001
From: Morgan Funtowicz <funtowiczmo@gmail.com>
Date: Sun, 3 Nov 2024 14:26:57 +0100
Subject: [PATCH 36/91] misc(lint): make clippy happier

---
 Cargo.lock                    | 36 ++++++-----------------------------
 backends/llamacpp/Cargo.toml  |  2 +-
 backends/llamacpp/src/main.rs | 36 ++++++++++++++---------------------
 3 files changed, 21 insertions(+), 53 deletions(-)

diff --git a/Cargo.lock b/Cargo.lock
index 479e94d7fca..6b6cb7a7e18 100644
--- a/Cargo.lock
+++ b/Cargo.lock
@@ -4239,7 +4239,7 @@ dependencies = [
  "tracing",
  "tracing-opentelemetry 0.27.0",
  "tracing-subscriber",
- "utoipa 5.1.2",
+ "utoipa",
 ]
 
 [[package]]
@@ -4368,7 +4368,7 @@ dependencies = [
  "tracing-opentelemetry 0.21.0",
  "tracing-subscriber",
  "ureq",
- "utoipa 4.2.3",
+ "utoipa",
  "utoipa-swagger-ui",
  "uuid",
  "vergen",
@@ -4419,7 +4419,7 @@ dependencies = [
  "tracing",
  "tracing-opentelemetry 0.21.0",
  "tracing-subscriber",
- "utoipa 4.2.3",
+ "utoipa",
  "utoipa-swagger-ui",
 ]
 
@@ -4470,7 +4470,7 @@ dependencies = [
  "tracing",
  "tracing-opentelemetry 0.21.0",
  "tracing-subscriber",
- "utoipa 4.2.3",
+ "utoipa",
  "utoipa-swagger-ui",
 ]
 
@@ -5192,19 +5192,7 @@ dependencies = [
  "indexmap 2.6.0",
  "serde",
  "serde_json",
- "utoipa-gen 4.3.0",
-]
-
-[[package]]
-name = "utoipa"
-version = "5.1.2"
-source = "registry+https://github.com/rust-lang/crates.io-index"
-checksum = "5e12e84f0ff45b6818029cd0f67280e453c80132c1b9897df407ecc20b9f7cfd"
-dependencies = [
- "indexmap 2.5.0",
- "serde",
- "serde_json",
- "utoipa-gen 5.1.2",
+ "utoipa-gen",
 ]
 
 [[package]]
@@ -5220,18 +5208,6 @@ dependencies = [
  "syn 2.0.85",
 ]
 
-[[package]]
-name = "utoipa-gen"
-version = "5.1.2"
-source = "registry+https://github.com/rust-lang/crates.io-index"
-checksum = "0dfc694d3a3118d2b9e80d68be83bf1aab7988510916934db83da61c14e7e6b2"
-dependencies = [
- "proc-macro2",
- "quote",
- "regex",
- "syn 2.0.79",
-]
-
 [[package]]
 name = "utoipa-swagger-ui"
 version = "6.0.0"
@@ -5244,7 +5220,7 @@ dependencies = [
  "rust-embed",
  "serde",
  "serde_json",
- "utoipa 4.2.3",
+ "utoipa",
  "zip",
 ]
 
diff --git a/backends/llamacpp/Cargo.toml b/backends/llamacpp/Cargo.toml
index 4a14dcdfd05..48a0bb84362 100644
--- a/backends/llamacpp/Cargo.toml
+++ b/backends/llamacpp/Cargo.toml
@@ -22,7 +22,7 @@ tokenizers = { workspace = true }
 tracing = "0.1"
 tracing-opentelemetry = "0.27.0"
 tracing-subscriber = { version = "0.3", features = ["json", "env-filter"] }
-utoipa = { version = "5.1.2", features = ["axum_extras"] }
+utoipa = { version = "4.2.3", features = ["axum_extras"] }
 log = "0.4.22"
 
 [build-dependencies]
diff --git a/backends/llamacpp/src/main.rs b/backends/llamacpp/src/main.rs
index 62f81848744..f128a6a3fc6 100644
--- a/backends/llamacpp/src/main.rs
+++ b/backends/llamacpp/src/main.rs
@@ -1,6 +1,7 @@
 use clap::{Parser, Subcommand};
 use std::path::PathBuf;
 use text_generation_backend_llamacpp::backend::{LlamaCppBackend, LlamaCppBackendError};
+use text_generation_router::server::ApiDoc;
 use text_generation_router::{server, usage_stats};
 use thiserror::Error;
 
@@ -35,13 +36,8 @@ struct Args {
     port: u16,
     #[clap(long, env, help = "Path to GGUF model file(s) to load")]
     gguf_path: PathBuf,
-    #[clap(
-        long,
-        env,
-        default_value = "1",
-        help = "Number of CPU threads allocated to one llama.cpp model"
-    )]
-    cores_per_instance: u16,
+    #[clap(long, env, default_value = "1", help = "Number of model instance(s)")]
+    num_model_instance: u16,
     #[clap(default_value = "bigscience/bloom", long, env)]
     tokenizer_name: String,
     #[clap(long, env)]
@@ -67,8 +63,6 @@ struct Args {
     #[clap(long, env)]
     ngrok_edge: Option<String>,
     #[clap(long, env, default_value_t = false)]
-    messages_api_enabled: bool,
-    #[clap(long, env, default_value_t = false)]
     disable_grammar_support: bool,
     #[clap(default_value = "4", long, env)]
     max_client_batch_size: usize,
@@ -100,7 +94,7 @@ async fn main() -> Result<(), RouterError> {
         hostname,
         port,
         gguf_path,
-        cores_per_instance,
+        num_model_instance,
         tokenizer_name,
         tokenizer_config_path,
         revision,
@@ -113,19 +107,17 @@ async fn main() -> Result<(), RouterError> {
         ngrok,
         ngrok_authtoken,
         ngrok_edge,
-        messages_api_enabled,
         disable_grammar_support,
         max_client_batch_size,
         usage_stats,
     } = args;
 
-    // if let Some(Commands::PrintSchema) = command {
-    //     use utoipa::OpenApi;
-    //     let api_doc = ApiDoc::openapi();
-    //     let api_doc = serde_json::to_string_pretty(&api_doc).unwrap();
-    //     println!("{}", api_doc);
-    //     std::process::exit(0);
-    // };
+    if let Some(Commands::PrintSchema) = command {
+        use utoipa::OpenApi;
+        let api_doc = ApiDoc::openapi().to_pretty_json().unwrap();
+        println!("{}", api_doc);
+        std::process::exit(0);
+    };
     text_generation_router::logging::init_logging(otlp_endpoint, otlp_service_name, json_output);
 
     // Validate args
@@ -144,11 +136,11 @@ async fn main() -> Result<(), RouterError> {
         ));
     }
 
-    if let Some(ref max_batch_total_tokens) = max_batch_total_tokens {
-        if max_batch_prefill_tokens > *max_batch_total_tokens {
+    if let Some(max_batch_total_tokens) = max_batch_total_tokens {
+        if max_batch_prefill_tokens > max_batch_total_tokens {
             return Err(RouterError::ArgumentValidation(format!("`max_batch_prefill_tokens` must be <= `max_batch_total_tokens`. Given: {max_batch_prefill_tokens} and {max_batch_total_tokens}")));
         }
-        if max_total_tokens as u32 > *max_batch_total_tokens {
+        if max_total_tokens as u32 > max_batch_total_tokens {
             return Err(RouterError::ArgumentValidation(format!("`max_total_tokens` must be <= `max_batch_total_tokens`. Given: {max_total_tokens} and {max_batch_total_tokens}")));
         }
     }
@@ -177,13 +169,13 @@ async fn main() -> Result<(), RouterError> {
         tokenizer_name,
         tokenizer_config_path,
         revision,
+        false,
         hostname,
         port,
         cors_allow_origin,
         ngrok,
         ngrok_authtoken,
         ngrok_edge,
-        messages_api_enabled,
         disable_grammar_support,
         max_client_batch_size,
         usage_stats,

From 05ff551950dad2948f5f8fa10234496179dffd42 Mon Sep 17 00:00:00 2001
From: Morgan Funtowicz <funtowiczmo@gmail.com>
Date: Sun, 3 Nov 2024 23:07:22 +0100
Subject: [PATCH 37/91] feat(backend): add number of generated tokens in the
 callback

---
 backends/llamacpp/csrc/backend.cpp | 4 ++--
 backends/llamacpp/csrc/backend.hpp | 4 ++--
 backends/llamacpp/csrc/ffi.hpp     | 6 +++---
 backends/llamacpp/src/backend.rs   | 3 ++-
 backends/llamacpp/src/lib.rs       | 2 +-
 5 files changed, 10 insertions(+), 9 deletions(-)

diff --git a/backends/llamacpp/csrc/backend.cpp b/backends/llamacpp/csrc/backend.cpp
index 4b6086200aa..54e41a14312 100644
--- a/backends/llamacpp/csrc/backend.cpp
+++ b/backends/llamacpp/csrc/backend.cpp
@@ -120,8 +120,8 @@ namespace huggingface::tgi::backends::llamacpp {
                 }
 
                 // Bubble up the generated token if a callback is provided
-                std::invoke(
-                        std::forward<const llama_decode_callback>(callback_), new_token_id, new_token_logits, is_eos);
+                std::invoke(std::forward<const llama_decode_callback>(callback_),
+                            new_token_id, new_token_logits, is_eos, n_decoded_tokens);
 
                 batch = llama_batch_get_one(&new_token_id, 1);
             }
diff --git a/backends/llamacpp/csrc/backend.hpp b/backends/llamacpp/csrc/backend.hpp
index 70f992687f7..ebae7fb0db5 100644
--- a/backends/llamacpp/csrc/backend.hpp
+++ b/backends/llamacpp/csrc/backend.hpp
@@ -29,8 +29,8 @@ namespace huggingface::tgi::backends::llamacpp {
     static constexpr auto llama_sampler_deleter = [](llama_sampler *pSampler) { llama_sampler_free(pSampler); };
     typedef std::unique_ptr<llama_sampler, decltype(llama_sampler_deleter)> llama_sampler_ptr;
 
-    typedef std::function<void(llama_token, float_t, bool)> llama_decode_callback;
-    static constexpr auto llama_void_callback = [](llama_token, float_t, bool) {};
+    typedef std::function<void(llama_token, float_t, bool, size_t)> llama_decode_callback;
+    static constexpr auto llama_void_callback = [](llama_token, float_t, bool, size_t) {};
 
     /**
      *
diff --git a/backends/llamacpp/csrc/ffi.hpp b/backends/llamacpp/csrc/ffi.hpp
index 63f8d3b6c7f..df924cb7fd1 100644
--- a/backends/llamacpp/csrc/ffi.hpp
+++ b/backends/llamacpp/csrc/ffi.hpp
@@ -68,14 +68,14 @@ namespace huggingface::tgi::backends::llamacpp {
                 const generation_params_t generation_params,
                 const sampling_params_t &sampling_params,
                 OpaqueStream *stream,
-                rust::Fn<void(OpaqueStream *, uint32_t, float_t, bool)> callback
+                rust::Fn<void(OpaqueStream *, uint32_t, float_t, bool, size_t)> callback
         ) {
             // Define the visitor lambda function which requires the has_emplace_generate constraint on T
             auto inner_fw = [=, &sampling_params, &stream, &callback]<has_emplace_generate T>(T &&backend)
                     -> std::expected<size_t, backend_error_t> {
 
-                auto context_forwarding_callback = [=, &stream](uint32_t new_token_id, float_t logits, bool is_eos){
-                    callback(stream, new_token_id, logits, is_eos);
+                auto context_forwarding_callback = [=, &stream](uint32_t new_token_id, float_t logits, bool is_eos, size_t n_generated_tokens){
+                    callback(stream, new_token_id, logits, is_eos, n_generated_tokens);
                 };
 
                 // Ask the compiler to create view over Rust slice transmuting from uint32_t* to int32_t*
diff --git a/backends/llamacpp/src/backend.rs b/backends/llamacpp/src/backend.rs
index bfdac34b520..c3fff6979b6 100644
--- a/backends/llamacpp/src/backend.rs
+++ b/backends/llamacpp/src/backend.rs
@@ -102,6 +102,7 @@ fn llama_generate_callback(
     new_token_id: u32,
     new_token_logit: f32,
     is_eos: bool,
+    n_generated_tokens: usize,
 ) {
     let response = InferStreamResponse::Intermediate {
         token: Token {
@@ -112,7 +113,7 @@ fn llama_generate_callback(
         },
         top_tokens: vec![],
     };
-    debug!("Generated token: {new_token_id} -> logits={new_token_logit}, is_eos={is_eos}");
+    info!("Generated token: {new_token_id} -> logits={new_token_logit}, is_eos={is_eos} ({n_generated_tokens})");
 
     unsafe {
         if let Err(ref err) = (*channel).0.send(Ok(response)) {
diff --git a/backends/llamacpp/src/lib.rs b/backends/llamacpp/src/lib.rs
index f923526f98b..277f77cbf04 100644
--- a/backends/llamacpp/src/lib.rs
+++ b/backends/llamacpp/src/lib.rs
@@ -70,7 +70,7 @@ mod ffi {
             generation_params: GenerationParams,
             sampling_params: &SamplingParams,
             stream: *mut OpaqueStream,
-            callback: unsafe fn(*mut OpaqueStream, u32, f32, bool),
+            callback: unsafe fn(*mut OpaqueStream, u32, f32, bool, usize),
         ) -> Result<usize>;
     }
 }

From 06424aa9ff44a7d3edee24cb8ce7de5681222184 Mon Sep 17 00:00:00 2001
From: Morgan Funtowicz <funtowiczmo@gmail.com>
Date: Sun, 3 Nov 2024 23:50:46 +0100
Subject: [PATCH 38/91] feat(backend): correctly handle the max_new_tokens case
 for is_eos

---
 backends/llamacpp/csrc/backend.cpp | 6 +++++-
 1 file changed, 5 insertions(+), 1 deletion(-)

diff --git a/backends/llamacpp/csrc/backend.cpp b/backends/llamacpp/csrc/backend.cpp
index 54e41a14312..733a826a70b 100644
--- a/backends/llamacpp/csrc/backend.cpp
+++ b/backends/llamacpp/csrc/backend.cpp
@@ -113,6 +113,7 @@ namespace huggingface::tgi::backends::llamacpp {
                 auto new_token_id = llama_sampler_sample(sampler.get(), context, -1);
                 auto new_token_logits = 0.0f; // TODO: return logit
                 auto is_eos = llama_token_is_eog(mModel_.get(), new_token_id);
+                auto effective_n_decoded_tokens = n_decoded_tokens + 1;
 
                 if (!generation_context.generation_params.ignore_eos_token) {
                     generation_context.generated_tokens[n_decoded_tokens] = new_token_id;
@@ -121,7 +122,10 @@ namespace huggingface::tgi::backends::llamacpp {
 
                 // Bubble up the generated token if a callback is provided
                 std::invoke(std::forward<const llama_decode_callback>(callback_),
-                            new_token_id, new_token_logits, is_eos, n_decoded_tokens);
+                            new_token_id,
+                            new_token_logits,
+                            is_eos || effective_n_decoded_tokens == max_new_tokens,
+                            effective_n_decoded_tokens);
 
                 batch = llama_batch_get_one(&new_token_id, 1);
             }

From 11c593dc69f9c7b800cd0dbac73e1e00d696867a Mon Sep 17 00:00:00 2001
From: Morgan Funtowicz <funtowiczmo@gmail.com>
Date: Mon, 4 Nov 2024 00:11:55 +0100
Subject: [PATCH 39/91] feat(backend): make eog clearer on c++ side

---
 backends/llamacpp/csrc/backend.cpp | 23 +++++++++++++----------
 1 file changed, 13 insertions(+), 10 deletions(-)

diff --git a/backends/llamacpp/csrc/backend.cpp b/backends/llamacpp/csrc/backend.cpp
index 733a826a70b..79c09a26c6a 100644
--- a/backends/llamacpp/csrc/backend.cpp
+++ b/backends/llamacpp/csrc/backend.cpp
@@ -95,7 +95,7 @@ namespace huggingface::tgi::backends::llamacpp {
 
         // Decode
         auto n_decoded_tokens = 0;
-        for (bool generating = true; generating && n_decoded_tokens < max_new_tokens; ++n_decoded_tokens) {
+        for (bool generating = true; generating; ++n_decoded_tokens) {
             const auto callback_ = callback.value_or(llama_void_callback);
 
 #ifdef TGI_LLAMACPP_BACKEND_DEBUG
@@ -108,24 +108,27 @@ namespace huggingface::tgi::backends::llamacpp {
             const auto status = llama_decode(context, batch);
 #endif
             batch.n_tokens = 0;
-            if (LLAMA_SUCCESS(status)) {
+            if (LLAMA_SUCCESS(status)) [[likely]] {
                 // Sample the new token
                 auto new_token_id = llama_sampler_sample(sampler.get(), context, -1);
+                auto is_eog = llama_token_is_eog(mModel_.get(), new_token_id);
                 auto new_token_logits = 0.0f; // TODO: return logit
-                auto is_eos = llama_token_is_eog(mModel_.get(), new_token_id);
-                auto effective_n_decoded_tokens = n_decoded_tokens + 1;
 
-                if (!generation_context.generation_params.ignore_eos_token) {
-                    generation_context.generated_tokens[n_decoded_tokens] = new_token_id;
-                    generating = !is_eos;
-                }
+                // Push the token to the generated vector on Rust side
+                generation_context.generated_tokens[n_decoded_tokens] = new_token_id;
+
+                // Handle termination cases
+                const auto has_reach_max_tokens = n_decoded_tokens >= max_new_tokens - 1;
+                const auto has_reach_eog = !generation_context.generation_params.ignore_eos_token & is_eog;
+
+                generating = !(has_reach_max_tokens | has_reach_eog);
 
                 // Bubble up the generated token if a callback is provided
                 std::invoke(std::forward<const llama_decode_callback>(callback_),
                             new_token_id,
                             new_token_logits,
-                            is_eos || effective_n_decoded_tokens == max_new_tokens,
-                            effective_n_decoded_tokens);
+                            !generating,
+                            n_decoded_tokens + 1);
 
                 batch = llama_batch_get_one(&new_token_id, 1);
             }

From 5b7a951389216a58cc603c28b1c3ea8e87930bca Mon Sep 17 00:00:00 2001
From: Morgan Funtowicz <funtowiczmo@gmail.com>
Date: Mon, 4 Nov 2024 16:17:43 +0100
Subject: [PATCH 40/91] feat(backend): refactor the callback to handle
 intermediate and end inference message

---
 backends/llamacpp/csrc/backend.cpp |  35 +++----
 backends/llamacpp/csrc/backend.hpp |  44 ++++----
 backends/llamacpp/csrc/ffi.hpp     |  27 ++---
 backends/llamacpp/src/backend.rs   | 157 ++++++++++++++++-------------
 backends/llamacpp/src/lib.rs       |  12 +--
 5 files changed, 138 insertions(+), 137 deletions(-)

diff --git a/backends/llamacpp/csrc/backend.cpp b/backends/llamacpp/csrc/backend.cpp
index 79c09a26c6a..65898dfe772 100644
--- a/backends/llamacpp/csrc/backend.cpp
+++ b/backends/llamacpp/csrc/backend.cpp
@@ -114,9 +114,6 @@ namespace huggingface::tgi::backends::llamacpp {
                 auto is_eog = llama_token_is_eog(mModel_.get(), new_token_id);
                 auto new_token_logits = 0.0f; // TODO: return logit
 
-                // Push the token to the generated vector on Rust side
-                generation_context.generated_tokens[n_decoded_tokens] = new_token_id;
-
                 // Handle termination cases
                 const auto has_reach_max_tokens = n_decoded_tokens >= max_new_tokens - 1;
                 const auto has_reach_eog = !generation_context.generation_params.ignore_eos_token & is_eog;
@@ -150,10 +147,15 @@ namespace huggingface::tgi::backends::llamacpp {
     ) {
         // TODO: Should we provide a way to change this value?
         auto generated = std::vector<llama_token>(2 << 8);
+        auto inner_callback = [&](uint32_t new_token_id, float_t new_token_logit, bool is_eos,
+                                  size_t num_generated_tokens) {
+            generated.emplace_back(new_token_id);
+
+            if (callback.has_value())
+                (*callback)(new_token_id, new_token_logit, is_eos, num_generated_tokens);
+        };
 
-        auto nTokensGenerated = generate(tokens, generated, generation_params, sampling_params, callback);
-        if (nTokensGenerated.has_value())
-            generated.resize(*nTokensGenerated);
+        auto nTokensGenerated = stream(tokens, generation_params, sampling_params, inner_callback);
         return generated;
     }
 
@@ -168,25 +170,24 @@ namespace huggingface::tgi::backends::llamacpp {
         llama_numa_init(ggml_numa_strategy::GGML_NUMA_STRATEGY_NUMACTL);
     }
 
-    std::expected<std::size_t, backend_error_t>
-    single_worker_backend_t::generate(
+    std::expected<size_t, backend_error_t>
+    single_worker_backend_t::stream(
             std::span<const llama_token> tokens,
-            std::span<llama_token> out,
             const generation_params_t &generation_params,
             const sampling_params_t &sampling_params,
-            const std::optional<llama_decode_callback> &callback
+            const llama_decode_callback &callback
     ) {
-        return mWorker_.generate(mContext_.get(), {generation_params, sampling_params, tokens, out}, callback);
+        return mWorker_.generate(mContext_.get(), {generation_params, sampling_params, tokens}, callback);
     }
 
     std::expected<size_t, backend_error_t>
-    multi_worker_backend_t::generate(
-            std::span<const llama_token>,
-            std::span<llama_token>,
+    multi_worker_backend_t::stream(
+            std::span<const llama_token> tokens,
             const generation_params_t &generation_params,
             const sampling_params_t &sampling_params,
-            const std::optional<llama_decode_callback> &callback) {
-        SPDLOG_ERROR("Not implemented yet");
-        return 0uz;
+            const llama_decode_callback &callback
+    ) {
+        SPDLOG_WARN("Not implemented for multi_worker_t");
+        return 0;
     }
 }
\ No newline at end of file
diff --git a/backends/llamacpp/csrc/backend.hpp b/backends/llamacpp/csrc/backend.hpp
index ebae7fb0db5..1fef7fb8931 100644
--- a/backends/llamacpp/csrc/backend.hpp
+++ b/backends/llamacpp/csrc/backend.hpp
@@ -69,7 +69,6 @@ namespace huggingface::tgi::backends::llamacpp {
         generation_params_t generation_params;
         sampling_params_t sampling_params;
         std::span<const llama_token> input_tokens;
-        std::span<llama_token> generated_tokens;
     };
 
     /**
@@ -125,34 +124,34 @@ namespace huggingface::tgi::backends::llamacpp {
         /**
          *
          * @param tokens
-         * @params out
-         * @param params
-         * @param maxNewTokens
+         * @param generation_params
+         * @param sampling_params
+         * @param callback
          * @return
          */
         [[nodiscard("Generated tokens will be freed after this call if not assigned to an lvalue")]]
-        virtual std::expected<size_t, backend_error_t> generate(
-                std::span<const llama_token> input_tokens,
-                std::span<llama_token> generated_tokens,
+        std::expected<std::vector<llama_token>, backend_error_t> generate(
+                std::span<const llama_token> tokens,
                 const generation_params_t &generation_params,
                 const sampling_params_t &sampling_params,
-                const std::optional<llama_decode_callback> &callback
-        ) = 0;
+                const std::optional<llama_decode_callback> &callback = std::nullopt
+        );
 
         /**
          *
          * @param tokens
-         * @param params
-         * @param maxNewTokens
+         * @param generation_params
+         * @param sampling_params
+         * @params callback
          * @return
          */
         [[nodiscard("Generated tokens will be freed after this call if not assigned to an lvalue")]]
-        std::expected<std::vector<llama_token>, backend_error_t> generate(
+        virtual std::expected<size_t, backend_error_t> stream(
                 std::span<const llama_token> tokens,
                 const generation_params_t &generation_params,
                 const sampling_params_t &sampling_params,
-                const std::optional<llama_decode_callback> &callback = std::nullopt
-        );
+                const llama_decode_callback &callback
+        ) = 0;
     };
 
 
@@ -174,16 +173,11 @@ namespace huggingface::tgi::backends::llamacpp {
     public:
         explicit single_worker_backend_t(llama_model *pModel, const std::optional<llama_context_params> &);
 
-        using backend_base_t::generate;
-
-        std::expected<size_t, backend_error_t>
-        generate(
+        std::expected<size_t, backend_error_t> stream(
                 std::span<const llama_token> tokens,
-                std::span<llama_token> out,
                 const generation_params_t &generation_params,
                 const sampling_params_t &sampling_params,
-                const std::optional<llama_decode_callback> &callback
-        ) override;
+                const llama_decode_callback &callback) override;
     };
 
     class multi_worker_backend_t : backend_base_t {
@@ -191,13 +185,11 @@ namespace huggingface::tgi::backends::llamacpp {
         llama_context_ptr mContext_;
 
     public:
-        std::expected<size_t, backend_error_t> generate(
-                std::span<const llama_token>,
-                std::span<llama_token>,
+        std::expected<size_t, backend_error_t> stream(
+                std::span<const llama_token> tokens,
                 const generation_params_t &generation_params,
                 const sampling_params_t &sampling_params,
-                const std::optional<llama_decode_callback> &callback
-        ) override;
+                const llama_decode_callback &callback) override;
     };
 }
 
diff --git a/backends/llamacpp/csrc/ffi.hpp b/backends/llamacpp/csrc/ffi.hpp
index df924cb7fd1..3ae392f624c 100644
--- a/backends/llamacpp/csrc/ffi.hpp
+++ b/backends/llamacpp/csrc/ffi.hpp
@@ -28,23 +28,20 @@ namespace huggingface::tgi::backends::llamacpp {
 
     // Concept identifying types which have a .generate() -> size_t method to do in-place generation
     template<typename T>
-    concept has_emplace_generate = requires(
+    concept has_stream_method = requires(
             T t,
             std::span<const llama_token> input_tokens,
-            std::span<llama_token> generated_tokens,
             const generation_params_t &generation_params,
             const sampling_params_t &sampling_params,
             llama_decode_callback callback
     ) {
         {
-        t.generate(input_tokens, generated_tokens, generation_params, sampling_params, callback)
+        t.stream(input_tokens, generation_params, sampling_params, callback)
         } -> std::same_as<std::expected<size_t, backend_error_t>>;
     };
 
-    static_assert(has_emplace_generate<single_worker_backend_t>,
-                  "single_worker_backend_t doesn't meet concept is_generate_emplace_capable");
-    static_assert(has_emplace_generate<multi_worker_backend_t>,
-                  "multi_worker_backend_t doesn't meet concept is_generate_emplace_capable");
+    static_assert(has_stream_method<single_worker_backend_t>, "single_worker_backend_t doesn't meet concept has_stream_method");
+    static_assert(has_stream_method<multi_worker_backend_t>, "multi_worker_backend_t doesn't meet concept has_stream_method");
 
     class llama_cpp_backend_exception_t : std::exception {
 
@@ -64,29 +61,25 @@ namespace huggingface::tgi::backends::llamacpp {
 
         size_t stream(
                 rust::Slice<const uint32_t> input_tokens,
-                rust::Slice <uint32_t> generated_tokens,
                 const generation_params_t generation_params,
                 const sampling_params_t &sampling_params,
-                OpaqueStream *stream,
-                rust::Fn<void(OpaqueStream *, uint32_t, float_t, bool, size_t)> callback
+                InferContext *ctx,
+                rust::Fn<void(InferContext *, uint32_t, float_t, bool, size_t)> callback
         ) {
             // Define the visitor lambda function which requires the has_emplace_generate constraint on T
-            auto inner_fw = [=, &sampling_params, &stream, &callback]<has_emplace_generate T>(T &&backend)
+            auto inner_fw = [=, &sampling_params, &ctx, &callback]<has_stream_method T>(T &&backend)
                     -> std::expected<size_t, backend_error_t> {
 
-                auto context_forwarding_callback = [=, &stream](uint32_t new_token_id, float_t logits, bool is_eos, size_t n_generated_tokens){
-                    callback(stream, new_token_id, logits, is_eos, n_generated_tokens);
+                auto context_forwarding_callback = [=, &ctx](uint32_t new_token_id, float_t logits, bool is_eos, size_t n_generated_tokens){
+                    callback(ctx, new_token_id, logits, is_eos, n_generated_tokens);
                 };
 
                 // Ask the compiler to create view over Rust slice transmuting from uint32_t* to int32_t*
                 auto input_tokens_v =
                         std::span(reinterpret_cast<const llama_token *>(input_tokens.data()), input_tokens.size());
-                auto generated_tokens_v =
-                        std::span(reinterpret_cast<llama_token *>(generated_tokens.data()), generated_tokens.size());
 
-                return backend.generate(
+                return backend.stream(
                         input_tokens_v,
-                        generated_tokens_v,
                         generation_params,
                         sampling_params,
                         context_forwarding_callback
diff --git a/backends/llamacpp/src/backend.rs b/backends/llamacpp/src/backend.rs
index c3fff6979b6..06e8d43ed9b 100644
--- a/backends/llamacpp/src/backend.rs
+++ b/backends/llamacpp/src/backend.rs
@@ -1,7 +1,6 @@
 use crate::ffi::{
     create_single_worker_backend, GenerationParams, LlamaCppBackendImpl, SamplingParams,
 };
-use crate::OpaqueStream;
 use async_trait::async_trait;
 use cxx::UniquePtr;
 use std::path::{Path, PathBuf};
@@ -14,12 +13,13 @@ use text_generation_router::validation::{
 };
 use text_generation_router::{FinishReason, Token};
 use thiserror::Error;
+use tokio::sync::mpsc::error::SendError;
 use tokio::sync::mpsc::{unbounded_channel, UnboundedSender};
 use tokio::time::Instant;
 use tokio_stream::wrappers::UnboundedReceiverStream;
 use tracing::{debug, error, info};
 
-type BoxedOpaqueStream = Box<OpaqueStream>;
+type InferResult = Result<InferStreamResponse, InferError>;
 
 unsafe impl Send for LlamaCppBackendImpl {}
 
@@ -45,14 +45,19 @@ impl From<&ValidStoppingParameters> for GenerationParams {
 }
 
 #[cfg_attr(debug_assertions, derive(Debug))]
-struct InferContext {
-    pub(crate) stream: UnboundedSender<Result<InferStreamResponse, InferError>>,
+struct GenerationContext {
     pub(crate) input_tokens: Arc<Vec<u32>>,
     pub(crate) generated_tokens: Vec<u32>,
     pub(crate) generation_params: GenerationParams,
     pub(crate) sampling_params: SamplingParams,
 }
 
+pub(crate) struct InferContext {
+    pub(crate) start: Instant,
+    pub(crate) stream: UnboundedSender<InferResult>,
+    pub(crate) generation: GenerationContext,
+}
+
 #[derive(Debug, Error)]
 pub enum LlamaCppBackendError {
     #[error("Provided GGUF model path {0} doesn't exist")]
@@ -63,7 +68,7 @@ pub enum LlamaCppBackendError {
 }
 
 pub struct LlamaCppBackend {
-    backlog: Sender<InferContext>,
+    backlog: Sender<(GenerationContext, UnboundedSender<InferResult>)>,
     scheduler_handle: JoinHandle<()>,
 }
 
@@ -98,81 +103,96 @@ impl LlamaCppBackend {
 }
 
 fn llama_generate_callback(
-    channel: *mut OpaqueStream,
+    ctx: *mut InferContext,
     new_token_id: u32,
     new_token_logit: f32,
-    is_eos: bool,
+    is_final: bool,
     n_generated_tokens: usize,
 ) {
-    let response = InferStreamResponse::Intermediate {
-        token: Token {
-            id: new_token_id,
-            text: "".to_string(),
-            logprob: new_token_logit,
-            special: false,
+    info!("Generated token: {new_token_id} -> logits={new_token_logit}, is_final={is_final} ({n_generated_tokens})");
+
+    // Decode token
+    let token = Token {
+        id: new_token_id,
+        text: "".to_string(),
+        logprob: new_token_logit,
+        special: false,
+    };
+
+    let ctx = unsafe { &mut *ctx };
+
+    // Append the new token to the generated ones
+    ctx.generation.generated_tokens.push(new_token_id);
+
+    // Create the streamed response
+    let response = match is_final {
+        false => InferStreamResponse::Intermediate {
+            token,
+            top_tokens: vec![],
         },
-        top_tokens: vec![],
+        true => {
+            // Decode the whole text
+            let text = String::new();
+
+            // Stream end response
+            InferStreamResponse::End {
+                token,
+                top_tokens: vec![],
+                generated_text: GeneratedText {
+                    text,
+                    generated_tokens: n_generated_tokens as u32,
+                    finish_reason: FinishReason::Length,
+                    seed: Some(ctx.generation.sampling_params.seed),
+                },
+                start: ctx.start,
+                queued: ctx.start,
+            }
+        }
     };
-    info!("Generated token: {new_token_id} -> logits={new_token_logit}, is_eos={is_eos} ({n_generated_tokens})");
-
-    unsafe {
-        if let Err(ref err) = (*channel).0.send(Ok(response)) {
-            error!(
-                "Failed to send back token to the client: {}",
-                err.to_string()
-            );
-        };
+
+    // Send back to the client
+    if let Err(ref err) = ctx.stream.send(Ok(response)) {
+        error!("Failed to send back the response to the client, cancelling request");
+        // TODO: cancel the request
     }
 }
 
 unsafe fn scheduler_loop(
     mut backend: UniquePtr<LlamaCppBackendImpl>,
-    backlog: Receiver<InferContext>,
+    backlog: Receiver<(GenerationContext, UnboundedSender<InferResult>)>,
 ) {
+    // This loop will mostly decode single token at every step, so no need to rely on parallelism
+    tokenizers::utils::parallelism::set_parallelism(false);
+
     loop {
-        if let Ok(mut ctx) = backlog.recv() {
+        if let Ok((generation, stream)) = backlog.recv() {
             let start = Instant::now();
-            let stream = BoxedOpaqueStream::new(OpaqueStream(ctx.stream));
-            let stream_ptr = Box::into_raw(stream);
-            let result = backend.pin_mut().stream(
-                &ctx.input_tokens,
-                &mut ctx.generated_tokens,
-                ctx.generation_params,
-                &ctx.sampling_params,
-                stream_ptr,
-                llama_generate_callback,
-            );
-
-            // Make sure we re-keep track of the OpaqueStream box
-            let stream = Box::from_raw(stream_ptr);
-
-            match result {
-                Ok(n_tokens) => {
-                    unsafe {
-                        ctx.generated_tokens.set_len(n_tokens);
-                    }
-
-                    let _ = stream.0.send(Ok(InferStreamResponse::End {
-                        token: Token {
-                            id: ctx.generated_tokens[n_tokens - 1],
-                            text: "".to_string(),
-                            logprob: 0.0,
-                            special: false,
-                        },
-                        top_tokens: vec![],
-                        generated_text: GeneratedText {
-                            text: "".to_string(),
-                            generated_tokens: n_tokens as u32,
-                            finish_reason: FinishReason::Length,
-                            seed: Some(ctx.sampling_params.seed),
-                        },
-                        start,
-                        queued: start,
-                    }));
-
-                    debug!("Generated {n_tokens} tokens -> {:?}", ctx.generated_tokens);
+            let generation_params = generation.generation_params; // copy
+            let sampling_params = generation.sampling_params; // copy
+            let input_tokens = Arc::clone(&generation.input_tokens);
+
+            // Creating the whole InferContext and pushing it to the heap
+            {
+                let ctx = Box::new(InferContext {
+                    start,
+                    stream,
+                    generation,
+                });
+
+                let boxed_ctx = Box::into_raw(ctx);
+
+                if let Err(e) = backend.pin_mut().stream(
+                    &input_tokens,
+                    generation_params,
+                    &sampling_params,
+                    boxed_ctx,
+                    llama_generate_callback,
+                ) {
+                    error!("Error while decoding tokens... {}", e.what());
                 }
-                Err(err) => println!("Error: {err}"),
+
+                // Make sure we re-keep track of the OpaqueStream box
+                let _ = Box::from_raw(boxed_ctx);
             }
         } else {
             info!("IPC channel is closed, exiting the scheduler loop");
@@ -186,21 +206,20 @@ impl Backend for LlamaCppBackend {
     fn schedule(
         &self,
         request: ValidGenerateRequest,
-    ) -> Result<UnboundedReceiverStream<Result<InferStreamResponse, InferError>>, InferError> {
+    ) -> Result<UnboundedReceiverStream<InferResult>, InferError> {
         if let Some(input_ids) = request.input_ids {
             let (sx, rx) = unbounded_channel();
             let sampling_params = SamplingParams::from(&request.parameters);
             let generation_params = GenerationParams::from(&request.stopping_parameters);
 
-            let ctx = InferContext {
-                stream: sx,
+            let ctx = GenerationContext {
                 input_tokens: Arc::clone(&input_ids),
                 generated_tokens: Vec::with_capacity(generation_params.max_new_tokens as usize),
                 generation_params,
                 sampling_params,
             };
 
-            match self.backlog.send(ctx) {
+            match self.backlog.send((ctx, sx)) {
                 Ok(_) => Ok(UnboundedReceiverStream::new(rx)),
                 Err(_) => Err(InferError::GenerationError(
                     "Failed to sent the request".to_string(),
diff --git a/backends/llamacpp/src/lib.rs b/backends/llamacpp/src/lib.rs
index 277f77cbf04..01f2054db89 100644
--- a/backends/llamacpp/src/lib.rs
+++ b/backends/llamacpp/src/lib.rs
@@ -1,6 +1,5 @@
+use crate::backend::InferContext;
 use crate::ffi::SamplingParams;
-use text_generation_router::infer::{InferError, InferStreamResponse};
-use tokio::sync::mpsc::UnboundedSender;
 
 pub mod backend;
 
@@ -16,8 +15,6 @@ impl Default for SamplingParams {
     }
 }
 
-struct OpaqueStream(UnboundedSender<Result<InferStreamResponse, InferError>>);
-
 #[cxx::bridge(namespace = "huggingface::tgi::backends::llamacpp")]
 mod ffi {
     #[derive(Debug, Copy, Clone)]
@@ -36,7 +33,7 @@ mod ffi {
     }
 
     extern "Rust" {
-        type OpaqueStream;
+        type InferContext;
     }
 
     unsafe extern "C++" {
@@ -66,11 +63,10 @@ mod ffi {
         unsafe fn stream(
             self: Pin<&mut LlamaCppBackendImpl>,
             tokens: &[u32],
-            generated: &mut [u32],
             generation_params: GenerationParams,
             sampling_params: &SamplingParams,
-            stream: *mut OpaqueStream,
-            callback: unsafe fn(*mut OpaqueStream, u32, f32, bool, usize),
+            stream: *mut InferContext,
+            callback: unsafe fn(*mut InferContext, u32, f32, bool, usize),
         ) -> Result<usize>;
     }
 }

From 958c72a44a4bba4f8cdcb12d09d4038de7dc95bf Mon Sep 17 00:00:00 2001
From: Morgan Funtowicz <funtowiczmo@gmail.com>
Date: Mon, 4 Nov 2024 16:26:05 +0100
Subject: [PATCH 41/91] misc(ffi): remove unused ffi mapping

---
 backends/llamacpp/src/lib.rs | 8 --------
 1 file changed, 8 deletions(-)

diff --git a/backends/llamacpp/src/lib.rs b/backends/llamacpp/src/lib.rs
index 01f2054db89..006c7387ae3 100644
--- a/backends/llamacpp/src/lib.rs
+++ b/backends/llamacpp/src/lib.rs
@@ -52,14 +52,6 @@ mod ffi {
         #[rust_name = "create_single_worker_backend"]
         fn create_single_worker_backend(modelPath: &str) -> Result<UniquePtr<LlamaCppBackendImpl>>;
 
-        // fn generate(
-        //     self: Pin<&mut LlamaCppBackendImpl>,
-        //     tokens: &[u32],
-        //     generated: &mut [u32],
-        //     generation_params: GenerationParams,
-        //     sampling_params: &SamplingParams,
-        // ) -> Result<usize>;
-
         unsafe fn stream(
             self: Pin<&mut LlamaCppBackendImpl>,
             tokens: &[u32],

From 1473259f84fb0272b357392a13eaa168d39bc1c4 Mon Sep 17 00:00:00 2001
From: Morgan Funtowicz <funtowiczmo@gmail.com>
Date: Mon, 4 Nov 2024 17:01:22 +0100
Subject: [PATCH 42/91] feat(backend): add early stopping criteria from TGI
 stream callback

---
 backends/llamacpp/csrc/backend.cpp | 16 +++++++++-------
 backends/llamacpp/csrc/backend.hpp |  4 ++--
 backends/llamacpp/csrc/ffi.hpp     |  6 +++---
 backends/llamacpp/src/backend.rs   | 13 ++++++++-----
 backends/llamacpp/src/lib.rs       |  2 +-
 5 files changed, 23 insertions(+), 18 deletions(-)

diff --git a/backends/llamacpp/csrc/backend.cpp b/backends/llamacpp/csrc/backend.cpp
index 65898dfe772..f69563811da 100644
--- a/backends/llamacpp/csrc/backend.cpp
+++ b/backends/llamacpp/csrc/backend.cpp
@@ -121,11 +121,12 @@ namespace huggingface::tgi::backends::llamacpp {
                 generating = !(has_reach_max_tokens | has_reach_eog);
 
                 // Bubble up the generated token if a callback is provided
-                std::invoke(std::forward<const llama_decode_callback>(callback_),
-                            new_token_id,
-                            new_token_logits,
-                            !generating,
-                            n_decoded_tokens + 1);
+                const auto should_stop = std::invoke(std::forward<const llama_decode_callback>(callback_),
+                                                     new_token_id,
+                                                     new_token_logits,
+                                                     !generating,
+                                                     n_decoded_tokens + 1);
+                generating ^= should_stop;
 
                 batch = llama_batch_get_one(&new_token_id, 1);
             }
@@ -148,11 +149,12 @@ namespace huggingface::tgi::backends::llamacpp {
         // TODO: Should we provide a way to change this value?
         auto generated = std::vector<llama_token>(2 << 8);
         auto inner_callback = [&](uint32_t new_token_id, float_t new_token_logit, bool is_eos,
-                                  size_t num_generated_tokens) {
+                                  size_t num_generated_tokens) -> bool {
             generated.emplace_back(new_token_id);
 
             if (callback.has_value())
-                (*callback)(new_token_id, new_token_logit, is_eos, num_generated_tokens);
+                return (*callback)(new_token_id, new_token_logit, is_eos, num_generated_tokens);
+            return true;
         };
 
         auto nTokensGenerated = stream(tokens, generation_params, sampling_params, inner_callback);
diff --git a/backends/llamacpp/csrc/backend.hpp b/backends/llamacpp/csrc/backend.hpp
index 1fef7fb8931..bf9df5cca0e 100644
--- a/backends/llamacpp/csrc/backend.hpp
+++ b/backends/llamacpp/csrc/backend.hpp
@@ -29,8 +29,8 @@ namespace huggingface::tgi::backends::llamacpp {
     static constexpr auto llama_sampler_deleter = [](llama_sampler *pSampler) { llama_sampler_free(pSampler); };
     typedef std::unique_ptr<llama_sampler, decltype(llama_sampler_deleter)> llama_sampler_ptr;
 
-    typedef std::function<void(llama_token, float_t, bool, size_t)> llama_decode_callback;
-    static constexpr auto llama_void_callback = [](llama_token, float_t, bool, size_t) {};
+    typedef std::function<bool(llama_token, float_t, bool, size_t)> llama_decode_callback;
+    static constexpr auto llama_void_callback = [](llama_token, float_t, bool, size_t) -> bool { return false; };
 
     /**
      *
diff --git a/backends/llamacpp/csrc/ffi.hpp b/backends/llamacpp/csrc/ffi.hpp
index 3ae392f624c..f33a2f1ad57 100644
--- a/backends/llamacpp/csrc/ffi.hpp
+++ b/backends/llamacpp/csrc/ffi.hpp
@@ -64,14 +64,14 @@ namespace huggingface::tgi::backends::llamacpp {
                 const generation_params_t generation_params,
                 const sampling_params_t &sampling_params,
                 InferContext *ctx,
-                rust::Fn<void(InferContext *, uint32_t, float_t, bool, size_t)> callback
+                rust::Fn<bool(InferContext *, uint32_t, float_t, bool, size_t)> callback
         ) {
             // Define the visitor lambda function which requires the has_emplace_generate constraint on T
             auto inner_fw = [=, &sampling_params, &ctx, &callback]<has_stream_method T>(T &&backend)
                     -> std::expected<size_t, backend_error_t> {
 
-                auto context_forwarding_callback = [=, &ctx](uint32_t new_token_id, float_t logits, bool is_eos, size_t n_generated_tokens){
-                    callback(ctx, new_token_id, logits, is_eos, n_generated_tokens);
+                auto context_forwarding_callback = [=, &ctx](uint32_t new_token_id, float_t logits, bool is_eos, size_t n_generated_tokens) -> bool {
+                    return callback(ctx, new_token_id, logits, is_eos, n_generated_tokens);
                 };
 
                 // Ask the compiler to create view over Rust slice transmuting from uint32_t* to int32_t*
diff --git a/backends/llamacpp/src/backend.rs b/backends/llamacpp/src/backend.rs
index 06e8d43ed9b..531a07dc6fb 100644
--- a/backends/llamacpp/src/backend.rs
+++ b/backends/llamacpp/src/backend.rs
@@ -13,11 +13,10 @@ use text_generation_router::validation::{
 };
 use text_generation_router::{FinishReason, Token};
 use thiserror::Error;
-use tokio::sync::mpsc::error::SendError;
 use tokio::sync::mpsc::{unbounded_channel, UnboundedSender};
 use tokio::time::Instant;
 use tokio_stream::wrappers::UnboundedReceiverStream;
-use tracing::{debug, error, info};
+use tracing::{error, info};
 
 type InferResult = Result<InferStreamResponse, InferError>;
 
@@ -45,7 +44,7 @@ impl From<&ValidStoppingParameters> for GenerationParams {
 }
 
 #[cfg_attr(debug_assertions, derive(Debug))]
-struct GenerationContext {
+pub(crate) struct GenerationContext {
     pub(crate) input_tokens: Arc<Vec<u32>>,
     pub(crate) generated_tokens: Vec<u32>,
     pub(crate) generation_params: GenerationParams,
@@ -108,7 +107,7 @@ fn llama_generate_callback(
     new_token_logit: f32,
     is_final: bool,
     n_generated_tokens: usize,
-) {
+) -> bool {
     info!("Generated token: {new_token_id} -> logits={new_token_logit}, is_final={is_final} ({n_generated_tokens})");
 
     // Decode token
@@ -151,10 +150,14 @@ fn llama_generate_callback(
     };
 
     // Send back to the client
-    if let Err(ref err) = ctx.stream.send(Ok(response)) {
+    if let Err(ref _err) = ctx.stream.send(Ok(response)) {
         error!("Failed to send back the response to the client, cancelling request");
         // TODO: cancel the request
+        return true; // should_stop
     }
+
+    // should_stop
+    false
 }
 
 unsafe fn scheduler_loop(
diff --git a/backends/llamacpp/src/lib.rs b/backends/llamacpp/src/lib.rs
index 006c7387ae3..abcdd1fad06 100644
--- a/backends/llamacpp/src/lib.rs
+++ b/backends/llamacpp/src/lib.rs
@@ -58,7 +58,7 @@ mod ffi {
             generation_params: GenerationParams,
             sampling_params: &SamplingParams,
             stream: *mut InferContext,
-            callback: unsafe fn(*mut InferContext, u32, f32, bool, usize),
+            callback: unsafe fn(*mut InferContext, u32, f32, bool, usize) -> bool,
         ) -> Result<usize>;
     }
 }

From 1149186794a919d58cf5d43c7a497d81555f20c5 Mon Sep 17 00:00:00 2001
From: Morgan Funtowicz <funtowiczmo@gmail.com>
Date: Mon, 4 Nov 2024 23:01:57 +0100
Subject: [PATCH 43/91] feat(backend): expose tokenizer to the
 GenerationContext to decode token

---
 backends/llamacpp/src/backend.rs | 65 +++++++++++++++++++++-----------
 backends/llamacpp/src/main.rs    | 21 ++++++++---
 2 files changed, 58 insertions(+), 28 deletions(-)

diff --git a/backends/llamacpp/src/backend.rs b/backends/llamacpp/src/backend.rs
index 531a07dc6fb..08fac6755a7 100644
--- a/backends/llamacpp/src/backend.rs
+++ b/backends/llamacpp/src/backend.rs
@@ -13,6 +13,7 @@ use text_generation_router::validation::{
 };
 use text_generation_router::{FinishReason, Token};
 use thiserror::Error;
+use tokenizers::Tokenizer;
 use tokio::sync::mpsc::{unbounded_channel, UnboundedSender};
 use tokio::time::Instant;
 use tokio_stream::wrappers::UnboundedReceiverStream;
@@ -54,6 +55,7 @@ pub(crate) struct GenerationContext {
 pub(crate) struct InferContext {
     pub(crate) start: Instant,
     pub(crate) stream: UnboundedSender<InferResult>,
+    pub(crate) tokenizer: Tokenizer,
     pub(crate) generation: GenerationContext,
 }
 
@@ -72,7 +74,10 @@ pub struct LlamaCppBackend {
 }
 
 impl LlamaCppBackend {
-    pub fn new<P: AsRef<Path> + Send>(model_path: P) -> Result<Self, LlamaCppBackendError> {
+    pub fn new<P: AsRef<Path> + Send>(
+        model_path: P,
+        tokenizer: Tokenizer,
+    ) -> Result<Self, LlamaCppBackendError> {
         let path = Arc::new(model_path.as_ref());
         if !path.exists() {
             return Err(LlamaCppBackendError::ModelFileDoesntExist(
@@ -93,7 +98,7 @@ impl LlamaCppBackend {
         );
 
         let (submitter, receiver) = channel();
-        let handle = unsafe { spawn(|| scheduler_loop(backend, receiver)) };
+        let handle = unsafe { spawn(|| scheduler_loop(backend, tokenizer, receiver)) };
         Ok(Self {
             backlog: submitter,
             scheduler_handle: handle,
@@ -110,19 +115,25 @@ fn llama_generate_callback(
 ) -> bool {
     info!("Generated token: {new_token_id} -> logits={new_token_logit}, is_final={is_final} ({n_generated_tokens})");
 
-    // Decode token
-    let token = Token {
-        id: new_token_id,
-        text: "".to_string(),
-        logprob: new_token_logit,
-        special: false,
-    };
-
     let ctx = unsafe { &mut *ctx };
 
     // Append the new token to the generated ones
     ctx.generation.generated_tokens.push(new_token_id);
 
+    // Decode token
+    let token = match ctx.tokenizer.decode(&[new_token_id], false) {
+        Ok(text) => {
+            let special = ctx.tokenizer.get_added_vocabulary().is_special_token(&text);
+            Token {
+                id: new_token_id,
+                text,
+                logprob: new_token_logit,
+                special,
+            }
+        }
+        Err(_) => panic!("Failed to decode token"),
+    };
+
     // Create the streamed response
     let response = match is_final {
         false => InferStreamResponse::Intermediate {
@@ -131,21 +142,26 @@ fn llama_generate_callback(
         },
         true => {
             // Decode the whole text
-            let text = String::new();
-
-            // Stream end response
-            InferStreamResponse::End {
-                token,
-                top_tokens: vec![],
-                generated_text: GeneratedText {
-                    text,
-                    generated_tokens: n_generated_tokens as u32,
-                    finish_reason: FinishReason::Length,
-                    seed: Some(ctx.generation.sampling_params.seed),
+            match ctx
+                .tokenizer
+                .decode(&ctx.generation.generated_tokens, false)
+            {
+                Ok(text) => InferStreamResponse::End {
+                    token,
+                    top_tokens: vec![],
+                    generated_text: GeneratedText {
+                        text,
+                        generated_tokens: n_generated_tokens as u32,
+                        finish_reason: FinishReason::Length,
+                        seed: Some(ctx.generation.sampling_params.seed),
+                    },
+                    start: ctx.start,
+                    queued: ctx.start,
                 },
-                start: ctx.start,
-                queued: ctx.start,
+                Err(_) => panic!("Failed to decode token"),
             }
+
+            // Stream end response
         }
     };
 
@@ -162,6 +178,7 @@ fn llama_generate_callback(
 
 unsafe fn scheduler_loop(
     mut backend: UniquePtr<LlamaCppBackendImpl>,
+    tokenizer: Tokenizer,
     backlog: Receiver<(GenerationContext, UnboundedSender<InferResult>)>,
 ) {
     // This loop will mostly decode single token at every step, so no need to rely on parallelism
@@ -170,6 +187,7 @@ unsafe fn scheduler_loop(
     loop {
         if let Ok((generation, stream)) = backlog.recv() {
             let start = Instant::now();
+            let tokenizer = tokenizer.clone();
             let generation_params = generation.generation_params; // copy
             let sampling_params = generation.sampling_params; // copy
             let input_tokens = Arc::clone(&generation.input_tokens);
@@ -179,6 +197,7 @@ unsafe fn scheduler_loop(
                 let ctx = Box::new(InferContext {
                     start,
                     stream,
+                    tokenizer,
                     generation,
                 });
 
diff --git a/backends/llamacpp/src/main.rs b/backends/llamacpp/src/main.rs
index f128a6a3fc6..c5d735ab719 100644
--- a/backends/llamacpp/src/main.rs
+++ b/backends/llamacpp/src/main.rs
@@ -4,6 +4,7 @@ use text_generation_backend_llamacpp::backend::{LlamaCppBackend, LlamaCppBackend
 use text_generation_router::server::ApiDoc;
 use text_generation_router::{server, usage_stats};
 use thiserror::Error;
+use tokenizers::FromPretrainedParameters;
 
 /// App Configuration
 #[derive(Parser, Debug)]
@@ -36,9 +37,9 @@ struct Args {
     port: u16,
     #[clap(long, env, help = "Path to GGUF model file(s) to load")]
     gguf_path: PathBuf,
-    #[clap(long, env, default_value = "1", help = "Number of model instance(s)")]
-    num_model_instance: u16,
-    #[clap(default_value = "bigscience/bloom", long, env)]
+    // #[clap(long, env, default_value = "1", help = "Number of model instance(s)")]
+    // num_model_instance: u16,
+    #[clap(long, env, required = true)]
     tokenizer_name: String,
     #[clap(long, env)]
     tokenizer_config_path: Option<String>,
@@ -94,7 +95,7 @@ async fn main() -> Result<(), RouterError> {
         hostname,
         port,
         gguf_path,
-        num_model_instance,
+        // num_model_instance,
         tokenizer_name,
         tokenizer_config_path,
         revision,
@@ -153,7 +154,17 @@ async fn main() -> Result<(), RouterError> {
         }
     }
 
-    let backend = LlamaCppBackend::new(gguf_path)?;
+    let auth_token = std::env::var("HF_TOKEN")
+        .or_else(|_| std::env::var("HUGGING_FACE_HUB_TOKEN"))
+        .ok();
+    let options = FromPretrainedParameters {
+        revision: revision.clone().unwrap_or("main".to_string()),
+        user_agent: Default::default(),
+        auth_token,
+    };
+    let tokenizer = tokenizers::Tokenizer::from_pretrained(tokenizer_name.clone(), Some(options))
+        .expect("Failed to retrieve tokenizer");
+    let backend = LlamaCppBackend::new(gguf_path, tokenizer)?;
 
     // Run server
     server::run(

From 52208f5b78fd8cc31d01f440b5f5e250896c1e64 Mon Sep 17 00:00:00 2001
From: Morgan Funtowicz <funtowiczmo@gmail.com>
Date: Mon, 4 Nov 2024 23:24:50 +0100
Subject: [PATCH 44/91] misc(backend): decrease log verbosity in callback

---
 backends/llamacpp/src/backend.rs | 4 ++--
 1 file changed, 2 insertions(+), 2 deletions(-)

diff --git a/backends/llamacpp/src/backend.rs b/backends/llamacpp/src/backend.rs
index 08fac6755a7..62b4743daac 100644
--- a/backends/llamacpp/src/backend.rs
+++ b/backends/llamacpp/src/backend.rs
@@ -17,7 +17,7 @@ use tokenizers::Tokenizer;
 use tokio::sync::mpsc::{unbounded_channel, UnboundedSender};
 use tokio::time::Instant;
 use tokio_stream::wrappers::UnboundedReceiverStream;
-use tracing::{error, info};
+use tracing::{debug, error, info};
 
 type InferResult = Result<InferStreamResponse, InferError>;
 
@@ -113,7 +113,7 @@ fn llama_generate_callback(
     is_final: bool,
     n_generated_tokens: usize,
 ) -> bool {
-    info!("Generated token: {new_token_id} -> logits={new_token_logit}, is_final={is_final} ({n_generated_tokens})");
+    debug!("Generated token: {new_token_id} -> logits={new_token_logit}, is_final={is_final} ({n_generated_tokens})");
 
     let ctx = unsafe { &mut *ctx };
 

From 62dba1a878ba7e3c8151485adfb6159457c34c5a Mon Sep 17 00:00:00 2001
From: Morgan Funtowicz <funtowiczmo@gmail.com>
Date: Tue, 5 Nov 2024 23:46:52 +0100
Subject: [PATCH 45/91] misc(cmake): use url deps and not git repo

---
 backends/llamacpp/cmake/fmt.cmake    | 3 +--
 backends/llamacpp/cmake/spdlog.cmake | 7 +++----
 2 files changed, 4 insertions(+), 6 deletions(-)

diff --git a/backends/llamacpp/cmake/fmt.cmake b/backends/llamacpp/cmake/fmt.cmake
index f94a9c5668f..840280ca8ba 100644
--- a/backends/llamacpp/cmake/fmt.cmake
+++ b/backends/llamacpp/cmake/fmt.cmake
@@ -1,6 +1,5 @@
 FetchContent_Declare(
         fmt
-        GIT_REPOSITORY https://github.com/fmtlib/fmt
-        GIT_TAG 11.0.1
+        URL https://github.com/fmtlib/fmt/archive/refs/tags/11.0.2.tar.gz
 )
 FetchContent_MakeAvailable(fmt)
diff --git a/backends/llamacpp/cmake/spdlog.cmake b/backends/llamacpp/cmake/spdlog.cmake
index 68658ba5019..04c218b5814 100644
--- a/backends/llamacpp/cmake/spdlog.cmake
+++ b/backends/llamacpp/cmake/spdlog.cmake
@@ -1,18 +1,17 @@
 set(SPDLOG_USE_FMT ON)
-set(SPDLOG_BUILD_SHARED OFF)
+set(SPDLOG_BUILD_SHARED ON)
 set(SPDLOG_FMT_EXTERNAL ON)
 
 # Define the level at which SPDLOG_ compilation level is defined
 if (CMAKE_BUILD_TYPE STREQUAL "Debug")
     message(STATUS "Verbose logging is enabled in debug build")
     add_compile_definitions(SPDLOG_ACTIVE_LEVEL=SPDLOG_LEVEL_DEBUG)
-else()
+else ()
     add_compile_definitions(SPDLOG_ACTIVE_LEVEL=SPDLOG_LEVEL_INFO)
 endif ()
 
 fetchcontent_declare(
         spdlog
-        GIT_REPOSITORY https://github.com/gabime/spdlog.git
-        GIT_TAG v1.14.1
+        URL https://github.com/gabime/spdlog/archive/refs/tags/v1.14.1.tar.gz
 )
 fetchcontent_makeavailable(spdlog)

From 588421833c53b1ee6328b3d19650f6d93623e910 Mon Sep 17 00:00:00 2001
From: Morgan Funtowicz <funtowiczmo@gmail.com>
Date: Tue, 5 Nov 2024 23:47:22 +0100
Subject: [PATCH 46/91] misc(backend): missing header <variant>

---
 backends/llamacpp/csrc/ffi.hpp | 1 +
 1 file changed, 1 insertion(+)

diff --git a/backends/llamacpp/csrc/ffi.hpp b/backends/llamacpp/csrc/ffi.hpp
index f33a2f1ad57..a3d14ee52f7 100644
--- a/backends/llamacpp/csrc/ffi.hpp
+++ b/backends/llamacpp/csrc/ffi.hpp
@@ -8,6 +8,7 @@
 #include <exception>
 #include <filesystem>
 #include <string_view>
+#include <variant>
 
 #include <spdlog/spdlog.h>
 #include "backend.hpp"

From a1154b17ec8489ad77217af8cf52027e9413ecb6 Mon Sep 17 00:00:00 2001
From: Morgan Funtowicz <funtowiczmo@gmail.com>
Date: Tue, 5 Nov 2024 23:47:38 +0100
Subject: [PATCH 47/91] feat(backend): avoid copy constructor

---
 backends/llamacpp/csrc/ffi.hpp | 3 +--
 1 file changed, 1 insertion(+), 2 deletions(-)

diff --git a/backends/llamacpp/csrc/ffi.hpp b/backends/llamacpp/csrc/ffi.hpp
index a3d14ee52f7..9daacf2c84d 100644
--- a/backends/llamacpp/csrc/ffi.hpp
+++ b/backends/llamacpp/csrc/ffi.hpp
@@ -101,8 +101,7 @@ namespace huggingface::tgi::backends::llamacpp {
         params.use_mmap = true;
 
         auto *model = llama_load_model_from_file(cxxPath.c_str(), params);
-        auto backend = single_worker_backend_t(model, std::nullopt);
-        return std::make_unique<llama_cpp_backend_impl_t>(std::move(backend));
+        return std::make_unique<llama_cpp_backend_impl_t>(single_worker_backend_t { model, std::nullopt });
     }
 }
 

From 7eec0f704f05cbcc55e8b2a8132679b497d02fe3 Mon Sep 17 00:00:00 2001
From: Morgan Funtowicz <funtowiczmo@gmail.com>
Date: Tue, 5 Nov 2024 23:48:13 +0100
Subject: [PATCH 48/91] chore(backend): minor fixes mostly format

---
 backends/llamacpp/build.rs         | 8 ++++----
 backends/llamacpp/csrc/backend.cpp | 2 +-
 backends/llamacpp/csrc/backend.hpp | 7 ++++++-
 backends/llamacpp/src/backend.rs   | 4 ++--
 4 files changed, 13 insertions(+), 8 deletions(-)

diff --git a/backends/llamacpp/build.rs b/backends/llamacpp/build.rs
index eefc6403278..1ab926d4635 100644
--- a/backends/llamacpp/build.rs
+++ b/backends/llamacpp/build.rs
@@ -99,11 +99,11 @@ fn main() {
     println!("cargo:rustc-link-search=native={}", out_dir.display());
 
     if is_debug {
-        println!("cargo:rustc-link-lib=static=fmtd");
-        println!("cargo:rustc-link-lib=static=spdlogd");
+        println!("cargo:rustc-link-lib=dylib=fmtd");
+        println!("cargo:rustc-link-lib=dylib=spdlogd");
     } else {
-        println!("cargo:rustc-link-lib=fmt");
-        println!("cargo:rustc-link-lib=spdlog");
+        println!("cargo:rustc-link-lib=dylib=fmt");
+        println!("cargo:rustc-link-lib=dylib=spdlog");
     }
 
     println!("cargo:rustc-link-lib=static=common");
diff --git a/backends/llamacpp/csrc/backend.cpp b/backends/llamacpp/csrc/backend.cpp
index f69563811da..739b84a1d36 100644
--- a/backends/llamacpp/csrc/backend.cpp
+++ b/backends/llamacpp/csrc/backend.cpp
@@ -170,7 +170,7 @@ namespace huggingface::tgi::backends::llamacpp {
               mContext_(llama_context_factory(model)),
               mWorker_(mModel_, params.value_or(llama_context_default_params())) {
         llama_numa_init(ggml_numa_strategy::GGML_NUMA_STRATEGY_NUMACTL);
-    }
+    };
 
     std::expected<size_t, backend_error_t>
     single_worker_backend_t::stream(
diff --git a/backends/llamacpp/csrc/backend.hpp b/backends/llamacpp/csrc/backend.hpp
index bf9df5cca0e..4abc202ded6 100644
--- a/backends/llamacpp/csrc/backend.hpp
+++ b/backends/llamacpp/csrc/backend.hpp
@@ -157,10 +157,11 @@ namespace huggingface::tgi::backends::llamacpp {
 
     class single_worker_backend_t : backend_base_t {
     private:
-        constexpr const static auto llama_context_factory = [](llama_model *pModel) -> llama_context_ptr {
+        constexpr static auto llama_context_factory = [](llama_model *pModel) -> llama_context_ptr {
             auto llParams = llama_context_default_params();
             llParams.flash_attn = true;
             llParams.n_batch = 1;
+            llParams.n_threads = 1;
             llParams.no_perf = true;
             llParams.attention_type = llama_attention_type::LLAMA_ATTENTION_TYPE_CAUSAL;
 
@@ -173,6 +174,8 @@ namespace huggingface::tgi::backends::llamacpp {
     public:
         explicit single_worker_backend_t(llama_model *pModel, const std::optional<llama_context_params> &);
 
+        using backend_base_t::generate;
+
         std::expected<size_t, backend_error_t> stream(
                 std::span<const llama_token> tokens,
                 const generation_params_t &generation_params,
@@ -185,6 +188,8 @@ namespace huggingface::tgi::backends::llamacpp {
         llama_context_ptr mContext_;
 
     public:
+        using backend_base_t::generate;
+
         std::expected<size_t, backend_error_t> stream(
                 std::span<const llama_token> tokens,
                 const generation_params_t &generation_params,
diff --git a/backends/llamacpp/src/backend.rs b/backends/llamacpp/src/backend.rs
index 62b4743daac..609c8405767 100644
--- a/backends/llamacpp/src/backend.rs
+++ b/backends/llamacpp/src/backend.rs
@@ -70,7 +70,7 @@ pub enum LlamaCppBackendError {
 
 pub struct LlamaCppBackend {
     backlog: Sender<(GenerationContext, UnboundedSender<InferResult>)>,
-    scheduler_handle: JoinHandle<()>,
+    _scheduler_handle: JoinHandle<()>,
 }
 
 impl LlamaCppBackend {
@@ -101,7 +101,7 @@ impl LlamaCppBackend {
         let handle = unsafe { spawn(|| scheduler_loop(backend, tokenizer, receiver)) };
         Ok(Self {
             backlog: submitter,
-            scheduler_handle: handle,
+            _scheduler_handle: handle,
         })
     }
 }

From a7afde41a94776e7324137b1091a8883f0afde00 Mon Sep 17 00:00:00 2001
From: Morgan Funtowicz <funtowiczmo@gmail.com>
Date: Tue, 5 Nov 2024 23:48:22 +0100
Subject: [PATCH 49/91] feat(backend): dockerfile

---
 Dockerfile.llamacpp | 51 +++++++++++++++++++++++++++++++++++++++++++++
 1 file changed, 51 insertions(+)
 create mode 100644 Dockerfile.llamacpp

diff --git a/Dockerfile.llamacpp b/Dockerfile.llamacpp
new file mode 100644
index 00000000000..e24ce9bd32b
--- /dev/null
+++ b/Dockerfile.llamacpp
@@ -0,0 +1,51 @@
+# Build dependencies resolver stage
+FROM lukemathwalker/cargo-chef:latest AS chef
+WORKDIR /usr/src/text-generation-inference/
+
+FROM chef AS planner
+COPY Cargo.lock Cargo.lock
+COPY Cargo.toml Cargo.toml
+COPY rust-toolchain.toml rust-toolchain.toml
+COPY backends backends
+COPY benchmark benchmark
+COPY clients clients
+COPY launcher launcher
+COPY router router
+
+RUN cargo chef prepare --recipe-path recipe.json
+
+FROM chef AS builder
+RUN apt update && DEBIAN_FRONTEND=noninteractive apt install -y \
+    clang \
+    cmake \
+    gcc g++ \
+    libc++-dev \
+    libopenmpi-dev \
+    libssl-dev \
+    ninja-build \
+    openssl \
+    python3-dev
+
+
+RUN   update-alternatives --install /usr/bin/cc cc /usr/bin/clang 10 \
+      && update-alternatives --install /usr/bin/c++ c++ /usr/bin/clang 10 \
+      && update-alternatives --auto cc \
+      && update-alternatives --auto c++ \
+      && update-alternatives --display cc \
+      && update-alternatives --display c++ \
+      && cc --version \
+      && c++ --version
+
+COPY --from=planner usr/src/text-generation-inference/recipe.json recipe.json
+RUN cargo chef cook --profile release-opt --package text-generation-backend-llamacpp --bin text-generation-backend-llamacpp --recipe-path recipe.json
+
+COPY Cargo.lock Cargo.lock
+COPY Cargo.toml Cargo.toml
+COPY rust-toolchain.toml rust-toolchain.toml
+COPY backends backends
+COPY benchmark benchmark
+COPY launcher launcher
+COPY router router
+
+ENV RUSTFLAGS="-L/usr/lib"
+RUN cargo build --profile release-opt --package text-generation-backend-llamacpp --bin text-generation-backend-llamacpp --frozen
\ No newline at end of file

From 20652824d99076f58e989f46430f68b2d619f489 Mon Sep 17 00:00:00 2001
From: Morgan Funtowicz <funtowiczmo@gmail.com>
Date: Wed, 6 Nov 2024 17:33:37 +0100
Subject: [PATCH 50/91] feat(dockerfile): build process

---
 Dockerfile.llamacpp | 10 +++++++---
 1 file changed, 7 insertions(+), 3 deletions(-)

diff --git a/Dockerfile.llamacpp b/Dockerfile.llamacpp
index e24ce9bd32b..0864c1bad08 100644
--- a/Dockerfile.llamacpp
+++ b/Dockerfile.llamacpp
@@ -15,6 +15,7 @@ COPY router router
 RUN cargo chef prepare --recipe-path recipe.json
 
 FROM chef AS builder
+ENV CMAKE_INSTALL_PREFIX=${CWD}/dist
 RUN apt update && DEBIAN_FRONTEND=noninteractive apt install -y \
     clang \
     cmake \
@@ -26,7 +27,6 @@ RUN apt update && DEBIAN_FRONTEND=noninteractive apt install -y \
     openssl \
     python3-dev
 
-
 RUN   update-alternatives --install /usr/bin/cc cc /usr/bin/clang 10 \
       && update-alternatives --install /usr/bin/c++ c++ /usr/bin/clang 10 \
       && update-alternatives --auto cc \
@@ -36,7 +36,7 @@ RUN   update-alternatives --install /usr/bin/cc cc /usr/bin/clang 10 \
       && cc --version \
       && c++ --version
 
-COPY --from=planner usr/src/text-generation-inference/recipe.json recipe.json
+COPY --from=planner /usr/src/text-generation-inference/recipe.json recipe.json
 RUN cargo chef cook --profile release-opt --package text-generation-backend-llamacpp --bin text-generation-backend-llamacpp --recipe-path recipe.json
 
 COPY Cargo.lock Cargo.lock
@@ -48,4 +48,8 @@ COPY launcher launcher
 COPY router router
 
 ENV RUSTFLAGS="-L/usr/lib"
-RUN cargo build --profile release-opt --package text-generation-backend-llamacpp --bin text-generation-backend-llamacpp --frozen
\ No newline at end of file
+RUN cargo build --profile release-opt --package text-generation-backend-llamacpp --bin text-generation-backend-llamacpp --frozen
+
+FROM ubuntu:24.04
+COPY --from=builder /usr/src/text-generation-inference/target/release-opt/text-generation-backend-llamacpp /usr/src/text-generation-inference/text-generation-launcher
+COPY --from=builder /usr/src/text-generation-inference/dist /usr/
\ No newline at end of file

From 26d0266cec6f327bd41c0a8050dbc1725e670f32 Mon Sep 17 00:00:00 2001
From: Morgan Funtowicz <funtowiczmo@gmail.com>
Date: Wed, 6 Nov 2024 17:46:46 +0100
Subject: [PATCH 51/91] feat(backend): handle all the tokenization failure and
 send back to the client

---
 backends/llamacpp/src/backend.rs | 69 +++++++++++++++++---------------
 1 file changed, 36 insertions(+), 33 deletions(-)

diff --git a/backends/llamacpp/src/backend.rs b/backends/llamacpp/src/backend.rs
index 609c8405767..8214c36a73b 100644
--- a/backends/llamacpp/src/backend.rs
+++ b/backends/llamacpp/src/backend.rs
@@ -124,56 +124,59 @@ fn llama_generate_callback(
     let token = match ctx.tokenizer.decode(&[new_token_id], false) {
         Ok(text) => {
             let special = ctx.tokenizer.get_added_vocabulary().is_special_token(&text);
-            Token {
+            Ok(Token {
                 id: new_token_id,
                 text,
                 logprob: new_token_logit,
                 special,
-            }
+            })
         }
-        Err(_) => panic!("Failed to decode token"),
+        Err(ref err) => Err(InferError::GenerationError(err.to_string())),
     };
 
     // Create the streamed response
-    let response = match is_final {
-        false => InferStreamResponse::Intermediate {
-            token,
-            top_tokens: vec![],
-        },
-        true => {
-            // Decode the whole text
-            match ctx
-                .tokenizer
-                .decode(&ctx.generation.generated_tokens, false)
-            {
-                Ok(text) => InferStreamResponse::End {
+    let response = match token {
+        Ok(token) => {
+            match is_final {
+                false => Ok(InferStreamResponse::Intermediate {
                     token,
                     top_tokens: vec![],
-                    generated_text: GeneratedText {
-                        text,
-                        generated_tokens: n_generated_tokens as u32,
-                        finish_reason: FinishReason::Length,
-                        seed: Some(ctx.generation.sampling_params.seed),
-                    },
-                    start: ctx.start,
-                    queued: ctx.start,
-                },
-                Err(_) => panic!("Failed to decode token"),
+                }),
+                true => {
+                    // Decode the whole text
+                    match ctx
+                        .tokenizer
+                        .decode(&ctx.generation.generated_tokens, false)
+                    {
+                        Ok(text) => Ok(InferStreamResponse::End {
+                            token,
+                            top_tokens: vec![],
+                            generated_text: GeneratedText {
+                                text,
+                                generated_tokens: n_generated_tokens as u32,
+                                finish_reason: FinishReason::Length,
+                                seed: Some(ctx.generation.sampling_params.seed),
+                            },
+                            start: ctx.start,
+                            queued: ctx.start,
+                        }),
+                        Err(err) => Err(InferError::GenerationError(err.to_string())),
+                    }
+                }
             }
-
-            // Stream end response
         }
+        Err(err) => Err(err),
     };
 
     // Send back to the client
-    if let Err(ref _err) = ctx.stream.send(Ok(response)) {
+    let should_stop = if let Err(ref _err) = ctx.stream.send(response) {
         error!("Failed to send back the response to the client, cancelling request");
-        // TODO: cancel the request
-        return true; // should_stop
-    }
+        true
+    } else {
+        true
+    };
 
-    // should_stop
-    false
+    should_stop
 }
 
 unsafe fn scheduler_loop(

From cf17928f83fdb0d1224d5286b2953effde8cf28a Mon Sep 17 00:00:00 2001
From: Morgan Funtowicz <funtowiczmo@gmail.com>
Date: Fri, 8 Nov 2024 00:53:53 +0100
Subject: [PATCH 52/91] misc(cmake): remove dependency on fmt

---
 backends/llamacpp/CMakeLists.txt     | 1 -
 backends/llamacpp/build.rs           | 4 ++--
 backends/llamacpp/cmake/fmt.cmake    | 5 -----
 backends/llamacpp/cmake/spdlog.cmake | 2 +-
 backends/llamacpp/csrc/backend.cpp   | 4 +---
 5 files changed, 4 insertions(+), 12 deletions(-)
 delete mode 100644 backends/llamacpp/cmake/fmt.cmake

diff --git a/backends/llamacpp/CMakeLists.txt b/backends/llamacpp/CMakeLists.txt
index e536efc57a2..938f7360011 100644
--- a/backends/llamacpp/CMakeLists.txt
+++ b/backends/llamacpp/CMakeLists.txt
@@ -19,7 +19,6 @@ else ()
 endif ()
 
 # Add dependencies
-include(cmake/fmt.cmake)
 include(cmake/spdlog.cmake)
 
 if (${LLAMA_CPP_BUILD_CUDA})
diff --git a/backends/llamacpp/build.rs b/backends/llamacpp/build.rs
index 1ab926d4635..5331e87d451 100644
--- a/backends/llamacpp/build.rs
+++ b/backends/llamacpp/build.rs
@@ -99,10 +99,10 @@ fn main() {
     println!("cargo:rustc-link-search=native={}", out_dir.display());
 
     if is_debug {
-        println!("cargo:rustc-link-lib=dylib=fmtd");
+        // println!("cargo:rustc-link-lib=dylib=fmtd");
         println!("cargo:rustc-link-lib=dylib=spdlogd");
     } else {
-        println!("cargo:rustc-link-lib=dylib=fmt");
+        // println!("cargo:rustc-link-lib=dylib=fmt");
         println!("cargo:rustc-link-lib=dylib=spdlog");
     }
 
diff --git a/backends/llamacpp/cmake/fmt.cmake b/backends/llamacpp/cmake/fmt.cmake
deleted file mode 100644
index 840280ca8ba..00000000000
--- a/backends/llamacpp/cmake/fmt.cmake
+++ /dev/null
@@ -1,5 +0,0 @@
-FetchContent_Declare(
-        fmt
-        URL https://github.com/fmtlib/fmt/archive/refs/tags/11.0.2.tar.gz
-)
-FetchContent_MakeAvailable(fmt)
diff --git a/backends/llamacpp/cmake/spdlog.cmake b/backends/llamacpp/cmake/spdlog.cmake
index 04c218b5814..bd81d6d51a3 100644
--- a/backends/llamacpp/cmake/spdlog.cmake
+++ b/backends/llamacpp/cmake/spdlog.cmake
@@ -1,6 +1,6 @@
 set(SPDLOG_USE_FMT ON)
 set(SPDLOG_BUILD_SHARED ON)
-set(SPDLOG_FMT_EXTERNAL ON)
+set(SPDLOG_FMT_EXTERNAL OFF)
 
 # Define the level at which SPDLOG_ compilation level is defined
 if (CMAKE_BUILD_TYPE STREQUAL "Debug")
diff --git a/backends/llamacpp/csrc/backend.cpp b/backends/llamacpp/csrc/backend.cpp
index 739b84a1d36..11781273aed 100644
--- a/backends/llamacpp/csrc/backend.cpp
+++ b/backends/llamacpp/csrc/backend.cpp
@@ -7,9 +7,7 @@
 
 #include <ggml.h>
 #include <llama.h>
-#include <fmt/chrono.h>
-#include <fmt/format.h>
-#include <fmt/std.h>
+#include <spdlog/fmt/chrono.h>
 #include <spdlog/spdlog.h>
 
 #include "backend.hpp"

From 4f5397c4147aab2e5818426e162321f7179dc2d1 Mon Sep 17 00:00:00 2001
From: Morgan Funtowicz <funtowiczmo@gmail.com>
Date: Fri, 8 Nov 2024 00:54:05 +0100
Subject: [PATCH 53/91] misc(cmake): use URL base llama.cpp repo

---
 backends/llamacpp/CMakeLists.txt | 9 +++------
 1 file changed, 3 insertions(+), 6 deletions(-)

diff --git a/backends/llamacpp/CMakeLists.txt b/backends/llamacpp/CMakeLists.txt
index 938f7360011..f92bbe68661 100644
--- a/backends/llamacpp/CMakeLists.txt
+++ b/backends/llamacpp/CMakeLists.txt
@@ -33,17 +33,14 @@ endif ()
 # Download llama.cpp repo at the specific version
 fetchcontent_declare(
         llama
-        #    DOWNLOAD_EXTRACT_TIMESTAMP TRUE
-        GIT_REPOSITORY https://github.com/ggerganov/llama.cpp.git
-        GIT_TAG b3958
-        GIT_SHALLOW FALSE
+        URL https://github.com/ggerganov/llama.cpp/archive/refs/tags/b4048.tar.gz
 )
 
 fetchcontent_makeavailable(llama)
 
 add_library(tgi_llamacpp_backend_impl STATIC csrc/backend.hpp csrc/backend.cpp)
 target_compile_features(tgi_llamacpp_backend_impl PRIVATE cxx_std_11)
-target_link_libraries(tgi_llamacpp_backend_impl PUBLIC fmt::fmt spdlog::spdlog llama common)
+target_link_libraries(tgi_llamacpp_backend_impl PUBLIC spdlog::spdlog llama common)
 install(TARGETS tgi_llamacpp_backend_impl spdlog llama common)
 
 if (${CMAKE_BUILD_TYPE} STREQUAL "Debug")
@@ -54,7 +51,7 @@ if (${LLAMA_CPP_BUILD_OFFLINE_RUNNER})
     message(STATUS "Building llama.cpp offline runner")
     add_executable(tgi_llamacpp_offline_runner offline/main.cpp)
 
-    target_link_libraries(tgi_llamacpp_offline_runner PUBLIC tgi_llamacpp_backend_impl llama common spdlog::spdlog fmt::fmt)
+    target_link_libraries(tgi_llamacpp_offline_runner PUBLIC tgi_llamacpp_backend_impl llama common spdlog::spdlog)
 endif ()
 
 

From 86d30aea43c6b858fa260aaa49b2c95320f97236 Mon Sep 17 00:00:00 2001
From: Morgan Funtowicz <funtowiczmo@gmail.com>
Date: Sat, 9 Nov 2024 22:10:33 +0100
Subject: [PATCH 54/91] feat(backend): simplify overall cpp structure

---
 backends/llamacpp/csrc/backend.cpp | 103 ++++----------------------
 backends/llamacpp/csrc/backend.hpp | 110 ++--------------------------
 backends/llamacpp/csrc/ffi.hpp     |  79 +++++++-------------
 backends/llamacpp/offline/main.cpp |  43 +++++++----
 backends/llamacpp/src/backend.rs   | 113 +++++++++++++++++------------
 backends/llamacpp/src/lib.rs       |   9 +--
 backends/llamacpp/src/main.rs      |   8 +-
 7 files changed, 144 insertions(+), 321 deletions(-)

diff --git a/backends/llamacpp/csrc/backend.cpp b/backends/llamacpp/csrc/backend.cpp
index 11781273aed..837f87ea052 100644
--- a/backends/llamacpp/csrc/backend.cpp
+++ b/backends/llamacpp/csrc/backend.cpp
@@ -49,43 +49,28 @@ namespace huggingface::tgi::backends::llamacpp {
         }
 
         llama_sampler_chain_add(pSampler, llama_sampler_init_dist(seed));
-        return llama_sampler_ptr(pSampler, llama_sampler_deleter);
+        return {pSampler, llama_sampler_deleter};
     }
 
     worker_t::worker_t(std::shared_ptr<llama_model> model, const llama_context_params &params)
-            : mModel_(model), mParams_(params) {
+            : model_(model), context_(llama_new_context_with_model(model_.get(), params)) {
 
 #ifdef TGI_LLAMACPP_BACKEND_DEBUG
         char modelName[256];
         llama_model_meta_val_str(model.get(), "general.name", modelName, sizeof(modelName));
-        SPDLOG_DEBUG(FMT_STRING("Created llama.cpp backend for model: '{}'"), std::string_view(modelName));
+        SPDLOG_DEBUG(FMT_STRING("Created llama.cpp  backend for model: '{}'"), std::string_view(modelName));
 #endif
     }
 
-    void worker_t::loop(std::stop_source &driver, std::queue<generation_context_t> &backlog) const {
-        auto *context = llama_new_context_with_model(mModel_.get(), mParams_);
-
-        while (!driver.stop_requested()) {
-            const auto generation_context = backlog.front();
-
-            generate(context, generation_context, std::nullopt);
-            backlog.pop();
-
-            SPDLOG_DEBUG("Processed request ({:d} remaining)", backlog.size());
-        }
-
-        llama_free(context);
-    }
-
-    size_t worker_t::generate(
-            llama_context *context,
-            const generation_context_t &generation_context,
-            const std::optional<llama_decode_callback> &callback) const {
+    std::expected<size_t, backend_error_t>
+    worker_t::generate(const generation_context_t &generation_context,
+                       const std::optional<llama_decode_callback> &callback) const {
         // Store information about context and generation size
+        const auto callback_ = callback.value_or(llama_void_callback);
         auto max_new_tokens = generation_context.generation_params.max_new_tokens;
 
         // Convert sampling params to what llama.cpp is looking for
-        auto sampler = generation_context.sampling_params.into_llama_sampler(mModel_.get());
+        auto sampler = generation_context.sampling_params.into_llama_sampler(model_.get());
 
         // Set up the prompt
         auto copy = std::vector(generation_context.input_tokens.begin(), generation_context.input_tokens.end());
@@ -94,11 +79,10 @@ namespace huggingface::tgi::backends::llamacpp {
         // Decode
         auto n_decoded_tokens = 0;
         for (bool generating = true; generating; ++n_decoded_tokens) {
-            const auto callback_ = callback.value_or(llama_void_callback);
 
 #ifdef TGI_LLAMACPP_BACKEND_DEBUG
             const auto start = std::chrono::steady_clock::now();
-            const auto status = llama_decode(context, batch);
+            const auto status = llama_decode(context_.get(), batch);
             const auto end = std::chrono::steady_clock::now();
             const auto latency = std::chrono::duration_cast<std::chrono::milliseconds>(end - start);
             SPDLOG_DEBUG(FMT_STRING("Successfully decoded {:d} token(s) in {}"), batch.n_tokens, latency);
@@ -108,8 +92,8 @@ namespace huggingface::tgi::backends::llamacpp {
             batch.n_tokens = 0;
             if (LLAMA_SUCCESS(status)) [[likely]] {
                 // Sample the new token
-                auto new_token_id = llama_sampler_sample(sampler.get(), context, -1);
-                auto is_eog = llama_token_is_eog(mModel_.get(), new_token_id);
+                auto new_token_id = llama_sampler_sample(sampler.get(), context_.get(), -1);
+                auto is_eog = llama_token_is_eog(model_.get(), new_token_id);
                 auto new_token_logits = 0.0f; // TODO: return logit
 
                 // Handle termination cases
@@ -119,11 +103,8 @@ namespace huggingface::tgi::backends::llamacpp {
                 generating = !(has_reach_max_tokens | has_reach_eog);
 
                 // Bubble up the generated token if a callback is provided
-                const auto should_stop = std::invoke(std::forward<const llama_decode_callback>(callback_),
-                                                     new_token_id,
-                                                     new_token_logits,
-                                                     !generating,
-                                                     n_decoded_tokens + 1);
+                const auto should_stop =
+                        std::invoke(callback_, new_token_id, new_token_logits, !generating, n_decoded_tokens + 1);
                 generating ^= should_stop;
 
                 batch = llama_batch_get_one(&new_token_id, 1);
@@ -132,62 +113,4 @@ namespace huggingface::tgi::backends::llamacpp {
 
         return n_decoded_tokens;
     }
-
-
-    backend_base_t::backend_base_t(llama_model *model) : mModel_(model, llama_free_model) { llama_backend_init(); }
-
-    backend_base_t::~backend_base_t() { llama_backend_free(); }
-
-    std::expected<std::vector<llama_token>, backend_error_t> backend_base_t::generate(
-            std::span<const llama_token> tokens,
-            const generation_params_t &generation_params,
-            const sampling_params_t &sampling_params,
-            const std::optional<llama_decode_callback> &callback
-    ) {
-        // TODO: Should we provide a way to change this value?
-        auto generated = std::vector<llama_token>(2 << 8);
-        auto inner_callback = [&](uint32_t new_token_id, float_t new_token_logit, bool is_eos,
-                                  size_t num_generated_tokens) -> bool {
-            generated.emplace_back(new_token_id);
-
-            if (callback.has_value())
-                return (*callback)(new_token_id, new_token_logit, is_eos, num_generated_tokens);
-            return true;
-        };
-
-        auto nTokensGenerated = stream(tokens, generation_params, sampling_params, inner_callback);
-        return generated;
-    }
-
-
-    /** Single worker_t Backend impl **/
-
-    single_worker_backend_t::single_worker_backend_t(llama_model *model,
-                                                     const std::optional<llama_context_params> &params)
-            : backend_base_t(model),
-              mContext_(llama_context_factory(model)),
-              mWorker_(mModel_, params.value_or(llama_context_default_params())) {
-        llama_numa_init(ggml_numa_strategy::GGML_NUMA_STRATEGY_NUMACTL);
-    };
-
-    std::expected<size_t, backend_error_t>
-    single_worker_backend_t::stream(
-            std::span<const llama_token> tokens,
-            const generation_params_t &generation_params,
-            const sampling_params_t &sampling_params,
-            const llama_decode_callback &callback
-    ) {
-        return mWorker_.generate(mContext_.get(), {generation_params, sampling_params, tokens}, callback);
-    }
-
-    std::expected<size_t, backend_error_t>
-    multi_worker_backend_t::stream(
-            std::span<const llama_token> tokens,
-            const generation_params_t &generation_params,
-            const sampling_params_t &sampling_params,
-            const llama_decode_callback &callback
-    ) {
-        SPDLOG_WARN("Not implemented for multi_worker_t");
-        return 0;
-    }
 }
\ No newline at end of file
diff --git a/backends/llamacpp/csrc/backend.hpp b/backends/llamacpp/csrc/backend.hpp
index 4abc202ded6..de37df75eb5 100644
--- a/backends/llamacpp/csrc/backend.hpp
+++ b/backends/llamacpp/csrc/backend.hpp
@@ -76,8 +76,8 @@ namespace huggingface::tgi::backends::llamacpp {
      */
     class worker_t {
     private:
-        const std::shared_ptr<llama_model> mModel_;
-        const llama_context_params mParams_;
+        std::shared_ptr<llama_model> model_;
+        llama_context_ptr context_;
 
     public:
         /**
@@ -85,7 +85,7 @@ namespace huggingface::tgi::backends::llamacpp {
          * @param model
          * @param params
          */
-        worker_t(std::shared_ptr<llama_model> model, const llama_context_params &params);
+        worker_t(std::shared_ptr<llama_model>, const llama_context_params &);
 
         /**
          *
@@ -93,108 +93,8 @@ namespace huggingface::tgi::backends::llamacpp {
          * @param generation_context
          * @param callback
          */
-        size_t
-        generate(llama_context *, const generation_context_t &, const std::optional<llama_decode_callback> &) const;
-
-        /**
-         *
-         */
-        void loop(std::stop_source &driver, std::queue<generation_context_t> &backlog) const;
-    };
-
-
-    class backend_base_t {
-
-    protected:
-        std::shared_ptr<llama_model> mModel_;
-
-    public:
-
-        /**
-         *
-         * @param model
-         */
-        explicit backend_base_t(llama_model *model);
-
-        /**
-         * Destructor
-         */
-        ~backend_base_t();
-
-        /**
-         *
-         * @param tokens
-         * @param generation_params
-         * @param sampling_params
-         * @param callback
-         * @return
-         */
-        [[nodiscard("Generated tokens will be freed after this call if not assigned to an lvalue")]]
-        std::expected<std::vector<llama_token>, backend_error_t> generate(
-                std::span<const llama_token> tokens,
-                const generation_params_t &generation_params,
-                const sampling_params_t &sampling_params,
-                const std::optional<llama_decode_callback> &callback = std::nullopt
-        );
-
-        /**
-         *
-         * @param tokens
-         * @param generation_params
-         * @param sampling_params
-         * @params callback
-         * @return
-         */
-        [[nodiscard("Generated tokens will be freed after this call if not assigned to an lvalue")]]
-        virtual std::expected<size_t, backend_error_t> stream(
-                std::span<const llama_token> tokens,
-                const generation_params_t &generation_params,
-                const sampling_params_t &sampling_params,
-                const llama_decode_callback &callback
-        ) = 0;
-    };
-
-
-    class single_worker_backend_t : backend_base_t {
-    private:
-        constexpr static auto llama_context_factory = [](llama_model *pModel) -> llama_context_ptr {
-            auto llParams = llama_context_default_params();
-            llParams.flash_attn = true;
-            llParams.n_batch = 1;
-            llParams.n_threads = 1;
-            llParams.no_perf = true;
-            llParams.attention_type = llama_attention_type::LLAMA_ATTENTION_TYPE_CAUSAL;
-
-            return {llama_new_context_with_model(pModel, llParams), llama_context_deleter};
-        };
-
-        llama_context_ptr mContext_;
-        worker_t mWorker_;
-
-    public:
-        explicit single_worker_backend_t(llama_model *pModel, const std::optional<llama_context_params> &);
-
-        using backend_base_t::generate;
-
-        std::expected<size_t, backend_error_t> stream(
-                std::span<const llama_token> tokens,
-                const generation_params_t &generation_params,
-                const sampling_params_t &sampling_params,
-                const llama_decode_callback &callback) override;
-    };
-
-    class multi_worker_backend_t : backend_base_t {
-    private:
-        llama_context_ptr mContext_;
-
-    public:
-        using backend_base_t::generate;
-
-        std::expected<size_t, backend_error_t> stream(
-                std::span<const llama_token> tokens,
-                const generation_params_t &generation_params,
-                const sampling_params_t &sampling_params,
-                const llama_decode_callback &callback) override;
+        [[nodiscard]] std::expected<size_t, backend_error_t>
+        generate(const generation_context_t &, const std::optional<llama_decode_callback> &) const;
     };
 }
 
diff --git a/backends/llamacpp/csrc/ffi.hpp b/backends/llamacpp/csrc/ffi.hpp
index 9daacf2c84d..51a524cbbdd 100644
--- a/backends/llamacpp/csrc/ffi.hpp
+++ b/backends/llamacpp/csrc/ffi.hpp
@@ -7,58 +7,41 @@
 
 #include <exception>
 #include <filesystem>
+#include <memory>
 #include <string_view>
 #include <variant>
 
 #include <spdlog/spdlog.h>
-#include "backend.hpp"
 
 namespace huggingface::tgi::backends::llamacpp {
-    struct generation_params_t;
-    struct sampling_params_t;
-
-    class llama_cpp_backend_impl_t;
+    class llama_cpp_worker_frontend_t;
 }
 
-
+#include "backend.hpp"
 #include "backends/llamacpp/src/lib.rs.h"
 #include "rust/cxx.h"
 
 
 namespace huggingface::tgi::backends::llamacpp {
 
-    // Concept identifying types which have a .generate() -> size_t method to do in-place generation
-    template<typename T>
-    concept has_stream_method = requires(
-            T t,
-            std::span<const llama_token> input_tokens,
-            const generation_params_t &generation_params,
-            const sampling_params_t &sampling_params,
-            llama_decode_callback callback
-    ) {
-        {
-        t.stream(input_tokens, generation_params, sampling_params, callback)
-        } -> std::same_as<std::expected<size_t, backend_error_t>>;
+    auto llama_model_deleter = [](llama_model *model) { llama_free_model(model); };
+    auto make_shared_llama_model = [](llama_model *model) {
+        return std::shared_ptr<llama_model>(model, llama_model_deleter);
     };
 
-    static_assert(has_stream_method<single_worker_backend_t>, "single_worker_backend_t doesn't meet concept has_stream_method");
-    static_assert(has_stream_method<multi_worker_backend_t>, "multi_worker_backend_t doesn't meet concept has_stream_method");
-
-    class llama_cpp_backend_exception_t : std::exception {
-
-    };
+    class llama_cpp_backend_exception_t : std::exception {};
 
     /**
-     * Llama.cpp backend interfacing with Rust FFI layer
+     * Llama.cpp frontend over the worker interfacing with Rust FFI layer
      */
-    class llama_cpp_backend_impl_t {
+    class llama_cpp_worker_frontend_t {
     private:
-        std::variant<single_worker_backend_t, multi_worker_backend_t> mInner_;
+        std::shared_ptr<llama_model> model_;
+        worker_t worker_;
 
     public:
-        explicit llama_cpp_backend_impl_t(single_worker_backend_t &&backend) : mInner_(std::move(backend)) {}
-
-        explicit llama_cpp_backend_impl_t(multi_worker_backend_t &&backend) : mInner_(std::move(backend)) {}
+        explicit llama_cpp_worker_frontend_t(llama_model *model):
+            model_{ make_shared_llama_model(model) }, worker_(model_, {.no_perf = true}) {}
 
         size_t stream(
                 rust::Slice<const uint32_t> input_tokens,
@@ -67,41 +50,31 @@ namespace huggingface::tgi::backends::llamacpp {
                 InferContext *ctx,
                 rust::Fn<bool(InferContext *, uint32_t, float_t, bool, size_t)> callback
         ) {
-            // Define the visitor lambda function which requires the has_emplace_generate constraint on T
-            auto inner_fw = [=, &sampling_params, &ctx, &callback]<has_stream_method T>(T &&backend)
-                    -> std::expected<size_t, backend_error_t> {
-
-                auto context_forwarding_callback = [=, &ctx](uint32_t new_token_id, float_t logits, bool is_eos, size_t n_generated_tokens) -> bool {
-                    return callback(ctx, new_token_id, logits, is_eos, n_generated_tokens);
-                };
-
-                // Ask the compiler to create view over Rust slice transmuting from uint32_t* to int32_t*
-                auto input_tokens_v =
-                        std::span(reinterpret_cast<const llama_token *>(input_tokens.data()), input_tokens.size());
-
-                return backend.stream(
-                        input_tokens_v,
-                        generation_params,
-                        sampling_params,
-                        context_forwarding_callback
-                );
+            auto context_forwarding_callback =
+                    [=, &ctx](uint32_t new_token_id, float_t logits, bool is_eos, size_t n_generated_tokens) -> bool {
+                return callback(ctx, new_token_id, logits, is_eos, n_generated_tokens);
             };
 
-            if (const auto result = std::visit(inner_fw, mInner_); result.has_value()) {
+            // Ask the compiler to create view over Rust slice transmuting from uint32_t* to llama_token*
+            auto input_tokens_v =
+                    std::span(reinterpret_cast<const llama_token *>(input_tokens.data()), input_tokens.size());
+
+            const auto generation_context = generation_context_t {generation_params, sampling_params, input_tokens_v};
+            if(const auto result = worker_.generate(generation_context, context_forwarding_callback); result.has_value()) [[likely]] {
                 return *result;
             } else {
-                throw llama_cpp_backend_exception_t();
+                throw llama_cpp_backend_exception_t {};
             }
         }
     };
 
-    std::unique_ptr<llama_cpp_backend_impl_t> create_single_worker_backend(rust::Str modelPath) {
+    std::unique_ptr<llama_cpp_worker_frontend_t> create_worker_frontend(rust::Str modelPath) {
         const auto cxxPath = std::string(modelPath);
         auto params = llama_model_default_params();
         params.use_mmap = true;
 
-        auto *model = llama_load_model_from_file(cxxPath.c_str(), params);
-        return std::make_unique<llama_cpp_backend_impl_t>(single_worker_backend_t { model, std::nullopt });
+        auto *model = (llama_load_model_from_file(cxxPath.c_str(), params));
+        return std::make_unique<llama_cpp_worker_frontend_t>(model);
     }
 }
 
diff --git a/backends/llamacpp/offline/main.cpp b/backends/llamacpp/offline/main.cpp
index 7eb7dbde0a9..721abf051f5 100644
--- a/backends/llamacpp/offline/main.cpp
+++ b/backends/llamacpp/offline/main.cpp
@@ -1,16 +1,17 @@
 //
 // Created by mfuntowicz on 10/3/24.
 //
+#include <memory>
 
-#include <fmt/color.h>
-#include <fmt/format.h>
-#include <fmt/std.h>
-#include <fmt/ranges.h>
+#include <llama.h>
 #include <spdlog/spdlog.h>
+#include <spdlog/fmt/ranges.h>s
 #include "../csrc/backend.hpp"
 
 using namespace huggingface::tgi::backends::llamacpp;
 
+const auto llama_model_deleter = [](llama_model *model) { llama_free_model(model); };
+
 int main(int argc, char **argv) {
     if (argc < 2) {
         fmt::print("No model folder provider");
@@ -18,21 +19,31 @@ int main(int argc, char **argv) {
     }
 
     spdlog::set_level(spdlog::level::debug);
-    
+
     const auto modelPath = absolute(std::filesystem::path(argv[1]));
     const auto params = llama_model_default_params();
-    auto *model = llama_load_model_from_file(modelPath.c_str(), params);
+    auto model = std::unique_ptr<llama_model, decltype(llama_model_deleter)>(
+            llama_load_model_from_file(modelPath.c_str(), params)
+    );
 
-    auto backend = single_worker_backend_t(model, {});
+    auto prompt = "My name is Morgan";
+    auto tokens = std::vector<llama_token>(16);
+    const auto nb_tokens = llama_tokenize(model.get(), prompt, sizeof(prompt), tokens.data(), tokens.size(), true,
+                                          false);
+    tokens.resize(nb_tokens);
+    auto backend = worker_t{std::move(model), {.n_batch = 1, .n_threads = 4}};
+
+    fmt::println("Tokenized: {}", tokens);
 
     // generate
-    const auto promptTokens = {128000, 5159, 836, 374, 23809, 11};
-    const auto out = backend.generate(promptTokens, {.max_new_tokens = 32}, {.top_k = 40});
-
-    if (out.has_value())
-        fmt::print(FMT_STRING("Generated: {}"), *out);
-    else {
-        const auto err = out.error();
-        fmt::print(fmt::emphasis::bold | fg(fmt::color::red), "Got an error: {:d}", static_cast<uint8_t>(err));
-    }
+    auto generated_tokens = std::vector<llama_token>(32);
+    const auto n_generated_tokens = backend.generate(
+            {{.max_new_tokens = 32}, {.top_k = 40}, tokens},
+            [&generated_tokens](llama_token new_token_id, float_t logit, bool is_eos, size_t step) -> bool {
+                generated_tokens.emplace(generated_tokens.begin() + (step - 1), new_token_id);
+                return false;
+            }
+    );
+    generated_tokens.resize(n_generated_tokens.value());
+    fmt::println("Generated {} tokens", generated_tokens);
 }
diff --git a/backends/llamacpp/src/backend.rs b/backends/llamacpp/src/backend.rs
index 8214c36a73b..8e36aa63160 100644
--- a/backends/llamacpp/src/backend.rs
+++ b/backends/llamacpp/src/backend.rs
@@ -1,8 +1,9 @@
 use crate::ffi::{
-    create_single_worker_backend, GenerationParams, LlamaCppBackendImpl, SamplingParams,
+    create_worker_frontend, GenerationParams, LlamaCppWorkerFrontend, SamplingParams,
 };
 use async_trait::async_trait;
 use cxx::UniquePtr;
+use std::ops::Deref;
 use std::path::{Path, PathBuf};
 use std::sync::mpsc::{channel, Receiver, Sender};
 use std::sync::Arc;
@@ -21,7 +22,7 @@ use tracing::{debug, error, info};
 
 type InferResult = Result<InferStreamResponse, InferError>;
 
-unsafe impl Send for LlamaCppBackendImpl {}
+unsafe impl Send for LlamaCppWorkerFrontend {}
 
 impl From<&ValidParameters> for SamplingParams {
     fn from(v: &ValidParameters) -> Self {
@@ -68,41 +69,54 @@ pub enum LlamaCppBackendError {
     ModelInitializationFailed(PathBuf, String),
 }
 
-pub struct LlamaCppBackend {
-    backlog: Sender<(GenerationContext, UnboundedSender<InferResult>)>,
-    _scheduler_handle: JoinHandle<()>,
+// pub struct LlamaCppBackend {
+//     backlog: Sender<(GenerationContext, UnboundedSender<InferResult>)>,
+//     _scheduler_handle: JoinHandle<()>,
+// }
+
+struct LlamaCppWorker {
+    sender: Sender<(GenerationContext, UnboundedSender<InferResult>)>,
+    handle: JoinHandle<()>,
+}
+
+pub enum LlamaCppBackend {
+    Single(LlamaCppWorker),
+    // Multi(Vec<LlamaCppWorker>)
 }
 
 impl LlamaCppBackend {
-    pub fn new<P: AsRef<Path> + Send>(
+    fn allocate_worker(
+        path: &Path,
+    ) -> Result<UniquePtr<LlamaCppWorkerFrontend>, LlamaCppBackendError> {
+        create_worker_frontend(&path.display().to_string()).map_err(|ref err| {
+            LlamaCppBackendError::ModelInitializationFailed(path.to_path_buf(), err.to_string())
+        })
+    }
+
+    pub fn new<P: AsRef<Path>>(
         model_path: P,
         tokenizer: Tokenizer,
+        num_cores_per_instance: u16,
     ) -> Result<Self, LlamaCppBackendError> {
-        let path = Arc::new(model_path.as_ref());
+        let shared_path = Arc::new(model_path);
+        let path = shared_path.deref().as_ref();
         if !path.exists() {
             return Err(LlamaCppBackendError::ModelFileDoesntExist(
                 path.display().to_string(),
             ));
         }
 
-        let backend = create_single_worker_backend(path.to_str().unwrap()).map_err(|err| {
-            LlamaCppBackendError::ModelInitializationFailed(
-                path.to_path_buf(),
-                err.what().to_string(),
-            )
-        })?;
-
-        info!(
-            "Successfully initialized llama.cpp backend from {}",
-            path.display()
-        );
+        let worker = match num_cores_per_instance {
+            0 => {
+                let worker = Self::allocate_worker(path)?;
+                let (sender, receiver) = channel();
+                let handle = spawn(|| scheduler_loop(worker, tokenizer, receiver));
+                LlamaCppBackend::Single(LlamaCppWorker { sender, handle })
+            }
+            _ => panic!("No supported yet"),
+        };
 
-        let (submitter, receiver) = channel();
-        let handle = unsafe { spawn(|| scheduler_loop(backend, tokenizer, receiver)) };
-        Ok(Self {
-            backlog: submitter,
-            _scheduler_handle: handle,
-        })
+        Ok(worker)
     }
 }
 
@@ -169,18 +183,16 @@ fn llama_generate_callback(
     };
 
     // Send back to the client
-    let should_stop = if let Err(ref _err) = ctx.stream.send(response) {
+    if let Err(ref _err) = ctx.stream.send(response) {
         error!("Failed to send back the response to the client, cancelling request");
         true
     } else {
-        true
-    };
-
-    should_stop
+        false
+    }
 }
 
-unsafe fn scheduler_loop(
-    mut backend: UniquePtr<LlamaCppBackendImpl>,
+fn scheduler_loop(
+    mut backend: UniquePtr<LlamaCppWorkerFrontend>,
     tokenizer: Tokenizer,
     backlog: Receiver<(GenerationContext, UnboundedSender<InferResult>)>,
 ) {
@@ -204,20 +216,23 @@ unsafe fn scheduler_loop(
                     generation,
                 });
 
-                let boxed_ctx = Box::into_raw(ctx);
+                // We leak the box to avoid it being freed after the first callback call
+                // when going out of scope
+                unsafe {
+                    let boxed_ctx = Box::into_raw(ctx);
+                    if let Err(e) = backend.pin_mut().stream(
+                        &input_tokens,
+                        generation_params,
+                        &sampling_params,
+                        boxed_ctx,
+                        llama_generate_callback,
+                    ) {
+                        error!("Error while decoding tokens... {}", e.what());
+                    }
 
-                if let Err(e) = backend.pin_mut().stream(
-                    &input_tokens,
-                    generation_params,
-                    &sampling_params,
-                    boxed_ctx,
-                    llama_generate_callback,
-                ) {
-                    error!("Error while decoding tokens... {}", e.what());
+                    // Make sure we re-keep track of the OpaqueStream box
+                    let _ = Box::from_raw(boxed_ctx);
                 }
-
-                // Make sure we re-keep track of the OpaqueStream box
-                let _ = Box::from_raw(boxed_ctx);
             }
         } else {
             info!("IPC channel is closed, exiting the scheduler loop");
@@ -244,11 +259,13 @@ impl Backend for LlamaCppBackend {
                 sampling_params,
             };
 
-            match self.backlog.send((ctx, sx)) {
-                Ok(_) => Ok(UnboundedReceiverStream::new(rx)),
-                Err(_) => Err(InferError::GenerationError(
-                    "Failed to sent the request".to_string(),
-                )),
+            match self {
+                LlamaCppBackend::Single(worker) => match worker.sender.send((ctx, sx)) {
+                    Ok(_) => Ok(UnboundedReceiverStream::new(rx)),
+                    Err(_) => Err(InferError::GenerationError(
+                        "Failed to sent the request".to_string(),
+                    )),
+                },
             }
         } else {
             Err(InferError::GenerationError(
diff --git a/backends/llamacpp/src/lib.rs b/backends/llamacpp/src/lib.rs
index abcdd1fad06..4f0fa800276 100644
--- a/backends/llamacpp/src/lib.rs
+++ b/backends/llamacpp/src/lib.rs
@@ -46,14 +46,13 @@ mod ffi {
         type SamplingParams;
 
         /// Represent an instance of the llama.cpp backend instance on C++ side
-        #[cxx_name = "llama_cpp_backend_impl_t"]
-        type LlamaCppBackendImpl;
+        #[cxx_name = "llama_cpp_worker_frontend_t"]
+        type LlamaCppWorkerFrontend;
 
-        #[rust_name = "create_single_worker_backend"]
-        fn create_single_worker_backend(modelPath: &str) -> Result<UniquePtr<LlamaCppBackendImpl>>;
+        fn create_worker_frontend(modelPath: &str) -> Result<UniquePtr<LlamaCppWorkerFrontend>>;
 
         unsafe fn stream(
-            self: Pin<&mut LlamaCppBackendImpl>,
+            self: Pin<&mut LlamaCppWorkerFrontend>,
             tokens: &[u32],
             generation_params: GenerationParams,
             sampling_params: &SamplingParams,
diff --git a/backends/llamacpp/src/main.rs b/backends/llamacpp/src/main.rs
index c5d735ab719..a2abd5556a8 100644
--- a/backends/llamacpp/src/main.rs
+++ b/backends/llamacpp/src/main.rs
@@ -37,8 +37,8 @@ struct Args {
     port: u16,
     #[clap(long, env, help = "Path to GGUF model file(s) to load")]
     gguf_path: PathBuf,
-    // #[clap(long, env, default_value = "1", help = "Number of model instance(s)")]
-    // num_model_instance: u16,
+    #[clap(long, env, help = "Number of CPU core per instance(s)")]
+    num_cores_per_instance: Option<u16>,
     #[clap(long, env, required = true)]
     tokenizer_name: String,
     #[clap(long, env)]
@@ -95,7 +95,7 @@ async fn main() -> Result<(), RouterError> {
         hostname,
         port,
         gguf_path,
-        // num_model_instance,
+        num_cores_per_instance,
         tokenizer_name,
         tokenizer_config_path,
         revision,
@@ -164,7 +164,7 @@ async fn main() -> Result<(), RouterError> {
     };
     let tokenizer = tokenizers::Tokenizer::from_pretrained(tokenizer_name.clone(), Some(options))
         .expect("Failed to retrieve tokenizer");
-    let backend = LlamaCppBackend::new(gguf_path, tokenizer)?;
+    let backend = LlamaCppBackend::new(gguf_path, tokenizer, num_cores_per_instance.unwrap_or(0))?;
 
     // Run server
     server::run(

From 6915fa3441e3ab0026d996e3c6c100930b1e5dda Mon Sep 17 00:00:00 2001
From: Morgan Funtowicz <funtowiczmo@gmail.com>
Date: Sat, 9 Nov 2024 22:19:38 +0100
Subject: [PATCH 55/91] feat(backend): remove reinterpret_cast converting from
 uint32_t to llama_token(int32_t)

---
 backends/llamacpp/csrc/ffi.hpp | 4 ++--
 1 file changed, 2 insertions(+), 2 deletions(-)

diff --git a/backends/llamacpp/csrc/ffi.hpp b/backends/llamacpp/csrc/ffi.hpp
index 51a524cbbdd..70669b7cdb9 100644
--- a/backends/llamacpp/csrc/ffi.hpp
+++ b/backends/llamacpp/csrc/ffi.hpp
@@ -56,8 +56,8 @@ namespace huggingface::tgi::backends::llamacpp {
             };
 
             // Ask the compiler to create view over Rust slice transmuting from uint32_t* to llama_token*
-            auto input_tokens_v =
-                    std::span(reinterpret_cast<const llama_token *>(input_tokens.data()), input_tokens.size());
+            auto input_tokens_v = std::vector<llama_token>(input_tokens.size());
+            std::memcpy(input_tokens_v.data(), input_tokens.data(), input_tokens.size());
 
             const auto generation_context = generation_context_t {generation_params, sampling_params, input_tokens_v};
             if(const auto result = worker_.generate(generation_context, context_forwarding_callback); result.has_value()) [[likely]] {

From 7e2890fe2cf14270e6e7ecd92500072b4655ab8c Mon Sep 17 00:00:00 2001
From: Morgan Funtowicz <funtowiczmo@gmail.com>
Date: Mon, 11 Nov 2024 19:50:11 +0100
Subject: [PATCH 56/91] feat(backend): remove unused function

---
 backends/llamacpp/csrc/backend.cpp | 13 -------------
 1 file changed, 13 deletions(-)

diff --git a/backends/llamacpp/csrc/backend.cpp b/backends/llamacpp/csrc/backend.cpp
index 837f87ea052..66017fc5513 100644
--- a/backends/llamacpp/csrc/backend.cpp
+++ b/backends/llamacpp/csrc/backend.cpp
@@ -14,19 +14,6 @@
 
 namespace huggingface::tgi::backends::llamacpp {
 
-    void llama_batch_fill_prompt(llama_batch &batch, std::span<const llama_token> input_tokens) {
-        for (auto i = 0; i < input_tokens.size(); ++i) {
-            batch.token[i] = input_tokens[i];
-            batch.pos[i] = i;
-            batch.n_seq_id[i] = 1;
-            batch.seq_id[i] = nullptr;
-            batch.logits[i] = false;
-            ++batch.n_tokens;
-        }
-
-        batch.logits[batch.n_tokens] = true;
-    }
-
     llama_sampler_ptr sampling_params_t::into_llama_sampler(const llama_model *model) const {
         auto *pSampler = llama_sampler_chain_init({.no_perf = false});
 

From 488ba938983ec7b0cf47e4a53ff28b590fb3de31 Mon Sep 17 00:00:00 2001
From: Morgan Funtowicz <funtowiczmo@gmail.com>
Date: Mon, 11 Nov 2024 19:50:33 +0100
Subject: [PATCH 57/91] feat(backend): fix invalid reference to context in
 release mode

---
 backends/llamacpp/csrc/backend.cpp | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/backends/llamacpp/csrc/backend.cpp b/backends/llamacpp/csrc/backend.cpp
index 66017fc5513..eb91e51782c 100644
--- a/backends/llamacpp/csrc/backend.cpp
+++ b/backends/llamacpp/csrc/backend.cpp
@@ -74,7 +74,7 @@ namespace huggingface::tgi::backends::llamacpp {
             const auto latency = std::chrono::duration_cast<std::chrono::milliseconds>(end - start);
             SPDLOG_DEBUG(FMT_STRING("Successfully decoded {:d} token(s) in {}"), batch.n_tokens, latency);
 #else
-            const auto status = llama_decode(context, batch);
+            const auto status = llama_decode(context_.get(), batch);
 #endif
             batch.n_tokens = 0;
             if (LLAMA_SUCCESS(status)) [[likely]] {

From 363d5e45de275b3c2739e2a4f9abad5cfa7e9baa Mon Sep 17 00:00:00 2001
From: Morgan Funtowicz <funtowiczmo@gmail.com>
Date: Wed, 13 Nov 2024 00:07:59 +0100
Subject: [PATCH 58/91] feat(backend): use std::ranges to map uint32_t to
 llama_token

---
 backends/llamacpp/csrc/ffi.hpp | 13 ++++++++++---
 1 file changed, 10 insertions(+), 3 deletions(-)

diff --git a/backends/llamacpp/csrc/ffi.hpp b/backends/llamacpp/csrc/ffi.hpp
index 70669b7cdb9..948e96a0d1d 100644
--- a/backends/llamacpp/csrc/ffi.hpp
+++ b/backends/llamacpp/csrc/ffi.hpp
@@ -8,8 +8,8 @@
 #include <exception>
 #include <filesystem>
 #include <memory>
+#include <ranges>
 #include <string_view>
-#include <variant>
 
 #include <spdlog/spdlog.h>
 
@@ -56,9 +56,16 @@ namespace huggingface::tgi::backends::llamacpp {
             };
 
             // Ask the compiler to create view over Rust slice transmuting from uint32_t* to llama_token*
-            auto input_tokens_v = std::vector<llama_token>(input_tokens.size());
-            std::memcpy(input_tokens_v.data(), input_tokens.data(), input_tokens.size());
+            static auto as_llama_token = [](const uint32_t x){ return static_cast<llama_token>(x); };
 
+#ifdef __cpp_lib_ranges_to_container
+            auto input_tokens_v = input_tokens | std::views::transform(as_llama_token) | std::ranges::to<std::vector>();
+#else
+            auto input_tokens_ = input_tokens | std::views::transform(as_llama_token);
+            auto input_tokens_v = std::vector<llama_token>(input_tokens_.begin(), input_tokens_.end());
+#endif
+
+            // Defer the generation to the actual worker_t
             const auto generation_context = generation_context_t {generation_params, sampling_params, input_tokens_v};
             if(const auto result = worker_.generate(generation_context, context_forwarding_callback); result.has_value()) [[likely]] {
                 return *result;

From 02cd6fe427b8ba705a4a138926971f8dc5562a9f Mon Sep 17 00:00:00 2001
From: Morgan Funtowicz <funtowiczmo@gmail.com>
Date: Wed, 13 Nov 2024 00:08:26 +0100
Subject: [PATCH 59/91] chore(backend): minor improvements

---
 backends/llamacpp/csrc/ffi.hpp | 13 +++++++++++--
 1 file changed, 11 insertions(+), 2 deletions(-)

diff --git a/backends/llamacpp/csrc/ffi.hpp b/backends/llamacpp/csrc/ffi.hpp
index 948e96a0d1d..43694fa3276 100644
--- a/backends/llamacpp/csrc/ffi.hpp
+++ b/backends/llamacpp/csrc/ffi.hpp
@@ -50,6 +50,8 @@ namespace huggingface::tgi::backends::llamacpp {
                 InferContext *ctx,
                 rust::Fn<bool(InferContext *, uint32_t, float_t, bool, size_t)> callback
         ) {
+            // Wrapper around the provided Rust callback to inject the InferContext when returning from the C++ FFI boundaries
+            // It captures the context (ctx) using reference and will automatically call the Rust callback forwarding the InferContext
             auto context_forwarding_callback =
                     [=, &ctx](uint32_t new_token_id, float_t logits, bool is_eos, size_t n_generated_tokens) -> bool {
                 return callback(ctx, new_token_id, logits, is_eos, n_generated_tokens);
@@ -76,11 +78,18 @@ namespace huggingface::tgi::backends::llamacpp {
     };
 
     std::unique_ptr<llama_cpp_worker_frontend_t> create_worker_frontend(rust::Str modelPath) {
-        const auto cxxPath = std::string(modelPath);
+        // Initialize the numa context from numactl
+        static const bool INITIALIZED_NUMA_CONTEXT_ONCE = [](){
+            llama_numa_init(GGML_NUMA_STRATEGY_NUMACTL);
+            return true;
+        }();
+
+        // Allocate model weights parameters
         auto params = llama_model_default_params();
         params.use_mmap = true;
 
-        auto *model = (llama_load_model_from_file(cxxPath.c_str(), params));
+        // Allocate the model from the Rust provided, string path
+        auto *model = (llama_load_model_from_file(static_cast<std::string>(modelPath).c_str(), params));
         return std::make_unique<llama_cpp_worker_frontend_t>(model);
     }
 }

From daf1631e09710343c3e208be3282bd53c4cf3ccd Mon Sep 17 00:00:00 2001
From: Morgan Funtowicz <funtowiczmo@gmail.com>
Date: Wed, 13 Nov 2024 00:08:49 +0100
Subject: [PATCH 60/91] dockerfile(backend): initial working version of
 llama.cpp container

---
 Dockerfile.llamacpp | 25 +++++++++++++++++++++----
 1 file changed, 21 insertions(+), 4 deletions(-)

diff --git a/Dockerfile.llamacpp b/Dockerfile.llamacpp
index 0864c1bad08..3dab2a2968d 100644
--- a/Dockerfile.llamacpp
+++ b/Dockerfile.llamacpp
@@ -15,8 +15,10 @@ COPY router router
 RUN cargo chef prepare --recipe-path recipe.json
 
 FROM chef AS builder
-ENV CMAKE_INSTALL_PREFIX=${CWD}/dist
-RUN apt update && DEBIAN_FRONTEND=noninteractive apt install -y \
+ENV CMAKE_INSTALL_PREFIX=/usr/src/text-generation-inference/dist
+RUN --mount=type=cache,target=/var/cache/apt,sharing=locked \
+    --mount=type=cache,target=/var/lib/apt,sharing=locked \
+    apt update && DEBIAN_FRONTEND=noninteractive apt install -y \
     clang \
     cmake \
     gcc g++ \
@@ -48,8 +50,23 @@ COPY launcher launcher
 COPY router router
 
 ENV RUSTFLAGS="-L/usr/lib"
+ENV CMAKE_INSTALL_PREFIX=/usr/src/text-generation-inference/dist
 RUN cargo build --profile release-opt --package text-generation-backend-llamacpp --bin text-generation-backend-llamacpp --frozen
 
-FROM ubuntu:24.04
+FROM ubuntu:22.04
+ENV DEBIAN_FRONTEND=noninteractive
+
+RUN --mount=type=cache,target=/var/cache/apt,sharing=locked \
+    --mount=type=cache,target=/var/lib/apt,sharing=locked \
+    apt update && \
+    apt upgrade -y && \
+    apt install -y \
+    openssl \
+    python3.11-dev
+
 COPY --from=builder /usr/src/text-generation-inference/target/release-opt/text-generation-backend-llamacpp /usr/src/text-generation-inference/text-generation-launcher
-COPY --from=builder /usr/src/text-generation-inference/dist /usr/
\ No newline at end of file
+COPY --from=builder /usr/src/text-generation-inference/dist /usr/
+
+ENV PORT=8080
+WORKDIR /usr/src/text-generation-inference
+ENTRYPOINT ["text-generation-launcher"]
\ No newline at end of file

From 57b215467bc28b37a2b7a4ca98ea74b4a171d179 Mon Sep 17 00:00:00 2001
From: Morgan Funtowicz <funtowiczmo@gmail.com>
Date: Wed, 13 Nov 2024 00:22:11 +0100
Subject: [PATCH 61/91] feat(backend): simplify Rust callback

---
 backends/llamacpp/src/backend.rs | 27 ++++++++++-----------------
 1 file changed, 10 insertions(+), 17 deletions(-)

diff --git a/backends/llamacpp/src/backend.rs b/backends/llamacpp/src/backend.rs
index 8e36aa63160..2dd5b70d116 100644
--- a/backends/llamacpp/src/backend.rs
+++ b/backends/llamacpp/src/backend.rs
@@ -134,23 +134,18 @@ fn llama_generate_callback(
     // Append the new token to the generated ones
     ctx.generation.generated_tokens.push(new_token_id);
 
-    // Decode token
-    let token = match ctx.tokenizer.decode(&[new_token_id], false) {
+    // Generate response
+    let response = match ctx.tokenizer.decode(&[new_token_id], false) {
         Ok(text) => {
             let special = ctx.tokenizer.get_added_vocabulary().is_special_token(&text);
-            Ok(Token {
+            let token = Token {
                 id: new_token_id,
                 text,
                 logprob: new_token_logit,
                 special,
-            })
-        }
-        Err(ref err) => Err(InferError::GenerationError(err.to_string())),
-    };
+            };
 
-    // Create the streamed response
-    let response = match token {
-        Ok(token) => {
+            // Should we generate an ending or intermediate response?
             match is_final {
                 false => Ok(InferStreamResponse::Intermediate {
                     token,
@@ -179,16 +174,14 @@ fn llama_generate_callback(
                 }
             }
         }
-        Err(err) => Err(err),
+        Err(ref err) => Err(InferError::GenerationError(err.to_string())),
     };
 
     // Send back to the client
-    if let Err(ref _err) = ctx.stream.send(response) {
-        error!("Failed to send back the response to the client, cancelling request");
-        true
-    } else {
-        false
-    }
+    let status = ctx.stream.send(response).inspect_err(|err| {
+        error!("Failed to send back the response: {}", err);
+    });
+    status.is_err()
 }
 
 fn scheduler_loop(

From 6f059c4b5ddc7ddc3e5654767c9b7d24caa517da Mon Sep 17 00:00:00 2001
From: Morgan Funtowicz <funtowiczmo@gmail.com>
Date: Thu, 14 Nov 2024 08:41:38 +0100
Subject: [PATCH 62/91] feat(backend): wrap Arc tokenizer to avoid duplicating

---
 backends/llamacpp/src/backend.rs | 60 ++++++++++++++------------------
 backends/llamacpp/src/lib.rs     |  2 +-
 backends/llamacpp/src/main.rs    |  7 ++--
 3 files changed, 32 insertions(+), 37 deletions(-)

diff --git a/backends/llamacpp/src/backend.rs b/backends/llamacpp/src/backend.rs
index 2dd5b70d116..dc29b707a1e 100644
--- a/backends/llamacpp/src/backend.rs
+++ b/backends/llamacpp/src/backend.rs
@@ -53,10 +53,10 @@ pub(crate) struct GenerationContext {
     pub(crate) sampling_params: SamplingParams,
 }
 
-pub(crate) struct InferContext {
+pub(crate) struct InferContext<'a> {
     pub(crate) start: Instant,
     pub(crate) stream: UnboundedSender<InferResult>,
-    pub(crate) tokenizer: Tokenizer,
+    pub(crate) tokenizer: &'a Tokenizer,
     pub(crate) generation: GenerationContext,
 }
 
@@ -69,11 +69,6 @@ pub enum LlamaCppBackendError {
     ModelInitializationFailed(PathBuf, String),
 }
 
-// pub struct LlamaCppBackend {
-//     backlog: Sender<(GenerationContext, UnboundedSender<InferResult>)>,
-//     _scheduler_handle: JoinHandle<()>,
-// }
-
 struct LlamaCppWorker {
     sender: Sender<(GenerationContext, UnboundedSender<InferResult>)>,
     handle: JoinHandle<()>,
@@ -95,7 +90,7 @@ impl LlamaCppBackend {
 
     pub fn new<P: AsRef<Path>>(
         model_path: P,
-        tokenizer: Tokenizer,
+        tokenizer: Arc<Tokenizer>,
         num_cores_per_instance: u16,
     ) -> Result<Self, LlamaCppBackendError> {
         let shared_path = Arc::new(model_path);
@@ -110,7 +105,7 @@ impl LlamaCppBackend {
             0 => {
                 let worker = Self::allocate_worker(path)?;
                 let (sender, receiver) = channel();
-                let handle = spawn(|| scheduler_loop(worker, tokenizer, receiver));
+                let handle = spawn(move || scheduler_loop(worker, tokenizer, receiver));
                 LlamaCppBackend::Single(LlamaCppWorker { sender, handle })
             }
             _ => panic!("No supported yet"),
@@ -186,7 +181,7 @@ fn llama_generate_callback(
 
 fn scheduler_loop(
     mut backend: UniquePtr<LlamaCppWorkerFrontend>,
-    tokenizer: Tokenizer,
+    tokenizer: Arc<Tokenizer>,
     backlog: Receiver<(GenerationContext, UnboundedSender<InferResult>)>,
 ) {
     // This loop will mostly decode single token at every step, so no need to rely on parallelism
@@ -195,37 +190,34 @@ fn scheduler_loop(
     loop {
         if let Ok((generation, stream)) = backlog.recv() {
             let start = Instant::now();
-            let tokenizer = tokenizer.clone();
             let generation_params = generation.generation_params; // copy
             let sampling_params = generation.sampling_params; // copy
             let input_tokens = Arc::clone(&generation.input_tokens);
 
             // Creating the whole InferContext and pushing it to the heap
-            {
-                let ctx = Box::new(InferContext {
-                    start,
-                    stream,
-                    tokenizer,
-                    generation,
-                });
-
-                // We leak the box to avoid it being freed after the first callback call
-                // when going out of scope
-                unsafe {
-                    let boxed_ctx = Box::into_raw(ctx);
-                    if let Err(e) = backend.pin_mut().stream(
-                        &input_tokens,
-                        generation_params,
-                        &sampling_params,
-                        boxed_ctx,
-                        llama_generate_callback,
-                    ) {
-                        error!("Error while decoding tokens... {}", e.what());
-                    }
+            let ctx = Box::new(InferContext {
+                start,
+                stream,
+                tokenizer: &tokenizer,
+                generation,
+            });
 
-                    // Make sure we re-keep track of the OpaqueStream box
-                    let _ = Box::from_raw(boxed_ctx);
+            // We leak the box to avoid it being freed after the first callback call
+            // when going out of scope
+            unsafe {
+                let boxed_ctx = Box::into_raw(ctx);
+                if let Err(e) = backend.pin_mut().stream(
+                    &input_tokens,
+                    generation_params,
+                    &sampling_params,
+                    boxed_ctx,
+                    llama_generate_callback,
+                ) {
+                    error!("Error while decoding tokens... {}", e.what());
                 }
+
+                // Make sure we re-keep track of the OpaqueStream box
+                let _ = Box::from_raw(boxed_ctx);
             }
         } else {
             info!("IPC channel is closed, exiting the scheduler loop");
diff --git a/backends/llamacpp/src/lib.rs b/backends/llamacpp/src/lib.rs
index 4f0fa800276..8fc989552be 100644
--- a/backends/llamacpp/src/lib.rs
+++ b/backends/llamacpp/src/lib.rs
@@ -33,7 +33,7 @@ mod ffi {
     }
 
     extern "Rust" {
-        type InferContext;
+        type InferContext<'a>;
     }
 
     unsafe extern "C++" {
diff --git a/backends/llamacpp/src/main.rs b/backends/llamacpp/src/main.rs
index a2abd5556a8..adc183edc5b 100644
--- a/backends/llamacpp/src/main.rs
+++ b/backends/llamacpp/src/main.rs
@@ -1,5 +1,6 @@
 use clap::{Parser, Subcommand};
 use std::path::PathBuf;
+use std::sync::Arc;
 use text_generation_backend_llamacpp::backend::{LlamaCppBackend, LlamaCppBackendError};
 use text_generation_router::server::ApiDoc;
 use text_generation_router::{server, usage_stats};
@@ -162,8 +163,10 @@ async fn main() -> Result<(), RouterError> {
         user_agent: Default::default(),
         auth_token,
     };
-    let tokenizer = tokenizers::Tokenizer::from_pretrained(tokenizer_name.clone(), Some(options))
-        .expect("Failed to retrieve tokenizer");
+    let tokenizer = Arc::new(
+        tokenizers::Tokenizer::from_pretrained(tokenizer_name.clone(), Some(options))
+            .expect("Failed to retrieve tokenizer"),
+    );
     let backend = LlamaCppBackend::new(gguf_path, tokenizer, num_cores_per_instance.unwrap_or(0))?;
 
     // Run server

From 70c90ad93395bf5ea317efc10fdcd21d916cd89a Mon Sep 17 00:00:00 2001
From: Morgan Funtowicz <funtowiczmo@gmail.com>
Date: Thu, 14 Nov 2024 09:04:06 +0100
Subject: [PATCH 63/91] feat(backend): update llamacpp to 4077

---
 backends/llamacpp/CMakeLists.txt | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/backends/llamacpp/CMakeLists.txt b/backends/llamacpp/CMakeLists.txt
index f92bbe68661..73369935594 100644
--- a/backends/llamacpp/CMakeLists.txt
+++ b/backends/llamacpp/CMakeLists.txt
@@ -33,7 +33,7 @@ endif ()
 # Download llama.cpp repo at the specific version
 fetchcontent_declare(
         llama
-        URL https://github.com/ggerganov/llama.cpp/archive/refs/tags/b4048.tar.gz
+        URL https://github.com/ggerganov/llama.cpp/archive/refs/tags/b4077.tar.gz
 )
 
 fetchcontent_makeavailable(llama)

From 23d2bcf28dbbb01f5391a5ec56a3163ef9f018eb Mon Sep 17 00:00:00 2001
From: Morgan Funtowicz <funtowiczmo@gmail.com>
Date: Thu, 14 Nov 2024 09:38:13 +0100
Subject: [PATCH 64/91] misc(build): improve build process

---
 backends/llamacpp/CMakeLists.txt     |  7 +++----
 backends/llamacpp/build.rs           | 14 ++------------
 backends/llamacpp/cmake/spdlog.cmake |  8 +++++++-
 3 files changed, 12 insertions(+), 17 deletions(-)

diff --git a/backends/llamacpp/CMakeLists.txt b/backends/llamacpp/CMakeLists.txt
index 73369935594..f6dd2db1db7 100644
--- a/backends/llamacpp/CMakeLists.txt
+++ b/backends/llamacpp/CMakeLists.txt
@@ -6,7 +6,6 @@ set(CMAKE_CXX_STANDARD 23)
 include(FetchContent)
 
 set(LLAMA_CPP_TARGET_VERSION "b3837" CACHE STRING "Version of llama.cpp to build against")
-set(LLAMA_BUILD_COMMON ON)
 set(LLAMA_CPP_TARGET_CUDA_ARCHS "75-real;80-real;86-real;89-real;90-real" CACHE STRING "CUDA arch(s) to build")
 option(LLAMA_CPP_BUILD_OFFLINE_RUNNER "Flag to build the standalone c++ backend runner")
 option(LLAMA_CPP_BUILD_CUDA "Flag to build CUDA enabled inference through llama.cpp")
@@ -40,8 +39,8 @@ fetchcontent_makeavailable(llama)
 
 add_library(tgi_llamacpp_backend_impl STATIC csrc/backend.hpp csrc/backend.cpp)
 target_compile_features(tgi_llamacpp_backend_impl PRIVATE cxx_std_11)
-target_link_libraries(tgi_llamacpp_backend_impl PUBLIC spdlog::spdlog llama common)
-install(TARGETS tgi_llamacpp_backend_impl spdlog llama common)
+target_link_libraries(tgi_llamacpp_backend_impl PUBLIC spdlog::spdlog llama)
+install(TARGETS tgi_llamacpp_backend_impl spdlog llama)
 
 if (${CMAKE_BUILD_TYPE} STREQUAL "Debug")
     target_compile_definitions(tgi_llamacpp_backend_impl PRIVATE TGI_LLAMACPP_BACKEND_DEBUG=1)
@@ -51,7 +50,7 @@ if (${LLAMA_CPP_BUILD_OFFLINE_RUNNER})
     message(STATUS "Building llama.cpp offline runner")
     add_executable(tgi_llamacpp_offline_runner offline/main.cpp)
 
-    target_link_libraries(tgi_llamacpp_offline_runner PUBLIC tgi_llamacpp_backend_impl llama common spdlog::spdlog)
+    target_link_libraries(tgi_llamacpp_offline_runner PUBLIC tgi_llamacpp_backend_impl llama spdlog::spdlog)
 endif ()
 
 
diff --git a/backends/llamacpp/build.rs b/backends/llamacpp/build.rs
index 5331e87d451..0e9f2ae9afe 100644
--- a/backends/llamacpp/build.rs
+++ b/backends/llamacpp/build.rs
@@ -59,9 +59,6 @@ fn build_ffi_layer(deps_folder: &Path, install_prefix: &Path) {
     cxx_build::bridge("src/lib.rs")
         .static_flag(true)
         .std("c++23")
-        .include(deps_folder.join("spdlog-src").join("include")) // Why spdlog doesnt install headers?
-        .include(deps_folder.join("llama-src").join("ggml").join("include")) // Why ggml doesnt install headers?
-        .include(deps_folder.join("llama-src").join("common").join("include")) // Why common doesnt install headers?
         .include(install_prefix.join("include"))
         .include("csrc")
         .file("csrc/ffi.hpp")
@@ -98,15 +95,8 @@ fn main() {
     // Linkage info
     println!("cargo:rustc-link-search=native={}", out_dir.display());
 
-    if is_debug {
-        // println!("cargo:rustc-link-lib=dylib=fmtd");
-        println!("cargo:rustc-link-lib=dylib=spdlogd");
-    } else {
-        // println!("cargo:rustc-link-lib=dylib=fmt");
-        println!("cargo:rustc-link-lib=dylib=spdlog");
-    }
-
-    println!("cargo:rustc-link-lib=static=common");
+    let spdlog_linkage_target = if is_debug { "spdlogd" } else { "spdlog" };
+    println!("cargo:rustc-link-lib=static={spdlog_linkage_target}");
     println!("cargo:rustc-link-lib=dylib=ggml");
     println!("cargo:rustc-link-lib=dylib=llama");
 
diff --git a/backends/llamacpp/cmake/spdlog.cmake b/backends/llamacpp/cmake/spdlog.cmake
index bd81d6d51a3..f9d590a7847 100644
--- a/backends/llamacpp/cmake/spdlog.cmake
+++ b/backends/llamacpp/cmake/spdlog.cmake
@@ -1,6 +1,12 @@
 set(SPDLOG_USE_FMT ON)
-set(SPDLOG_BUILD_SHARED ON)
+set(SPDLOG_BUILD_SHARED OFF)
 set(SPDLOG_FMT_EXTERNAL OFF)
+set(SPDLOG_INSTALL ON)
+set(SPDLOG_NO_ATOMIC_LEVELS ON)  # We are not modifying log levels concurrently
+
+if (${CMAKE_SYSTEM_NAME} STREQUAL "Linux")
+    set(SPDLOG_CLOCK_COARSE ON)
+endif ()
 
 # Define the level at which SPDLOG_ compilation level is defined
 if (CMAKE_BUILD_TYPE STREQUAL "Debug")

From 5335bf973b2fef2c592a3061ccbf9e5c4fec7ab7 Mon Sep 17 00:00:00 2001
From: Morgan Funtowicz <funtowiczmo@gmail.com>
Date: Thu, 21 Nov 2024 00:03:05 +0100
Subject: [PATCH 65/91] feat(backend): multistream inference on CPU

---
 Cargo.lock                         |   1 +
 backends/llamacpp/CMakeLists.txt   |   6 +
 backends/llamacpp/Cargo.toml       |   1 +
 backends/llamacpp/build.rs         |   5 +-
 backends/llamacpp/cmake/numa.cmake |  20 ++++
 backends/llamacpp/csrc/backend.cpp |   2 +-
 backends/llamacpp/csrc/ffi.hpp     |  23 ++++
 backends/llamacpp/src/backend.rs   | 173 +++++++++++++++++++++++------
 backends/llamacpp/src/lib.rs       |   2 +
 9 files changed, 198 insertions(+), 35 deletions(-)
 create mode 100644 backends/llamacpp/cmake/numa.cmake

diff --git a/Cargo.lock b/Cargo.lock
index 6b6cb7a7e18..81b7c282a7e 100644
--- a/Cargo.lock
+++ b/Cargo.lock
@@ -4229,6 +4229,7 @@ dependencies = [
  "log",
  "metrics",
  "metrics-exporter-prometheus",
+ "num_cpus",
  "pkg-config",
  "serde_json",
  "text-generation-router",
diff --git a/backends/llamacpp/CMakeLists.txt b/backends/llamacpp/CMakeLists.txt
index f6dd2db1db7..13107e0abce 100644
--- a/backends/llamacpp/CMakeLists.txt
+++ b/backends/llamacpp/CMakeLists.txt
@@ -18,6 +18,7 @@ else ()
 endif ()
 
 # Add dependencies
+include(cmake/numa.cmake)
 include(cmake/spdlog.cmake)
 
 if (${LLAMA_CPP_BUILD_CUDA})
@@ -40,6 +41,11 @@ fetchcontent_makeavailable(llama)
 add_library(tgi_llamacpp_backend_impl STATIC csrc/backend.hpp csrc/backend.cpp)
 target_compile_features(tgi_llamacpp_backend_impl PRIVATE cxx_std_11)
 target_link_libraries(tgi_llamacpp_backend_impl PUBLIC spdlog::spdlog llama)
+
+if (NUMA_FOUND)
+    target_link_libraries(tgi_llamacpp_backend_impl PUBLIC numa)
+endif ()
+
 install(TARGETS tgi_llamacpp_backend_impl spdlog llama)
 
 if (${CMAKE_BUILD_TYPE} STREQUAL "Debug")
diff --git a/backends/llamacpp/Cargo.toml b/backends/llamacpp/Cargo.toml
index 48a0bb84362..0a5039b3034 100644
--- a/backends/llamacpp/Cargo.toml
+++ b/backends/llamacpp/Cargo.toml
@@ -9,6 +9,7 @@ homepage.workspace = true
 async-trait = "0.1"
 clap = { version = "4.5.19", features = ["derive"] }
 cxx = "1.0"
+num_cpus = "1"
 hf-hub = { workspace = true }
 image = { version = "0.25.1", features = ["default-formats"] }
 metrics = { workspace = true }
diff --git a/backends/llamacpp/build.rs b/backends/llamacpp/build.rs
index 0e9f2ae9afe..22726db1843 100644
--- a/backends/llamacpp/build.rs
+++ b/backends/llamacpp/build.rs
@@ -86,6 +86,7 @@ fn main() {
 
     // Emit linkage search path
     probe!("ompi", MPI_REQUIRED_VERSION);
+    probe!("numa", "2.0");
 
     // Backend
     BACKEND_DEPS.iter().for_each(|name| {
@@ -96,7 +97,9 @@ fn main() {
     println!("cargo:rustc-link-search=native={}", out_dir.display());
 
     let spdlog_linkage_target = if is_debug { "spdlogd" } else { "spdlog" };
-    println!("cargo:rustc-link-lib=static={spdlog_linkage_target}");
+    let fmt_linkage_target = if is_debug { "fmtd" } else { "fmt" };
+    println!("cargo:rustc-link-lib=dylib={spdlog_linkage_target}");
+    println!("cargo:rustc-link-lib=dylib={fmt_linkage_target}");
     println!("cargo:rustc-link-lib=dylib=ggml");
     println!("cargo:rustc-link-lib=dylib=llama");
 
diff --git a/backends/llamacpp/cmake/numa.cmake b/backends/llamacpp/cmake/numa.cmake
new file mode 100644
index 00000000000..0399b752ce9
--- /dev/null
+++ b/backends/llamacpp/cmake/numa.cmake
@@ -0,0 +1,20 @@
+# Find the numa policy library.
+# Output variables:
+#  NUMA_INCLUDE_DIR : e.g., /usr/include/.
+#  NUMA_LIBRARY     : Library path of numa library
+#  NUMA_FOUND       : True if found.
+FIND_PATH(NUMA_INCLUDE_DIR NAME numa.h
+        HINTS $ENV{HOME}/local/include /opt/local/include /usr/local/include /usr/include)
+
+FIND_LIBRARY(NUMA_LIBRARY NAME numa
+        HINTS $ENV{HOME}/local/lib64 $ENV{HOME}/local/lib /usr/local/lib64 /usr/local/lib /opt/local/lib64 /opt/local/lib /usr/lib64 /usr/lib
+)
+
+IF (NUMA_INCLUDE_DIR AND NUMA_LIBRARY)
+    SET(NUMA_FOUND TRUE)
+    MESSAGE(STATUS "Found numa library: inc=${NUMA_INCLUDE_DIR}, lib=${NUMA_LIBRARY}")
+ELSE ()
+    SET(NUMA_FOUND FALSE)
+    MESSAGE(STATUS "WARNING: Numa library not found.")
+    MESSAGE(STATUS "Try: 'sudo apt-get install libnuma libnuma-dev' (or sudo yum install numactl numactl-devel)")
+ENDIF ()
\ No newline at end of file
diff --git a/backends/llamacpp/csrc/backend.cpp b/backends/llamacpp/csrc/backend.cpp
index eb91e51782c..a30eb217e95 100644
--- a/backends/llamacpp/csrc/backend.cpp
+++ b/backends/llamacpp/csrc/backend.cpp
@@ -45,7 +45,7 @@ namespace huggingface::tgi::backends::llamacpp {
 #ifdef TGI_LLAMACPP_BACKEND_DEBUG
         char modelName[256];
         llama_model_meta_val_str(model.get(), "general.name", modelName, sizeof(modelName));
-        SPDLOG_DEBUG(FMT_STRING("Created llama.cpp  backend for model: '{}'"), std::string_view(modelName));
+        SPDLOG_DEBUG(FMT_STRING("Created llama.cpp backend for model: '{}'"), std::string_view(modelName));
 #endif
     }
 
diff --git a/backends/llamacpp/csrc/ffi.hpp b/backends/llamacpp/csrc/ffi.hpp
index 43694fa3276..9700f52e201 100644
--- a/backends/llamacpp/csrc/ffi.hpp
+++ b/backends/llamacpp/csrc/ffi.hpp
@@ -5,13 +5,19 @@
 #ifndef TGI_LLAMA_CPP_BACKEND_FFI_HPP
 #define TGI_LLAMA_CPP_BACKEND_FFI_HPP
 
+#include <cstdint>
 #include <exception>
 #include <filesystem>
 #include <memory>
 #include <ranges>
 #include <string_view>
+#include <thread>
 
 #include <spdlog/spdlog.h>
+#include <spdlog/fmt/ranges.h>
+#include <spdlog/fmt/std.h>
+
+#include <numa.h>
 
 namespace huggingface::tgi::backends::llamacpp {
     class llama_cpp_worker_frontend_t;
@@ -92,6 +98,23 @@ namespace huggingface::tgi::backends::llamacpp {
         auto *model = (llama_load_model_from_file(static_cast<std::string>(modelPath).c_str(), params));
         return std::make_unique<llama_cpp_worker_frontend_t>(model);
     }
+
+    void set_numactl_core_affinity(rust::Slice<const size_t> affinity) {
+        SPDLOG_INFO("Setting numactl cores affinity to {} for thread {}", affinity, std::this_thread::get_id());
+//        auto nodes = std::unordered_set<usize>();
+        auto cpumask = numa_allocate_cpumask();
+        for(auto core : affinity) {
+            numa_bitmask_setbit(cpumask, core);
+            numa_sched_setaffinity(0, cpumask);
+        }
+
+//#ifdef TGI_LLAMACPP_BACKEND_DEBUG
+        auto cpumask_check = numa_allocate_cpumask();
+        numa_sched_getaffinity(0, cpumask_check);
+        SPDLOG_DEBUG(FMT_STRING("numa_sched_affinity for thread {} -> {:b}"), std::this_thread::get_id(), *cpumask_check->maskp);
+//#endif
+
+    }
 }
 
 
diff --git a/backends/llamacpp/src/backend.rs b/backends/llamacpp/src/backend.rs
index dc29b707a1e..fa5bfbab0e3 100644
--- a/backends/llamacpp/src/backend.rs
+++ b/backends/llamacpp/src/backend.rs
@@ -1,13 +1,17 @@
 use crate::ffi::{
-    create_worker_frontend, GenerationParams, LlamaCppWorkerFrontend, SamplingParams,
+    create_worker_frontend, set_numactl_core_affinity, GenerationParams, LlamaCppWorkerFrontend,
+    SamplingParams,
 };
 use async_trait::async_trait;
 use cxx::UniquePtr;
-use std::ops::Deref;
+use log::warn;
+use std::cell::RefCell;
+use std::ops::Range;
 use std::path::{Path, PathBuf};
 use std::sync::mpsc::{channel, Receiver, Sender};
 use std::sync::Arc;
-use std::thread::{spawn, JoinHandle};
+use std::thread::spawn;
+use text_generation_router::infer::InferError::GenerationError;
 use text_generation_router::infer::{Backend, GeneratedText, InferError, InferStreamResponse};
 use text_generation_router::validation::{
     ValidGenerateRequest, ValidParameters, ValidStoppingParameters,
@@ -15,11 +19,41 @@ use text_generation_router::validation::{
 use text_generation_router::{FinishReason, Token};
 use thiserror::Error;
 use tokenizers::Tokenizer;
-use tokio::sync::mpsc::{unbounded_channel, UnboundedSender};
+use tokio::sync::mpsc::{unbounded_channel, UnboundedReceiver, UnboundedSender};
+use tokio::sync::{Semaphore, SemaphorePermit, TryAcquireError};
+use tokio::task::JoinHandle;
 use tokio::time::Instant;
 use tokio_stream::wrappers::UnboundedReceiverStream;
 use tracing::{debug, error, info};
 
+macro_rules! send_or_warn {
+    ($send: expr, $err: expr) => {
+        if let Err(se) = $send.send(err) {
+            warn!(
+                "Failed to send message back to the user: {}. Originating error: {}",
+                se, e
+            );
+        }
+    };
+}
+
+fn get_num_cores() -> usize {
+    match option_env!("TGI_USE_PHYSICAL_CORES")
+        .unwrap_or("OFF")
+        .to_uppercase()
+        .as_str()
+    {
+        "ON" => {
+            info!("Using only physical cores on the machine");
+            num_cpus::get_physical()
+        }
+        _ => {
+            info!("Using physical and logical cores on the machine");
+            num_cpus::get()
+        }
+    }
+}
+
 type InferResult = Result<InferStreamResponse, InferError>;
 
 unsafe impl Send for LlamaCppWorkerFrontend {}
@@ -71,12 +105,19 @@ pub enum LlamaCppBackendError {
 
 struct LlamaCppWorker {
     sender: Sender<(GenerationContext, UnboundedSender<InferResult>)>,
-    handle: JoinHandle<()>,
 }
 
-pub enum LlamaCppBackend {
-    Single(LlamaCppWorker),
-    // Multi(Vec<LlamaCppWorker>)
+impl LlamaCppWorker {
+    fn submit(&self, ctx: GenerationContext, sx: UnboundedSender<InferResult>) {
+        if let Err(err) = self.sender.send((ctx, sx)) {
+            // TODO: What do we do?
+        }
+    }
+}
+
+pub struct LlamaCppBackend {
+    scheduler_sender: UnboundedSender<(GenerationContext, UnboundedSender<InferResult>)>,
+    scheduler_handle: JoinHandle<()>,
 }
 
 impl LlamaCppBackend {
@@ -93,28 +134,67 @@ impl LlamaCppBackend {
         tokenizer: Arc<Tokenizer>,
         num_cores_per_instance: u16,
     ) -> Result<Self, LlamaCppBackendError> {
-        let shared_path = Arc::new(model_path);
-        let path = shared_path.deref().as_ref();
+        let path = model_path.as_ref();
         if !path.exists() {
             return Err(LlamaCppBackendError::ModelFileDoesntExist(
                 path.display().to_string(),
             ));
         }
 
-        let worker = match num_cores_per_instance {
-            0 => {
-                let worker = Self::allocate_worker(path)?;
-                let (sender, receiver) = channel();
-                let handle = spawn(move || scheduler_loop(worker, tokenizer, receiver));
-                LlamaCppBackend::Single(LlamaCppWorker { sender, handle })
-            }
-            _ => panic!("No supported yet"),
-        };
+        let cores_allocation = get_cores_allocation(num_cores_per_instance as usize);
+
+        // Allocate all the workers
+        let streams = cores_allocation
+            .iter()
+            .map(|affinity| match Self::allocate_worker(path) {
+                Ok(worker) => {
+                    let tokenizer = Arc::clone(&tokenizer);
+                    let (sender, receiver) = channel();
+                    let affinity = affinity.clone().collect::<Vec<_>>();
+                    spawn(move || worker_loop(worker, affinity, tokenizer, receiver));
 
-        Ok(worker)
+                    Ok(LlamaCppWorker { sender })
+                }
+                Err(e) => Err(e),
+            })
+            .collect::<Result<Vec<_>, _>>()?;
+
+        // Start the scheduler loop
+        let (scheduler_sender, scheduler_receiver) = unbounded_channel();
+        let scheduler_handle = tokio::spawn(scheduler_loop(scheduler_receiver, streams));
+        Ok(Self {
+            scheduler_sender,
+            scheduler_handle,
+        })
     }
 }
 
+fn get_cores_allocation(num_cores_per_instance: usize) -> Vec<Range<usize>> {
+    // Get the total number of cores on the CPU
+    let cores_count = get_num_cores();
+
+    // Make sure each instance has some cores available
+    let mut effective_num_cores_per_instance = match num_cores_per_instance {
+        0 => cores_count,
+        _ => num_cores_per_instance,
+    };
+
+    // If we have spare cores, let's see if we can give everyone one more core
+    let mut num_instances = cores_count / effective_num_cores_per_instance;
+    if cores_count - (num_instances * effective_num_cores_per_instance) >= num_instances {
+        effective_num_cores_per_instance = effective_num_cores_per_instance + 1;
+        warn!("Overriding cores allocation to {effective_num_cores_per_instance} per instance");
+    }
+
+    (0..num_instances)
+        .map(|ordinal| {
+            let start = ordinal * effective_num_cores_per_instance;
+            let end = (ordinal + 1) * effective_num_cores_per_instance - 1;
+            (start..end)
+        })
+        .collect()
+}
+
 fn llama_generate_callback(
     ctx: *mut InferContext,
     new_token_id: u32,
@@ -164,12 +244,12 @@ fn llama_generate_callback(
                             start: ctx.start,
                             queued: ctx.start,
                         }),
-                        Err(err) => Err(InferError::GenerationError(err.to_string())),
+                        Err(err) => Err(GenerationError(err.to_string())),
                     }
                 }
             }
         }
-        Err(ref err) => Err(InferError::GenerationError(err.to_string())),
+        Err(ref err) => Err(GenerationError(err.to_string())),
     };
 
     // Send back to the client
@@ -179,14 +259,43 @@ fn llama_generate_callback(
     status.is_err()
 }
 
-fn scheduler_loop(
+async fn scheduler_loop(
+    mut queue: UnboundedReceiver<(GenerationContext, UnboundedSender<InferResult>)>,
+    mut workers: Vec<LlamaCppWorker>,
+) {
+    // Semaphore allows us to wait for a worker to become available
+    let permits = Semaphore::new(workers.len());
+
+    // Let's receive incoming requests
+    loop {
+        match queue.recv().await {
+            None => break,
+            Some((ctx, sender)) => {
+                let permit = permits.try_acquire();
+                if let Err(err) = permit {
+                    let _ = sender.send(Err(InferError::Overloaded(err)));
+                }
+
+                // We can unwrap because we wouldn't have a semaphore available otherwise
+                let worker = workers.pop().unwrap();
+                worker.submit(ctx, sender);
+            }
+        }
+    }
+}
+
+fn worker_loop(
     mut backend: UniquePtr<LlamaCppWorkerFrontend>,
+    affinity: Vec<usize>,
     tokenizer: Arc<Tokenizer>,
     backlog: Receiver<(GenerationContext, UnboundedSender<InferResult>)>,
 ) {
     // This loop will mostly decode single token at every step, so no need to rely on parallelism
     tokenizers::utils::parallelism::set_parallelism(false);
 
+    // Bind cores for the current thread
+    set_numactl_core_affinity(&affinity);
+
     loop {
         if let Ok((generation, stream)) = backlog.recv() {
             let start = Instant::now();
@@ -214,6 +323,7 @@ fn scheduler_loop(
                     llama_generate_callback,
                 ) {
                     error!("Error while decoding tokens... {}", e.what());
+                    // TODO: What error to give back to the user?
                 }
 
                 // Make sure we re-keep track of the OpaqueStream box
@@ -244,18 +354,15 @@ impl Backend for LlamaCppBackend {
                 sampling_params,
             };
 
-            match self {
-                LlamaCppBackend::Single(worker) => match worker.sender.send((ctx, sx)) {
-                    Ok(_) => Ok(UnboundedReceiverStream::new(rx)),
-                    Err(_) => Err(InferError::GenerationError(
-                        "Failed to sent the request".to_string(),
-                    )),
-                },
+            // We send the workload to the scheduler
+            if let Err(e) = self.scheduler_sender.send((ctx, sx)) {
+                Err(InferError::IncompleteGenerationStream)
+            } else {
+                // We are returning the associated channel as early as we can, potentially closing it up
+                Ok(UnboundedReceiverStream::new(rx))
             }
         } else {
-            Err(InferError::GenerationError(
-                "Unsupported modalities".to_string(),
-            ))
+            Err(GenerationError("Unsupported modalities".to_string()))
         }
     }
 
diff --git a/backends/llamacpp/src/lib.rs b/backends/llamacpp/src/lib.rs
index 8fc989552be..f9fc72e513f 100644
--- a/backends/llamacpp/src/lib.rs
+++ b/backends/llamacpp/src/lib.rs
@@ -51,6 +51,8 @@ mod ffi {
 
         fn create_worker_frontend(modelPath: &str) -> Result<UniquePtr<LlamaCppWorkerFrontend>>;
 
+        fn set_numactl_core_affinity(affinity: &[usize]);
+
         unsafe fn stream(
             self: Pin<&mut LlamaCppWorkerFrontend>,
             tokens: &[u32],

From 50c376612cd49a0a9c16c67b3ee61bd5add96766 Mon Sep 17 00:00:00 2001
From: Morgan Funtowicz <funtowiczmo@gmail.com>
Date: Thu, 21 Nov 2024 13:52:38 +0100
Subject: [PATCH 66/91] feat(backend): bind thread and memory affinity for
 thread

---
 backends/llamacpp/build.rs         | 38 +++++++++-----
 backends/llamacpp/cmake/numa.cmake |  2 +-
 backends/llamacpp/csrc/ffi.hpp     | 84 +++++++++++++++++++++++++-----
 backends/llamacpp/src/backend.rs   |  7 ++-
 4 files changed, 101 insertions(+), 30 deletions(-)

diff --git a/backends/llamacpp/build.rs b/backends/llamacpp/build.rs
index 22726db1843..023ccfbaadb 100644
--- a/backends/llamacpp/build.rs
+++ b/backends/llamacpp/build.rs
@@ -12,8 +12,12 @@ const BACKEND_DEPS: [&str; 2] = [CMAKE_LLAMA_CPP_TARGET, CMAKE_LLAMA_CPP_FFI_TAR
 macro_rules! probe {
     ($name: expr, $version: expr) => {
         if let Err(_) = pkg_config::probe_library($name) {
-            pkg_config::probe_library(&format!("{}-{}", $name, $version))
-                .expect(&format!("Failed to locate {}", $name));
+            match pkg_config::probe_library(&format!("{}-{}", $name, $version)) {
+                Ok(_) => Ok(()),
+                Err(_) => Err(()),
+            }
+        } else {
+            Ok(())
         }
     };
 }
@@ -53,16 +57,27 @@ fn build_backend(
     deps_folder
 }
 
-fn build_ffi_layer(deps_folder: &Path, install_prefix: &Path) {
-    println!("cargo:warning={}", deps_folder.display());
+fn build_ffi_layer(is_debug: bool, install_prefix: &Path) {
     CFG.include_prefix = "backends/llamacpp";
-    cxx_build::bridge("src/lib.rs")
+
+    let mut bridge = cxx_build::bridge("src/lib.rs");
+
+    bridge
         .static_flag(true)
         .std("c++23")
         .include(install_prefix.join("include"))
         .include("csrc")
-        .file("csrc/ffi.hpp")
-        .compile(CMAKE_LLAMA_CPP_FFI_TARGET); // Make sure this target is not the same as cmake above
+        .file("csrc/ffi.hpp");
+
+    if is_debug {
+        bridge.define("TGI_LLAMACPP_BACKEND_DEBUG", "");
+    }
+
+    if probe!("numa", "2.0").is_ok() {
+        bridge.define("NUMA_AVAILABLE", "");
+    };
+
+    bridge.compile(CMAKE_LLAMA_CPP_FFI_TARGET); // Make sure this target is not the same as cmake above
 }
 
 fn main() {
@@ -82,11 +97,12 @@ fn main() {
     let deps_path = build_backend(is_debug, opt_level, out_dir.as_path(), &install_path);
 
     // Build the FFI layer calling the backend above
-    build_ffi_layer(&deps_path, &install_path);
+    build_ffi_layer(is_debug, &install_path);
 
     // Emit linkage search path
-    probe!("ompi", MPI_REQUIRED_VERSION);
-    probe!("numa", "2.0");
+    if probe!("ompi", MPI_REQUIRED_VERSION).is_err() {
+        panic!("An implement of MPI is required");
+    }
 
     // Backend
     BACKEND_DEPS.iter().for_each(|name| {
@@ -97,9 +113,7 @@ fn main() {
     println!("cargo:rustc-link-search=native={}", out_dir.display());
 
     let spdlog_linkage_target = if is_debug { "spdlogd" } else { "spdlog" };
-    let fmt_linkage_target = if is_debug { "fmtd" } else { "fmt" };
     println!("cargo:rustc-link-lib=dylib={spdlog_linkage_target}");
-    println!("cargo:rustc-link-lib=dylib={fmt_linkage_target}");
     println!("cargo:rustc-link-lib=dylib=ggml");
     println!("cargo:rustc-link-lib=dylib=llama");
 
diff --git a/backends/llamacpp/cmake/numa.cmake b/backends/llamacpp/cmake/numa.cmake
index 0399b752ce9..94dfddc2779 100644
--- a/backends/llamacpp/cmake/numa.cmake
+++ b/backends/llamacpp/cmake/numa.cmake
@@ -13,8 +13,8 @@ FIND_LIBRARY(NUMA_LIBRARY NAME numa
 IF (NUMA_INCLUDE_DIR AND NUMA_LIBRARY)
     SET(NUMA_FOUND TRUE)
     MESSAGE(STATUS "Found numa library: inc=${NUMA_INCLUDE_DIR}, lib=${NUMA_LIBRARY}")
+    add_compile_definitions(NUMA_AVAILABLE)
 ELSE ()
     SET(NUMA_FOUND FALSE)
     MESSAGE(STATUS "WARNING: Numa library not found.")
-    MESSAGE(STATUS "Try: 'sudo apt-get install libnuma libnuma-dev' (or sudo yum install numactl numactl-devel)")
 ENDIF ()
\ No newline at end of file
diff --git a/backends/llamacpp/csrc/ffi.hpp b/backends/llamacpp/csrc/ffi.hpp
index 9700f52e201..147f81aef02 100644
--- a/backends/llamacpp/csrc/ffi.hpp
+++ b/backends/llamacpp/csrc/ffi.hpp
@@ -17,7 +17,12 @@
 #include <spdlog/fmt/ranges.h>
 #include <spdlog/fmt/std.h>
 
+#ifdef NUMA_AVAILABLE
+#define CURRENT_THREAD 0
+#include <algorithm>
+#include <unordered_set>
 #include <numa.h>
+#endif
 
 namespace huggingface::tgi::backends::llamacpp {
     class llama_cpp_worker_frontend_t;
@@ -84,6 +89,10 @@ namespace huggingface::tgi::backends::llamacpp {
     };
 
     std::unique_ptr<llama_cpp_worker_frontend_t> create_worker_frontend(rust::Str modelPath) {
+#ifdef TGI_LLAMACPP_BACKEND_DEBUG
+        spdlog::set_level(spdlog::level::debug);
+#endif
+
         // Initialize the numa context from numactl
         static const bool INITIALIZED_NUMA_CONTEXT_ONCE = [](){
             llama_numa_init(GGML_NUMA_STRATEGY_NUMACTL);
@@ -99,21 +108,70 @@ namespace huggingface::tgi::backends::llamacpp {
         return std::make_unique<llama_cpp_worker_frontend_t>(model);
     }
 
-    void set_numactl_core_affinity(rust::Slice<const size_t> affinity) {
-        SPDLOG_INFO("Setting numactl cores affinity to {} for thread {}", affinity, std::this_thread::get_id());
-//        auto nodes = std::unordered_set<usize>();
-        auto cpumask = numa_allocate_cpumask();
-        for(auto core : affinity) {
-            numa_bitmask_setbit(cpumask, core);
-            numa_sched_setaffinity(0, cpumask);
-        }
+    struct numa_cpumask_deleter { void operator()(struct bitmask* cpumask){ numa_free_cpumask(cpumask); }};
+    typedef std::unique_ptr<struct bitmask, numa_cpumask_deleter> unique_cpumask_ptr;
 
-//#ifdef TGI_LLAMACPP_BACKEND_DEBUG
-        auto cpumask_check = numa_allocate_cpumask();
-        numa_sched_getaffinity(0, cpumask_check);
-        SPDLOG_DEBUG(FMT_STRING("numa_sched_affinity for thread {} -> {:b}"), std::this_thread::get_id(), *cpumask_check->maskp);
-//#endif
+    void set_numactl_core_affinity(rust::Slice<const size_t> affinity) {
+//    void set_numactl_core_affinity(std::vector<size_t> affinity) {
+#ifdef NUMA_AVAILABLE
+        if(numa_available()) {
+            SPDLOG_INFO("Setting numactl cores affinity to {} for thread {}", affinity, std::this_thread::get_id());
+
+            auto cpumask = unique_cpumask_ptr(numa_allocate_cpumask());
+            std::ranges::for_each(affinity, [&cpumask](size_t cpu) { numa_bitmask_setbit(cpumask.get(), cpu); });
+            numa_sched_setaffinity(CURRENT_THREAD, cpumask.get());
+
+            // Retrieve some information about the current setup
+            if(const auto numa_num_nodes = numa_num_configured_nodes(); numa_num_nodes > 1) {
+                const auto *numa_all_cpus = numa_all_cpus_ptr;
+                SPDLOG_INFO(FMT_STRING("All CPUs: {:b} (# Nodes: {:d}"), *numa_all_cpus->maskp, numa_num_nodes);
+
+                // Retrieve the cpumask specific for the current node
+                auto cpus_per_node = unique_cpumask_ptr(numa_allocate_cpumask());
+
+                // Allocate a set which keeps track of which nodes is being targeted
+                auto numa_spawning_nodes = std::unordered_set<size_t>();
+                for(auto node = 0; node < numa_num_nodes; ++node) {
+                    // Retrieve the cpumask for the target node
+                    numa_node_to_cpus(node, cpus_per_node.get());
+
+                    // intersect which cores on the nodes are targeted, in no one on that specific node
+                    // the value of allocated_cpus_on_node will be 0 as the result of the AND operation.
+                    const auto allocated_cpus_on_node = *cpus_per_node->maskp & *cpumask->maskp;
+                    if(allocated_cpus_on_node > 0) {
+
+                        // If we have some cores on the node, attempt to insert in the set of targeted node
+                        if(const auto [_, was_inserted] = numa_spawning_nodes.emplace(node); was_inserted) {
+                            SPDLOG_DEBUG("Allocated thread spawning node: {:d}", node);
+                        }
+                    }
+
+                    // Clear all the bits relative to the current node
+                    numa_bitmask_clearall(cpus_per_node.get());
+                }
+
+                // Bind the memory if we spawn a single node, otherwise, let's display a warning
+                if(numa_spawning_nodes.size() == 1) {
+                    SPDLOG_INFO(FMT_STRING("Setting memory affinity to node: {:d}"), *numa_spawning_nodes.begin());
+                    numa_set_preferred(*numa_spawning_nodes.begin());
+                } else {
+                    SPDLOG_WARN(FMT_STRING("Specified thread affinity spawn multiple NUMA nodes: {}"), numa_spawning_nodes);
+                }
+            }
 
+#ifdef TGI_LLAMACPP_BACKEND_DEBUG
+            // Sanity check in the logs...
+            auto *cpumask_check = numa_allocate_cpumask();
+            numa_sched_getaffinity(CURRENT_THREAD, cpumask_check);
+            SPDLOG_DEBUG(
+                    FMT_STRING("numa_sched_affinity for thread {} -> {:b}"),
+                    std::this_thread::get_id(), *cpumask_check->maskp);
+            numa_free_cpumask(cpumask_check);
+#endif
+        }
+#else
+        SPDLOG_WARN("TGI's llama.cpp backend was compiled without NUMA support");
+#endif
     }
 }
 
diff --git a/backends/llamacpp/src/backend.rs b/backends/llamacpp/src/backend.rs
index fa5bfbab0e3..1ef959a82c0 100644
--- a/backends/llamacpp/src/backend.rs
+++ b/backends/llamacpp/src/backend.rs
@@ -5,7 +5,6 @@ use crate::ffi::{
 use async_trait::async_trait;
 use cxx::UniquePtr;
 use log::warn;
-use std::cell::RefCell;
 use std::ops::Range;
 use std::path::{Path, PathBuf};
 use std::sync::mpsc::{channel, Receiver, Sender};
@@ -20,7 +19,7 @@ use text_generation_router::{FinishReason, Token};
 use thiserror::Error;
 use tokenizers::Tokenizer;
 use tokio::sync::mpsc::{unbounded_channel, UnboundedReceiver, UnboundedSender};
-use tokio::sync::{Semaphore, SemaphorePermit, TryAcquireError};
+use tokio::sync::Semaphore;
 use tokio::task::JoinHandle;
 use tokio::time::Instant;
 use tokio_stream::wrappers::UnboundedReceiverStream;
@@ -180,7 +179,7 @@ fn get_cores_allocation(num_cores_per_instance: usize) -> Vec<Range<usize>> {
     };
 
     // If we have spare cores, let's see if we can give everyone one more core
-    let mut num_instances = cores_count / effective_num_cores_per_instance;
+    let num_instances = cores_count / effective_num_cores_per_instance;
     if cores_count - (num_instances * effective_num_cores_per_instance) >= num_instances {
         effective_num_cores_per_instance = effective_num_cores_per_instance + 1;
         warn!("Overriding cores allocation to {effective_num_cores_per_instance} per instance");
@@ -190,7 +189,7 @@ fn get_cores_allocation(num_cores_per_instance: usize) -> Vec<Range<usize>> {
         .map(|ordinal| {
             let start = ordinal * effective_num_cores_per_instance;
             let end = (ordinal + 1) * effective_num_cores_per_instance - 1;
-            (start..end)
+            start..end
         })
         .collect()
 }

From 84eead219af7e4cb5068d87ff4311afbb7b5b55a Mon Sep 17 00:00:00 2001
From: Morgan Funtowicz <funtowiczmo@gmail.com>
Date: Thu, 21 Nov 2024 21:43:50 +0100
Subject: [PATCH 67/91] feat(backend): correctly setup llama_context providing
 n_threads and n_ubatch

---
 backends/llamacpp/csrc/backend.cpp |  2 +-
 backends/llamacpp/csrc/backend.hpp |  2 +-
 backends/llamacpp/csrc/ffi.hpp     |  8 ++++----
 backends/llamacpp/src/backend.rs   | 27 +++++++++++++++------------
 backends/llamacpp/src/lib.rs       |  5 ++++-
 5 files changed, 25 insertions(+), 19 deletions(-)

diff --git a/backends/llamacpp/csrc/backend.cpp b/backends/llamacpp/csrc/backend.cpp
index a30eb217e95..54f1cf73683 100644
--- a/backends/llamacpp/csrc/backend.cpp
+++ b/backends/llamacpp/csrc/backend.cpp
@@ -39,7 +39,7 @@ namespace huggingface::tgi::backends::llamacpp {
         return {pSampler, llama_sampler_deleter};
     }
 
-    worker_t::worker_t(std::shared_ptr<llama_model> model, const llama_context_params &params)
+    worker_t::worker_t(std::shared_ptr<llama_model> model, const llama_context_params &&params)
             : model_(model), context_(llama_new_context_with_model(model_.get(), params)) {
 
 #ifdef TGI_LLAMACPP_BACKEND_DEBUG
diff --git a/backends/llamacpp/csrc/backend.hpp b/backends/llamacpp/csrc/backend.hpp
index de37df75eb5..039d4eac9f1 100644
--- a/backends/llamacpp/csrc/backend.hpp
+++ b/backends/llamacpp/csrc/backend.hpp
@@ -85,7 +85,7 @@ namespace huggingface::tgi::backends::llamacpp {
          * @param model
          * @param params
          */
-        worker_t(std::shared_ptr<llama_model>, const llama_context_params &);
+        worker_t(std::shared_ptr<llama_model>, const llama_context_params &&);
 
         /**
          *
diff --git a/backends/llamacpp/csrc/ffi.hpp b/backends/llamacpp/csrc/ffi.hpp
index 147f81aef02..f9eec781967 100644
--- a/backends/llamacpp/csrc/ffi.hpp
+++ b/backends/llamacpp/csrc/ffi.hpp
@@ -51,8 +51,8 @@ namespace huggingface::tgi::backends::llamacpp {
         worker_t worker_;
 
     public:
-        explicit llama_cpp_worker_frontend_t(llama_model *model):
-            model_{ make_shared_llama_model(model) }, worker_(model_, {.no_perf = true}) {}
+        explicit llama_cpp_worker_frontend_t(llama_model *model, int32_t num_threads):
+            model_{ make_shared_llama_model(model) }, worker_(model_, {.n_ubatch = 1, .n_threads = num_threads, .no_perf = true}) {}
 
         size_t stream(
                 rust::Slice<const uint32_t> input_tokens,
@@ -88,7 +88,7 @@ namespace huggingface::tgi::backends::llamacpp {
         }
     };
 
-    std::unique_ptr<llama_cpp_worker_frontend_t> create_worker_frontend(rust::Str modelPath) {
+    std::unique_ptr<llama_cpp_worker_frontend_t> create_worker_frontend(rust::Str modelPath, uint32_t num_threads) {
 #ifdef TGI_LLAMACPP_BACKEND_DEBUG
         spdlog::set_level(spdlog::level::debug);
 #endif
@@ -105,7 +105,7 @@ namespace huggingface::tgi::backends::llamacpp {
 
         // Allocate the model from the Rust provided, string path
         auto *model = (llama_load_model_from_file(static_cast<std::string>(modelPath).c_str(), params));
-        return std::make_unique<llama_cpp_worker_frontend_t>(model);
+        return std::make_unique<llama_cpp_worker_frontend_t>(model, static_cast<int32_t>(num_threads));
     }
 
     struct numa_cpumask_deleter { void operator()(struct bitmask* cpumask){ numa_free_cpumask(cpumask); }};
diff --git a/backends/llamacpp/src/backend.rs b/backends/llamacpp/src/backend.rs
index 1ef959a82c0..e846a476e16 100644
--- a/backends/llamacpp/src/backend.rs
+++ b/backends/llamacpp/src/backend.rs
@@ -122,8 +122,9 @@ pub struct LlamaCppBackend {
 impl LlamaCppBackend {
     fn allocate_worker(
         path: &Path,
+        num_threads: u32,
     ) -> Result<UniquePtr<LlamaCppWorkerFrontend>, LlamaCppBackendError> {
-        create_worker_frontend(&path.display().to_string()).map_err(|ref err| {
+        create_worker_frontend(&path.display().to_string(), num_threads).map_err(|ref err| {
             LlamaCppBackendError::ModelInitializationFailed(path.to_path_buf(), err.to_string())
         })
     }
@@ -145,17 +146,19 @@ impl LlamaCppBackend {
         // Allocate all the workers
         let streams = cores_allocation
             .iter()
-            .map(|affinity| match Self::allocate_worker(path) {
-                Ok(worker) => {
-                    let tokenizer = Arc::clone(&tokenizer);
-                    let (sender, receiver) = channel();
-                    let affinity = affinity.clone().collect::<Vec<_>>();
-                    spawn(move || worker_loop(worker, affinity, tokenizer, receiver));
-
-                    Ok(LlamaCppWorker { sender })
-                }
-                Err(e) => Err(e),
-            })
+            .map(
+                |affinity| match Self::allocate_worker(path, num_cores_per_instance as u32) {
+                    Ok(worker) => {
+                        let tokenizer = Arc::clone(&tokenizer);
+                        let (sender, receiver) = channel();
+                        let affinity = affinity.clone().collect::<Vec<_>>();
+                        spawn(move || worker_loop(worker, affinity, tokenizer, receiver));
+
+                        Ok(LlamaCppWorker { sender })
+                    }
+                    Err(e) => Err(e),
+                },
+            )
             .collect::<Result<Vec<_>, _>>()?;
 
         // Start the scheduler loop
diff --git a/backends/llamacpp/src/lib.rs b/backends/llamacpp/src/lib.rs
index f9fc72e513f..6b047bf53ff 100644
--- a/backends/llamacpp/src/lib.rs
+++ b/backends/llamacpp/src/lib.rs
@@ -49,7 +49,10 @@ mod ffi {
         #[cxx_name = "llama_cpp_worker_frontend_t"]
         type LlamaCppWorkerFrontend;
 
-        fn create_worker_frontend(modelPath: &str) -> Result<UniquePtr<LlamaCppWorkerFrontend>>;
+        fn create_worker_frontend(
+            modelPath: &str,
+            num_threads: u32,
+        ) -> Result<UniquePtr<LlamaCppWorkerFrontend>>;
 
         fn set_numactl_core_affinity(affinity: &[usize]);
 

From 5a856616610d08ae58c6db78c8fe7d84327b7a19 Mon Sep 17 00:00:00 2001
From: Morgan Funtowicz <funtowiczmo@gmail.com>
Date: Fri, 22 Nov 2024 13:32:56 +0100
Subject: [PATCH 68/91] feat(backend): rely on multi consumer queue to
 scheduler workers

---
 Cargo.lock                       | 49 ++++++++++++++++++++++++
 backends/llamacpp/Cargo.toml     |  1 +
 backends/llamacpp/src/backend.rs | 65 +++++++++++---------------------
 3 files changed, 71 insertions(+), 44 deletions(-)

diff --git a/Cargo.lock b/Cargo.lock
index 81b7c282a7e..4b4e7670ee7 100644
--- a/Cargo.lock
+++ b/Cargo.lock
@@ -142,6 +142,18 @@ version = "0.7.6"
 source = "registry+https://github.com/rust-lang/crates.io-index"
 checksum = "7c02d123df017efcdfbd739ef81735b36c5ba83ec3c59c80a9d7ecc718f92e50"
 
+[[package]]
+name = "async-channel"
+version = "2.3.1"
+source = "registry+https://github.com/rust-lang/crates.io-index"
+checksum = "89b47800b0be77592da0afd425cc03468052844aff33b84e33cc696f64e77b6a"
+dependencies = [
+ "concurrent-queue",
+ "event-listener-strategy",
+ "futures-core",
+ "pin-project-lite",
+]
+
 [[package]]
 name = "async-rustls"
 version = "0.3.0"
@@ -758,6 +770,15 @@ dependencies = [
  "static_assertions",
 ]
 
+[[package]]
+name = "concurrent-queue"
+version = "2.5.0"
+source = "registry+https://github.com/rust-lang/crates.io-index"
+checksum = "4ca0197aee26d1ae37445ee532fefce43251d24cc7c166799f4d46817f1d3973"
+dependencies = [
+ "crossbeam-utils",
+]
+
 [[package]]
 name = "console"
 version = "0.15.8"
@@ -1158,6 +1179,27 @@ dependencies = [
  "cc",
 ]
 
+[[package]]
+name = "event-listener"
+version = "5.3.1"
+source = "registry+https://github.com/rust-lang/crates.io-index"
+checksum = "6032be9bd27023a771701cc49f9f053c751055f71efb2e0ae5c15809093675ba"
+dependencies = [
+ "concurrent-queue",
+ "parking",
+ "pin-project-lite",
+]
+
+[[package]]
+name = "event-listener-strategy"
+version = "0.5.2"
+source = "registry+https://github.com/rust-lang/crates.io-index"
+checksum = "0f214dc438f977e6d4e3500aaa277f5ad94ca83fbbd9b1a15713ce2344ccc5a1"
+dependencies = [
+ "event-listener",
+ "pin-project-lite",
+]
+
 [[package]]
 name = "exr"
 version = "1.72.0"
@@ -2922,6 +2964,12 @@ dependencies = [
  "unicode-width",
 ]
 
+[[package]]
+name = "parking"
+version = "2.2.1"
+source = "registry+https://github.com/rust-lang/crates.io-index"
+checksum = "f38d5652c16fde515bb1ecef450ab0f6a219d619a7274976324d5e377f7dceba"
+
 [[package]]
 name = "parking_lot"
 version = "0.12.3"
@@ -4219,6 +4267,7 @@ dependencies = [
 name = "text-generation-backend-llamacpp"
 version = "2.4.1-dev0"
 dependencies = [
+ "async-channel",
  "async-trait",
  "clap 4.5.20",
  "cmake",
diff --git a/backends/llamacpp/Cargo.toml b/backends/llamacpp/Cargo.toml
index 0a5039b3034..df2c3421866 100644
--- a/backends/llamacpp/Cargo.toml
+++ b/backends/llamacpp/Cargo.toml
@@ -7,6 +7,7 @@ homepage.workspace = true
 
 [dependencies]
 async-trait = "0.1"
+async-channel = "2.3"
 clap = { version = "4.5.19", features = ["derive"] }
 cxx = "1.0"
 num_cpus = "1"
diff --git a/backends/llamacpp/src/backend.rs b/backends/llamacpp/src/backend.rs
index e846a476e16..5bcb913b776 100644
--- a/backends/llamacpp/src/backend.rs
+++ b/backends/llamacpp/src/backend.rs
@@ -2,6 +2,7 @@ use crate::ffi::{
     create_worker_frontend, set_numactl_core_affinity, GenerationParams, LlamaCppWorkerFrontend,
     SamplingParams,
 };
+use async_channel::{unbounded as mpmc_unbounded, Receiver as MpmcReceiver, Sender as MpmcSender};
 use async_trait::async_trait;
 use cxx::UniquePtr;
 use log::warn;
@@ -19,7 +20,6 @@ use text_generation_router::{FinishReason, Token};
 use thiserror::Error;
 use tokenizers::Tokenizer;
 use tokio::sync::mpsc::{unbounded_channel, UnboundedReceiver, UnboundedSender};
-use tokio::sync::Semaphore;
 use tokio::task::JoinHandle;
 use tokio::time::Instant;
 use tokio_stream::wrappers::UnboundedReceiverStream;
@@ -102,18 +102,6 @@ pub enum LlamaCppBackendError {
     ModelInitializationFailed(PathBuf, String),
 }
 
-struct LlamaCppWorker {
-    sender: Sender<(GenerationContext, UnboundedSender<InferResult>)>,
-}
-
-impl LlamaCppWorker {
-    fn submit(&self, ctx: GenerationContext, sx: UnboundedSender<InferResult>) {
-        if let Err(err) = self.sender.send((ctx, sx)) {
-            // TODO: What do we do?
-        }
-    }
-}
-
 pub struct LlamaCppBackend {
     scheduler_sender: UnboundedSender<(GenerationContext, UnboundedSender<InferResult>)>,
     scheduler_handle: JoinHandle<()>,
@@ -141,29 +129,26 @@ impl LlamaCppBackend {
             ));
         }
 
-        let cores_allocation = get_cores_allocation(num_cores_per_instance as usize);
+        // Allocate the multi-consumer queue to orchestrate all the workers
+        let (backlog_submitter, backlog_receiver) = mpmc_unbounded();
 
         // Allocate all the workers
-        let streams = cores_allocation
-            .iter()
-            .map(
-                |affinity| match Self::allocate_worker(path, num_cores_per_instance as u32) {
-                    Ok(worker) => {
-                        let tokenizer = Arc::clone(&tokenizer);
-                        let (sender, receiver) = channel();
-                        let affinity = affinity.clone().collect::<Vec<_>>();
-                        spawn(move || worker_loop(worker, affinity, tokenizer, receiver));
-
-                        Ok(LlamaCppWorker { sender })
-                    }
-                    Err(e) => Err(e),
-                },
-            )
-            .collect::<Result<Vec<_>, _>>()?;
+        let cores_allocation = get_cores_allocation(num_cores_per_instance as usize);
+        cores_allocation.iter().for_each(|affinity| {
+            match Self::allocate_worker(path, num_cores_per_instance as u32) {
+                Ok(worker) => {
+                    let tokenizer = Arc::clone(&tokenizer);
+                    let affinity = affinity.clone().collect::<Vec<_>>();
+                    let backlog_receiver = backlog_receiver.clone();
+                    spawn(move || worker_loop(worker, affinity, tokenizer, backlog_receiver));
+                }
+                Err(e) => {}
+            }
+        });
 
         // Start the scheduler loop
         let (scheduler_sender, scheduler_receiver) = unbounded_channel();
-        let scheduler_handle = tokio::spawn(scheduler_loop(scheduler_receiver, streams));
+        let scheduler_handle = tokio::spawn(scheduler_loop(scheduler_receiver, backlog_submitter));
         Ok(Self {
             scheduler_sender,
             scheduler_handle,
@@ -263,24 +248,16 @@ fn llama_generate_callback(
 
 async fn scheduler_loop(
     mut queue: UnboundedReceiver<(GenerationContext, UnboundedSender<InferResult>)>,
-    mut workers: Vec<LlamaCppWorker>,
+    backlog: MpmcSender<(GenerationContext, UnboundedSender<InferResult>)>,
 ) {
-    // Semaphore allows us to wait for a worker to become available
-    let permits = Semaphore::new(workers.len());
-
     // Let's receive incoming requests
     loop {
         match queue.recv().await {
             None => break,
             Some((ctx, sender)) => {
-                let permit = permits.try_acquire();
-                if let Err(err) = permit {
-                    let _ = sender.send(Err(InferError::Overloaded(err)));
+                if let Err(e) = backlog.send((ctx, sender)).await {
+                    todo!("What do we do")
                 }
-
-                // We can unwrap because we wouldn't have a semaphore available otherwise
-                let worker = workers.pop().unwrap();
-                worker.submit(ctx, sender);
             }
         }
     }
@@ -290,7 +267,7 @@ fn worker_loop(
     mut backend: UniquePtr<LlamaCppWorkerFrontend>,
     affinity: Vec<usize>,
     tokenizer: Arc<Tokenizer>,
-    backlog: Receiver<(GenerationContext, UnboundedSender<InferResult>)>,
+    backlog: MpmcReceiver<(GenerationContext, UnboundedSender<InferResult>)>,
 ) {
     // This loop will mostly decode single token at every step, so no need to rely on parallelism
     tokenizers::utils::parallelism::set_parallelism(false);
@@ -299,7 +276,7 @@ fn worker_loop(
     set_numactl_core_affinity(&affinity);
 
     loop {
-        if let Ok((generation, stream)) = backlog.recv() {
+        if let Ok((generation, stream)) = backlog.recv_blocking() {
             let start = Instant::now();
             let generation_params = generation.generation_params; // copy
             let sampling_params = generation.sampling_params; // copy

From 30ae99631c0c028afe482e460b2bad316918e0f2 Mon Sep 17 00:00:00 2001
From: Morgan Funtowicz <funtowiczmo@gmail.com>
Date: Fri, 22 Nov 2024 13:34:52 +0100
Subject: [PATCH 69/91] misc(docker): add numa lib as dependency

---
 Dockerfile.llamacpp | 2 ++
 1 file changed, 2 insertions(+)

diff --git a/Dockerfile.llamacpp b/Dockerfile.llamacpp
index 3dab2a2968d..916f885a522 100644
--- a/Dockerfile.llamacpp
+++ b/Dockerfile.llamacpp
@@ -23,6 +23,7 @@ RUN --mount=type=cache,target=/var/cache/apt,sharing=locked \
     cmake \
     gcc g++ \
     libc++-dev \
+    libnumactl-dev \
     libopenmpi-dev \
     libssl-dev \
     ninja-build \
@@ -61,6 +62,7 @@ RUN --mount=type=cache,target=/var/cache/apt,sharing=locked \
     apt update && \
     apt upgrade -y && \
     apt install -y \
+    numactl \
     openssl \
     python3.11-dev
 

From 2d9465d181e0778a5456e5d99503264c98318f65 Mon Sep 17 00:00:00 2001
From: Morgan Funtowicz <funtowiczmo@gmail.com>
Date: Fri, 22 Nov 2024 14:02:58 +0100
Subject: [PATCH 70/91] misc(backend): allow rebinding numa core affinity

---
 backends/llamacpp/csrc/backend.cpp |  1 -
 backends/llamacpp/csrc/ffi.hpp     | 10 +++++++++-
 backends/llamacpp/src/backend.rs   | 21 +++++----------------
 backends/llamacpp/src/lib.rs       |  3 ++-
 4 files changed, 16 insertions(+), 19 deletions(-)

diff --git a/backends/llamacpp/csrc/backend.cpp b/backends/llamacpp/csrc/backend.cpp
index 54f1cf73683..b60c3ddc0a3 100644
--- a/backends/llamacpp/csrc/backend.cpp
+++ b/backends/llamacpp/csrc/backend.cpp
@@ -3,7 +3,6 @@
 //
 
 #include <filesystem>
-#include <span>
 
 #include <ggml.h>
 #include <llama.h>
diff --git a/backends/llamacpp/csrc/ffi.hpp b/backends/llamacpp/csrc/ffi.hpp
index f9eec781967..d33a4c7b105 100644
--- a/backends/llamacpp/csrc/ffi.hpp
+++ b/backends/llamacpp/csrc/ffi.hpp
@@ -111,7 +111,7 @@ namespace huggingface::tgi::backends::llamacpp {
     struct numa_cpumask_deleter { void operator()(struct bitmask* cpumask){ numa_free_cpumask(cpumask); }};
     typedef std::unique_ptr<struct bitmask, numa_cpumask_deleter> unique_cpumask_ptr;
 
-    void set_numactl_core_affinity(rust::Slice<const size_t> affinity) {
+    void set_numa_core_affinity(rust::Slice<const size_t> affinity) {
 //    void set_numactl_core_affinity(std::vector<size_t> affinity) {
 #ifdef NUMA_AVAILABLE
         if(numa_available()) {
@@ -173,6 +173,14 @@ namespace huggingface::tgi::backends::llamacpp {
         SPDLOG_WARN("TGI's llama.cpp backend was compiled without NUMA support");
 #endif
     }
+
+    /**
+     * 
+     */
+    void update_numa_affinity() {
+        SPDLOG_INFO("Rebinding NUMA affinity for current worker on thread: {}", std::this_thread::get_id());
+        llama_numa_init(ggml_numa_strategy::GGML_NUMA_STRATEGY_NUMACTL);
+    }
 }
 
 
diff --git a/backends/llamacpp/src/backend.rs b/backends/llamacpp/src/backend.rs
index 5bcb913b776..709e5d42500 100644
--- a/backends/llamacpp/src/backend.rs
+++ b/backends/llamacpp/src/backend.rs
@@ -1,6 +1,6 @@
 use crate::ffi::{
-    create_worker_frontend, set_numactl_core_affinity, GenerationParams, LlamaCppWorkerFrontend,
-    SamplingParams,
+    create_worker_frontend, set_numa_core_affinity, update_numa_affinity, GenerationParams,
+    LlamaCppWorkerFrontend, SamplingParams,
 };
 use async_channel::{unbounded as mpmc_unbounded, Receiver as MpmcReceiver, Sender as MpmcSender};
 use async_trait::async_trait;
@@ -8,7 +8,6 @@ use cxx::UniquePtr;
 use log::warn;
 use std::ops::Range;
 use std::path::{Path, PathBuf};
-use std::sync::mpsc::{channel, Receiver, Sender};
 use std::sync::Arc;
 use std::thread::spawn;
 use text_generation_router::infer::InferError::GenerationError;
@@ -25,17 +24,6 @@ use tokio::time::Instant;
 use tokio_stream::wrappers::UnboundedReceiverStream;
 use tracing::{debug, error, info};
 
-macro_rules! send_or_warn {
-    ($send: expr, $err: expr) => {
-        if let Err(se) = $send.send(err) {
-            warn!(
-                "Failed to send message back to the user: {}. Originating error: {}",
-                se, e
-            );
-        }
-    };
-}
-
 fn get_num_cores() -> usize {
     match option_env!("TGI_USE_PHYSICAL_CORES")
         .unwrap_or("OFF")
@@ -272,8 +260,9 @@ fn worker_loop(
     // This loop will mostly decode single token at every step, so no need to rely on parallelism
     tokenizers::utils::parallelism::set_parallelism(false);
 
-    // Bind cores for the current thread
-    set_numactl_core_affinity(&affinity);
+    // Bind cores for the current thread and make sure it's taken into account
+    set_numa_core_affinity(&affinity);
+    update_numa_affinity();
 
     loop {
         if let Ok((generation, stream)) = backlog.recv_blocking() {
diff --git a/backends/llamacpp/src/lib.rs b/backends/llamacpp/src/lib.rs
index 6b047bf53ff..e06220f2f84 100644
--- a/backends/llamacpp/src/lib.rs
+++ b/backends/llamacpp/src/lib.rs
@@ -54,7 +54,8 @@ mod ffi {
             num_threads: u32,
         ) -> Result<UniquePtr<LlamaCppWorkerFrontend>>;
 
-        fn set_numactl_core_affinity(affinity: &[usize]);
+        fn set_numa_core_affinity(affinity: &[usize]);
+        fn update_numa_affinity();
 
         unsafe fn stream(
             self: Pin<&mut LlamaCppWorkerFrontend>,

From 4ee2ee58c9f2b528a95d78129aced91c9ca3e7f3 Mon Sep 17 00:00:00 2001
From: Morgan Funtowicz <funtowiczmo@gmail.com>
Date: Fri, 22 Nov 2024 14:48:39 +0100
Subject: [PATCH 71/91] misc(license): update LICENSE

---
 LICENSE | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/LICENSE b/LICENSE
index d6456956733..faa86e9b0a6 100644
--- a/LICENSE
+++ b/LICENSE
@@ -187,7 +187,7 @@
       same "printed page" as the copyright notice for easier
       identification within third-party archives.
 
-   Copyright [yyyy] [name of copyright owner]
+   Copyright 2024 Hugging Face Inc.
 
    Licensed under the Apache License, Version 2.0 (the "License");
    you may not use this file except in compliance with the License.

From b9c04b9c0726d66c30e8c7108fe306e1193b22c1 Mon Sep 17 00:00:00 2001
From: Morgan Funtowicz <funtowiczmo@gmail.com>
Date: Fri, 22 Nov 2024 15:13:54 +0100
Subject: [PATCH 72/91] misc(doc): c++ documentation

---
 backends/llamacpp/csrc/backend.hpp | 29 +++++++++++++--------
 backends/llamacpp/csrc/ffi.hpp     | 42 +++++++++++++++++++++++++++++-
 2 files changed, 59 insertions(+), 12 deletions(-)

diff --git a/backends/llamacpp/csrc/backend.hpp b/backends/llamacpp/csrc/backend.hpp
index 039d4eac9f1..0e1a13ac167 100644
--- a/backends/llamacpp/csrc/backend.hpp
+++ b/backends/llamacpp/csrc/backend.hpp
@@ -33,14 +33,15 @@ namespace huggingface::tgi::backends::llamacpp {
     static constexpr auto llama_void_callback = [](llama_token, float_t, bool, size_t) -> bool { return false; };
 
     /**
-     *
+     * Represent an error which can be returned as part of an std::expected
      */
     enum backend_error_t : uint8_t {
+        // Provided model filepath doesnt exist
         MODEL_FILE_DOESNT_EXIST = 1
     };
 
     /**
-     *
+     * Hold all the parameters provided by TGI to sample from the final distribution of tokens
      */
     struct sampling_params_t {
         uint32_t top_k = std::numeric_limits<decltype(top_k)>::max();
@@ -58,13 +59,19 @@ namespace huggingface::tgi::backends::llamacpp {
     };
 
     /**
-     *
+     * Hold all the parameters provided by TGI to control the generation process
      */
     struct generation_params_t {
         uint32_t max_new_tokens = std::numeric_limits<uint32_t>::max();
         bool ignore_eos_token = false;
     };
 
+    /**
+     * Container structure wrapping up the current generation context composed by:
+     * - a non-owning view over the prompt tokens
+     * - the sampling parameters
+     * - the generation parameters
+     */
     struct generation_context_t {
         generation_params_t generation_params;
         sampling_params_t sampling_params;
@@ -72,7 +79,7 @@ namespace huggingface::tgi::backends::llamacpp {
     };
 
     /**
-     *
+     * Represent the actual model execution (i.e. "forward") and generation loop for llama.cpp
      */
     class worker_t {
     private:
@@ -81,17 +88,17 @@ namespace huggingface::tgi::backends::llamacpp {
 
     public:
         /**
-         *
-         * @param model
-         * @param params
+         * Create a new llama.cpp worker from the provided llama_model and the context parameters
+         * @param model Previously allocated `llama_model` holding the weights of the neural network
+         * @param params Parameters to allocate the execution context of the model
          */
         worker_t(std::shared_ptr<llama_model>, const llama_context_params &&);
 
         /**
-         *
-         * @param context
-         * @param generation_context
-         * @param callback
+         * Generate multiple successive tokens, sampled from the distribution generated by executing a forward pass
+         * over the neural network operations and matrices
+         * @param generation_context The generation context holding sampling and generation parameters along with prompt tokens
+         * @param callback An optional callback function which would be called everytime a new token is sampled
          */
         [[nodiscard]] std::expected<size_t, backend_error_t>
         generate(const generation_context_t &, const std::optional<llama_decode_callback> &) const;
diff --git a/backends/llamacpp/csrc/ffi.hpp b/backends/llamacpp/csrc/ffi.hpp
index d33a4c7b105..3645526344f 100644
--- a/backends/llamacpp/csrc/ffi.hpp
+++ b/backends/llamacpp/csrc/ffi.hpp
@@ -35,11 +35,18 @@ namespace huggingface::tgi::backends::llamacpp {
 
 namespace huggingface::tgi::backends::llamacpp {
 
+    /**
+     * Smart pointer to drop a llama_model when going out of scope
+     */
     auto llama_model_deleter = [](llama_model *model) { llama_free_model(model); };
     auto make_shared_llama_model = [](llama_model *model) {
         return std::shared_ptr<llama_model>(model, llama_model_deleter);
     };
 
+    /**
+     * llama.cpp backend specific exception mapped from `backend_exception_t` to throw at the FFI level and
+     * allow automatic implementation of Result<_, Exception> from C++ to Rust
+     */
     class llama_cpp_backend_exception_t : std::exception {};
 
     /**
@@ -51,9 +58,29 @@ namespace huggingface::tgi::backends::llamacpp {
         worker_t worker_;
 
     public:
+        /**
+         * Create a new llama.cpp worker frontend allowing to map custom Rust FFI types from CXX crate to c++ boundary
+         * @param model The `llama_model` to use on the worker
+         * @param num_threads The number of threads the worker is allowed to spawn accross for its threadpool
+         */
         explicit llama_cpp_worker_frontend_t(llama_model *model, int32_t num_threads):
             model_{ make_shared_llama_model(model) }, worker_(model_, {.n_ubatch = 1, .n_threads = num_threads, .no_perf = true}) {}
 
+        /**
+         * Generate a new set of tokens from the provided `input_tokens`, streaming each individual token generated
+         * through the `callback`.
+         * Individual tokens are generated using the sampling parameters provided through `sampling_params` and the
+         * generation parameters, provided through `generation_params` allowing to define the behaviour of the generation loop.
+         * `ctx` is an opaque structure defined on Rust side which holds stream information to send tokens back to the originating client.
+         * @param input_tokens Prompt input tokens originating from the tokenization of the request's text input
+         * @param generation_params Parameters controlling the generation loop such as ignoring the end of sentence token or
+         * the maximum number of tokens to generate
+         * @param sampling_params Parameters controlling the sampling process on the final token distribution
+         * @param ctx Opaque structure from Rust holding HTTP channel to stream back response to the client
+         * @param callback Function pointer called everytime a new token is generated during the generation loop.
+         * If this callback returns `true` it signals an early termination request on the Rust side.
+         * @return Number of generated tokens
+         */
         size_t stream(
                 rust::Slice<const uint32_t> input_tokens,
                 const generation_params_t generation_params,
@@ -88,6 +115,12 @@ namespace huggingface::tgi::backends::llamacpp {
         }
     };
 
+    /**
+     * Utility method to allocate a new worker frontend from Rust
+     * @param modelPath The GGUF model path as an UTF-8 string from Rust
+     * @param num_threads Integer greater than zero representing the number of threads the worker is allowed to use for computations
+     * @return unique ownership of `llama_cpp_worker_frontend_t` pointer
+     */
     std::unique_ptr<llama_cpp_worker_frontend_t> create_worker_frontend(rust::Str modelPath, uint32_t num_threads) {
 #ifdef TGI_LLAMACPP_BACKEND_DEBUG
         spdlog::set_level(spdlog::level::debug);
@@ -108,9 +141,16 @@ namespace huggingface::tgi::backends::llamacpp {
         return std::make_unique<llama_cpp_worker_frontend_t>(model, static_cast<int32_t>(num_threads));
     }
 
+    /**
+     * Smart pointer to automatically destroy the underlying numa_bitset * when going out of scope
+     */
     struct numa_cpumask_deleter { void operator()(struct bitmask* cpumask){ numa_free_cpumask(cpumask); }};
     typedef std::unique_ptr<struct bitmask, numa_cpumask_deleter> unique_cpumask_ptr;
 
+    /**
+     * Define the NUMA core and memory affinity for the current thread by binding cores and memory to respective NUMA node(s)
+     * @param affinity The set of allowed execution cores to inform the scheduler for the current thread
+     */
     void set_numa_core_affinity(rust::Slice<const size_t> affinity) {
 //    void set_numactl_core_affinity(std::vector<size_t> affinity) {
 #ifdef NUMA_AVAILABLE
@@ -175,7 +215,7 @@ namespace huggingface::tgi::backends::llamacpp {
     }
 
     /**
-     * 
+     * Force an update of the llama.cpp/ggml threadpool, reading from NUMA cores affinity
      */
     void update_numa_affinity() {
         SPDLOG_INFO("Rebinding NUMA affinity for current worker on thread: {}", std::this_thread::get_id());

From 862a519fdd818fd492696021bb3d19e993ec7b8b Mon Sep 17 00:00:00 2001
From: Morgan Funtowicz <funtowiczmo@gmail.com>
Date: Fri, 22 Nov 2024 15:35:55 +0100
Subject: [PATCH 73/91] misc(doc): rust documentation

---
 backends/llamacpp/src/backend.rs | 135 +++++++++++++++++++++++++------
 backends/llamacpp/src/lib.rs     |  60 ++++++++++++++
 2 files changed, 169 insertions(+), 26 deletions(-)

diff --git a/backends/llamacpp/src/backend.rs b/backends/llamacpp/src/backend.rs
index 709e5d42500..32547655664 100644
--- a/backends/llamacpp/src/backend.rs
+++ b/backends/llamacpp/src/backend.rs
@@ -24,6 +24,10 @@ use tokio::time::Instant;
 use tokio_stream::wrappers::UnboundedReceiverStream;
 use tracing::{debug, error, info};
 
+/// Detect the number of CPU cores on the machine
+///
+/// returns: usize Integer greater than 0 representing the number of CPU cores on the machine
+///
 fn get_num_cores() -> usize {
     match option_env!("TGI_USE_PHYSICAL_CORES")
         .unwrap_or("OFF")
@@ -41,6 +45,45 @@ fn get_num_cores() -> usize {
     }
 }
 
+/// Subdivide the set of CPU cores available on the system to equal, non-overlapping, subsets of CPU cores
+///
+/// # Arguments
+///
+/// * `num_cores_per_instance`: Minimum number of cores for each instance
+///
+/// returns: Vec<Range<usize>, Global>
+///
+/// # Examples
+///
+/// ```
+///
+/// ```
+fn get_cores_allocation(num_cores_per_instance: usize) -> Vec<Range<usize>> {
+    // Get the total number of cores on the CPU
+    let cores_count = get_num_cores();
+
+    // Make sure each instance has some cores available
+    let mut effective_num_cores_per_instance = match num_cores_per_instance {
+        0 => cores_count,
+        _ => num_cores_per_instance,
+    };
+
+    // If we have spare cores, let's see if we can give everyone one more core
+    let num_instances = cores_count / effective_num_cores_per_instance;
+    if cores_count - (num_instances * effective_num_cores_per_instance) >= num_instances {
+        effective_num_cores_per_instance = effective_num_cores_per_instance + 1;
+        warn!("Overriding cores allocation to {effective_num_cores_per_instance} per instance");
+    }
+
+    (0..num_instances)
+        .map(|ordinal| {
+            let start = ordinal * effective_num_cores_per_instance;
+            let end = (ordinal + 1) * effective_num_cores_per_instance - 1;
+            start..end
+        })
+        .collect()
+}
+
 type InferResult = Result<InferStreamResponse, InferError>;
 
 unsafe impl Send for LlamaCppWorkerFrontend {}
@@ -96,6 +139,20 @@ pub struct LlamaCppBackend {
 }
 
 impl LlamaCppBackend {
+    /// Attempt to create a new llama.cpp worker from the provided model path
+    ///
+    /// # Arguments
+    ///
+    /// * `path`: Path to the GGUF model file to load
+    /// * `num_threads`: Number of cores the model is allowed to spawn for its computations
+    ///
+    /// returns: Result<UniquePtr<LlamaCppWorkerFrontend>, LlamaCppBackendError>
+    ///
+    /// # Examples
+    ///
+    /// ```
+    ///
+    /// ```
     fn allocate_worker(
         path: &Path,
         num_threads: u32,
@@ -144,32 +201,27 @@ impl LlamaCppBackend {
     }
 }
 
-fn get_cores_allocation(num_cores_per_instance: usize) -> Vec<Range<usize>> {
-    // Get the total number of cores on the CPU
-    let cores_count = get_num_cores();
-
-    // Make sure each instance has some cores available
-    let mut effective_num_cores_per_instance = match num_cores_per_instance {
-        0 => cores_count,
-        _ => num_cores_per_instance,
-    };
-
-    // If we have spare cores, let's see if we can give everyone one more core
-    let num_instances = cores_count / effective_num_cores_per_instance;
-    if cores_count - (num_instances * effective_num_cores_per_instance) >= num_instances {
-        effective_num_cores_per_instance = effective_num_cores_per_instance + 1;
-        warn!("Overriding cores allocation to {effective_num_cores_per_instance} per instance");
-    }
-
-    (0..num_instances)
-        .map(|ordinal| {
-            let start = ordinal * effective_num_cores_per_instance;
-            let end = (ordinal + 1) * effective_num_cores_per_instance - 1;
-            start..end
-        })
-        .collect()
-}
-
+/// llama.cpp worker actual streaming callback, called everytime a new token is being generated
+///
+/// # Arguments
+///
+/// * `ctx`: InferContext holding the channel to stream back generated token to the client.
+/// *UNSAFE* This parameter is unsafe and represented as a mutable pointer to avoid automatic drop of its
+/// referenced resources after the first iteration step.
+/// It's the responsibility of the caller to ensure a `Box::from_raw` is taking back full ownership of the pointer
+/// for correct deletion.
+/// * `new_token_id`: The sampled token identifier
+/// * `new_token_logit`: the sampled token identifier log probability
+/// * `is_final`: Flag indicating if the sampled token is a final one
+/// * `n_generated_tokens`: Counter representing the actual number of token generated at this stage
+///
+/// returns: bool `true` if the worker should stop the generation at this stage, `false` to continue
+///
+/// # Examples
+///
+/// ```
+///
+/// ```
 fn llama_generate_callback(
     ctx: *mut InferContext,
     new_token_id: u32,
@@ -234,6 +286,20 @@ fn llama_generate_callback(
     status.is_err()
 }
 
+/// Main loop allowing scheduling incoming requests without blocking the main execution thread
+///
+/// # Arguments
+///
+/// * `queue`: Synchronized container to receive new request
+/// * `backlog`: Synchronized container to dispatch new request towards all the workers for one to pick it up.
+///
+/// returns: ()
+///
+/// # Examples
+///
+/// ```
+///
+/// ```
 async fn scheduler_loop(
     mut queue: UnboundedReceiver<(GenerationContext, UnboundedSender<InferResult>)>,
     backlog: MpmcSender<(GenerationContext, UnboundedSender<InferResult>)>,
@@ -251,6 +317,23 @@ async fn scheduler_loop(
     }
 }
 
+/// llama.cpp worker thread receiving incoming requests from the scheduler and handling all generation
+/// process along with the streaming logic back to the client.
+///
+/// # Arguments
+///
+/// * `backend`: Owned llama.cpp worker with allocated execution resources
+/// * `affinity`: Set of CPUs to bind the worker's thread for scheduling
+/// * `tokenizer`: Tokenizer to use to decode generated token
+/// * `backlog`: Multi-consumers queue holding the requests waiting to be handled by a worker
+///
+/// returns: ()
+///
+/// # Examples
+///
+/// ```
+///
+/// ```
 fn worker_loop(
     mut backend: UniquePtr<LlamaCppWorkerFrontend>,
     affinity: Vec<usize>,
diff --git a/backends/llamacpp/src/lib.rs b/backends/llamacpp/src/lib.rs
index e06220f2f84..d844bb9fcd6 100644
--- a/backends/llamacpp/src/lib.rs
+++ b/backends/llamacpp/src/lib.rs
@@ -49,14 +49,74 @@ mod ffi {
         #[cxx_name = "llama_cpp_worker_frontend_t"]
         type LlamaCppWorkerFrontend;
 
+        /// Create a new llama.cpp worker
+        ///
+        /// # Arguments
+        ///
+        /// * `modelPath`: Path to the GGUF model file to load
+        /// * `num_threads`: Number of threads the worker is allowed to spawn to run computations
+        ///
+        /// returns: Result<<unknown>, <unknown>>
+        ///
+        /// # Examples
+        ///
+        /// ```
+        ///
+        /// ```
         fn create_worker_frontend(
             modelPath: &str,
             num_threads: u32,
         ) -> Result<UniquePtr<LlamaCppWorkerFrontend>>;
 
+        /// Define the NUMA cores affinity on which the current thread is allowed to be scheduled.
+        ///
+        /// # Arguments
+        ///
+        /// * `affinity`: Set of CPU cores allowed for scheduling
+        ///
+        /// returns: ()
+        ///
+        /// # Examples
+        ///
+        /// ```
+        /// // Bind the current thread for execution on cores 0, 1, 2, 3
+        /// set_numa_core_affinity(&[0, 1, 2, 3]);
+        /// ```
         fn set_numa_core_affinity(affinity: &[usize]);
+
+        /// Force llama.cpp to reevaluate the allowed NUMA context (core and memory affinity) for
+        /// its internal threads scheduling.
+        /// This method can potentially cause llama.cpp / ggml to reallocate its internal threadpool to
+        /// match the new affinity constraints
+        ///
+        /// returns: ()
+        ///
+        /// # Examples
+        ///
+        /// ```
+        /// set_numa_core_affinity(&[0, 1, 2, 3]);
+        /// update_numa_affinity();
+        /// ```
         fn update_numa_affinity();
 
+        /// Generate new tokens from the provided prompt input `tokens` and generation and sampling parameters,
+        /// streaming back each generated individual token through the `callback`.
+        ///
+        /// # Arguments
+        ///
+        /// * `tokens`: Prompt input tokenized from the request's text input
+        /// * `generation_params`: Parameters controling the generation loop
+        /// * `sampling_params`: Parameters controling the sampling from the token distribution
+        /// * `stream`: Opaque structure mapping HTTP client transport to stream back token
+        /// * `callback`: Function pointer called everytime a new token is generated
+        ///
+        /// returns: Result<usize, <unknown>>
+        ///
+        /// # Examples
+        ///
+        /// ```
+        ///
+        /// ```
         unsafe fn stream(
             self: Pin<&mut LlamaCppWorkerFrontend>,
             tokens: &[u32],

From 9025a26ceae2a109cdcf66988309072b1cc58e5b Mon Sep 17 00:00:00 2001
From: Morgan Funtowicz <funtowiczmo@gmail.com>
Date: Fri, 22 Nov 2024 15:42:09 +0100
Subject: [PATCH 74/91] chore: remove unrelated change to trtllm

---
 backends/trtllm/CMakeLists.txt | 2 --
 1 file changed, 2 deletions(-)

diff --git a/backends/trtllm/CMakeLists.txt b/backends/trtllm/CMakeLists.txt
index 80b2b4305af..831372cdf99 100644
--- a/backends/trtllm/CMakeLists.txt
+++ b/backends/trtllm/CMakeLists.txt
@@ -18,8 +18,6 @@ set(CMAKE_CXX_STANDARD 20)
 include(FetchContent)
 include(ExternalProject)
 
-set(CMAKE_CUDA_FLAGS ${CMAKE_CUDA_FLAGS} "--allow-unsupported-compiler -ccbin=gcc")
-
 option(TGI_TRTLLM_BACKEND_BUILD_TESTS "Enable building the unittests suite" OFF)
 option(TGI_TRTLLM_BACKEND_BUILD_EXAMPLES "Enable building the examples suite" OFF)
 set(TGI_TRTLLM_BACKEND_TARGET_CUDA_ARCH_LIST "89-real" CACHE STRING "List of CUDA architectures to support")

From bbe95ca9e9079c5c7cd29dff83fb759d1771c89b Mon Sep 17 00:00:00 2001
From: Funtowicz Morgan <mfuntowicz@users.noreply.github.com>
Date: Thu, 28 Nov 2024 09:53:15 +0100
Subject: [PATCH 75/91] Update Dockerfile.llamacpp as per review

Co-authored-by: Hugo Larcher <hugo.larcher@huggingface.co>
---
 Dockerfile.llamacpp | 4 +++-
 1 file changed, 3 insertions(+), 1 deletion(-)

diff --git a/Dockerfile.llamacpp b/Dockerfile.llamacpp
index 916f885a522..44583b0989f 100644
--- a/Dockerfile.llamacpp
+++ b/Dockerfile.llamacpp
@@ -64,7 +64,9 @@ RUN --mount=type=cache,target=/var/cache/apt,sharing=locked \
     apt install -y \
     numactl \
     openssl \
-    python3.11-dev
+    python3.11-dev \
+    python3.11-venv \
+    ibgomp1
 
 COPY --from=builder /usr/src/text-generation-inference/target/release-opt/text-generation-backend-llamacpp /usr/src/text-generation-inference/text-generation-launcher
 COPY --from=builder /usr/src/text-generation-inference/dist /usr/

From d918e6a159ce5d1067fddf6a79b41f48867190fc Mon Sep 17 00:00:00 2001
From: Funtowicz Morgan <mfuntowicz@users.noreply.github.com>
Date: Thu, 28 Nov 2024 09:53:59 +0100
Subject: [PATCH 76/91] Update Dockerfile.llamacpp as per review

Co-authored-by: Hugo Larcher <hugo.larcher@huggingface.co>
---
 Dockerfile.llamacpp | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/Dockerfile.llamacpp b/Dockerfile.llamacpp
index 44583b0989f..92b1882a3a7 100644
--- a/Dockerfile.llamacpp
+++ b/Dockerfile.llamacpp
@@ -23,7 +23,7 @@ RUN --mount=type=cache,target=/var/cache/apt,sharing=locked \
     cmake \
     gcc g++ \
     libc++-dev \
-    libnumactl-dev \
+    libnuma-dev \
     libopenmpi-dev \
     libssl-dev \
     ninja-build \

From 274cfce435a5b72806edc429327d6d7f710cb5cf Mon Sep 17 00:00:00 2001
From: Morgan Funtowicz <funtowiczmo@gmail.com>
Date: Thu, 28 Nov 2024 10:59:50 +0100
Subject: [PATCH 77/91] feat(backend): remove core overriding in the Rust
 backend

---
 backends/llamacpp/src/backend.rs | 4 ----
 1 file changed, 4 deletions(-)

diff --git a/backends/llamacpp/src/backend.rs b/backends/llamacpp/src/backend.rs
index 32547655664..557c14b4921 100644
--- a/backends/llamacpp/src/backend.rs
+++ b/backends/llamacpp/src/backend.rs
@@ -70,10 +70,6 @@ fn get_cores_allocation(num_cores_per_instance: usize) -> Vec<Range<usize>> {
 
     // If we have spare cores, let's see if we can give everyone one more core
     let num_instances = cores_count / effective_num_cores_per_instance;
-    if cores_count - (num_instances * effective_num_cores_per_instance) >= num_instances {
-        effective_num_cores_per_instance = effective_num_cores_per_instance + 1;
-        warn!("Overriding cores allocation to {effective_num_cores_per_instance} per instance");
-    }
 
     (0..num_instances)
         .map(|ordinal| {

From 8e8979351480298277d70ff264a0e82cbe1f34d1 Mon Sep 17 00:00:00 2001
From: Morgan Funtowicz <funtowiczmo@gmail.com>
Date: Thu, 28 Nov 2024 14:52:48 +0100
Subject: [PATCH 78/91] feat(backend): use the new batch api from llama

---
 backends/llamacpp/csrc/backend.cpp | 95 ++++++++++++++++++++----------
 backends/llamacpp/csrc/backend.hpp |  2 +-
 2 files changed, 64 insertions(+), 33 deletions(-)

diff --git a/backends/llamacpp/csrc/backend.cpp b/backends/llamacpp/csrc/backend.cpp
index b60c3ddc0a3..17709b72704 100644
--- a/backends/llamacpp/csrc/backend.cpp
+++ b/backends/llamacpp/csrc/backend.cpp
@@ -38,6 +38,31 @@ namespace huggingface::tgi::backends::llamacpp {
         return {pSampler, llama_sampler_deleter};
     }
 
+
+    std::expected<llama_batch, backend_error_t> get_batch_from_prompt(std::span<llama_token> prompt) {
+        auto batch = llama_batch_init(static_cast<int32_t>(prompt.size()), 0, 1);
+        std::for_each(prompt.begin(), prompt.end(), [&batch](const llama_token token) {
+            const auto n_token = batch.n_tokens;
+
+            batch.token[n_token] = token;
+            batch.pos[n_token] = n_token;
+            batch.n_seq_id[n_token] = 1;
+            batch.seq_id[n_token][0] = 1;
+            batch.logits[n_token] = false;
+            batch.n_tokens++;
+        });
+
+        batch.logits[batch.n_tokens - 1] = true;
+        return batch;
+    }
+
+    void update_batch_for_decoding(llama_batch &batch, llama_token token, size_t position) {
+        batch.n_tokens = 1;
+        batch.logits[0] = true;
+        batch.token[0] = token;
+        batch.pos[0] = static_cast<int32_t>(position);
+    }
+
     worker_t::worker_t(std::shared_ptr<llama_model> model, const llama_context_params &&params)
             : model_(model), context_(llama_new_context_with_model(model_.get(), params)) {
 
@@ -59,44 +84,50 @@ namespace huggingface::tgi::backends::llamacpp {
         auto sampler = generation_context.sampling_params.into_llama_sampler(model_.get());
 
         // Set up the prompt
-        auto copy = std::vector(generation_context.input_tokens.begin(), generation_context.input_tokens.end());
-        auto batch = llama_batch_get_one(copy.data(), copy.size());
-
-        // Decode
-        auto n_decoded_tokens = 0;
-        for (bool generating = true; generating; ++n_decoded_tokens) {
+        if (auto maybe_batch = get_batch_from_prompt(generation_context.input_tokens); maybe_batch.has_value()) {
+            // Decode
+            auto batch = *maybe_batch;
+            auto n_decoded_tokens = 0;
+            const auto prompt_size = generation_context.input_tokens.size();
+            for (bool generating = true; generating; ++n_decoded_tokens) {
 
 #ifdef TGI_LLAMACPP_BACKEND_DEBUG
-            const auto start = std::chrono::steady_clock::now();
-            const auto status = llama_decode(context_.get(), batch);
-            const auto end = std::chrono::steady_clock::now();
-            const auto latency = std::chrono::duration_cast<std::chrono::milliseconds>(end - start);
-            SPDLOG_DEBUG(FMT_STRING("Successfully decoded {:d} token(s) in {}"), batch.n_tokens, latency);
+                const auto start = std::chrono::steady_clock::now();
+                const auto status = llama_decode(context_.get(), batch);
+                const auto end = std::chrono::steady_clock::now();
+                const auto latency = std::chrono::duration_cast<std::chrono::milliseconds>(end - start);
+                SPDLOG_DEBUG(FMT_STRING("Successfully decoded {:d} token(s) in {}"), batch.n_tokens, latency);
 #else
-            const auto status = llama_decode(context_.get(), batch);
+                const auto status = llama_decode(context_.get(), batch);
 #endif
-            batch.n_tokens = 0;
-            if (LLAMA_SUCCESS(status)) [[likely]] {
-                // Sample the new token
-                auto new_token_id = llama_sampler_sample(sampler.get(), context_.get(), -1);
-                auto is_eog = llama_token_is_eog(model_.get(), new_token_id);
-                auto new_token_logits = 0.0f; // TODO: return logit
-
-                // Handle termination cases
-                const auto has_reach_max_tokens = n_decoded_tokens >= max_new_tokens - 1;
-                const auto has_reach_eog = !generation_context.generation_params.ignore_eos_token & is_eog;
-
-                generating = !(has_reach_max_tokens | has_reach_eog);
+                batch.n_tokens = 0;
+                if (LLAMA_SUCCESS(status)) [[likely]] {
+                    // Sample the new token
+                    auto new_token_id = llama_sampler_sample(sampler.get(), context_.get(), -1);
+                    const auto is_eog = llama_token_is_eog(model_.get(), new_token_id);
+                    const auto new_token_logits = llama_get_logits_ith(context_.get(), -1); // TODO: return logit
+
+                    // Handle termination cases
+                    const bool has_reach_max_tokens = n_decoded_tokens >= max_new_tokens - 1;
+                    const bool has_reach_eog = !generation_context.generation_params.ignore_eos_token & is_eog;
+                    const bool is_final = has_reach_max_tokens | has_reach_eog;
+
+                    // Bubble up the generated token if a callback is provided
+                    const auto should_stop = callback_(new_token_id, *new_token_logits, is_final, n_decoded_tokens + 1);
+
+                    // Compute the continuation flag
+                    generating = !(should_stop | is_final);
+
+                    // Update the batch for the next generation
+                    update_batch_for_decoding(batch, new_token_id, prompt_size + n_decoded_tokens);
+                }
+            }
 
-                // Bubble up the generated token if a callback is provided
-                const auto should_stop =
-                        std::invoke(callback_, new_token_id, new_token_logits, !generating, n_decoded_tokens + 1);
-                generating ^= should_stop;
+            llama_batch_free(batch);
 
-                batch = llama_batch_get_one(&new_token_id, 1);
-            }
+            return n_decoded_tokens;
+        } else {
+            return maybe_batch.error();
         }
-
-        return n_decoded_tokens;
     }
 }
\ No newline at end of file
diff --git a/backends/llamacpp/csrc/backend.hpp b/backends/llamacpp/csrc/backend.hpp
index 0e1a13ac167..321b667ae49 100644
--- a/backends/llamacpp/csrc/backend.hpp
+++ b/backends/llamacpp/csrc/backend.hpp
@@ -75,7 +75,7 @@ namespace huggingface::tgi::backends::llamacpp {
     struct generation_context_t {
         generation_params_t generation_params;
         sampling_params_t sampling_params;
-        std::span<const llama_token> input_tokens;
+        std::span<llama_token> input_tokens;
     };
 
     /**

From 298367cdfd2f33cc9699b2748f1d8edfbf3c66e1 Mon Sep 17 00:00:00 2001
From: Morgan Funtowicz <funtowiczmo@gmail.com>
Date: Thu, 28 Nov 2024 14:53:35 +0100
Subject: [PATCH 79/91] feat(backend): fix when num_cores_per_instance is
 equals to zero with the size of the generated core allocation

---
 backends/llamacpp/src/backend.rs | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/backends/llamacpp/src/backend.rs b/backends/llamacpp/src/backend.rs
index 557c14b4921..e662e207c00 100644
--- a/backends/llamacpp/src/backend.rs
+++ b/backends/llamacpp/src/backend.rs
@@ -176,7 +176,7 @@ impl LlamaCppBackend {
         // Allocate all the workers
         let cores_allocation = get_cores_allocation(num_cores_per_instance as usize);
         cores_allocation.iter().for_each(|affinity| {
-            match Self::allocate_worker(path, num_cores_per_instance as u32) {
+            match Self::allocate_worker(path, affinity.len() as u32) {
                 Ok(worker) => {
                     let tokenizer = Arc::clone(&tokenizer);
                     let affinity = affinity.clone().collect::<Vec<_>>();

From 929a2fc718a16d1c19cbc44fb0966f0c1c3f5903 Mon Sep 17 00:00:00 2001
From: Morgan Funtowicz <funtowiczmo@gmail.com>
Date: Thu, 28 Nov 2024 14:53:46 +0100
Subject: [PATCH 80/91] feat(backend): add some test to the backend for core
 allocation

---
 backends/llamacpp/build.rs       |  2 +-
 backends/llamacpp/src/backend.rs | 76 ++++++++++++++++++++++++++++++++
 2 files changed, 77 insertions(+), 1 deletion(-)

diff --git a/backends/llamacpp/build.rs b/backends/llamacpp/build.rs
index 023ccfbaadb..e22fa07c7db 100644
--- a/backends/llamacpp/build.rs
+++ b/backends/llamacpp/build.rs
@@ -94,7 +94,7 @@ fn main() {
         .unwrap_or(out_dir.join("dist"));
 
     // Build the backend
-    let deps_path = build_backend(is_debug, opt_level, out_dir.as_path(), &install_path);
+    let _ = build_backend(is_debug, opt_level, out_dir.as_path(), &install_path);
 
     // Build the FFI layer calling the backend above
     build_ffi_layer(is_debug, &install_path);
diff --git a/backends/llamacpp/src/backend.rs b/backends/llamacpp/src/backend.rs
index e662e207c00..d8f28ab9da1 100644
--- a/backends/llamacpp/src/backend.rs
+++ b/backends/llamacpp/src/backend.rs
@@ -28,6 +28,7 @@ use tracing::{debug, error, info};
 ///
 /// returns: usize Integer greater than 0 representing the number of CPU cores on the machine
 ///
+#[cfg(not(test))]
 fn get_num_cores() -> usize {
     match option_env!("TGI_USE_PHYSICAL_CORES")
         .unwrap_or("OFF")
@@ -45,6 +46,18 @@ fn get_num_cores() -> usize {
     }
 }
 
+#[cfg(test)]
+fn get_num_cores() -> usize {
+    match option_env!("TGI_USE_PHYSICAL_CORES")
+        .unwrap_or("OFF")
+        .to_uppercase()
+        .as_str()
+    {
+        "ON" => 16,
+        _ => 32,
+    }
+}
+
 /// Subdivide the set of CPU cores available on the system to equal, non-overlapping, subsets of CPU cores
 ///
 /// # Arguments
@@ -417,3 +430,66 @@ impl Backend for LlamaCppBackend {
         true
     }
 }
+
+#[cfg(test)]
+mod tests {
+    use crate::backend::{get_cores_allocation, get_num_cores};
+
+    fn test_get_num_cores() {
+        std::env::set_var("TGI_USE_PHYSICAL_CORES", "OFF");
+        assert_eq!(get_num_cores(), 32);
+
+        std::env::set_var("TGI_USE_PHYSICAL_CORES", "ON");
+        assert_eq!(get_num_cores(), 16);
+    }
+
+    fn test_get_cores_allocation_single_instance() {
+        std::env::set_var("TGI_USE_PHYSICAL_CORES", "OFF");
+        let smt_allocation = get_cores_allocation(0);
+        assert_eq!(smt_allocation.len(), 1);
+        assert_eq!(
+            smt_allocation[0].clone().collect::<Vec<_>>(),
+            (0..32).collect::<Vec<_>>()
+        );
+
+        std::env::set_var("TGI_USE_PHYSICAL_CORES", "ON");
+        let smt_allocation = get_cores_allocation(0);
+        assert_eq!(smt_allocation.len(), 1);
+        assert_eq!(
+            smt_allocation[0].clone().collect::<Vec<_>>(),
+            (0..16).collect::<Vec<_>>()
+        );
+    }
+
+    fn test_get_cores_allocation_multi_instances() {
+        for cores_per_instance in [1, 2, 4, 8, 16, 3, 7] {
+            std::env::set_var("TGI_USE_PHYSICAL_CORES", "OFF");
+
+            let num_instances = 32 / cores_per_instance;
+            let smt_allocation = get_cores_allocation(cores_per_instance);
+
+            for i in 0..num_instances {
+                let start = i * cores_per_instance;
+                let end = start + cores_per_instance;
+                assert_eq!(
+                    smt_allocation[i].clone().collect::<Vec<_>>(),
+                    (start..end).collect::<Vec<_>>()
+                );
+            }
+
+            std::env::set_var("TGI_USE_PHYSICAL_CORES", "ON");
+            let num_instances = 16 / cores_per_instance;
+            let smt_allocation = get_cores_allocation(cores_per_instance);
+            assert_eq!(smt_allocation.len(), num_instances);
+
+            for i in 0..num_instances {
+                let start = i * cores_per_instance;
+                let end = start + cores_per_instance;
+                assert_eq!(
+                    smt_allocation[i].clone().collect::<Vec<_>>(),
+                    (start..end).collect::<Vec<_>>()
+                );
+            }
+        }
+    }
+}

From df72c56b5b57adf51fb32a342406c45fda144947 Mon Sep 17 00:00:00 2001
From: Morgan Funtowicz <funtowiczmo@gmail.com>
Date: Thu, 28 Nov 2024 16:30:20 +0100
Subject: [PATCH 81/91] feat(backend): add guard in case top_k = 0

---
 backends/llamacpp/csrc/backend.cpp | 5 ++++-
 1 file changed, 4 insertions(+), 1 deletion(-)

diff --git a/backends/llamacpp/csrc/backend.cpp b/backends/llamacpp/csrc/backend.cpp
index 17709b72704..4605243588d 100644
--- a/backends/llamacpp/csrc/backend.cpp
+++ b/backends/llamacpp/csrc/backend.cpp
@@ -28,7 +28,10 @@ namespace huggingface::tgi::backends::llamacpp {
                 false,
                 false
         ));
-        llama_sampler_chain_add(pSampler, llama_sampler_init_top_k(static_cast<int32_t>(top_k)));
+
+        if (top_k > 0) {
+            llama_sampler_chain_add(pSampler, llama_sampler_init_top_k(static_cast<int32_t>(top_k)));
+        }
 
         if (0 < top_p && top_p < 1) {
             llama_sampler_chain_add(pSampler, llama_sampler_init_top_p(top_p, 1));

From 9d659f1e23f0f4eea9c6df8a7b8339d8b8af0af8 Mon Sep 17 00:00:00 2001
From: Morgan Funtowicz <funtowiczmo@gmail.com>
Date: Thu, 28 Nov 2024 16:49:29 +0100
Subject: [PATCH 82/91] feat(backend): add missing temperature parameter

---
 backends/llamacpp/csrc/backend.cpp | 1 +
 backends/llamacpp/csrc/backend.hpp | 1 +
 backends/llamacpp/src/backend.rs   | 1 +
 backends/llamacpp/src/lib.rs       | 2 ++
 4 files changed, 5 insertions(+)

diff --git a/backends/llamacpp/csrc/backend.cpp b/backends/llamacpp/csrc/backend.cpp
index 4605243588d..00692ea88e8 100644
--- a/backends/llamacpp/csrc/backend.cpp
+++ b/backends/llamacpp/csrc/backend.cpp
@@ -37,6 +37,7 @@ namespace huggingface::tgi::backends::llamacpp {
             llama_sampler_chain_add(pSampler, llama_sampler_init_top_p(top_p, 1));
         }
 
+        llama_sampler_chain_add(pSampler, llama_sampler_init_temp(temperature));
         llama_sampler_chain_add(pSampler, llama_sampler_init_dist(seed));
         return {pSampler, llama_sampler_deleter};
     }
diff --git a/backends/llamacpp/csrc/backend.hpp b/backends/llamacpp/csrc/backend.hpp
index 321b667ae49..38fd3aad676 100644
--- a/backends/llamacpp/csrc/backend.hpp
+++ b/backends/llamacpp/csrc/backend.hpp
@@ -48,6 +48,7 @@ namespace huggingface::tgi::backends::llamacpp {
         float_t top_p = 1.0f;
         float_t frequency_penalty = 0.0f;
         float_t repetition_penalty = 0.0f;
+        float_t temperature = 0.0f;
         uint64_t seed = 2014;
 
         /**
diff --git a/backends/llamacpp/src/backend.rs b/backends/llamacpp/src/backend.rs
index d8f28ab9da1..e1575b1d027 100644
--- a/backends/llamacpp/src/backend.rs
+++ b/backends/llamacpp/src/backend.rs
@@ -104,6 +104,7 @@ impl From<&ValidParameters> for SamplingParams {
             top_p: v.top_p,
             frequency_penalty: v.frequency_penalty,
             repetition_penalty: v.repetition_penalty,
+            temperature: v.temperature,
             seed: v.seed,
         }
     }
diff --git a/backends/llamacpp/src/lib.rs b/backends/llamacpp/src/lib.rs
index d844bb9fcd6..3507217ff86 100644
--- a/backends/llamacpp/src/lib.rs
+++ b/backends/llamacpp/src/lib.rs
@@ -10,6 +10,7 @@ impl Default for SamplingParams {
             top_p: 1.0f32,
             frequency_penalty: 0.0f32,
             repetition_penalty: 0.0f32,
+            temperature: 1.0f32,
             seed: 2014u64,
         }
     }
@@ -29,6 +30,7 @@ mod ffi {
         top_p: f32,
         frequency_penalty: f32,
         repetition_penalty: f32,
+        temperature: f32,
         seed: u64,
     }
 

From 6c5a75b593cc72de9f944c400c572d3178f5f0e3 Mon Sep 17 00:00:00 2001
From: Morgan Funtowicz <funtowiczmo@gmail.com>
Date: Thu, 28 Nov 2024 17:45:22 +0100
Subject: [PATCH 83/91] misc(offline): update model creation as std::shared_ptr

---
 backends/llamacpp/offline/main.cpp | 9 +++++----
 1 file changed, 5 insertions(+), 4 deletions(-)

diff --git a/backends/llamacpp/offline/main.cpp b/backends/llamacpp/offline/main.cpp
index 721abf051f5..e5c70e77a4f 100644
--- a/backends/llamacpp/offline/main.cpp
+++ b/backends/llamacpp/offline/main.cpp
@@ -5,7 +5,7 @@
 
 #include <llama.h>
 #include <spdlog/spdlog.h>
-#include <spdlog/fmt/ranges.h>s
+#include <spdlog/fmt/ranges.h>
 #include "../csrc/backend.hpp"
 
 using namespace huggingface::tgi::backends::llamacpp;
@@ -22,8 +22,9 @@ int main(int argc, char **argv) {
 
     const auto modelPath = absolute(std::filesystem::path(argv[1]));
     const auto params = llama_model_default_params();
-    auto model = std::unique_ptr<llama_model, decltype(llama_model_deleter)>(
-            llama_load_model_from_file(modelPath.c_str(), params)
+    auto model = std::shared_ptr<llama_model>(
+            llama_load_model_from_file(modelPath.c_str(), params),
+            llama_model_deleter
     );
 
     auto prompt = "My name is Morgan";
@@ -31,7 +32,7 @@ int main(int argc, char **argv) {
     const auto nb_tokens = llama_tokenize(model.get(), prompt, sizeof(prompt), tokens.data(), tokens.size(), true,
                                           false);
     tokens.resize(nb_tokens);
-    auto backend = worker_t{std::move(model), {.n_batch = 1, .n_threads = 4}};
+    auto backend = worker_t(std::move(model), {.n_batch = 1, .n_threads = 4});
 
     fmt::println("Tokenized: {}", tokens);
 

From b1ebc8f73bbab6b8d683f58a9b48e150f5af2919 Mon Sep 17 00:00:00 2001
From: Morgan Funtowicz <funtowiczmo@gmail.com>
Date: Thu, 28 Nov 2024 23:56:57 +0100
Subject: [PATCH 84/91] feat(backend): update llama.cpp to 4215

---
 backends/llamacpp/CMakeLists.txt | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/backends/llamacpp/CMakeLists.txt b/backends/llamacpp/CMakeLists.txt
index 13107e0abce..05fce9227d2 100644
--- a/backends/llamacpp/CMakeLists.txt
+++ b/backends/llamacpp/CMakeLists.txt
@@ -33,7 +33,7 @@ endif ()
 # Download llama.cpp repo at the specific version
 fetchcontent_declare(
         llama
-        URL https://github.com/ggerganov/llama.cpp/archive/refs/tags/b4077.tar.gz
+        URL https://github.com/ggerganov/llama.cpp/archive/refs/tags/b4215.tar.gz
 )
 
 fetchcontent_makeavailable(llama)

From dc6435e3a58decb93d6d06c6d03052be4eb57411 Mon Sep 17 00:00:00 2001
From: Morgan Funtowicz <funtowiczmo@gmail.com>
Date: Thu, 28 Nov 2024 23:57:08 +0100
Subject: [PATCH 85/91] feat(backend): create llama_context_params with default
 factory

---
 backends/llamacpp/csrc/ffi.hpp     | 11 ++++++++++-
 backends/llamacpp/offline/main.cpp | 25 +++++++++++++++++++------
 2 files changed, 29 insertions(+), 7 deletions(-)

diff --git a/backends/llamacpp/csrc/ffi.hpp b/backends/llamacpp/csrc/ffi.hpp
index 3645526344f..99679fdb0cb 100644
--- a/backends/llamacpp/csrc/ffi.hpp
+++ b/backends/llamacpp/csrc/ffi.hpp
@@ -43,6 +43,15 @@ namespace huggingface::tgi::backends::llamacpp {
         return std::shared_ptr<llama_model>(model, llama_model_deleter);
     };
 
+    auto get_llama_context_params = [](size_t num_threads) {
+        auto params = llama_context_default_params();
+        params.n_threads = num_threads;
+        params.n_threads_batch = num_threads;
+        params.flash_attn = true;
+        params.no_perf = false;
+        return params;
+    };
+
     /**
      * llama.cpp backend specific exception mapped from `backend_exception_t` to throw at the FFI level and
      * allow automatic implementation of Result<_, Exception> from C++ to Rust
@@ -64,7 +73,7 @@ namespace huggingface::tgi::backends::llamacpp {
          * @param num_threads The number of threads the worker is allowed to spawn accross for its threadpool
          */
         explicit llama_cpp_worker_frontend_t(llama_model *model, int32_t num_threads):
-            model_{ make_shared_llama_model(model) }, worker_(model_, {.n_ubatch = 1, .n_threads = num_threads, .no_perf = true}) {}
+            model_{ make_shared_llama_model(model) }, worker_(model_, get_llama_context_params(num_threads)) {}
 
         /**
          * Generate a new set of tokens from the provided `input_tokens`, streaming each individual token generated
diff --git a/backends/llamacpp/offline/main.cpp b/backends/llamacpp/offline/main.cpp
index e5c70e77a4f..fad97b3a1ed 100644
--- a/backends/llamacpp/offline/main.cpp
+++ b/backends/llamacpp/offline/main.cpp
@@ -27,24 +27,37 @@ int main(int argc, char **argv) {
             llama_model_deleter
     );
 
-    auto prompt = "My name is Morgan";
-    auto tokens = std::vector<llama_token>(16);
-    const auto nb_tokens = llama_tokenize(model.get(), prompt, sizeof(prompt), tokens.data(), tokens.size(), true,
+    auto prompt = std::string("My name is Morgan");
+    auto tokens = std::vector<llama_token>(128);
+    const auto nb_tokens = llama_tokenize(model.get(), prompt.c_str(), prompt.size(), tokens.data(), tokens.size(),
+                                          true,
                                           false);
     tokens.resize(nb_tokens);
-    auto backend = worker_t(std::move(model), {.n_batch = 1, .n_threads = 4});
+    llama_numa_init(ggml_numa_strategy::GGML_NUMA_STRATEGY_DISTRIBUTE);
+    auto backend = worker_t(model, llama_context_default_params());
 
     fmt::println("Tokenized: {}", tokens);
 
     // generate
     auto generated_tokens = std::vector<llama_token>(32);
     const auto n_generated_tokens = backend.generate(
-            {{.max_new_tokens = 32}, {.top_k = 40}, tokens},
+            {{.max_new_tokens = 32}, {.top_k = 40, .top_p = 0.95, .temperature = 0.8},
+             tokens},
             [&generated_tokens](llama_token new_token_id, float_t logit, bool is_eos, size_t step) -> bool {
                 generated_tokens.emplace(generated_tokens.begin() + (step - 1), new_token_id);
                 return false;
             }
     );
     generated_tokens.resize(n_generated_tokens.value());
-    fmt::println("Generated {} tokens", generated_tokens);
+
+    std::string decoded = std::string(256, 'a');
+    const size_t length = llama_detokenize(model.get(),
+                                           generated_tokens.data(),
+                                           generated_tokens.size(),
+                                           decoded.data(),
+                                           decoded.size(),
+                                           false, false);
+    decoded.resize(std::min(length, decoded.size()));
+    fmt::println("Generated tokens: {}", generated_tokens);
+    fmt::println("Generated text:   {}", decoded);
 }

From b10eaab9f30f7c92ec9d3f73170e69de69c185fa Mon Sep 17 00:00:00 2001
From: Morgan Funtowicz <funtowiczmo@gmail.com>
Date: Thu, 28 Nov 2024 23:57:24 +0100
Subject: [PATCH 86/91] feat(backend): use new batch API to generate tokens

---
 backends/llamacpp/csrc/backend.cpp | 55 +++++++++++++++---------------
 1 file changed, 28 insertions(+), 27 deletions(-)

diff --git a/backends/llamacpp/csrc/backend.cpp b/backends/llamacpp/csrc/backend.cpp
index 00692ea88e8..f7e4cde288d 100644
--- a/backends/llamacpp/csrc/backend.cpp
+++ b/backends/llamacpp/csrc/backend.cpp
@@ -14,10 +14,10 @@
 namespace huggingface::tgi::backends::llamacpp {
 
     llama_sampler_ptr sampling_params_t::into_llama_sampler(const llama_model *model) const {
-        auto *pSampler = llama_sampler_chain_init({.no_perf = false});
+        auto *sampler = llama_sampler_chain_init({.no_perf = false});
 
         // Penalties
-        llama_sampler_chain_add(pSampler, llama_sampler_init_penalties(
+        llama_sampler_chain_add(sampler, llama_sampler_init_penalties(
                 llama_n_vocab(model),
                 llama_token_eos(model),
                 llama_token_nl(model),
@@ -28,31 +28,27 @@ namespace huggingface::tgi::backends::llamacpp {
                 false,
                 false
         ));
-
-        if (top_k > 0) {
-            llama_sampler_chain_add(pSampler, llama_sampler_init_top_k(static_cast<int32_t>(top_k)));
-        }
+        llama_sampler_chain_add(sampler, llama_sampler_init_top_k(static_cast<int32_t>(top_k)));
 
         if (0 < top_p && top_p < 1) {
-            llama_sampler_chain_add(pSampler, llama_sampler_init_top_p(top_p, 1));
+            llama_sampler_chain_add(sampler, llama_sampler_init_top_p(top_p, 0));
         }
 
-        llama_sampler_chain_add(pSampler, llama_sampler_init_temp(temperature));
-        llama_sampler_chain_add(pSampler, llama_sampler_init_dist(seed));
-        return {pSampler, llama_sampler_deleter};
+        llama_sampler_chain_add(sampler, llama_sampler_init_temp(temperature));
+        llama_sampler_chain_add(sampler, llama_sampler_init_dist(seed));
+        return {sampler, llama_sampler_deleter};
     }
 
-
     std::expected<llama_batch, backend_error_t> get_batch_from_prompt(std::span<llama_token> prompt) {
         auto batch = llama_batch_init(static_cast<int32_t>(prompt.size()), 0, 1);
-        std::for_each(prompt.begin(), prompt.end(), [&batch](const llama_token token) {
-            const auto n_token = batch.n_tokens;
+        batch.n_tokens = 0;
 
-            batch.token[n_token] = token;
-            batch.pos[n_token] = n_token;
-            batch.n_seq_id[n_token] = 1;
-            batch.seq_id[n_token][0] = 1;
-            batch.logits[n_token] = false;
+        std::for_each(prompt.begin(), prompt.end(), [&batch](const llama_token token) {
+            batch.token[batch.n_tokens] = token;
+            batch.pos[batch.n_tokens] = batch.n_tokens;
+            batch.n_seq_id[batch.n_tokens] = 1;
+            batch.seq_id[batch.n_tokens][0] = 0;
+            batch.logits[batch.n_tokens] = false;
             batch.n_tokens++;
         });
 
@@ -60,11 +56,12 @@ namespace huggingface::tgi::backends::llamacpp {
         return batch;
     }
 
-    void update_batch_for_decoding(llama_batch &batch, llama_token token, size_t position) {
-        batch.n_tokens = 1;
-        batch.logits[0] = true;
+    int32_t update_batch_for_decoding(llama_batch &batch, llama_token token, size_t position) {
         batch.token[0] = token;
         batch.pos[0] = static_cast<int32_t>(position);
+        batch.logits[0] = true;
+        batch.n_tokens = 1;
+        return 0; // Decoding will always happen at position 0
     }
 
     worker_t::worker_t(std::shared_ptr<llama_model> model, const llama_context_params &&params)
@@ -89,10 +86,14 @@ namespace huggingface::tgi::backends::llamacpp {
 
         // Set up the prompt
         if (auto maybe_batch = get_batch_from_prompt(generation_context.input_tokens); maybe_batch.has_value()) {
-            // Decode
             auto batch = *maybe_batch;
+
+            // Keep track of where we are
             auto n_decoded_tokens = 0;
-            const auto prompt_size = generation_context.input_tokens.size();
+            auto position = batch.n_tokens;
+            auto sampling_index = batch.n_tokens - 1;
+
+            // Decode
             for (bool generating = true; generating; ++n_decoded_tokens) {
 
 #ifdef TGI_LLAMACPP_BACKEND_DEBUG
@@ -104,12 +105,11 @@ namespace huggingface::tgi::backends::llamacpp {
 #else
                 const auto status = llama_decode(context_.get(), batch);
 #endif
-                batch.n_tokens = 0;
                 if (LLAMA_SUCCESS(status)) [[likely]] {
                     // Sample the new token
-                    auto new_token_id = llama_sampler_sample(sampler.get(), context_.get(), -1);
+                    auto new_token_id = llama_sampler_sample(sampler.get(), context_.get(), sampling_index);
                     const auto is_eog = llama_token_is_eog(model_.get(), new_token_id);
-                    const auto new_token_logits = llama_get_logits_ith(context_.get(), -1); // TODO: return logit
+                    const auto *new_token_logits = llama_get_logits_ith(context_.get(), sampling_index) + new_token_id;
 
                     // Handle termination cases
                     const bool has_reach_max_tokens = n_decoded_tokens >= max_new_tokens - 1;
@@ -123,7 +123,8 @@ namespace huggingface::tgi::backends::llamacpp {
                     generating = !(should_stop | is_final);
 
                     // Update the batch for the next generation
-                    update_batch_for_decoding(batch, new_token_id, prompt_size + n_decoded_tokens);
+                    sampling_index = update_batch_for_decoding(batch, new_token_id, position);
+                    position += 1;
                 }
             }
 

From 59b0ef30189c55e52b20f229d0d39ea74a5bd02d Mon Sep 17 00:00:00 2001
From: Hugo Larcher <hugo.larcher@huggingface.co>
Date: Fri, 29 Nov 2024 00:31:36 +0100
Subject: [PATCH 87/91] feat: Fix Cmakelist to allow building on Darwin
 platform (#2785)

* feat: Fix Cmakelist to allow building on Darwin platform
* fix: Fix tokenizer in llama.cpp Dockerfile
---
 Dockerfile.llamacpp                |  7 +++++--
 backends/llamacpp/CMakeLists.txt   |  8 +++++++-
 backends/llamacpp/README.md        | 17 +++++++++++++++++
 backends/llamacpp/requirements.txt |  1 +
 4 files changed, 30 insertions(+), 3 deletions(-)
 create mode 100644 backends/llamacpp/README.md
 create mode 100644 backends/llamacpp/requirements.txt

diff --git a/Dockerfile.llamacpp b/Dockerfile.llamacpp
index 92b1882a3a7..e8896ad4915 100644
--- a/Dockerfile.llamacpp
+++ b/Dockerfile.llamacpp
@@ -66,11 +66,14 @@ RUN --mount=type=cache,target=/var/cache/apt,sharing=locked \
     openssl \
     python3.11-dev \
     python3.11-venv \
-    ibgomp1
+    libgomp1
 
 COPY --from=builder /usr/src/text-generation-inference/target/release-opt/text-generation-backend-llamacpp /usr/src/text-generation-inference/text-generation-launcher
 COPY --from=builder /usr/src/text-generation-inference/dist /usr/
-
+COPY --from=builder /usr/src/text-generation-inference/backends/llamacpp/requirements.txt requirements.txt
+RUN /usr/bin/python3.11 -m venv /usr/src/text-generation-inference/venv
+ENV PATH="/usr/src/text-generation-inference/venv/bin:$PATH"
+RUN pip3 install --no-cache-dir -r requirements.txt
 ENV PORT=8080
 WORKDIR /usr/src/text-generation-inference
 ENTRYPOINT ["text-generation-launcher"]
\ No newline at end of file
diff --git a/backends/llamacpp/CMakeLists.txt b/backends/llamacpp/CMakeLists.txt
index 05fce9227d2..6599fd692e9 100644
--- a/backends/llamacpp/CMakeLists.txt
+++ b/backends/llamacpp/CMakeLists.txt
@@ -2,6 +2,7 @@ cmake_minimum_required(VERSION 3.24)
 
 project(tgi-llama-cpp-backend VERSION 1.0.0)
 set(CMAKE_CXX_STANDARD 23)
+set(CMAKE_CXX_STANDARD_REQUIRED ON)
 
 include(FetchContent)
 
@@ -10,13 +11,18 @@ set(LLAMA_CPP_TARGET_CUDA_ARCHS "75-real;80-real;86-real;89-real;90-real" CACHE
 option(LLAMA_CPP_BUILD_OFFLINE_RUNNER "Flag to build the standalone c++ backend runner")
 option(LLAMA_CPP_BUILD_CUDA "Flag to build CUDA enabled inference through llama.cpp")
 
-if (${CMAKE_CXX_COMPILER_ID} STREQUAL "Clang" AND ${CMAKE_SYSTEM_NAME} STREQUAL "Linux")
+if (${CMAKE_CXX_COMPILER_ID} STREQUAL "Clang" AND (${CMAKE_SYSTEM_NAME} STREQUAL "Linux" OR ${CMAKE_SYSTEM_NAME} STREQUAL "Darwin"))
     message(STATUS "Targeting libc++")
     set(CMAKE_CXX_FLAGS -stdlib=libc++ ${CMAKE_CXX_FLAGS})
 else ()
     message(STATUS "Not using libc++ ${CMAKE_CXX_COMPILER_ID} ${CMAKE_SYSTEM_NAME}")
 endif ()
 
+# add linker options for Darwin
+if (${CMAKE_SYSTEM_NAME} STREQUAL "Darwin")
+    set(CMAKE_EXE_LINKER_FLAGS "${CMAKE_EXE_LINKER_FLAGS}  -L$HOMEBREW_PREFIX/opt/llvm/lib/c++ -L$HOMEBREW_PREFIX/opt/llvm/lib/unwind -lunwind")
+endif ()
+
 # Add dependencies
 include(cmake/numa.cmake)
 include(cmake/spdlog.cmake)
diff --git a/backends/llamacpp/README.md b/backends/llamacpp/README.md
new file mode 100644
index 00000000000..0931339c40d
--- /dev/null
+++ b/backends/llamacpp/README.md
@@ -0,0 +1,17 @@
+## Compiling with MacOS
+
+To compile the Llama.cpp backend on MacOS, you need to install `clang` and `cmake` via Homebrew:
+
+```bash
+brew install llvm cmake
+```
+
+You then need to configure CMakelists.txt to use the newly installed clang compiler.
+You can do this by configuring your IDE or adding the following lines to the top of the file:
+
+```cmake
+set(CMAKE_C_COMPILER /opt/homebrew/opt/llvm/bin/clang)
+set(CMAKE_CXX_COMPILER /opt/homebrew/opt/llvm/bin/clang++)
+```
+
+CMakelist.txt assumes that Homebrew installs libc++ in `$HOMEBREW_PREFIX/opt/llvm/lib/c++`.
\ No newline at end of file
diff --git a/backends/llamacpp/requirements.txt b/backends/llamacpp/requirements.txt
new file mode 100644
index 00000000000..2372d58ba53
--- /dev/null
+++ b/backends/llamacpp/requirements.txt
@@ -0,0 +1 @@
+transformers==4.45.2 ; python_version >= "3.9" and python_version < "3.13"
\ No newline at end of file

From f5c4cee364ca61f74103f9cb8aec992b670eb7e5 Mon Sep 17 00:00:00 2001
From: Morgan Funtowicz <funtowiczmo@gmail.com>
Date: Fri, 29 Nov 2024 16:22:43 +0100
Subject: [PATCH 88/91] feat(backend): correctly link to all libraries

---
 backends/llamacpp/build.rs | 3 +++
 1 file changed, 3 insertions(+)

diff --git a/backends/llamacpp/build.rs b/backends/llamacpp/build.rs
index e22fa07c7db..b5fd7bc0463 100644
--- a/backends/llamacpp/build.rs
+++ b/backends/llamacpp/build.rs
@@ -115,6 +115,9 @@ fn main() {
     let spdlog_linkage_target = if is_debug { "spdlogd" } else { "spdlog" };
     println!("cargo:rustc-link-lib=dylib={spdlog_linkage_target}");
     println!("cargo:rustc-link-lib=dylib=ggml");
+    println!("cargo:rustc-link-lib=dylib=ggml-base");
+    println!("cargo:rustc-link-lib=dylib=ggml-cpu");
+    println!("cargo:rustc-link-lib=dylib=ggml-amx");
     println!("cargo:rustc-link-lib=dylib=llama");
 
     // Rerun if one of these file change

From db41776a0e151f0a57170d4b614d5c4c5ef6ca27 Mon Sep 17 00:00:00 2001
From: Morgan Funtowicz <funtowiczmo@gmail.com>
Date: Fri, 29 Nov 2024 16:22:55 +0100
Subject: [PATCH 89/91] feat(backend): add mimalloc memory allocator to the
 container

---
 Dockerfile.llamacpp | 28 +++++++++++++++++++++++++---
 1 file changed, 25 insertions(+), 3 deletions(-)

diff --git a/Dockerfile.llamacpp b/Dockerfile.llamacpp
index e8896ad4915..78b3636bcf4 100644
--- a/Dockerfile.llamacpp
+++ b/Dockerfile.llamacpp
@@ -54,6 +54,26 @@ ENV RUSTFLAGS="-L/usr/lib"
 ENV CMAKE_INSTALL_PREFIX=/usr/src/text-generation-inference/dist
 RUN cargo build --profile release-opt --package text-generation-backend-llamacpp --bin text-generation-backend-llamacpp --frozen
 
+FROM ubuntu:22.04 AS mimalloc-builder
+ENV DEBIAN_FRONTEND=noninteractive
+ENV MIMALLOC_VERSION=2.1.7
+RUN --mount=type=cache,target=/var/cache/apt,sharing=locked \
+    --mount=type=cache,target=/var/lib/apt,sharing=locked \
+    apt update && \
+    apt upgrade -y && \
+    apt install -y \
+    clang \
+    cmake \
+    ninja-build \
+    wget
+
+RUN wget https://github.com/microsoft/mimalloc/archive/refs/tags/v${MIMALLOC_VERSION}.tar.gz -O mimalloc-${MIMALLOC_VERSION}.tar.gz && \
+    tar -xzf mimalloc-${MIMALLOC_VERSION}.tar.gz && \
+    cd mimalloc-${MIMALLOC_VERSION} && \
+    cmake -G Ninja -DCMAKE_BUILD_TYPE=Release -B build . && \
+    cmake --build build --parallel && \
+    cmake --install build
+
 FROM ubuntu:22.04
 ENV DEBIAN_FRONTEND=noninteractive
 
@@ -62,18 +82,20 @@ RUN --mount=type=cache,target=/var/cache/apt,sharing=locked \
     apt update && \
     apt upgrade -y && \
     apt install -y \
+    libopenmpi3 \
     numactl \
     openssl \
     python3.11-dev \
-    python3.11-venv \
-    libgomp1
+    python3.11-venv
 
 COPY --from=builder /usr/src/text-generation-inference/target/release-opt/text-generation-backend-llamacpp /usr/src/text-generation-inference/text-generation-launcher
 COPY --from=builder /usr/src/text-generation-inference/dist /usr/
 COPY --from=builder /usr/src/text-generation-inference/backends/llamacpp/requirements.txt requirements.txt
+COPY --from=mimalloc-builder /usr/local/lib/libmimalloc.so.2.1 /usr/lib/libmimalloc.so.2.1
+
 RUN /usr/bin/python3.11 -m venv /usr/src/text-generation-inference/venv
 ENV PATH="/usr/src/text-generation-inference/venv/bin:$PATH"
 RUN pip3 install --no-cache-dir -r requirements.txt
 ENV PORT=8080
 WORKDIR /usr/src/text-generation-inference
-ENTRYPOINT ["text-generation-launcher"]
\ No newline at end of file
+ENTRYPOINT ["LD_PRELOAD=/usr/lib/libmimalloc.so.2.1", "text-generation-launcher"]
\ No newline at end of file

From c9f6c3a8f79d12346372ba786db9be9cd010a40b Mon Sep 17 00:00:00 2001
From: Morgan Funtowicz <funtowiczmo@gmail.com>
Date: Fri, 29 Nov 2024 23:34:16 +0100
Subject: [PATCH 90/91] feat(backend): better map exception throw on C++ side

---
 backends/llamacpp/csrc/backend.cpp |  6 ++++++
 backends/llamacpp/csrc/backend.hpp |  4 +++-
 backends/llamacpp/csrc/ffi.hpp     | 33 +++++++++++++++++++++++++++---
 3 files changed, 39 insertions(+), 4 deletions(-)

diff --git a/backends/llamacpp/csrc/backend.cpp b/backends/llamacpp/csrc/backend.cpp
index f7e4cde288d..b6b3de00477 100644
--- a/backends/llamacpp/csrc/backend.cpp
+++ b/backends/llamacpp/csrc/backend.cpp
@@ -125,6 +125,12 @@ namespace huggingface::tgi::backends::llamacpp {
                     // Update the batch for the next generation
                     sampling_index = update_batch_for_decoding(batch, new_token_id, position);
                     position += 1;
+                } else {
+                    if (status == 1) {
+                        return backend_error_t::NO_KV_SLOT_AVAILABLE;
+                    } else {
+                        return backend_error_t::DECODING_ERROR;
+                    }
                 }
             }
 
diff --git a/backends/llamacpp/csrc/backend.hpp b/backends/llamacpp/csrc/backend.hpp
index 38fd3aad676..e1ab1e6504f 100644
--- a/backends/llamacpp/csrc/backend.hpp
+++ b/backends/llamacpp/csrc/backend.hpp
@@ -37,7 +37,9 @@ namespace huggingface::tgi::backends::llamacpp {
      */
     enum backend_error_t : uint8_t {
         // Provided model filepath doesnt exist
-        MODEL_FILE_DOESNT_EXIST = 1
+        MODEL_FILE_DOESNT_EXIST = 1,
+        NO_KV_SLOT_AVAILABLE = 2,
+        DECODING_ERROR = 3
     };
 
     /**
diff --git a/backends/llamacpp/csrc/ffi.hpp b/backends/llamacpp/csrc/ffi.hpp
index 99679fdb0cb..2f1437397ca 100644
--- a/backends/llamacpp/csrc/ffi.hpp
+++ b/backends/llamacpp/csrc/ffi.hpp
@@ -32,7 +32,6 @@ namespace huggingface::tgi::backends::llamacpp {
 #include "backends/llamacpp/src/lib.rs.h"
 #include "rust/cxx.h"
 
-
 namespace huggingface::tgi::backends::llamacpp {
 
     /**
@@ -56,7 +55,12 @@ namespace huggingface::tgi::backends::llamacpp {
      * llama.cpp backend specific exception mapped from `backend_exception_t` to throw at the FFI level and
      * allow automatic implementation of Result<_, Exception> from C++ to Rust
      */
-    class llama_cpp_backend_exception_t : std::exception {};
+    class llama_cpp_backend_exception_t : std::exception {
+    public:
+        backend_error_t error;
+
+        llama_cpp_backend_exception_t(const backend_error_t error): error(error) {};
+    };
 
     /**
      * Llama.cpp frontend over the worker interfacing with Rust FFI layer
@@ -119,7 +123,7 @@ namespace huggingface::tgi::backends::llamacpp {
             if(const auto result = worker_.generate(generation_context, context_forwarding_callback); result.has_value()) [[likely]] {
                 return *result;
             } else {
-                throw llama_cpp_backend_exception_t {};
+                throw llama_cpp_backend_exception_t(result.error());
             }
         }
     };
@@ -232,5 +236,28 @@ namespace huggingface::tgi::backends::llamacpp {
     }
 }
 
+// Error handle converting to rust Result<T, CxxError>
+template <typename Try, typename Fail>
+static void trycatch(Try &&func, Fail &&fail) noexcept try {
+    func();
+} catch (const huggingface::tgi::backends::llamacpp::llama_cpp_backend_exception_t &e) {
+    switch (e.error) {
+        case huggingface::tgi::backends::llamacpp::backend_error_t::MODEL_FILE_DOESNT_EXIST: {
+            fail("Specified model path doesn't exist.");
+            break;
+        }
+        case huggingface::tgi::backends::llamacpp::backend_error_t::NO_KV_SLOT_AVAILABLE: {
+            fail("Keys/Values cache is full, no slot available for the new batch.");
+            break;
+        }
+        case huggingface::tgi::backends::llamacpp::backend_error_t::DECODING_ERROR: {
+            fail("An error what detected during the generation.");
+            break;
+        }
+    }
+    fail();
+}
+
+
 
 #endif //TGI_LLAMA_CPP_BACKEND_FFI_HPP

From e0dda9b614d285f3ee9e4053f9306c946a753721 Mon Sep 17 00:00:00 2001
From: Morgan Funtowicz <funtowiczmo@gmail.com>
Date: Fri, 29 Nov 2024 23:38:27 +0100
Subject: [PATCH 91/91] feat(backend): use c++ defined types for llama.cpp

---
 backends/llamacpp/csrc/backend.cpp | 2 +-
 backends/llamacpp/csrc/backend.hpp | 8 +-------
 2 files changed, 2 insertions(+), 8 deletions(-)

diff --git a/backends/llamacpp/csrc/backend.cpp b/backends/llamacpp/csrc/backend.cpp
index b6b3de00477..d3f89adca61 100644
--- a/backends/llamacpp/csrc/backend.cpp
+++ b/backends/llamacpp/csrc/backend.cpp
@@ -36,7 +36,7 @@ namespace huggingface::tgi::backends::llamacpp {
 
         llama_sampler_chain_add(sampler, llama_sampler_init_temp(temperature));
         llama_sampler_chain_add(sampler, llama_sampler_init_dist(seed));
-        return {sampler, llama_sampler_deleter};
+        return llama_sampler_ptr(sampler);
     }
 
     std::expected<llama_batch, backend_error_t> get_batch_from_prompt(std::span<llama_token> prompt) {
diff --git a/backends/llamacpp/csrc/backend.hpp b/backends/llamacpp/csrc/backend.hpp
index e1ab1e6504f..84602e77d08 100644
--- a/backends/llamacpp/csrc/backend.hpp
+++ b/backends/llamacpp/csrc/backend.hpp
@@ -17,18 +17,12 @@
 #include <vector>
 
 #include <llama.h>
+#include <llama-cpp.h>
 #include <thread>
 
 #define LLAMA_SUCCESS(x) x == 0
 
 namespace huggingface::tgi::backends::llamacpp {
-
-    static constexpr auto llama_context_deleter = [](llama_context *pContext) { llama_free(pContext); };
-    typedef std::unique_ptr<llama_context, decltype(llama_context_deleter)> llama_context_ptr;
-
-    static constexpr auto llama_sampler_deleter = [](llama_sampler *pSampler) { llama_sampler_free(pSampler); };
-    typedef std::unique_ptr<llama_sampler, decltype(llama_sampler_deleter)> llama_sampler_ptr;
-
     typedef std::function<bool(llama_token, float_t, bool, size_t)> llama_decode_callback;
     static constexpr auto llama_void_callback = [](llama_token, float_t, bool, size_t) -> bool { return false; };