Skip to content

Commit 424acd6

Browse files
authored
feat(llama.cpp): allow to set cache-ram and ctx_shift (#7009)
* feat(llama.cpp): allow to set cache-ram and ctx_shift Signed-off-by: Ettore Di Giacinto <[email protected]> * Apply suggestion from @mudler Signed-off-by: Ettore Di Giacinto <[email protected]> --------- Signed-off-by: Ettore Di Giacinto <[email protected]> Signed-off-by: Ettore Di Giacinto <[email protected]>
1 parent 3cd8234 commit 424acd6

File tree

1 file changed

+19
-4
lines changed

1 file changed

+19
-4
lines changed

backend/cpp/llama-cpp/grpc-server.cpp

Lines changed: 19 additions & 4 deletions
Original file line numberDiff line numberDiff line change
@@ -270,6 +270,11 @@ static void params_parse(server_context& ctx_server, const backend::ModelOptions
270270
add_rpc_devices(std::string(llama_grpc_servers));
271271
}
272272

273+
// Initialize ctx_shift to false by default (can be overridden by options)
274+
params.ctx_shift = false;
275+
// Initialize cache_ram_mib to -1 by default (no limit, can be overridden by options)
276+
params.cache_ram_mib = -1;
277+
273278
// decode options. Options are in form optname:optvale, or if booleans only optname.
274279
for (int i = 0; i < request->options_size(); i++) {
275280
std::string opt = request->options(i);
@@ -279,8 +284,20 @@ static void params_parse(server_context& ctx_server, const backend::ModelOptions
279284
optval = "true";
280285
}
281286

282-
if (!strcmp(optname, "gpu")) {
283-
// llama.has_gpu = true;
287+
if (!strcmp(optname, "context_shift")) {
288+
if (!strcmp(optval, "true") || !strcmp(optval, "1") || !strcmp(optval, "yes") || !strcmp(optval, "on") || !strcmp(optval, "enabled")) {
289+
params.ctx_shift = true;
290+
} else if (!strcmp(optval, "false") || !strcmp(optval, "0") || !strcmp(optval, "no") || !strcmp(optval, "off") || !strcmp(optval, "disabled")) {
291+
params.ctx_shift = false;
292+
}
293+
} else if (!strcmp(optname, "cache_ram")) {
294+
if (optval != NULL) {
295+
try {
296+
params.cache_ram_mib = std::stoi(optval);
297+
} catch (const std::exception& e) {
298+
// If conversion fails, keep default value (-1)
299+
}
300+
}
284301
}
285302
}
286303

@@ -342,8 +359,6 @@ static void params_parse(server_context& ctx_server, const backend::ModelOptions
342359
}
343360

344361
params.no_kv_offload = request->nokvoffload();
345-
params.ctx_shift = false; // We control context-shifting in any case (and we disable it as it could just lead to infinite loops)
346-
347362
params.embedding = request->embeddings() || request->reranking();
348363
if (request->reranking()) {
349364
params.pooling_type = LLAMA_POOLING_TYPE_RANK;

0 commit comments

Comments
 (0)