Skip to content

Commit 04d6e93

Browse files
committed
Introduce --gpu and --tinyblas flags
1 parent 8762f13 commit 04d6e93

File tree

14 files changed

+267
-68
lines changed

14 files changed

+267
-68
lines changed

llama.cpp/common.cpp

Lines changed: 12 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -523,6 +523,18 @@ bool gpt_params_parse_ex(int argc, char ** argv, gpt_params & params) {
523523
params.unsecure = true;
524524
} else if (arg == "--nocompile") {
525525
FLAG_nocompile = true;
526+
} else if (arg == "--tinyblas") {
527+
FLAG_tinyblas = true; // undocumented
528+
} else if (arg == "--gpu") {
529+
if (++i >= argc) {
530+
invalid_param = true;
531+
break;
532+
}
533+
FLAG_gpu = llamafile_gpu_parse(argv[i]);
534+
if (FLAG_gpu == -1) {
535+
fprintf(stderr, "error: invalid --gpu flag value: %s\n", argv[i]);
536+
exit(1);
537+
}
526538
} else if (arg == "-dkvc" || arg == "--dump-kv-cache") {
527539
params.dump_kv_cache = true;
528540
} else if (arg == "-nkvo" || arg == "--no-kv-offload") {
@@ -930,8 +942,6 @@ void gpt_print_usage(int /*argc*/, char ** argv, const gpt_params & params) {
930942
printf(" draft model for speculative decoding (default: %s)\n", params.model.c_str());
931943
printf(" -ld LOGDIR, --logdir LOGDIR\n");
932944
printf(" path under which to save YAML logs (no logging if unset)\n");
933-
printf(" --unsecure disables pledge() sandboxing on Linux and OpenBSD\n");
934-
printf(" --nocompile disables runtime compilation of gpu support\n");
935945
printf(" --override-kv KEY=TYPE:VALUE\n");
936946
printf(" advanced option to override model metadata by key. may be specified multiple times.\n");
937947
printf(" types: int, float, bool. example: --override-kv tokenizer.ggml.add_bos_token=bool:false\n");

llama.cpp/ggml.c

Lines changed: 2 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -14168,7 +14168,7 @@ static void ggml_compute_forward(struct ggml_compute_params * params, struct ggm
1416814168
return;
1416914169
}
1417014170

14171-
if (!ggml_metal_supported() && ggml_cublas_loaded()) {
14171+
if (llamafile_gpu_supported() == LLAMAFILE_GPU_NVIDIA) {
1417214172
bool skip_cpu = ggml_cuda_compute_forward(params, tensor);
1417314173
if (skip_cpu) {
1417414174
return;
@@ -16048,7 +16048,7 @@ static int ggml_get_n_tasks(struct ggml_tensor * node, int n_threads) {
1604816048
//n_tasks = MIN(n_threads, MAX(1, nr0/128));
1604916049
//printf("nr0 = %8d, nr1 = %8d, nr0*nr1 = %8d, n_tasks%d\n", nr0, nr1, nr0*nr1, n_tasks);
1605016050

16051-
if (!ggml_metal_supported() && ggml_cublas_loaded()) {
16051+
if (llamafile_gpu_supported() == LLAMAFILE_GPU_NVIDIA) {
1605216052
if (ggml_cuda_can_mul_mat(node->src[0], node->src[1], node)) {
1605316053
n_tasks = 1; // TODO: this actually is doing nothing
1605416054
// the threads are still spinning

llama.cpp/llama.cpp

Lines changed: 28 additions & 26 deletions
Original file line numberDiff line numberDiff line change
@@ -734,25 +734,27 @@ static void ggml_graph_compute_helper(std::vector<uint8_t> & buf, ggml_cgraph *
734734
//
735735

736736
inline void * llama_host_malloc(size_t n) {
737-
if (ggml_metal_supported()) {
738-
return ggml_metal_host_malloc(n);
739-
} else if (ggml_cublas_loaded()) {
740-
return ggml_cuda_host_malloc(n);
741-
} else {
742-
return malloc(n);
737+
switch (llamafile_gpu_supported()) {
738+
case LLAMAFILE_GPU_APPLE:
739+
return ggml_metal_host_malloc(n);
740+
case LLAMAFILE_GPU_NVIDIA:
741+
return ggml_cuda_host_malloc(n);
742+
default:
743+
return malloc(n);
743744
}
744745
#if GGML_USE_CPU_HBM
745746
#error fix me
746747
#endif
747748
}
748749

749750
inline void llama_host_free(void * ptr) {
750-
if (ggml_metal_supported()) {
751-
return ggml_metal_host_free(ptr);
752-
} else if (ggml_cublas_loaded()) {
753-
return ggml_cuda_host_free(ptr);
754-
} else {
755-
return free(ptr);
751+
switch (llamafile_gpu_supported()) {
752+
case LLAMAFILE_GPU_APPLE:
753+
return ggml_metal_host_free(ptr);
754+
case LLAMAFILE_GPU_NVIDIA:
755+
return ggml_cuda_host_free(ptr);
756+
default:
757+
return free(ptr);
756758
}
757759
#if GGML_USE_CPU_HBM
758760
#error fix me
@@ -895,7 +897,7 @@ struct llama_mmap {
895897

896898
// report terminal progress of loading weights off the disk into
897899
// the cpu. if we're using gpu inference, then don't even bother
898-
if (!ggml_metal_supported() && !ggml_cublas_loaded()) {
900+
if (!llamafile_gpu_supported()) {
899901
llamafile_schlep(addr, size);
900902
}
901903
}
@@ -1276,7 +1278,7 @@ struct llama_kv_cache {
12761278
ggml_free(ctx);
12771279
}
12781280

1279-
if (!ggml_metal_supported() && ggml_cublas_loaded()) {
1281+
if (llamafile_gpu_supported() == LLAMAFILE_GPU_NVIDIA) {
12801282
for (size_t i = 0; i < k_l.size(); ++i) {
12811283
ggml_cuda_free_data(k_l[i]);
12821284
ggml_cuda_free_data(v_l[i]);
@@ -1387,7 +1389,7 @@ struct llama_model {
13871389
ggml_free(ctx);
13881390
}
13891391

1390-
if (!ggml_metal_supported() && ggml_cublas_loaded()) {
1392+
if (llamafile_gpu_supported() == LLAMAFILE_GPU_NVIDIA) {
13911393
for (size_t i = 0; i < tensors_by_name.size(); ++i) {
13921394
ggml_cuda_free_data(tensors_by_name[i].second);
13931395
}
@@ -1515,7 +1517,7 @@ static bool llama_kv_cache_init(
15151517
ggml_format_name(v, "cache_v_l%d", i);
15161518
cache.k_l.push_back(k);
15171519
cache.v_l.push_back(v);
1518-
if (!ggml_metal_supported() && ggml_cublas_loaded()) {
1520+
if (llamafile_gpu_supported() == LLAMAFILE_GPU_NVIDIA) {
15191521
if (i >= i_gpu_start) {
15201522
if (offload) {
15211523
ggml_cuda_assign_buffers_no_scratch(k);
@@ -2923,7 +2925,7 @@ static void llm_load_tensors(
29232925
enum ggml_backend_type llama_backend_offload = GGML_BACKEND_CPU;
29242926
enum ggml_backend_type llama_backend_offload_split = GGML_BACKEND_CPU;
29252927

2926-
if (!ggml_metal_supported() && ggml_cublas_loaded()) {
2928+
if (llamafile_gpu_supported() == LLAMAFILE_GPU_NVIDIA) {
29272929
LLAMA_LOG_INFO("%s: using " GGML_CUDA_NAME " for GPU acceleration\n", __func__);
29282930
ggml_cuda_set_main_device(main_gpu);
29292931

@@ -3645,7 +3647,7 @@ static void llm_load_tensors(
36453647

36463648
LLAMA_LOG_INFO("%s: mem required = %7.2f MiB\n", __func__, mem_required / 1024.0 / 1024.0);
36473649

3648-
if (!ggml_metal_supported() && ggml_cublas_loaded()) {
3650+
if (llamafile_gpu_supported() == LLAMAFILE_GPU_NVIDIA) {
36493651
const int n_gpu = std::min(n_gpu_layers, int(hparams.n_layer));
36503652

36513653
LLAMA_LOG_INFO("%s: offloading %d repeating layers to GPU\n", __func__, n_gpu);
@@ -3668,7 +3670,7 @@ static void llm_load_tensors(
36683670
}
36693671

36703672
(void) tensor_split;
3671-
if (!ggml_metal_supported() && ggml_cublas_loaded())
3673+
if (llamafile_gpu_supported() == LLAMAFILE_GPU_NVIDIA)
36723674
{
36733675
ggml_cuda_set_tensor_split(tensor_split);
36743676
}
@@ -5975,7 +5977,7 @@ static struct ggml_cgraph * llama_build_graph(
59755977

59765978
// this is needed for compatibility with Metal for example
59775979
static offload_func_t ggml_offload_gpu;
5978-
if (!ggml_metal_supported() && ggml_cublas_loaded()) {
5980+
if (llamafile_gpu_supported() == LLAMAFILE_GPU_NVIDIA) {
59795981
ggml_offload_gpu = ggml_cuda_assign_buffers_no_alloc;
59805982
} else {
59815983
ggml_offload_gpu = ggml_offload_nop;
@@ -6197,7 +6199,7 @@ static int llama_decode_internal(
61976199
GGML_ASSERT(strcmp(embeddings->name, "result_norm") == 0);
61986200
}
61996201

6200-
if (!ggml_metal_supported() && ggml_cublas_loaded()) {
6202+
if (llamafile_gpu_supported() == LLAMAFILE_GPU_NVIDIA) {
62016203
for (int i = 0; i < gf->n_leafs; i++) {
62026204
ggml_tensor * node = gf->leafs[i];
62036205
if (node->backend == GGML_BACKEND_GPU && node->extra == NULL) {
@@ -8916,7 +8918,7 @@ static int llama_apply_lora_from_file_internal(
89168918
offload_func_t offload_func = ggml_offload_nop;
89178919
offload_func_t offload_func_force_inplace = ggml_offload_nop;
89188920

8919-
if (!ggml_metal_supported() && ggml_cublas_loaded()) {
8921+
if (llamafile_gpu_supported() == LLAMAFILE_GPU_NVIDIA) {
89208922
if (dest_t->backend == GGML_BACKEND_GPU || dest_t->backend == GGML_BACKEND_GPU_SPLIT) {
89218923
if (dest_t->type != GGML_TYPE_F16) {
89228924
ThrowRuntimeError(format(
@@ -9042,7 +9044,7 @@ struct llama_model_params llama_model_default_params() {
90429044
/*.use_mlock =*/ false,
90439045
};
90449046

9045-
if (ggml_metal_supported()) {
9047+
if (llamafile_gpu_supported() == LLAMAFILE_GPU_APPLE) {
90469048
result.n_gpu_layers = 1;
90479049
}
90489050

@@ -9277,7 +9279,7 @@ struct llama_context * llama_new_context_with_model(
92779279
llama_token token = llama_token_bos(&ctx->model); // not actually used by llama_build_graph, but required to choose between token and embedding inputs graph
92789280
ggml_cgraph * gf = llama_build_graph(*ctx, llama_batch_get_one(&token, n_tokens, n_past, 0));
92799281

9280-
if (ggml_metal_supported()) {
9282+
if (llamafile_gpu_supported() == LLAMAFILE_GPU_APPLE) {
92819283
if (model->n_gpu_layers > 0) {
92829284
ctx->ctx_metal = ggml_metal_init(1);
92839285
if (!ctx->ctx_metal) {
@@ -9303,7 +9305,7 @@ struct llama_context * llama_new_context_with_model(
93039305
if (ctx->ctx_metal) {
93049306
//ggml_allocr_set_parse_seq(ctx->alloc, ggml_metal_get_concur_list(ctx->ctx_metal), ggml_metal_if_optimized(ctx->ctx_metal));
93059307
}
9306-
if (!ggml_metal_supported() && ggml_cuda_supported()) {
9308+
if (llamafile_gpu_supported() == LLAMAFILE_GPU_NVIDIA) {
93079309
ggml_cuda_set_scratch_size(alloc_size);
93089310
LLAMA_LOG_INFO("%s: VRAM scratch buffer: %.2f MiB\n", __func__, alloc_size / 1024.0 / 1024.0);
93099311

@@ -9336,7 +9338,7 @@ struct llama_context * llama_new_context_with_model(
93369338
}
93379339
}
93389340

9339-
if (ggml_metal_supported()) {
9341+
if (llamafile_gpu_supported() == LLAMAFILE_GPU_APPLE) {
93409342
if (model->n_gpu_layers > 0) {
93419343
// this allocates all Metal resources and memory buffers
93429344

llama.cpp/main/main.1

Lines changed: 38 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -353,6 +353,44 @@ Force system to keep model in RAM rather than swapping or compressing.
353353
Do not memory-map model (slower load but may reduce pageouts if not using mlock).
354354
.It Fl Fl numa
355355
Attempt optimizations that help on some NUMA systems if run without this previously, it is recommended to drop the system page cache before using this. See https://github.com/ggerganov/llama.cpp/issues/1437.
356+
.It Fl Fl nocompile
357+
Never compile GPU support at runtime.
358+
.Pp
359+
If
360+
.Pa ~/.llamafile/ggml-cuda.dll
361+
already exists on the file system (or .so for UNIX and .dylib for
362+
MacOS), then it'll be linked as-is without question. Otherwise,
363+
.Nm
364+
will fall back to CPU inference.
365+
.It Fl Fl gpu Ar GPU
366+
Specifies which brand of GPU should be used. Valid choices are:
367+
.Pp
368+
.Bl -dash
369+
.It
370+
.Ar AUTO :
371+
Use any GPU if possible, otherwise fall back to CPU inference (default)
372+
.It
373+
.Ar AMD :
374+
Use AMD GPU. The AMD ROCm SDK must be installed and the HIP_PATH
375+
environment variable must be defined. If an AMD GPU could not be used
376+
for any reason, then a fatal error will be raised.
377+
.It
378+
.Ar APPLE :
379+
Use Apple Metal GPU. This is only available on MacOS ARM64. If Metal
380+
could not be used for any reason, then a fatal error will be raised.
381+
.It
382+
.Ar NVIDIA :
383+
Use NVIDIA GPU. If an NVIDIA GPU could not be used for any reason, a
384+
fatal error will be raised. On Windows, NVIDIA GPU support will use our
385+
tinyBLAS library, since it works on stock Windows installs. If both MSVC
386+
and CUDA are installed beforehand, and
387+
.Nm
388+
is run for the first time on the x64 command prompt, then llamafile will
389+
use NVIDIA's faster cuBLAS library instead. On Linux and other systems,
390+
the CUDA SDK must always be installed, so that native GPU support can be
391+
compiled on the fly.
392+
.El
393+
.Pp
356394
.It Fl ngl Ar N , Fl Fl n-gpu-layers Ar N
357395
Number of layers to store in VRAM.
358396
.It Fl ngld Ar N , Fl Fl n-gpu-layers-draft Ar N

llama.cpp/main/main.1.asc

Lines changed: 33 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -324,6 +324,39 @@ OOPPTTIIOONNSS
324324
page cache before using this. See
325325
https://github.com/ggerganov/llama.cpp/issues/1437.
326326

327+
----nnooccoommppiillee
328+
Never compile GPU support at runtime.
329+
330+
If _~_/_._l_l_a_m_a_f_i_l_e_/_g_g_m_l_-_c_u_d_a_._d_l_l already exists on the file system
331+
(or .so for UNIX and .dylib for MacOS), then it'll be linked as-
332+
is without question. Otherwise, llllaammaaffiillee will fall back to CPU
333+
inference.
334+
335+
----ggppuu _G_P_U
336+
Specifies which brand of GPU should be used. Valid choices are:
337+
338+
-- _A_U_T_O: Use any GPU if possible, otherwise fall back to CPU
339+
inference (default)
340+
341+
-- _A_M_D: Use AMD GPU. The AMD ROCm SDK must be installed and the
342+
HIP_PATH environment variable must be defined. If an AMD GPU
343+
could not be used for any reason, then a fatal error will be
344+
raised.
345+
346+
-- _A_P_P_L_E: Use Apple Metal GPU. This is only available on MacOS
347+
ARM64. If Metal could not be used for any reason, then a
348+
fatal error will be raised.
349+
350+
-- _N_V_I_D_I_A: Use NVIDIA GPU. If an NVIDIA GPU could not be used
351+
for any reason, a fatal error will be raised. On Windows,
352+
NVIDIA GPU support will use our tinyBLAS library, since it
353+
works on stock Windows installs. If both MSVC and CUDA are
354+
installed beforehand, and llllaammaaffiillee is run for the first time
355+
on the x64 command prompt, then llamafile will use NVIDIA's
356+
faster cuBLAS library instead. On Linux and other systems,
357+
the CUDA SDK must always be installed, so that native GPU
358+
support can be compiled on the fly.
359+
327360
--nnggll _N, ----nn--ggppuu--llaayyeerrss _N
328361
Number of layers to store in VRAM.
329362

llama.cpp/main/main.cpp

Lines changed: 2 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -121,6 +121,7 @@ int main(int argc, char ** argv) {
121121

122122
if (has_argument(argc, argv, "--help")) {
123123
llamafile_help("/zip/llama.cpp/main/main.1.asc");
124+
__builtin_unreachable();
124125
}
125126

126127
if (!has_argument(argc, argv, "--cli") &&
@@ -162,7 +163,7 @@ int main(int argc, char ** argv) {
162163
console::init(params.simple_io, params.use_color);
163164
atexit([]() { console::cleanup(); });
164165

165-
if (!params.unsecure && !ggml_metal_supported() && !ggml_cuda_supported()) {
166+
if (!params.unsecure && !llamafile_gpu_supported()) {
166167
// Enable pledge() security on Linux and OpenBSD.
167168
// - We do this *after* opening the log file for writing.
168169
// - We do this *before* loading any weights or graphdefs.

llama.cpp/server/server.cpp

Lines changed: 15 additions & 4 deletions
Original file line numberDiff line numberDiff line change
@@ -1979,9 +1979,6 @@ static void server_print_usage(const char *argv0, const gpt_params &params,
19791979
printf(" Set a file to load a system prompt (initial prompt of all slots), this is useful for chat applications.\n");
19801980
printf(" --mmproj MMPROJ_FILE path to a multimodal projector file for LLaVA.\n");
19811981
printf(" --log-disable disables logging to a file.\n");
1982-
printf(" --nobrowser Do not attempt to open a web browser tab at startup.\n");
1983-
printf(" --unsecure disables pledge() sandboxing on Linux and OpenBSD\n");
1984-
printf(" --nocompile disables runtime compilation of gpu support\n");
19851982
printf("\n");
19861983
}
19871984

@@ -2334,6 +2331,20 @@ static void server_params_parse(int argc, char **argv, server_params &sparams,
23342331
{
23352332
FLAG_nocompile = true;
23362333
}
2334+
else if (arg == "--gpu")
2335+
{
2336+
if (++i >= argc)
2337+
{
2338+
invalid_param = true;
2339+
break;
2340+
}
2341+
FLAG_gpu = llamafile_gpu_parse(argv[i]);
2342+
if (FLAG_gpu == -1)
2343+
{
2344+
fprintf(stderr, "error: invalid --gpu flag value: %s\n", argv[i]);
2345+
exit(1);
2346+
}
2347+
}
23372348
else
23382349
{
23392350
fprintf(stderr, "error: unknown argument: %s\n", arg.c_str());
@@ -3043,7 +3054,7 @@ int server_cli(int argc, char ** argv) {
30433054
llamafile_launch_browser(url);
30443055
}
30453056

3046-
if (!sparams.unsecure && !ggml_metal_supported() && !ggml_cuda_supported()) {
3057+
if (!sparams.unsecure && !llamafile_gpu_supported()) {
30473058
// Enables pledge() security on Linux and OpenBSD.
30483059
// - We do this *after* binding the server socket.
30493060
// - We do this *after* opening the log file for writing.

0 commit comments

Comments
 (0)