Skip to content

Commit d258e7c

Browse files
committed
add exponential steps, update documentation
1 parent 5a7754a commit d258e7c

File tree

2 files changed

+66
-41
lines changed

2 files changed

+66
-41
lines changed

tools/llama-bench/README.md

Lines changed: 19 additions & 12 deletions
Original file line numberDiff line numberDiff line change
@@ -20,10 +20,20 @@ Performance testing tool for llama.cpp.
2020
## Syntax
2121

2222
```
23-
usage: ./llama-bench [options]
23+
usage: build/bin/llama-bench [options]
2424
2525
options:
2626
-h, --help
27+
--numa <distribute|isolate|numactl> numa mode (default: disabled)
28+
-r, --repetitions <n> number of times to repeat each test (default: 5)
29+
--prio <0|1|2|3> process/thread priority (default: 0)
30+
--delay <0...N> (seconds) delay between each test (default: 0)
31+
-o, --output <csv|json|jsonl|md|sql> output format printed to stdout (default: md)
32+
-oe, --output-err <csv|json|jsonl|md|sql> output format printed to stderr (default: none)
33+
-v, --verbose verbose output
34+
--progress print test progress indicators
35+
36+
test parameters:
2737
-m, --model <filename> (default: models/7B/ggml-model-q4_0.gguf)
2838
-p, --n-prompt <n> (default: 512)
2939
-n, --n-gen <n> (default: 128)
@@ -33,28 +43,25 @@ options:
3343
-ub, --ubatch-size <n> (default: 512)
3444
-ctk, --cache-type-k <t> (default: f16)
3545
-ctv, --cache-type-v <t> (default: f16)
36-
-t, --threads <n> (default: 8)
46+
-t, --threads <n> (default: 16)
3747
-C, --cpu-mask <hex,hex> (default: 0x0)
3848
--cpu-strict <0|1> (default: 0)
3949
--poll <0...100> (default: 50)
4050
-ngl, --n-gpu-layers <n> (default: 99)
41-
-rpc, --rpc <rpc_servers> (default: )
4251
-sm, --split-mode <none|layer|row> (default: layer)
4352
-mg, --main-gpu <i> (default: 0)
4453
-nkvo, --no-kv-offload <0|1> (default: 0)
4554
-fa, --flash-attn <0|1> (default: 0)
4655
-mmp, --mmap <0|1> (default: 1)
47-
--numa <distribute|isolate|numactl> (default: disabled)
4856
-embd, --embeddings <0|1> (default: 0)
4957
-ts, --tensor-split <ts0/ts1/..> (default: 0)
50-
-r, --repetitions <n> (default: 5)
51-
--prio <0|1|2|3> (default: 0)
52-
--delay <0...N> (seconds) (default: 0)
53-
-o, --output <csv|json|jsonl|md|sql> (default: md)
54-
-oe, --output-err <csv|json|jsonl|md|sql> (default: none)
55-
-v, --verbose (default: 0)
56-
57-
Multiple values can be given for each parameter by separating them with ',' or by specifying the parameter multiple times.
58+
-ot --override-tensors <tensor name pattern>=<buffer type>;...
59+
(default: disabled)
60+
-nopo, --no-op-offload <0|1> (default: 0)
61+
62+
Multiple values can be given for each parameter by separating them with ','
63+
or by specifying the parameter multiple times. Ranges can be given as
64+
'start-end' or 'start-end+step' or 'start-end*mult'.
5865
```
5966

6067
llama-bench can perform three types of tests:

tools/llama-bench/llama-bench.cpp

Lines changed: 47 additions & 29 deletions
Original file line numberDiff line numberDiff line change
@@ -196,18 +196,28 @@ static std::string pair_str(const std::pair<int, int> & p) {
196196
}
197197

198198
static std::vector<int> parse_int_range(const std::string & s) {
199-
// first[-last[+step]][,first[-last[+step]]...]
200-
std::regex range_regex(R"(^(\d+)(?:-(\d+)(?:\+(\d+))?)?(,|$))");
199+
// first[-last[(+|*)step]]
200+
std::regex range_regex(R"(^(\d+)(?:-(\d+)(?:([\+|\*])(\d+))?)?(?:,|$))");
201+
201202
std::smatch match;
202203
std::string::const_iterator search_start(s.cbegin());
203204
std::vector<int> result;
204205
while (std::regex_search(search_start, s.cend(), match, range_regex)) {
205-
int first = std::stoi(match[1]);
206-
int last = match[2].matched ? std::stoi(match[2]) : first;
207-
int step = match[3].matched ? std::stoi(match[3]) : 1;
206+
int first = std::stoi(match[1]);
207+
int last = match[2].matched ? std::stoi(match[2]) : first;
208+
char op = match[3].matched ? match[3].str()[0] : '+';
209+
int step = match[4].matched ? std::stoi(match[4]) : 1;
208210

209-
for (int i = first; i <= last; i += step) {
211+
for (int i = first; i <= last;) {
210212
result.push_back(i);
213+
214+
if (op == '+') {
215+
i += step;
216+
} else if (op == '*') {
217+
i *= step;
218+
} else {
219+
throw std::invalid_argument("invalid range format");
220+
}
211221
}
212222
search_start = match.suffix().first;
213223
}
@@ -275,7 +285,7 @@ static const cmd_params cmd_params_defaults = {
275285
/* no_kv_offload */ { false },
276286
/* flash_attn */ { false },
277287
/* tensor_split */ { std::vector<float>(llama_max_devices(), 0.0f) },
278-
/* tensor_buft_overrides*/ { std::vector<llama_model_tensor_buft_override>{{nullptr,nullptr}} },
288+
/* tensor_buft_overrides*/ { std::vector<llama_model_tensor_buft_override>{ { nullptr, nullptr } } },
279289
/* use_mmap */ { true },
280290
/* embeddings */ { false },
281291
/* no_op_offload */ { false },
@@ -294,13 +304,29 @@ static void print_usage(int /* argc */, char ** argv) {
294304
printf("\n");
295305
printf("options:\n");
296306
printf(" -h, --help\n");
307+
printf(" --numa <distribute|isolate|numactl> numa mode (default: disabled)\n");
308+
printf(" -r, --repetitions <n> number of times to repeat each test (default: %d)\n",
309+
cmd_params_defaults.reps);
310+
printf(" --prio <0|1|2|3> process/thread priority (default: %d)\n",
311+
cmd_params_defaults.prio);
312+
printf(" --delay <0...N> (seconds) delay between each test (default: %d)\n",
313+
cmd_params_defaults.delay);
314+
printf(" -o, --output <csv|json|jsonl|md|sql> output format printed to stdout (default: %s)\n",
315+
output_format_str(cmd_params_defaults.output_format));
316+
printf(" -oe, --output-err <csv|json|jsonl|md|sql> output format printed to stderr (default: %s)\n",
317+
output_format_str(cmd_params_defaults.output_format_stderr));
318+
printf(" -v, --verbose verbose output\n");
319+
printf(" --progress print test progress indicators\n");
320+
printf("\n");
321+
printf("test parameters:\n");
297322
printf(" -m, --model <filename> (default: %s)\n", join(cmd_params_defaults.model, ",").c_str());
298323
printf(" -p, --n-prompt <n> (default: %s)\n",
299324
join(cmd_params_defaults.n_prompt, ",").c_str());
300325
printf(" -n, --n-gen <n> (default: %s)\n", join(cmd_params_defaults.n_gen, ",").c_str());
301326
printf(" -pg <pp,tg> (default: %s)\n",
302327
join(transform_to_str(cmd_params_defaults.n_pg, pair_str), ",").c_str());
303-
printf(" -d, --n-depth <n> (default: %s)\n", join(cmd_params_defaults.n_depth, ",").c_str());
328+
printf(" -d, --n-depth <n> (default: %s)\n",
329+
join(cmd_params_defaults.n_depth, ",").c_str());
304330
printf(" -b, --batch-size <n> (default: %s)\n",
305331
join(cmd_params_defaults.n_batch, ",").c_str());
306332
printf(" -ub, --ubatch-size <n> (default: %s)\n",
@@ -332,25 +358,17 @@ static void print_usage(int /* argc */, char ** argv) {
332358
join(cmd_params_defaults.flash_attn, ",").c_str());
333359
printf(" -mmp, --mmap <0|1> (default: %s)\n",
334360
join(cmd_params_defaults.use_mmap, ",").c_str());
335-
printf(" --numa <distribute|isolate|numactl> (default: disabled)\n");
336361
printf(" -embd, --embeddings <0|1> (default: %s)\n",
337362
join(cmd_params_defaults.embeddings, ",").c_str());
338363
printf(" -ts, --tensor-split <ts0/ts1/..> (default: 0)\n");
339-
printf(" -ot --override-tensors <tensor name pattern>=<buffer type>;... (default: disabled)\n");
340-
printf(" -nopo, --no-op-offload <i> (default: 0)\n");
341-
printf(" -r, --repetitions <n> (default: %d)\n", cmd_params_defaults.reps);
342-
printf(" --prio <0|1|2|3> (default: %d)\n", cmd_params_defaults.prio);
343-
printf(" --delay <0...N> (seconds) (default: %d)\n", cmd_params_defaults.delay);
344-
printf(" -o, --output <csv|json|jsonl|md|sql> (default: %s)\n",
345-
output_format_str(cmd_params_defaults.output_format));
346-
printf(" -oe, --output-err <csv|json|jsonl|md|sql> (default: %s)\n",
347-
output_format_str(cmd_params_defaults.output_format_stderr));
348-
printf(" -v, --verbose (default: %s)\n", cmd_params_defaults.verbose ? "1" : "0");
349-
printf(" --progress (default: %s)\n", cmd_params_defaults.progress ? "1" : "0");
364+
printf(" -ot --override-tensors <tensor name pattern>=<buffer type>;...\n");
365+
printf(" (default: disabled)\n");
366+
printf(" -nopo, --no-op-offload <0|1> (default: 0)\n");
350367
printf("\n");
351368
printf(
352-
"Multiple values can be given for each parameter by separating them with ',' or by specifying the parameter "
353-
"multiple times. Ranges can be specified with 'first-last' or 'first-last+step'.\n");
369+
"Multiple values can be given for each parameter by separating them with ','\n"
370+
"or by specifying the parameter multiple times. Ranges can be given as\n"
371+
"'start-end' or 'start-end+step' or 'start-end*mult'.\n");
354372
}
355373

356374
static ggml_type ggml_type_from_name(const std::string & s) {
@@ -618,13 +636,13 @@ static cmd_params parse_cmd_params(int argc, char ** argv) {
618636
}
619637
auto p = string_split<bool>(argv[i], split_delim);
620638
params.embeddings.insert(params.embeddings.end(), p.begin(), p.end());
621-
} else if (arg == "-nopo" || arg == "--no-op-offload") {
622-
if (++i >= argc) {
623-
invalid_param = true;
624-
break;
625-
}
626-
auto p = string_split<bool>(argv[i], split_delim);
627-
params.no_op_offload.insert(params.no_op_offload.end(), p.begin(), p.end());
639+
} else if (arg == "-nopo" || arg == "--no-op-offload") {
640+
if (++i >= argc) {
641+
invalid_param = true;
642+
break;
643+
}
644+
auto p = string_split<bool>(argv[i], split_delim);
645+
params.no_op_offload.insert(params.no_op_offload.end(), p.begin(), p.end());
628646
} else if (arg == "-ts" || arg == "--tensor-split") {
629647
if (++i >= argc) {
630648
invalid_param = true;

0 commit comments

Comments
 (0)