diff --git a/scripts/compare-commits.sh b/scripts/compare-commits.sh index 94a8eceb302b9..c6fdaa7c33faa 100755 --- a/scripts/compare-commits.sh +++ b/scripts/compare-commits.sh @@ -1,19 +1,41 @@ #!/bin/bash if [ $# -lt 2 ]; then - echo "usage: ./scripts/compare-commits.sh [additional llama-bench arguments]" + echo "usage: ./scripts/compare-commits.sh [tool] [additional arguments]" + echo " tool: 'llama-bench' (default) or 'test-backend-ops'" + echo " additional arguments: passed to the selected tool" exit 1 fi set -e set -x +# Parse arguments +commit1=$1 +commit2=$2 +tool=${3:-llama-bench} +additional_args="${@:4}" + +# Validate tool argument +if [ "$tool" != "llama-bench" ] && [ "$tool" != "test-backend-ops" ]; then + echo "Error: tool must be 'llama-bench' or 'test-backend-ops'" + exit 1 +fi + # verify at the start that the compare script has all the necessary dependencies installed ./scripts/compare-llama-bench.py --check -bench_args="${@:3}" +if [ "$tool" = "llama-bench" ]; then + db_file="llama-bench.sqlite" + target="llama-bench" + run_args="-o sql -oe md $additional_args" +else # test-backend-ops + db_file="test-backend-ops.sqlite" + target="test-backend-ops" + run_args="perf --output sql $additional_args" +fi -rm -f llama-bench.sqlite > /dev/null +rm -f "$db_file" > /dev/null # to test a backend, call the script with the corresponding environment variable (e.g. GGML_CUDA=1 ./scripts/compare-commits.sh ...) if [ -n "$GGML_CUDA" ]; then @@ -25,14 +47,14 @@ dir="build-bench" function run { rm -fr ${dir} > /dev/null cmake -B ${dir} -S . ${CMAKE_OPTS} > /dev/null - cmake --build ${dir} -t llama-bench > /dev/null - ${dir}/bin/llama-bench -o sql -oe md $bench_args | sqlite3 llama-bench.sqlite + cmake --build ${dir} -t $target > /dev/null + ${dir}/bin/$target $run_args | sqlite3 "$db_file" } -git checkout $1 > /dev/null +git checkout $commit1 > /dev/null run -git checkout $2 > /dev/null +git checkout $commit2 > /dev/null run -./scripts/compare-llama-bench.py -b $1 -c $2 +./scripts/compare-llama-bench.py -b $commit1 -c $commit2 --tool $tool -i "$db_file" diff --git a/scripts/compare-llama-bench.py b/scripts/compare-llama-bench.py index 30e3cf8649e8a..3288df92caac9 100755 --- a/scripts/compare-llama-bench.py +++ b/scripts/compare-llama-bench.py @@ -1,16 +1,16 @@ #!/usr/bin/env python3 -import logging import argparse +import csv import heapq -import sys +import json +import logging import os -from glob import glob import sqlite3 -import json -import csv -from typing import Optional, Union +import sys from collections.abc import Iterator, Sequence +from glob import glob +from typing import Optional, Union try: import git @@ -23,7 +23,7 @@ logger = logging.getLogger("compare-llama-bench") # All llama-bench SQL fields -DB_FIELDS = [ +LLAMA_BENCH_DB_FIELDS = [ "build_commit", "build_number", "cpu_info", "gpu_info", "backends", "model_filename", "model_type", "model_size", "model_n_params", "n_batch", "n_ubatch", "n_threads", "cpu_mask", "cpu_strict", "poll", "type_k", "type_v", "n_gpu_layers", @@ -33,7 +33,7 @@ "test_time", "avg_ns", "stddev_ns", "avg_ts", "stddev_ts", ] -DB_TYPES = [ +LLAMA_BENCH_DB_TYPES = [ "TEXT", "INTEGER", "TEXT", "TEXT", "TEXT", "TEXT", "TEXT", "INTEGER", "INTEGER", "INTEGER", "INTEGER", "INTEGER", "TEXT", "INTEGER", "INTEGER", "TEXT", "TEXT", "INTEGER", @@ -42,20 +42,51 @@ "INTEGER", "INTEGER", "INTEGER", "INTEGER", "INTEGER", "INTEGER", "TEXT", "INTEGER", "INTEGER", "REAL", "REAL", ] -assert len(DB_FIELDS) == len(DB_TYPES) -# Properties by which to differentiate results per commit: -KEY_PROPERTIES = [ +# All test-backend-ops SQL fields +TEST_BACKEND_OPS_DB_FIELDS = [ + "test_time", "build_commit", "build_number", "backend_name", "op_name", "op_params", "test_mode", + "supported", "passed", "error_message", "time_us", "flops", "bandwidth_gb_s", "memory_kb", "n_runs" +] + +TEST_BACKEND_OPS_DB_TYPES = [ + "TEXT", "TEXT", "INTEGER", "TEXT", "TEXT", "TEXT", "TEXT", + "INTEGER", "INTEGER", "TEXT", "REAL", "REAL", "REAL", "INTEGER", "INTEGER" +] + +# Legacy aliases for backward compatibility +DB_FIELDS = LLAMA_BENCH_DB_FIELDS +DB_TYPES = LLAMA_BENCH_DB_TYPES + +assert len(LLAMA_BENCH_DB_FIELDS) == len(LLAMA_BENCH_DB_TYPES) +assert len(TEST_BACKEND_OPS_DB_FIELDS) == len(TEST_BACKEND_OPS_DB_TYPES) + +# Properties by which to differentiate results per commit for llama-bench: +LLAMA_BENCH_KEY_PROPERTIES = [ "cpu_info", "gpu_info", "backends", "n_gpu_layers", "tensor_buft_overrides", "model_filename", "model_type", "n_batch", "n_ubatch", "embeddings", "cpu_mask", "cpu_strict", "poll", "n_threads", "type_k", "type_v", "use_mmap", "no_kv_offload", "split_mode", "main_gpu", "tensor_split", "flash_attn", "n_prompt", "n_gen", "n_depth" ] -# Properties that are boolean and are converted to Yes/No for the table: -BOOL_PROPERTIES = ["embeddings", "cpu_strict", "use_mmap", "no_kv_offload", "flash_attn"] +# Properties by which to differentiate results per commit for test-backend-ops: +TEST_BACKEND_OPS_KEY_PROPERTIES = [ + "backend_name", "op_name", "op_params", "test_mode" +] + +# Legacy alias for backward compatibility +KEY_PROPERTIES = LLAMA_BENCH_KEY_PROPERTIES -# Header names for the table: -PRETTY_NAMES = { +# Properties that are boolean and are converted to Yes/No for the table (llama-bench): +LLAMA_BENCH_BOOL_PROPERTIES = ["embeddings", "cpu_strict", "use_mmap", "no_kv_offload", "flash_attn"] + +# Properties that are boolean and are converted to Yes/No for the table (test-backend-ops): +TEST_BACKEND_OPS_BOOL_PROPERTIES = ["supported", "passed"] + +# Legacy alias for backward compatibility +BOOL_PROPERTIES = LLAMA_BENCH_BOOL_PROPERTIES + +# Header names for the table (llama-bench): +LLAMA_BENCH_PRETTY_NAMES = { "cpu_info": "CPU", "gpu_info": "GPU", "backends": "Backends", "n_gpu_layers": "GPU layers", "tensor_buft_overrides": "Tensor overrides", "model_filename": "File", "model_type": "Model", "model_size": "Model size [GiB]", "model_n_params": "Num. of par.", "n_batch": "Batch size", "n_ubatch": "Microbatch size", "embeddings": "Embeddings", @@ -64,13 +95,31 @@ "flash_attn": "FlashAttention", } -DEFAULT_SHOW = ["model_type"] # Always show these properties by default. -DEFAULT_HIDE = ["model_filename"] # Always hide these properties by default. +# Header names for the table (test-backend-ops): +TEST_BACKEND_OPS_PRETTY_NAMES = { + "backend_name": "Backend", "op_name": "Operation", "op_params": "Parameters", "test_mode": "Mode", + "supported": "Supported", "passed": "Passed", "error_message": "Error", + "flops": "FLOPS", "bandwidth_gb_s": "Bandwidth (GB/s)", "memory_kb": "Memory (KB)", "n_runs": "Runs" +} + +# Legacy alias for backward compatibility +PRETTY_NAMES = LLAMA_BENCH_PRETTY_NAMES + +DEFAULT_SHOW_LLAMA_BENCH = ["model_type"] # Always show these properties by default. +DEFAULT_HIDE_LLAMA_BENCH = ["model_filename"] # Always hide these properties by default. + +DEFAULT_SHOW_TEST_BACKEND_OPS = ["backend_name", "op_name"] # Always show these properties by default. +DEFAULT_HIDE_TEST_BACKEND_OPS = ["error_message"] # Always hide these properties by default. + +# Legacy aliases for backward compatibility +DEFAULT_SHOW = DEFAULT_SHOW_LLAMA_BENCH +DEFAULT_HIDE = DEFAULT_HIDE_LLAMA_BENCH GPU_NAME_STRIP = ["NVIDIA GeForce ", "Tesla ", "AMD Radeon "] # Strip prefixes for smaller tables. MODEL_SUFFIX_REPLACE = {" - Small": "_S", " - Medium": "_M", " - Large": "_L"} -DESCRIPTION = """Creates tables from llama-bench data written to multiple JSON/CSV files, a single JSONL file or SQLite database. Example usage (Linux): +DESCRIPTION = """Creates tables from llama-bench or test-backend-ops data written to multiple JSON/CSV files, a single JSONL file or SQLite database. Example usage (Linux): +For llama-bench: $ git checkout master $ make clean && make llama-bench $ ./llama-bench -o sql | sqlite3 llama-bench.sqlite @@ -79,6 +128,15 @@ $ ./llama-bench -o sql | sqlite3 llama-bench.sqlite $ ./scripts/compare-llama-bench.py +For test-backend-ops: +$ git checkout master +$ make clean && make test-backend-ops +$ ./test-backend-ops perf --output sql | sqlite3 test-backend-ops.sqlite +$ git checkout some_branch +$ make clean && make test-backend-ops +$ ./test-backend-ops perf --output sql | sqlite3 test-backend-ops.sqlite +$ ./scripts/compare-llama-bench.py --tool test-backend-ops -i test-backend-ops.sqlite + Performance numbers from multiple runs per commit are averaged WITHOUT being weighted by the --repetitions parameter of llama-bench. """ @@ -96,6 +154,12 @@ "Defaults to the non-master commit for which llama-bench was run most recently." ) parser.add_argument("-c", "--compare", help=help_c) +help_t = ( + "The tool whose data is being compared. " + "Either 'llama-bench' (default) or 'test-backend-ops'. " + "This determines the database schema and comparison logic used." +) +parser.add_argument("-t", "--tool", help=help_t, default="llama-bench", choices=["llama-bench", "test-backend-ops"]) help_i = ( "JSON/JSONL/SQLite/CSV files for comparing commits. " "Specify multiple times to use multiple input files (JSON/CSV only). " @@ -142,8 +206,14 @@ sys.exit(1) input_file = known_args.input -if not input_file and os.path.exists("./llama-bench.sqlite"): - input_file = ["llama-bench.sqlite"] +tool = known_args.tool + +if not input_file: + if tool == "llama-bench" and os.path.exists("./llama-bench.sqlite"): + input_file = ["llama-bench.sqlite"] + elif tool == "test-backend-ops" and os.path.exists("./test-backend-ops.sqlite"): + input_file = ["test-backend-ops.sqlite"] + if not input_file: sqlite_files = glob("*.sqlite") if len(sqlite_files) == 1: @@ -161,14 +231,21 @@ class LlamaBenchData: build_len_max: int build_len: int = 8 builds: list[str] = [] - check_keys = set(KEY_PROPERTIES + ["build_commit", "test_time", "avg_ts"]) + tool: str = "llama-bench" # Tool type: "llama-bench" or "test-backend-ops" - def __init__(self): + def __init__(self, tool: str = "llama-bench"): + self.tool = tool try: self.repo = git.Repo(".", search_parent_directories=True) except git.InvalidGitRepositoryError: self.repo = None + # Set schema-specific properties based on tool + if self.tool == "llama-bench": + self.check_keys = set(LLAMA_BENCH_KEY_PROPERTIES + ["build_commit", "test_time", "avg_ts"]) + else: # test-backend-ops + self.check_keys = set(TEST_BACKEND_OPS_KEY_PROPERTIES + ["build_commit", "test_time"]) + def _builds_init(self): self.build_len = self.build_len_min @@ -252,52 +329,105 @@ def get_rows(self, properties: list[str], hexsha8_baseline: str, hexsha8_compare class LlamaBenchDataSQLite3(LlamaBenchData): connection: sqlite3.Connection cursor: sqlite3.Cursor + table_name: str - def __init__(self): - super().__init__() + def __init__(self, tool: str = "llama-bench"): + super().__init__(tool) self.connection = sqlite3.connect(":memory:") self.cursor = self.connection.cursor() - self.cursor.execute(f"CREATE TABLE test({', '.join(' '.join(x) for x in zip(DB_FIELDS, DB_TYPES))});") + + # Set table name and schema based on tool + if self.tool == "llama-bench": + self.table_name = "test" + db_fields = LLAMA_BENCH_DB_FIELDS + db_types = LLAMA_BENCH_DB_TYPES + else: # test-backend-ops + self.table_name = "test_backend_ops" + db_fields = TEST_BACKEND_OPS_DB_FIELDS + db_types = TEST_BACKEND_OPS_DB_TYPES + + self.cursor.execute(f"CREATE TABLE {self.table_name}({', '.join(' '.join(x) for x in zip(db_fields, db_types))});") def _builds_init(self): if self.connection: - self.build_len_min = self.cursor.execute("SELECT MIN(LENGTH(build_commit)) from test;").fetchone()[0] - self.build_len_max = self.cursor.execute("SELECT MAX(LENGTH(build_commit)) from test;").fetchone()[0] + self.build_len_min = self.cursor.execute(f"SELECT MIN(LENGTH(build_commit)) from {self.table_name};").fetchone()[0] + self.build_len_max = self.cursor.execute(f"SELECT MAX(LENGTH(build_commit)) from {self.table_name};").fetchone()[0] if self.build_len_min != self.build_len_max: logger.warning("Data contains commit hashes of differing lengths. It's possible that the wrong commits will be compared. " "Try purging the the database of old commits.") - self.cursor.execute(f"UPDATE test SET build_commit = SUBSTRING(build_commit, 1, {self.build_len_min});") + self.cursor.execute(f"UPDATE {self.table_name} SET build_commit = SUBSTRING(build_commit, 1, {self.build_len_min});") - builds = self.cursor.execute("SELECT DISTINCT build_commit FROM test;").fetchall() + builds = self.cursor.execute(f"SELECT DISTINCT build_commit FROM {self.table_name};").fetchall() self.builds = list(map(lambda b: b[0], builds)) # list[tuple[str]] -> list[str] super()._builds_init() def builds_timestamp(self, reverse: bool = False) -> Union[Iterator[tuple], Sequence[tuple]]: data = self.cursor.execute( - "SELECT build_commit, test_time FROM test ORDER BY test_time;").fetchall() + f"SELECT build_commit, test_time FROM {self.table_name} ORDER BY test_time;").fetchall() return reversed(data) if reverse else data def get_rows(self, properties: list[str], hexsha8_baseline: str, hexsha8_compare: str) -> Sequence[tuple]: + if self.tool == "llama-bench": + return self._get_rows_llama_bench(properties, hexsha8_baseline, hexsha8_compare) + else: # test-backend-ops + return self._get_rows_test_backend_ops(properties, hexsha8_baseline, hexsha8_compare) + + def _get_rows_llama_bench(self, properties: list[str], hexsha8_baseline: str, hexsha8_compare: str) -> Sequence[tuple]: select_string = ", ".join( [f"tb.{p}" for p in properties] + ["tb.n_prompt", "tb.n_gen", "tb.n_depth", "AVG(tb.avg_ts)", "AVG(tc.avg_ts)"]) equal_string = " AND ".join( - [f"tb.{p} = tc.{p}" for p in KEY_PROPERTIES] + [ + [f"tb.{p} = tc.{p}" for p in LLAMA_BENCH_KEY_PROPERTIES] + [ f"tb.build_commit = '{hexsha8_baseline}'", f"tc.build_commit = '{hexsha8_compare}'"] ) group_order_string = ", ".join([f"tb.{p}" for p in properties] + ["tb.n_gen", "tb.n_prompt", "tb.n_depth"]) - query = (f"SELECT {select_string} FROM test tb JOIN test tc ON {equal_string} " + query = (f"SELECT {select_string} FROM {self.table_name} tb JOIN {self.table_name} tc ON {equal_string} " + f"GROUP BY {group_order_string} ORDER BY {group_order_string};") + return self.cursor.execute(query).fetchall() + + def _get_rows_test_backend_ops(self, properties: list[str], hexsha8_baseline: str, hexsha8_compare: str) -> Sequence[tuple]: + # For test-backend-ops, we compare FLOPS and bandwidth metrics (prioritizing FLOPS over bandwidth) + select_string = ", ".join( + [f"tb.{p}" for p in properties] + [ + "AVG(tb.flops)", "AVG(tc.flops)", + "AVG(tb.bandwidth_gb_s)", "AVG(tc.bandwidth_gb_s)" + ]) + equal_string = " AND ".join( + [f"tb.{p} = tc.{p}" for p in TEST_BACKEND_OPS_KEY_PROPERTIES] + [ + f"tb.build_commit = '{hexsha8_baseline}'", f"tc.build_commit = '{hexsha8_compare}'", + "tb.supported = 1", "tc.supported = 1", "tb.passed = 1", "tc.passed = 1"] # Only compare successful tests + ) + group_order_string = ", ".join([f"tb.{p}" for p in properties]) + query = (f"SELECT {select_string} FROM {self.table_name} tb JOIN {self.table_name} tc ON {equal_string} " f"GROUP BY {group_order_string} ORDER BY {group_order_string};") return self.cursor.execute(query).fetchall() class LlamaBenchDataSQLite3File(LlamaBenchDataSQLite3): - def __init__(self, data_file: str): - super().__init__() + def __init__(self, data_file: str, tool: str = "llama-bench"): + super().__init__(tool) self.connection.close() self.connection = sqlite3.connect(data_file) self.cursor = self.connection.cursor() + + # Check which table exists in the database + tables = self.cursor.execute("SELECT name FROM sqlite_master WHERE type='table';").fetchall() + table_names = [table[0] for table in tables] + + if "test_backend_ops" in table_names and tool == "test-backend-ops": + self.table_name = "test_backend_ops" + elif "test" in table_names and tool == "llama-bench": + self.table_name = "test" + elif "test" in table_names: + # Fallback to test table for backward compatibility + self.table_name = "test" + if tool == "test-backend-ops": + logger.warning("test-backend-ops tool specified but only 'test' table found. Assuming llama-bench data.") + self.tool = "llama-bench" + else: + raise RuntimeError(f"No suitable table found for tool '{tool}' in database. Available tables: {table_names}") + self._builds_init() @staticmethod @@ -317,20 +447,23 @@ def valid_format(data_file: str) -> bool: class LlamaBenchDataJSONL(LlamaBenchDataSQLite3): - def __init__(self, data_file: str): - super().__init__() + def __init__(self, data_file: str, tool: str = "llama-bench"): + super().__init__(tool) + + # Get the appropriate field list based on tool + db_fields = LLAMA_BENCH_DB_FIELDS if tool == "llama-bench" else TEST_BACKEND_OPS_DB_FIELDS with open(data_file, "r", encoding="utf-8") as fp: for i, line in enumerate(fp): parsed = json.loads(line) - for k in parsed.keys() - set(DB_FIELDS): + for k in parsed.keys() - set(db_fields): del parsed[k] if (missing_keys := self._check_keys(parsed.keys())): raise RuntimeError(f"Missing required data key(s) at line {i + 1}: {', '.join(missing_keys)}") - self.cursor.execute(f"INSERT INTO test({', '.join(parsed.keys())}) VALUES({', '.join('?' * len(parsed))});", tuple(parsed.values())) + self.cursor.execute(f"INSERT INTO {self.table_name}({', '.join(parsed.keys())}) VALUES({', '.join('?' * len(parsed))});", tuple(parsed.values())) self._builds_init() @@ -349,21 +482,24 @@ def valid_format(data_file: str) -> bool: class LlamaBenchDataJSON(LlamaBenchDataSQLite3): - def __init__(self, data_files: list[str]): - super().__init__() + def __init__(self, data_files: list[str], tool: str = "llama-bench"): + super().__init__(tool) + + # Get the appropriate field list based on tool + db_fields = LLAMA_BENCH_DB_FIELDS if tool == "llama-bench" else TEST_BACKEND_OPS_DB_FIELDS for data_file in data_files: with open(data_file, "r", encoding="utf-8") as fp: parsed = json.load(fp) for i, entry in enumerate(parsed): - for k in entry.keys() - set(DB_FIELDS): + for k in entry.keys() - set(db_fields): del entry[k] if (missing_keys := self._check_keys(entry.keys())): raise RuntimeError(f"Missing required data key(s) at entry {i + 1}: {', '.join(missing_keys)}") - self.cursor.execute(f"INSERT INTO test({', '.join(entry.keys())}) VALUES({', '.join('?' * len(entry))});", tuple(entry.values())) + self.cursor.execute(f"INSERT INTO {self.table_name}({', '.join(entry.keys())}) VALUES({', '.join('?' * len(entry))});", tuple(entry.values())) self._builds_init() @@ -384,21 +520,24 @@ def valid_format(data_files: list[str]) -> bool: class LlamaBenchDataCSV(LlamaBenchDataSQLite3): - def __init__(self, data_files: list[str]): - super().__init__() + def __init__(self, data_files: list[str], tool: str = "llama-bench"): + super().__init__(tool) + + # Get the appropriate field list based on tool + db_fields = LLAMA_BENCH_DB_FIELDS if tool == "llama-bench" else TEST_BACKEND_OPS_DB_FIELDS for data_file in data_files: with open(data_file, "r", encoding="utf-8") as fp: for i, parsed in enumerate(csv.DictReader(fp)): keys = set(parsed.keys()) - for k in keys - set(DB_FIELDS): + for k in keys - set(db_fields): del parsed[k] if (missing_keys := self._check_keys(keys)): raise RuntimeError(f"Missing required data key(s) at line {i + 1}: {', '.join(missing_keys)}") - self.cursor.execute(f"INSERT INTO test({', '.join(parsed.keys())}) VALUES({', '.join('?' * len(parsed))});", tuple(parsed.values())) + self.cursor.execute(f"INSERT INTO {self.table_name}({', '.join(parsed.keys())}) VALUES({', '.join('?' * len(parsed))});", tuple(parsed.values())) self._builds_init() @@ -419,21 +558,90 @@ def valid_format(data_files: list[str]) -> bool: return True +def format_flops(flops_value: float) -> str: + """Format FLOPS values with appropriate units for better readability.""" + if flops_value == 0: + return "0.00" + + # Define unit thresholds and names + units = [ + (1e12, "T"), # TeraFLOPS + (1e9, "G"), # GigaFLOPS + (1e6, "M"), # MegaFLOPS + (1e3, "k"), # kiloFLOPS + (1, "") # FLOPS + ] + + for threshold, unit in units: + if abs(flops_value) >= threshold: + formatted_value = flops_value / threshold + if formatted_value >= 100: + return f"{formatted_value:.1f}{unit}" + else: + return f"{formatted_value:.2f}{unit}" + + # Fallback for very small values + return f"{flops_value:.2f}" + + +def format_flops_for_table(flops_value: float, target_unit: str) -> str: + """Format FLOPS values for table display without unit suffix (since unit is in header).""" + if flops_value == 0: + return "0.00" + + # Define unit thresholds based on target unit + unit_divisors = { + "TFLOPS": 1e12, + "GFLOPS": 1e9, + "MFLOPS": 1e6, + "kFLOPS": 1e3, + "FLOPS": 1 + } + + divisor = unit_divisors.get(target_unit, 1) + formatted_value = flops_value / divisor + + if formatted_value >= 100: + return f"{formatted_value:.1f}" + else: + return f"{formatted_value:.2f}" + + +def get_flops_unit_name(flops_values: list) -> str: + """Determine the best FLOPS unit name based on the magnitude of values.""" + if not flops_values or all(v == 0 for v in flops_values): + return "FLOPS" + + # Find the maximum absolute value to determine appropriate unit + max_flops = max(abs(v) for v in flops_values if v != 0) + + if max_flops >= 1e12: + return "TFLOPS" + elif max_flops >= 1e9: + return "GFLOPS" + elif max_flops >= 1e6: + return "MFLOPS" + elif max_flops >= 1e3: + return "kFLOPS" + else: + return "FLOPS" + + bench_data = None if len(input_file) == 1: if LlamaBenchDataSQLite3File.valid_format(input_file[0]): - bench_data = LlamaBenchDataSQLite3File(input_file[0]) + bench_data = LlamaBenchDataSQLite3File(input_file[0], tool) elif LlamaBenchDataJSON.valid_format(input_file): - bench_data = LlamaBenchDataJSON(input_file) + bench_data = LlamaBenchDataJSON(input_file, tool) elif LlamaBenchDataJSONL.valid_format(input_file[0]): - bench_data = LlamaBenchDataJSONL(input_file[0]) + bench_data = LlamaBenchDataJSONL(input_file[0], tool) elif LlamaBenchDataCSV.valid_format(input_file): - bench_data = LlamaBenchDataCSV(input_file) + bench_data = LlamaBenchDataCSV(input_file, tool) else: if LlamaBenchDataJSON.valid_format(input_file): - bench_data = LlamaBenchDataJSON(input_file) + bench_data = LlamaBenchDataJSON(input_file, tool) elif LlamaBenchDataCSV.valid_format(input_file): - bench_data = LlamaBenchDataCSV(input_file) + bench_data = LlamaBenchDataCSV(input_file, tool) if not bench_data: raise RuntimeError("No valid (or some invalid) input files found.") @@ -504,12 +712,27 @@ def valid_format(data_files: list[str]) -> bool: name_compare = bench_data.get_commit_name(hexsha8_compare) +# Get tool-specific configuration +if tool == "llama-bench": + key_properties = LLAMA_BENCH_KEY_PROPERTIES + bool_properties = LLAMA_BENCH_BOOL_PROPERTIES + pretty_names = LLAMA_BENCH_PRETTY_NAMES + default_show = DEFAULT_SHOW_LLAMA_BENCH + default_hide = DEFAULT_HIDE_LLAMA_BENCH +else: # test-backend-ops + key_properties = TEST_BACKEND_OPS_KEY_PROPERTIES + bool_properties = TEST_BACKEND_OPS_BOOL_PROPERTIES + pretty_names = TEST_BACKEND_OPS_PRETTY_NAMES + default_show = DEFAULT_SHOW_TEST_BACKEND_OPS + default_hide = DEFAULT_HIDE_TEST_BACKEND_OPS + # If the user provided columns to group the results by, use them: if known_args.show is not None: show = known_args.show.split(",") unknown_cols = [] for prop in show: - if prop not in KEY_PROPERTIES[:-3]: # Last three values are n_prompt, n_gen, n_depth. + valid_props = key_properties if tool == "test-backend-ops" else key_properties[:-3] # Exclude n_prompt, n_gen, n_depth for llama-bench + if prop not in valid_props: unknown_cols.append(prop) if unknown_cols: logger.error(f"Unknown values for --show: {', '.join(unknown_cols)}") @@ -518,32 +741,50 @@ def valid_format(data_files: list[str]) -> bool: rows_show = bench_data.get_rows(show, hexsha8_baseline, hexsha8_compare) # Otherwise, select those columns where the values are not all the same: else: - rows_full = bench_data.get_rows(KEY_PROPERTIES, hexsha8_baseline, hexsha8_compare) + rows_full = bench_data.get_rows(key_properties, hexsha8_baseline, hexsha8_compare) properties_different = [] - for i, kp_i in enumerate(KEY_PROPERTIES): - if kp_i in DEFAULT_SHOW or kp_i in ["n_prompt", "n_gen", "n_depth"]: - continue - for row_full in rows_full: - if row_full[i] != rows_full[0][i]: - properties_different.append(kp_i) - break + + if tool == "llama-bench": + # For llama-bench, skip n_prompt, n_gen, n_depth from differentiation logic + check_properties = [kp for kp in key_properties if kp not in ["n_prompt", "n_gen", "n_depth"]] + for i, kp_i in enumerate(key_properties): + if kp_i in default_show or kp_i in ["n_prompt", "n_gen", "n_depth"]: + continue + for row_full in rows_full: + if row_full[i] != rows_full[0][i]: + properties_different.append(kp_i) + break + else: # test-backend-ops + # For test-backend-ops, check all key properties + for i, kp_i in enumerate(key_properties): + if kp_i in default_show: + continue + for row_full in rows_full: + if row_full[i] != rows_full[0][i]: + properties_different.append(kp_i) + break show = [] - # Show CPU and/or GPU by default even if the hardware for all results is the same: - if rows_full and "n_gpu_layers" not in properties_different: - ngl = int(rows_full[0][KEY_PROPERTIES.index("n_gpu_layers")]) - if ngl != 99 and "cpu_info" not in properties_different: - show.append("cpu_info") + if tool == "llama-bench": + # Show CPU and/or GPU by default even if the hardware for all results is the same: + if rows_full and "n_gpu_layers" not in properties_different: + ngl = int(rows_full[0][key_properties.index("n_gpu_layers")]) - show += properties_different + if ngl != 99 and "cpu_info" not in properties_different: + show.append("cpu_info") - index_default = 0 - for prop in ["cpu_info", "gpu_info", "n_gpu_layers", "main_gpu"]: - if prop in show: - index_default += 1 - show = show[:index_default] + DEFAULT_SHOW + show[index_default:] - for prop in DEFAULT_HIDE: + show += properties_different + + index_default = 0 + for prop in ["cpu_info", "gpu_info", "n_gpu_layers", "main_gpu"]: + if prop in show: + index_default += 1 + show = show[:index_default] + default_show + show[index_default:] + else: # test-backend-ops + show = default_show + properties_different + + for prop in default_hide: try: show.remove(prop) except ValueError: @@ -551,7 +792,7 @@ def valid_format(data_files: list[str]) -> bool: # Add plot_x parameter to parameters to show if it's not already present: if known_args.plot: - for k, v in PRETTY_NAMES.items(): + for k, v in pretty_names.items(): if v == known_args.plot_x and k not in show: show.append(k) break @@ -563,60 +804,115 @@ def valid_format(data_files: list[str]) -> bool: sys.exit(1) table = [] -for row in rows_show: - n_prompt = int(row[-5]) - n_gen = int(row[-4]) - n_depth = int(row[-3]) - if n_prompt != 0 and n_gen == 0: - test_name = f"pp{n_prompt}" - elif n_prompt == 0 and n_gen != 0: - test_name = f"tg{n_gen}" - else: - test_name = f"pp{n_prompt}+tg{n_gen}" - if n_depth != 0: - test_name = f"{test_name}@d{n_depth}" - # Regular columns test name avg t/s values Speedup - # VVVVVVVVVVVVV VVVVVVVVV VVVVVVVVVVVVVV VVVVVVV - table.append(list(row[:-5]) + [test_name] + list(row[-2:]) + [float(row[-1]) / float(row[-2])]) +primary_metric = "FLOPS" # Default to FLOPS for test-backend-ops + +if tool == "llama-bench": + # For llama-bench, create test names and compare avg_ts values + for row in rows_show: + n_prompt = int(row[-5]) + n_gen = int(row[-4]) + n_depth = int(row[-3]) + if n_prompt != 0 and n_gen == 0: + test_name = f"pp{n_prompt}" + elif n_prompt == 0 and n_gen != 0: + test_name = f"tg{n_gen}" + else: + test_name = f"pp{n_prompt}+tg{n_gen}" + if n_depth != 0: + test_name = f"{test_name}@d{n_depth}" + # Regular columns test name avg t/s values Speedup + # VVVVVVVVVVVVV VVVVVVVVV VVVVVVVVVVVVVV VVVVVVV + table.append(list(row[:-5]) + [test_name] + list(row[-2:]) + [float(row[-1]) / float(row[-2])]) +else: # test-backend-ops + # Determine the primary metric by checking rows until we find one with valid data + if rows_show: + primary_metric = "FLOPS" # Default to FLOPS + flops_values = [] + + # Collect all FLOPS values to determine the best unit + for sample_row in rows_show: + baseline_flops = float(sample_row[-4]) + compare_flops = float(sample_row[-3]) + baseline_bandwidth = float(sample_row[-2]) + + if baseline_flops > 0: + flops_values.extend([baseline_flops, compare_flops]) + elif baseline_bandwidth > 0 and not flops_values: + primary_metric = "Bandwidth (GB/s)" + + # If we have FLOPS data, determine the appropriate unit + if flops_values: + primary_metric = get_flops_unit_name(flops_values) + + # For test-backend-ops, prioritize FLOPS > bandwidth for comparison + for row in rows_show: + # Extract metrics: flops, bandwidth_gb_s (baseline and compare) + baseline_flops = float(row[-4]) + compare_flops = float(row[-3]) + baseline_bandwidth = float(row[-2]) + compare_bandwidth = float(row[-1]) + + # Determine which metric to use for comparison (prioritize FLOPS > bandwidth) + if baseline_flops > 0 and compare_flops > 0: + # Use FLOPS comparison (higher is better) + speedup = compare_flops / baseline_flops + baseline_str = format_flops_for_table(baseline_flops, primary_metric) + compare_str = format_flops_for_table(compare_flops, primary_metric) + elif baseline_bandwidth > 0 and compare_bandwidth > 0: + # Use bandwidth comparison (higher is better) + speedup = compare_bandwidth / baseline_bandwidth + baseline_str = f"{baseline_bandwidth:.2f}" + compare_str = f"{compare_bandwidth:.2f}" + else: + # Fallback if no valid data is available + baseline_str = "N/A" + compare_str = "N/A" + speedup = 1.0 + + table.append(list(row[:-4]) + [baseline_str, compare_str, speedup]) # Some a-posteriori fixes to make the table contents prettier: -for bool_property in BOOL_PROPERTIES: +for bool_property in bool_properties: if bool_property in show: ip = show.index(bool_property) for row_table in table: row_table[ip] = "Yes" if int(row_table[ip]) == 1 else "No" -if "model_type" in show: - ip = show.index("model_type") - for (old, new) in MODEL_SUFFIX_REPLACE.items(): - for row_table in table: - row_table[ip] = row_table[ip].replace(old, new) +if tool == "llama-bench": + if "model_type" in show: + ip = show.index("model_type") + for (old, new) in MODEL_SUFFIX_REPLACE.items(): + for row_table in table: + row_table[ip] = row_table[ip].replace(old, new) -if "model_size" in show: - ip = show.index("model_size") - for row_table in table: - row_table[ip] = float(row_table[ip]) / 1024 ** 3 + if "model_size" in show: + ip = show.index("model_size") + for row_table in table: + row_table[ip] = float(row_table[ip]) / 1024 ** 3 -if "gpu_info" in show: - ip = show.index("gpu_info") - for row_table in table: - for gns in GPU_NAME_STRIP: - row_table[ip] = row_table[ip].replace(gns, "") + if "gpu_info" in show: + ip = show.index("gpu_info") + for row_table in table: + for gns in GPU_NAME_STRIP: + row_table[ip] = row_table[ip].replace(gns, "") - gpu_names = row_table[ip].split(", ") - num_gpus = len(gpu_names) - all_names_the_same = len(set(gpu_names)) == 1 - if len(gpu_names) >= 2 and all_names_the_same: - row_table[ip] = f"{num_gpus}x {gpu_names[0]}" + gpu_names = row_table[ip].split(", ") + num_gpus = len(gpu_names) + all_names_the_same = len(set(gpu_names)) == 1 + if len(gpu_names) >= 2 and all_names_the_same: + row_table[ip] = f"{num_gpus}x {gpu_names[0]}" -headers = [PRETTY_NAMES[p] for p in show] -headers += ["Test", f"t/s {name_baseline}", f"t/s {name_compare}", "Speedup"] +headers = [pretty_names.get(p, p) for p in show] +if tool == "llama-bench": + headers += ["Test", f"t/s {name_baseline}", f"t/s {name_compare}", "Speedup"] +else: # test-backend-ops + headers += [f"{primary_metric} {name_baseline}", f"{primary_metric} {name_compare}", "Speedup"] if known_args.plot: - def create_performance_plot(table_data: list[list[str]], headers: list[str], baseline_name: str, compare_name: str, output_file: str, plot_x_param: str, log_scale: bool = False): + def create_performance_plot(table_data: list[list[str]], headers: list[str], baseline_name: str, compare_name: str, output_file: str, plot_x_param: str, log_scale: bool = False, tool_type: str = "llama-bench", metric_name: str = "t/s"): try: - import matplotlib.pyplot as plt import matplotlib + import matplotlib.pyplot as plt matplotlib.use('Agg') except ImportError as e: logger.error("matplotlib is required for --plot.") @@ -746,8 +1042,14 @@ def make_axes(num_groups, max_cols=2, base_size=(8, 4)): title = ', '.join(title_parts) if title_parts else "Performance comparison" + # Determine y-axis label based on tool type + if tool_type == "llama-bench": + y_label = "Tokens per second (t/s)" + else: # test-backend-ops + y_label = metric_name + ax.set_xlabel(plot_x_label, fontsize=12, fontweight='bold') - ax.set_ylabel('Tokens per second (t/s)', fontsize=12, fontweight='bold') + ax.set_ylabel(y_label, fontsize=12, fontweight='bold') ax.set_title(title, fontsize=12, fontweight='bold') ax.legend(loc='best', fontsize=10) ax.grid(True, alpha=0.3) @@ -765,7 +1067,7 @@ def make_axes(num_groups, max_cols=2, base_size=(8, 4)): plt.savefig(output_file, dpi=300, bbox_inches='tight') plt.close() - create_performance_plot(table, headers, name_baseline, name_compare, known_args.plot, known_args.plot_x, known_args.plot_log_scale) + create_performance_plot(table, headers, name_baseline, name_compare, known_args.plot, known_args.plot_x, known_args.plot_log_scale, tool, primary_metric) print(tabulate( # noqa: NP100 table,