diff --git a/scripts/compare-commits.sh b/scripts/compare-commits.sh
index 94a8eceb302b9..c6fdaa7c33faa 100755
--- a/scripts/compare-commits.sh
+++ b/scripts/compare-commits.sh
@@ -1,19 +1,41 @@
 #!/bin/bash
 
 if [ $# -lt 2 ]; then
-    echo "usage: ./scripts/compare-commits.sh <commit1> <commit2> [additional llama-bench arguments]"
+    echo "usage: ./scripts/compare-commits.sh <commit1> <commit2> [tool] [additional arguments]"
+    echo "  tool: 'llama-bench' (default) or 'test-backend-ops'"
+    echo "  additional arguments: passed to the selected tool"
     exit 1
 fi
 
 set -e
 set -x
 
+# Parse arguments
+commit1=$1
+commit2=$2
+tool=${3:-llama-bench}
+additional_args="${@:4}"
+
+# Validate tool argument
+if [ "$tool" != "llama-bench" ] && [ "$tool" != "test-backend-ops" ]; then
+    echo "Error: tool must be 'llama-bench' or 'test-backend-ops'"
+    exit 1
+fi
+
 # verify at the start that the compare script has all the necessary dependencies installed
 ./scripts/compare-llama-bench.py --check
 
-bench_args="${@:3}"
+if [ "$tool" = "llama-bench" ]; then
+    db_file="llama-bench.sqlite"
+    target="llama-bench"
+    run_args="-o sql -oe md $additional_args"
+else  # test-backend-ops
+    db_file="test-backend-ops.sqlite"
+    target="test-backend-ops"
+    run_args="perf --output sql $additional_args"
+fi
 
-rm -f llama-bench.sqlite > /dev/null
+rm -f "$db_file" > /dev/null
 
 # to test a backend, call the script with the corresponding environment variable (e.g. GGML_CUDA=1 ./scripts/compare-commits.sh ...)
 if [ -n "$GGML_CUDA" ]; then
@@ -25,14 +47,14 @@ dir="build-bench"
 function run {
     rm -fr ${dir} > /dev/null
     cmake -B ${dir} -S . ${CMAKE_OPTS} > /dev/null
-    cmake --build ${dir} -t llama-bench > /dev/null
-    ${dir}/bin/llama-bench -o sql -oe md $bench_args | sqlite3 llama-bench.sqlite
+    cmake --build ${dir} -t $target > /dev/null
+    ${dir}/bin/$target $run_args | sqlite3 "$db_file"
 }
 
-git checkout $1 > /dev/null
+git checkout $commit1 > /dev/null
 run
 
-git checkout $2 > /dev/null
+git checkout $commit2 > /dev/null
 run
 
-./scripts/compare-llama-bench.py -b $1 -c $2
+./scripts/compare-llama-bench.py -b $commit1 -c $commit2 --tool $tool -i "$db_file"
diff --git a/scripts/compare-llama-bench.py b/scripts/compare-llama-bench.py
index 30e3cf8649e8a..3288df92caac9 100755
--- a/scripts/compare-llama-bench.py
+++ b/scripts/compare-llama-bench.py
@@ -1,16 +1,16 @@
 #!/usr/bin/env python3
 
-import logging
 import argparse
+import csv
 import heapq
-import sys
+import json
+import logging
 import os
-from glob import glob
 import sqlite3
-import json
-import csv
-from typing import Optional, Union
+import sys
 from collections.abc import Iterator, Sequence
+from glob import glob
+from typing import Optional, Union
 
 try:
     import git
@@ -23,7 +23,7 @@
 logger = logging.getLogger("compare-llama-bench")
 
 # All llama-bench SQL fields
-DB_FIELDS = [
+LLAMA_BENCH_DB_FIELDS = [
     "build_commit", "build_number", "cpu_info",       "gpu_info",   "backends",     "model_filename",
     "model_type",   "model_size",   "model_n_params", "n_batch",    "n_ubatch",     "n_threads",
     "cpu_mask",     "cpu_strict",   "poll",           "type_k",     "type_v",       "n_gpu_layers",
@@ -33,7 +33,7 @@
     "test_time",    "avg_ns",       "stddev_ns",      "avg_ts",     "stddev_ts",
 ]
 
-DB_TYPES = [
+LLAMA_BENCH_DB_TYPES = [
     "TEXT",    "INTEGER", "TEXT",    "TEXT",    "TEXT",    "TEXT",
     "TEXT",    "INTEGER", "INTEGER", "INTEGER", "INTEGER", "INTEGER",
     "TEXT",    "INTEGER", "INTEGER", "TEXT",    "TEXT",    "INTEGER",
@@ -42,20 +42,51 @@
     "INTEGER", "INTEGER", "INTEGER", "INTEGER", "INTEGER", "INTEGER",
     "TEXT",    "INTEGER", "INTEGER", "REAL",    "REAL",
 ]
-assert len(DB_FIELDS) == len(DB_TYPES)
 
-# Properties by which to differentiate results per commit:
-KEY_PROPERTIES = [
+# All test-backend-ops SQL fields
+TEST_BACKEND_OPS_DB_FIELDS = [
+    "test_time", "build_commit", "build_number", "backend_name", "op_name", "op_params", "test_mode",
+    "supported", "passed", "error_message", "time_us", "flops", "bandwidth_gb_s", "memory_kb", "n_runs"
+]
+
+TEST_BACKEND_OPS_DB_TYPES = [
+    "TEXT", "TEXT", "INTEGER", "TEXT", "TEXT", "TEXT", "TEXT",
+    "INTEGER", "INTEGER", "TEXT", "REAL", "REAL", "REAL", "INTEGER", "INTEGER"
+]
+
+# Legacy aliases for backward compatibility
+DB_FIELDS = LLAMA_BENCH_DB_FIELDS
+DB_TYPES = LLAMA_BENCH_DB_TYPES
+
+assert len(LLAMA_BENCH_DB_FIELDS) == len(LLAMA_BENCH_DB_TYPES)
+assert len(TEST_BACKEND_OPS_DB_FIELDS) == len(TEST_BACKEND_OPS_DB_TYPES)
+
+# Properties by which to differentiate results per commit for llama-bench:
+LLAMA_BENCH_KEY_PROPERTIES = [
     "cpu_info", "gpu_info", "backends", "n_gpu_layers", "tensor_buft_overrides", "model_filename", "model_type",
     "n_batch", "n_ubatch", "embeddings", "cpu_mask", "cpu_strict", "poll", "n_threads", "type_k", "type_v",
     "use_mmap", "no_kv_offload", "split_mode", "main_gpu", "tensor_split", "flash_attn", "n_prompt", "n_gen", "n_depth"
 ]
 
-# Properties that are boolean and are converted to Yes/No for the table:
-BOOL_PROPERTIES = ["embeddings", "cpu_strict", "use_mmap", "no_kv_offload", "flash_attn"]
+# Properties by which to differentiate results per commit for test-backend-ops:
+TEST_BACKEND_OPS_KEY_PROPERTIES = [
+    "backend_name", "op_name", "op_params", "test_mode"
+]
+
+# Legacy alias for backward compatibility
+KEY_PROPERTIES = LLAMA_BENCH_KEY_PROPERTIES
 
-# Header names for the table:
-PRETTY_NAMES = {
+# Properties that are boolean and are converted to Yes/No for the table (llama-bench):
+LLAMA_BENCH_BOOL_PROPERTIES = ["embeddings", "cpu_strict", "use_mmap", "no_kv_offload", "flash_attn"]
+
+# Properties that are boolean and are converted to Yes/No for the table (test-backend-ops):
+TEST_BACKEND_OPS_BOOL_PROPERTIES = ["supported", "passed"]
+
+# Legacy alias for backward compatibility
+BOOL_PROPERTIES = LLAMA_BENCH_BOOL_PROPERTIES
+
+# Header names for the table (llama-bench):
+LLAMA_BENCH_PRETTY_NAMES = {
     "cpu_info": "CPU", "gpu_info": "GPU", "backends": "Backends", "n_gpu_layers": "GPU layers",
     "tensor_buft_overrides": "Tensor overrides", "model_filename": "File", "model_type": "Model", "model_size": "Model size [GiB]",
     "model_n_params": "Num. of par.", "n_batch": "Batch size", "n_ubatch": "Microbatch size", "embeddings": "Embeddings",
@@ -64,13 +95,31 @@
     "flash_attn": "FlashAttention",
 }
 
-DEFAULT_SHOW = ["model_type"]  # Always show these properties by default.
-DEFAULT_HIDE = ["model_filename"]  # Always hide these properties by default.
+# Header names for the table (test-backend-ops):
+TEST_BACKEND_OPS_PRETTY_NAMES = {
+    "backend_name": "Backend", "op_name": "Operation", "op_params": "Parameters", "test_mode": "Mode",
+    "supported": "Supported", "passed": "Passed", "error_message": "Error",
+    "flops": "FLOPS", "bandwidth_gb_s": "Bandwidth (GB/s)", "memory_kb": "Memory (KB)", "n_runs": "Runs"
+}
+
+# Legacy alias for backward compatibility
+PRETTY_NAMES = LLAMA_BENCH_PRETTY_NAMES
+
+DEFAULT_SHOW_LLAMA_BENCH = ["model_type"]  # Always show these properties by default.
+DEFAULT_HIDE_LLAMA_BENCH = ["model_filename"]  # Always hide these properties by default.
+
+DEFAULT_SHOW_TEST_BACKEND_OPS = ["backend_name", "op_name"]  # Always show these properties by default.
+DEFAULT_HIDE_TEST_BACKEND_OPS = ["error_message"]  # Always hide these properties by default.
+
+# Legacy aliases for backward compatibility
+DEFAULT_SHOW = DEFAULT_SHOW_LLAMA_BENCH
+DEFAULT_HIDE = DEFAULT_HIDE_LLAMA_BENCH
 GPU_NAME_STRIP = ["NVIDIA GeForce ", "Tesla ", "AMD Radeon "]  # Strip prefixes for smaller tables.
 MODEL_SUFFIX_REPLACE = {" - Small": "_S", " - Medium": "_M", " - Large": "_L"}
 
-DESCRIPTION = """Creates tables from llama-bench data written to multiple JSON/CSV files, a single JSONL file or SQLite database. Example usage (Linux):
+DESCRIPTION = """Creates tables from llama-bench or test-backend-ops data written to multiple JSON/CSV files, a single JSONL file or SQLite database. Example usage (Linux):
 
+For llama-bench:
 $ git checkout master
 $ make clean && make llama-bench
 $ ./llama-bench -o sql | sqlite3 llama-bench.sqlite
@@ -79,6 +128,15 @@
 $ ./llama-bench -o sql | sqlite3 llama-bench.sqlite
 $ ./scripts/compare-llama-bench.py
 
+For test-backend-ops:
+$ git checkout master
+$ make clean && make test-backend-ops
+$ ./test-backend-ops perf --output sql | sqlite3 test-backend-ops.sqlite
+$ git checkout some_branch
+$ make clean && make test-backend-ops
+$ ./test-backend-ops perf --output sql | sqlite3 test-backend-ops.sqlite
+$ ./scripts/compare-llama-bench.py --tool test-backend-ops -i test-backend-ops.sqlite
+
 Performance numbers from multiple runs per commit are averaged WITHOUT being weighted by the --repetitions parameter of llama-bench.
 """
 
@@ -96,6 +154,12 @@
     "Defaults to the non-master commit for which llama-bench was run most recently."
 )
 parser.add_argument("-c", "--compare", help=help_c)
+help_t = (
+    "The tool whose data is being compared. "
+    "Either 'llama-bench' (default) or 'test-backend-ops'. "
+    "This determines the database schema and comparison logic used."
+)
+parser.add_argument("-t", "--tool", help=help_t, default="llama-bench", choices=["llama-bench", "test-backend-ops"])
 help_i = (
     "JSON/JSONL/SQLite/CSV files for comparing commits. "
     "Specify multiple times to use multiple input files (JSON/CSV only). "
@@ -142,8 +206,14 @@
     sys.exit(1)
 
 input_file = known_args.input
-if not input_file and os.path.exists("./llama-bench.sqlite"):
-    input_file = ["llama-bench.sqlite"]
+tool = known_args.tool
+
+if not input_file:
+    if tool == "llama-bench" and os.path.exists("./llama-bench.sqlite"):
+        input_file = ["llama-bench.sqlite"]
+    elif tool == "test-backend-ops" and os.path.exists("./test-backend-ops.sqlite"):
+        input_file = ["test-backend-ops.sqlite"]
+
 if not input_file:
     sqlite_files = glob("*.sqlite")
     if len(sqlite_files) == 1:
@@ -161,14 +231,21 @@ class LlamaBenchData:
     build_len_max: int
     build_len: int = 8
     builds: list[str] = []
-    check_keys = set(KEY_PROPERTIES + ["build_commit", "test_time", "avg_ts"])
+    tool: str = "llama-bench"  # Tool type: "llama-bench" or "test-backend-ops"
 
-    def __init__(self):
+    def __init__(self, tool: str = "llama-bench"):
+        self.tool = tool
         try:
             self.repo = git.Repo(".", search_parent_directories=True)
         except git.InvalidGitRepositoryError:
             self.repo = None
 
+        # Set schema-specific properties based on tool
+        if self.tool == "llama-bench":
+            self.check_keys = set(LLAMA_BENCH_KEY_PROPERTIES + ["build_commit", "test_time", "avg_ts"])
+        else:  # test-backend-ops
+            self.check_keys = set(TEST_BACKEND_OPS_KEY_PROPERTIES + ["build_commit", "test_time"])
+
     def _builds_init(self):
         self.build_len = self.build_len_min
 
@@ -252,52 +329,105 @@ def get_rows(self, properties: list[str], hexsha8_baseline: str, hexsha8_compare
 class LlamaBenchDataSQLite3(LlamaBenchData):
     connection: sqlite3.Connection
     cursor: sqlite3.Cursor
+    table_name: str
 
-    def __init__(self):
-        super().__init__()
+    def __init__(self, tool: str = "llama-bench"):
+        super().__init__(tool)
         self.connection = sqlite3.connect(":memory:")
         self.cursor = self.connection.cursor()
-        self.cursor.execute(f"CREATE TABLE test({', '.join(' '.join(x) for x in zip(DB_FIELDS, DB_TYPES))});")
+
+        # Set table name and schema based on tool
+        if self.tool == "llama-bench":
+            self.table_name = "test"
+            db_fields = LLAMA_BENCH_DB_FIELDS
+            db_types = LLAMA_BENCH_DB_TYPES
+        else:  # test-backend-ops
+            self.table_name = "test_backend_ops"
+            db_fields = TEST_BACKEND_OPS_DB_FIELDS
+            db_types = TEST_BACKEND_OPS_DB_TYPES
+
+        self.cursor.execute(f"CREATE TABLE {self.table_name}({', '.join(' '.join(x) for x in zip(db_fields, db_types))});")
 
     def _builds_init(self):
         if self.connection:
-            self.build_len_min = self.cursor.execute("SELECT MIN(LENGTH(build_commit)) from test;").fetchone()[0]
-            self.build_len_max = self.cursor.execute("SELECT MAX(LENGTH(build_commit)) from test;").fetchone()[0]
+            self.build_len_min = self.cursor.execute(f"SELECT MIN(LENGTH(build_commit)) from {self.table_name};").fetchone()[0]
+            self.build_len_max = self.cursor.execute(f"SELECT MAX(LENGTH(build_commit)) from {self.table_name};").fetchone()[0]
 
             if self.build_len_min != self.build_len_max:
                 logger.warning("Data contains commit hashes of differing lengths. It's possible that the wrong commits will be compared. "
                                "Try purging the the database of old commits.")
-                self.cursor.execute(f"UPDATE test SET build_commit = SUBSTRING(build_commit, 1, {self.build_len_min});")
+                self.cursor.execute(f"UPDATE {self.table_name} SET build_commit = SUBSTRING(build_commit, 1, {self.build_len_min});")
 
-            builds = self.cursor.execute("SELECT DISTINCT build_commit FROM test;").fetchall()
+            builds = self.cursor.execute(f"SELECT DISTINCT build_commit FROM {self.table_name};").fetchall()
             self.builds = list(map(lambda b: b[0], builds))  # list[tuple[str]] -> list[str]
         super()._builds_init()
 
     def builds_timestamp(self, reverse: bool = False) -> Union[Iterator[tuple], Sequence[tuple]]:
         data = self.cursor.execute(
-            "SELECT build_commit, test_time FROM test ORDER BY test_time;").fetchall()
+            f"SELECT build_commit, test_time FROM {self.table_name} ORDER BY test_time;").fetchall()
         return reversed(data) if reverse else data
 
     def get_rows(self, properties: list[str], hexsha8_baseline: str, hexsha8_compare: str) -> Sequence[tuple]:
+        if self.tool == "llama-bench":
+            return self._get_rows_llama_bench(properties, hexsha8_baseline, hexsha8_compare)
+        else:  # test-backend-ops
+            return self._get_rows_test_backend_ops(properties, hexsha8_baseline, hexsha8_compare)
+
+    def _get_rows_llama_bench(self, properties: list[str], hexsha8_baseline: str, hexsha8_compare: str) -> Sequence[tuple]:
         select_string = ", ".join(
             [f"tb.{p}" for p in properties] + ["tb.n_prompt", "tb.n_gen", "tb.n_depth", "AVG(tb.avg_ts)", "AVG(tc.avg_ts)"])
         equal_string = " AND ".join(
-            [f"tb.{p} = tc.{p}" for p in KEY_PROPERTIES] + [
+            [f"tb.{p} = tc.{p}" for p in LLAMA_BENCH_KEY_PROPERTIES] + [
                 f"tb.build_commit = '{hexsha8_baseline}'", f"tc.build_commit = '{hexsha8_compare}'"]
         )
         group_order_string = ", ".join([f"tb.{p}" for p in properties] + ["tb.n_gen", "tb.n_prompt", "tb.n_depth"])
-        query = (f"SELECT {select_string} FROM test tb JOIN test tc ON {equal_string} "
+        query = (f"SELECT {select_string} FROM {self.table_name} tb JOIN {self.table_name} tc ON {equal_string} "
+                 f"GROUP BY {group_order_string} ORDER BY {group_order_string};")
+        return self.cursor.execute(query).fetchall()
+
+    def _get_rows_test_backend_ops(self, properties: list[str], hexsha8_baseline: str, hexsha8_compare: str) -> Sequence[tuple]:
+        # For test-backend-ops, we compare FLOPS and bandwidth metrics (prioritizing FLOPS over bandwidth)
+        select_string = ", ".join(
+            [f"tb.{p}" for p in properties] + [
+                "AVG(tb.flops)", "AVG(tc.flops)",
+                "AVG(tb.bandwidth_gb_s)", "AVG(tc.bandwidth_gb_s)"
+            ])
+        equal_string = " AND ".join(
+            [f"tb.{p} = tc.{p}" for p in TEST_BACKEND_OPS_KEY_PROPERTIES] + [
+                f"tb.build_commit = '{hexsha8_baseline}'", f"tc.build_commit = '{hexsha8_compare}'",
+                "tb.supported = 1", "tc.supported = 1", "tb.passed = 1", "tc.passed = 1"]  # Only compare successful tests
+        )
+        group_order_string = ", ".join([f"tb.{p}" for p in properties])
+        query = (f"SELECT {select_string} FROM {self.table_name} tb JOIN {self.table_name} tc ON {equal_string} "
                  f"GROUP BY {group_order_string} ORDER BY {group_order_string};")
         return self.cursor.execute(query).fetchall()
 
 
 class LlamaBenchDataSQLite3File(LlamaBenchDataSQLite3):
-    def __init__(self, data_file: str):
-        super().__init__()
+    def __init__(self, data_file: str, tool: str = "llama-bench"):
+        super().__init__(tool)
 
         self.connection.close()
         self.connection = sqlite3.connect(data_file)
         self.cursor = self.connection.cursor()
+
+        # Check which table exists in the database
+        tables = self.cursor.execute("SELECT name FROM sqlite_master WHERE type='table';").fetchall()
+        table_names = [table[0] for table in tables]
+
+        if "test_backend_ops" in table_names and tool == "test-backend-ops":
+            self.table_name = "test_backend_ops"
+        elif "test" in table_names and tool == "llama-bench":
+            self.table_name = "test"
+        elif "test" in table_names:
+            # Fallback to test table for backward compatibility
+            self.table_name = "test"
+            if tool == "test-backend-ops":
+                logger.warning("test-backend-ops tool specified but only 'test' table found. Assuming llama-bench data.")
+                self.tool = "llama-bench"
+        else:
+            raise RuntimeError(f"No suitable table found for tool '{tool}' in database. Available tables: {table_names}")
+
         self._builds_init()
 
     @staticmethod
@@ -317,20 +447,23 @@ def valid_format(data_file: str) -> bool:
 
 
 class LlamaBenchDataJSONL(LlamaBenchDataSQLite3):
-    def __init__(self, data_file: str):
-        super().__init__()
+    def __init__(self, data_file: str, tool: str = "llama-bench"):
+        super().__init__(tool)
+
+        # Get the appropriate field list based on tool
+        db_fields = LLAMA_BENCH_DB_FIELDS if tool == "llama-bench" else TEST_BACKEND_OPS_DB_FIELDS
 
         with open(data_file, "r", encoding="utf-8") as fp:
             for i, line in enumerate(fp):
                 parsed = json.loads(line)
 
-                for k in parsed.keys() - set(DB_FIELDS):
+                for k in parsed.keys() - set(db_fields):
                     del parsed[k]
 
                 if (missing_keys := self._check_keys(parsed.keys())):
                     raise RuntimeError(f"Missing required data key(s) at line {i + 1}: {', '.join(missing_keys)}")
 
-                self.cursor.execute(f"INSERT INTO test({', '.join(parsed.keys())}) VALUES({', '.join('?' * len(parsed))});", tuple(parsed.values()))
+                self.cursor.execute(f"INSERT INTO {self.table_name}({', '.join(parsed.keys())}) VALUES({', '.join('?' * len(parsed))});", tuple(parsed.values()))
 
         self._builds_init()
 
@@ -349,21 +482,24 @@ def valid_format(data_file: str) -> bool:
 
 
 class LlamaBenchDataJSON(LlamaBenchDataSQLite3):
-    def __init__(self, data_files: list[str]):
-        super().__init__()
+    def __init__(self, data_files: list[str], tool: str = "llama-bench"):
+        super().__init__(tool)
+
+        # Get the appropriate field list based on tool
+        db_fields = LLAMA_BENCH_DB_FIELDS if tool == "llama-bench" else TEST_BACKEND_OPS_DB_FIELDS
 
         for data_file in data_files:
             with open(data_file, "r", encoding="utf-8") as fp:
                 parsed = json.load(fp)
 
                 for i, entry in enumerate(parsed):
-                    for k in entry.keys() - set(DB_FIELDS):
+                    for k in entry.keys() - set(db_fields):
                         del entry[k]
 
                     if (missing_keys := self._check_keys(entry.keys())):
                         raise RuntimeError(f"Missing required data key(s) at entry {i + 1}: {', '.join(missing_keys)}")
 
-                    self.cursor.execute(f"INSERT INTO test({', '.join(entry.keys())}) VALUES({', '.join('?' * len(entry))});", tuple(entry.values()))
+                    self.cursor.execute(f"INSERT INTO {self.table_name}({', '.join(entry.keys())}) VALUES({', '.join('?' * len(entry))});", tuple(entry.values()))
 
         self._builds_init()
 
@@ -384,21 +520,24 @@ def valid_format(data_files: list[str]) -> bool:
 
 
 class LlamaBenchDataCSV(LlamaBenchDataSQLite3):
-    def __init__(self, data_files: list[str]):
-        super().__init__()
+    def __init__(self, data_files: list[str], tool: str = "llama-bench"):
+        super().__init__(tool)
+
+        # Get the appropriate field list based on tool
+        db_fields = LLAMA_BENCH_DB_FIELDS if tool == "llama-bench" else TEST_BACKEND_OPS_DB_FIELDS
 
         for data_file in data_files:
             with open(data_file, "r", encoding="utf-8") as fp:
                 for i, parsed in enumerate(csv.DictReader(fp)):
                     keys = set(parsed.keys())
 
-                    for k in keys - set(DB_FIELDS):
+                    for k in keys - set(db_fields):
                         del parsed[k]
 
                     if (missing_keys := self._check_keys(keys)):
                         raise RuntimeError(f"Missing required data key(s) at line {i + 1}: {', '.join(missing_keys)}")
 
-                    self.cursor.execute(f"INSERT INTO test({', '.join(parsed.keys())}) VALUES({', '.join('?' * len(parsed))});", tuple(parsed.values()))
+                    self.cursor.execute(f"INSERT INTO {self.table_name}({', '.join(parsed.keys())}) VALUES({', '.join('?' * len(parsed))});", tuple(parsed.values()))
 
         self._builds_init()
 
@@ -419,21 +558,90 @@ def valid_format(data_files: list[str]) -> bool:
         return True
 
 
+def format_flops(flops_value: float) -> str:
+    """Format FLOPS values with appropriate units for better readability."""
+    if flops_value == 0:
+        return "0.00"
+
+    # Define unit thresholds and names
+    units = [
+        (1e12, "T"),   # TeraFLOPS
+        (1e9, "G"),    # GigaFLOPS
+        (1e6, "M"),    # MegaFLOPS
+        (1e3, "k"),    # kiloFLOPS
+        (1, "")        # FLOPS
+    ]
+
+    for threshold, unit in units:
+        if abs(flops_value) >= threshold:
+            formatted_value = flops_value / threshold
+            if formatted_value >= 100:
+                return f"{formatted_value:.1f}{unit}"
+            else:
+                return f"{formatted_value:.2f}{unit}"
+
+    # Fallback for very small values
+    return f"{flops_value:.2f}"
+
+
+def format_flops_for_table(flops_value: float, target_unit: str) -> str:
+    """Format FLOPS values for table display without unit suffix (since unit is in header)."""
+    if flops_value == 0:
+        return "0.00"
+
+    # Define unit thresholds based on target unit
+    unit_divisors = {
+        "TFLOPS": 1e12,
+        "GFLOPS": 1e9,
+        "MFLOPS": 1e6,
+        "kFLOPS": 1e3,
+        "FLOPS": 1
+    }
+
+    divisor = unit_divisors.get(target_unit, 1)
+    formatted_value = flops_value / divisor
+
+    if formatted_value >= 100:
+        return f"{formatted_value:.1f}"
+    else:
+        return f"{formatted_value:.2f}"
+
+
+def get_flops_unit_name(flops_values: list) -> str:
+    """Determine the best FLOPS unit name based on the magnitude of values."""
+    if not flops_values or all(v == 0 for v in flops_values):
+        return "FLOPS"
+
+    # Find the maximum absolute value to determine appropriate unit
+    max_flops = max(abs(v) for v in flops_values if v != 0)
+
+    if max_flops >= 1e12:
+        return "TFLOPS"
+    elif max_flops >= 1e9:
+        return "GFLOPS"
+    elif max_flops >= 1e6:
+        return "MFLOPS"
+    elif max_flops >= 1e3:
+        return "kFLOPS"
+    else:
+        return "FLOPS"
+
+
 bench_data = None
 if len(input_file) == 1:
     if LlamaBenchDataSQLite3File.valid_format(input_file[0]):
-        bench_data = LlamaBenchDataSQLite3File(input_file[0])
+        bench_data = LlamaBenchDataSQLite3File(input_file[0], tool)
     elif LlamaBenchDataJSON.valid_format(input_file):
-        bench_data = LlamaBenchDataJSON(input_file)
+        bench_data = LlamaBenchDataJSON(input_file, tool)
     elif LlamaBenchDataJSONL.valid_format(input_file[0]):
-        bench_data = LlamaBenchDataJSONL(input_file[0])
+        bench_data = LlamaBenchDataJSONL(input_file[0], tool)
     elif LlamaBenchDataCSV.valid_format(input_file):
-        bench_data = LlamaBenchDataCSV(input_file)
+        bench_data = LlamaBenchDataCSV(input_file, tool)
 else:
     if LlamaBenchDataJSON.valid_format(input_file):
-        bench_data = LlamaBenchDataJSON(input_file)
+        bench_data = LlamaBenchDataJSON(input_file, tool)
     elif LlamaBenchDataCSV.valid_format(input_file):
-        bench_data = LlamaBenchDataCSV(input_file)
+        bench_data = LlamaBenchDataCSV(input_file, tool)
 
 if not bench_data:
     raise RuntimeError("No valid (or some invalid) input files found.")
@@ -504,12 +712,27 @@ def valid_format(data_files: list[str]) -> bool:
 
 name_compare = bench_data.get_commit_name(hexsha8_compare)
 
+# Get tool-specific configuration
+if tool == "llama-bench":
+    key_properties = LLAMA_BENCH_KEY_PROPERTIES
+    bool_properties = LLAMA_BENCH_BOOL_PROPERTIES
+    pretty_names = LLAMA_BENCH_PRETTY_NAMES
+    default_show = DEFAULT_SHOW_LLAMA_BENCH
+    default_hide = DEFAULT_HIDE_LLAMA_BENCH
+else:  # test-backend-ops
+    key_properties = TEST_BACKEND_OPS_KEY_PROPERTIES
+    bool_properties = TEST_BACKEND_OPS_BOOL_PROPERTIES
+    pretty_names = TEST_BACKEND_OPS_PRETTY_NAMES
+    default_show = DEFAULT_SHOW_TEST_BACKEND_OPS
+    default_hide = DEFAULT_HIDE_TEST_BACKEND_OPS
+
 # If the user provided columns to group the results by, use them:
 if known_args.show is not None:
     show = known_args.show.split(",")
     unknown_cols = []
     for prop in show:
-        if prop not in KEY_PROPERTIES[:-3]:  # Last three values are n_prompt, n_gen, n_depth.
+        valid_props = key_properties if tool == "test-backend-ops" else key_properties[:-3]  # Exclude n_prompt, n_gen, n_depth for llama-bench
+        if prop not in valid_props:
             unknown_cols.append(prop)
     if unknown_cols:
         logger.error(f"Unknown values for --show: {', '.join(unknown_cols)}")
@@ -518,32 +741,50 @@ def valid_format(data_files: list[str]) -> bool:
     rows_show = bench_data.get_rows(show, hexsha8_baseline, hexsha8_compare)
 # Otherwise, select those columns where the values are not all the same:
 else:
-    rows_full = bench_data.get_rows(KEY_PROPERTIES, hexsha8_baseline, hexsha8_compare)
+    rows_full = bench_data.get_rows(key_properties, hexsha8_baseline, hexsha8_compare)
     properties_different = []
-    for i, kp_i in enumerate(KEY_PROPERTIES):
-        if kp_i in DEFAULT_SHOW or kp_i in ["n_prompt", "n_gen", "n_depth"]:
-            continue
-        for row_full in rows_full:
-            if row_full[i] != rows_full[0][i]:
-                properties_different.append(kp_i)
-                break
+
+    if tool == "llama-bench":
+        # For llama-bench, skip n_prompt, n_gen, n_depth from differentiation logic
+        check_properties = [kp for kp in key_properties if kp not in ["n_prompt", "n_gen", "n_depth"]]
+        for i, kp_i in enumerate(key_properties):
+            if kp_i in default_show or kp_i in ["n_prompt", "n_gen", "n_depth"]:
+                continue
+            for row_full in rows_full:
+                if row_full[i] != rows_full[0][i]:
+                    properties_different.append(kp_i)
+                    break
+    else:  # test-backend-ops
+        # For test-backend-ops, check all key properties
+        for i, kp_i in enumerate(key_properties):
+            if kp_i in default_show:
+                continue
+            for row_full in rows_full:
+                if row_full[i] != rows_full[0][i]:
+                    properties_different.append(kp_i)
+                    break
 
     show = []
-    # Show CPU and/or GPU by default even if the hardware for all results is the same:
-    if rows_full and "n_gpu_layers" not in properties_different:
-        ngl = int(rows_full[0][KEY_PROPERTIES.index("n_gpu_layers")])
 
-        if ngl != 99 and "cpu_info" not in properties_different:
-            show.append("cpu_info")
+    if tool == "llama-bench":
+        # Show CPU and/or GPU by default even if the hardware for all results is the same:
+        if rows_full and "n_gpu_layers" not in properties_different:
+            ngl = int(rows_full[0][key_properties.index("n_gpu_layers")])
 
-    show += properties_different
+            if ngl != 99 and "cpu_info" not in properties_different:
+                show.append("cpu_info")
 
-    index_default = 0
-    for prop in ["cpu_info", "gpu_info", "n_gpu_layers", "main_gpu"]:
-        if prop in show:
-            index_default += 1
-    show = show[:index_default] + DEFAULT_SHOW + show[index_default:]
-    for prop in DEFAULT_HIDE:
+        show += properties_different
+
+        index_default = 0
+        for prop in ["cpu_info", "gpu_info", "n_gpu_layers", "main_gpu"]:
+            if prop in show:
+                index_default += 1
+        show = show[:index_default] + default_show + show[index_default:]
+    else:  # test-backend-ops
+        show = default_show + properties_different
+
+    for prop in default_hide:
         try:
             show.remove(prop)
         except ValueError:
@@ -551,7 +792,7 @@ def valid_format(data_files: list[str]) -> bool:
 
     # Add plot_x parameter to parameters to show if it's not already present:
     if known_args.plot:
-        for k, v in PRETTY_NAMES.items():
+        for k, v in pretty_names.items():
             if v == known_args.plot_x and k not in show:
                 show.append(k)
                 break
@@ -563,60 +804,115 @@ def valid_format(data_files: list[str]) -> bool:
     sys.exit(1)
 
 table = []
-for row in rows_show:
-    n_prompt = int(row[-5])
-    n_gen    = int(row[-4])
-    n_depth  = int(row[-3])
-    if n_prompt != 0 and n_gen == 0:
-        test_name = f"pp{n_prompt}"
-    elif n_prompt == 0 and n_gen != 0:
-        test_name = f"tg{n_gen}"
-    else:
-        test_name = f"pp{n_prompt}+tg{n_gen}"
-    if n_depth != 0:
-        test_name = f"{test_name}@d{n_depth}"
-    #           Regular columns    test name    avg t/s values              Speedup
-    #            VVVVVVVVVVVVV     VVVVVVVVV    VVVVVVVVVVVVVV              VVVVVVV
-    table.append(list(row[:-5]) + [test_name] + list(row[-2:]) + [float(row[-1]) / float(row[-2])])
+primary_metric = "FLOPS"  # Default to FLOPS for test-backend-ops
+
+if tool == "llama-bench":
+    # For llama-bench, create test names and compare avg_ts values
+    for row in rows_show:
+        n_prompt = int(row[-5])
+        n_gen    = int(row[-4])
+        n_depth  = int(row[-3])
+        if n_prompt != 0 and n_gen == 0:
+            test_name = f"pp{n_prompt}"
+        elif n_prompt == 0 and n_gen != 0:
+            test_name = f"tg{n_gen}"
+        else:
+            test_name = f"pp{n_prompt}+tg{n_gen}"
+        if n_depth != 0:
+            test_name = f"{test_name}@d{n_depth}"
+        #           Regular columns    test name    avg t/s values              Speedup
+        #            VVVVVVVVVVVVV     VVVVVVVVV    VVVVVVVVVVVVVV              VVVVVVV
+        table.append(list(row[:-5]) + [test_name] + list(row[-2:]) + [float(row[-1]) / float(row[-2])])
+else:  # test-backend-ops
+    # Determine the primary metric by checking rows until we find one with valid data
+    if rows_show:
+        primary_metric = "FLOPS"  # Default to FLOPS
+        flops_values = []
+
+        # Collect all FLOPS values to determine the best unit
+        for sample_row in rows_show:
+            baseline_flops = float(sample_row[-4])
+            compare_flops = float(sample_row[-3])
+            baseline_bandwidth = float(sample_row[-2])
+
+            if baseline_flops > 0:
+                flops_values.extend([baseline_flops, compare_flops])
+            elif baseline_bandwidth > 0 and not flops_values:
+                primary_metric = "Bandwidth (GB/s)"
+
+        # If we have FLOPS data, determine the appropriate unit
+        if flops_values:
+            primary_metric = get_flops_unit_name(flops_values)
+
+    # For test-backend-ops, prioritize FLOPS > bandwidth for comparison
+    for row in rows_show:
+        # Extract metrics: flops, bandwidth_gb_s (baseline and compare)
+        baseline_flops = float(row[-4])
+        compare_flops = float(row[-3])
+        baseline_bandwidth = float(row[-2])
+        compare_bandwidth = float(row[-1])
+
+        # Determine which metric to use for comparison (prioritize FLOPS > bandwidth)
+        if baseline_flops > 0 and compare_flops > 0:
+            # Use FLOPS comparison (higher is better)
+            speedup = compare_flops / baseline_flops
+            baseline_str = format_flops_for_table(baseline_flops, primary_metric)
+            compare_str = format_flops_for_table(compare_flops, primary_metric)
+        elif baseline_bandwidth > 0 and compare_bandwidth > 0:
+            # Use bandwidth comparison (higher is better)
+            speedup = compare_bandwidth / baseline_bandwidth
+            baseline_str = f"{baseline_bandwidth:.2f}"
+            compare_str = f"{compare_bandwidth:.2f}"
+        else:
+            # Fallback if no valid data is available
+            baseline_str = "N/A"
+            compare_str = "N/A"
+            speedup = 1.0
+
+        table.append(list(row[:-4]) + [baseline_str, compare_str, speedup])
 
 # Some a-posteriori fixes to make the table contents prettier:
-for bool_property in BOOL_PROPERTIES:
+for bool_property in bool_properties:
     if bool_property in show:
         ip = show.index(bool_property)
         for row_table in table:
             row_table[ip] = "Yes" if int(row_table[ip]) == 1 else "No"
 
-if "model_type" in show:
-    ip = show.index("model_type")
-    for (old, new) in MODEL_SUFFIX_REPLACE.items():
-        for row_table in table:
-            row_table[ip] = row_table[ip].replace(old, new)
+if tool == "llama-bench":
+    if "model_type" in show:
+        ip = show.index("model_type")
+        for (old, new) in MODEL_SUFFIX_REPLACE.items():
+            for row_table in table:
+                row_table[ip] = row_table[ip].replace(old, new)
 
-if "model_size" in show:
-    ip = show.index("model_size")
-    for row_table in table:
-        row_table[ip] = float(row_table[ip]) / 1024 ** 3
+    if "model_size" in show:
+        ip = show.index("model_size")
+        for row_table in table:
+            row_table[ip] = float(row_table[ip]) / 1024 ** 3
 
-if "gpu_info" in show:
-    ip = show.index("gpu_info")
-    for row_table in table:
-        for gns in GPU_NAME_STRIP:
-            row_table[ip] = row_table[ip].replace(gns, "")
+    if "gpu_info" in show:
+        ip = show.index("gpu_info")
+        for row_table in table:
+            for gns in GPU_NAME_STRIP:
+                row_table[ip] = row_table[ip].replace(gns, "")
 
-        gpu_names = row_table[ip].split(", ")
-        num_gpus = len(gpu_names)
-        all_names_the_same = len(set(gpu_names)) == 1
-        if len(gpu_names) >= 2 and all_names_the_same:
-            row_table[ip] = f"{num_gpus}x {gpu_names[0]}"
+            gpu_names = row_table[ip].split(", ")
+            num_gpus = len(gpu_names)
+            all_names_the_same = len(set(gpu_names)) == 1
+            if len(gpu_names) >= 2 and all_names_the_same:
+                row_table[ip] = f"{num_gpus}x {gpu_names[0]}"
 
-headers  = [PRETTY_NAMES[p] for p in show]
-headers += ["Test", f"t/s {name_baseline}", f"t/s {name_compare}", "Speedup"]
+headers  = [pretty_names.get(p, p) for p in show]
+if tool == "llama-bench":
+    headers += ["Test", f"t/s {name_baseline}", f"t/s {name_compare}", "Speedup"]
+else:  # test-backend-ops
+    headers += [f"{primary_metric} {name_baseline}", f"{primary_metric} {name_compare}", "Speedup"]
 
 if known_args.plot:
-    def create_performance_plot(table_data: list[list[str]], headers: list[str], baseline_name: str, compare_name: str, output_file: str, plot_x_param: str, log_scale: bool = False):
+    def create_performance_plot(table_data: list[list[str]], headers: list[str], baseline_name: str, compare_name: str, output_file: str, plot_x_param: str, log_scale: bool = False, tool_type: str = "llama-bench", metric_name: str = "t/s"):
         try:
-            import matplotlib.pyplot as plt
             import matplotlib
+            import matplotlib.pyplot as plt
             matplotlib.use('Agg')
         except ImportError as e:
             logger.error("matplotlib is required for --plot.")
@@ -746,8 +1042,14 @@ def make_axes(num_groups, max_cols=2, base_size=(8, 4)):
 
             title = ', '.join(title_parts) if title_parts else "Performance comparison"
 
+            # Determine y-axis label based on tool type
+            if tool_type == "llama-bench":
+                y_label = "Tokens per second (t/s)"
+            else:  # test-backend-ops
+                y_label = metric_name
+
             ax.set_xlabel(plot_x_label, fontsize=12, fontweight='bold')
-            ax.set_ylabel('Tokens per second (t/s)', fontsize=12, fontweight='bold')
+            ax.set_ylabel(y_label, fontsize=12, fontweight='bold')
             ax.set_title(title, fontsize=12, fontweight='bold')
             ax.legend(loc='best', fontsize=10)
             ax.grid(True, alpha=0.3)
@@ -765,7 +1067,7 @@ def make_axes(num_groups, max_cols=2, base_size=(8, 4)):
         plt.savefig(output_file, dpi=300, bbox_inches='tight')
         plt.close()
 
-    create_performance_plot(table, headers, name_baseline, name_compare, known_args.plot, known_args.plot_x, known_args.plot_log_scale)
+    create_performance_plot(table, headers, name_baseline, name_compare, known_args.plot, known_args.plot_x, known_args.plot_log_scale, tool, primary_metric)
 
 print(tabulate( # noqa: NP100
     table,