diff --git a/.gitignore b/.gitignore
index f8ceb1560a1df..ba0da8655bf15 100644
--- a/.gitignore
+++ b/.gitignore
@@ -137,6 +137,9 @@ poetry.toml
 /tests/test-tokenizer-1-bpe
 /tests/test-tokenizer-1-spm
 
+# Test reports
+comparison_backend_ops_perf.txt
+
 # Scripts
 !/scripts/install-oneapi.bat
 
diff --git a/scripts/compare-commits-op-perf.sh b/scripts/compare-commits-op-perf.sh
new file mode 100755
index 0000000000000..e2c04941cfa22
--- /dev/null
+++ b/scripts/compare-commits-op-perf.sh
@@ -0,0 +1,40 @@
+#!/bin/bash
+
+if [ $# -lt 2 ]; then
+    echo "usage: ./scripts/compare-commits-op-perf.sh <commit1> <commit2> [additional test-backend-ops arguments]"
+    exit 1
+fi
+
+set -e
+set -x
+
+test_backend_ops_args="${@:3}"
+
+# Extract short form of commits (first 7 characters)
+commit1_short=$(echo $1 | cut -c1-7)
+commit2_short=$(echo $2 | cut -c1-7)
+
+rm -f test-backend-ops-perf-*.log
+
+# to test a backend, call the script with the corresponding environment variable (e.g. GGML_CUDA=1 ./scripts/compare-commits.sh ...)
+if [ -n "$GGML_CUDA" ]; then
+    CMAKE_OPTS="${CMAKE_OPTS} -DGGML_CUDA=ON"
+fi
+
+dir="build-test-backend-ops"
+
+function run {
+    commit_short=$1
+    rm -fr ${dir} > /dev/null
+    cmake -B ${dir} -S . ${CMAKE_OPTS} > /dev/null
+    cmake --build ${dir} -t test-backend-ops > /dev/null
+    ${dir}/bin/test-backend-ops $test_backend_ops_args perf 2>&1 | tee test-backend-ops-perf-${commit_short}.log
+}
+
+git checkout $1 > /dev/null
+run $commit1_short
+
+git checkout $2 > /dev/null
+run $commit2_short
+
+./scripts/compare-test-backend-ops-perf.py -b test-backend-ops-perf-$commit1_short.log -c test-backend-ops-perf-$commit2_short.log
diff --git a/scripts/compare-test-backend-ops-perf.py b/scripts/compare-test-backend-ops-perf.py
new file mode 100755
index 0000000000000..ab6d0832c2a16
--- /dev/null
+++ b/scripts/compare-test-backend-ops-perf.py
@@ -0,0 +1,225 @@
+#!/usr/bin/env python3
+
+import argparse
+import logging
+import re
+import sys
+from pathlib import Path
+from typing import Tuple, Union
+
+# Set up logging
+logger = logging.getLogger(__name__)
+logging.basicConfig(level=logging.INFO, format="%(message)s")
+
+
+def parse_benchmark_line(
+    line: str,
+) -> Tuple[Union[str, None], Union[float, None], Union[str, None]]:
+    """
+    Parses a single line of benchmark output.
+
+    Example lines:
+    MUL_MAT(...): 744 runs - 1660.11 us/run - 134.48 MFLOP/run - 81.01 GFLOPS
+    ADD(...): 98280 runs - 10.87 us/run - 48 kB/run - 4.21 GB/s
+
+    Returns a tuple of (key, normalized_value, unit_type) or (None, None, None) if parsing fails.
+
+    Performance units:
+    - GFLOPS/TFLOPS/MFLOPS: Floating Point Operations Per Second (normalized to GFLOPS)
+    - GB/s/MB/s/TB/s: Bytes Per Second (normalized to GB/s)
+    """
+    line = line.strip()
+    if ":" not in line:
+        return None, None, None
+
+    key, data_part = line.split(":", 1)
+    key = key.strip()
+
+    # Remove ANSI color codes from the data part
+    data_part = re.sub(r"\x1b\[[0-9;]*m", "", data_part)
+
+    # Try to match FLOPS units first
+    flops_match = re.search(
+        r"([\d\.]+)\s+(GFLOPS|TFLOPS|MFLOPS)\s*$", data_part.strip()
+    )
+    if flops_match:
+        value_str, unit = flops_match.groups()
+        value = float(value_str)
+
+        # Normalize everything to GFLOPS
+        if unit == "TFLOPS":
+            normalized_value = value * 1000
+        elif unit == "MFLOPS":
+            normalized_value = value / 1000
+        elif unit == "GFLOPS":
+            normalized_value = value
+        else:
+            assert False
+
+        return key, normalized_value, "GFLOPS"
+
+    # Try to match bandwidth units (GB/s, MB/s, TB/s)
+    bandwidth_match = re.search(r"([\d\.]+)\s+(GB/s|MB/s|TB/s)\s*$", data_part.strip())
+    if bandwidth_match:
+        value_str, unit = bandwidth_match.groups()
+        value = float(value_str)
+
+        # Normalize everything to GB/s
+        if unit == "TB/s":
+            normalized_value = value * 1000
+        elif unit == "MB/s":
+            normalized_value = value / 1000
+        elif unit == "GB/s":
+            normalized_value = value
+        else:
+            assert False
+
+        return key, normalized_value, "GB/s"
+
+    return None, None, None
+
+
+def extract_commit_id(filepath: Path) -> str:
+    """Extract commit ID from filename like test-backend-ops-perf-abc1234.log"""
+    filename = filepath.name
+    # Pattern: test-backend-ops-perf-<commit_id>.log
+    match = re.match(r"test-backend-ops-perf-([^.]+)\.log", filename)
+    if match:
+        return match.group(1)
+    return ""
+
+
+def load_results(filepath: Path) -> dict:
+    """Loads all benchmark results from a file into a dictionary."""
+    results = {}
+    try:
+        with open(filepath, "r", encoding="utf-8") as f:
+            for line in f:
+                key, value, unit_type = parse_benchmark_line(line)
+                if key and value is not None and unit_type:
+                    results[key] = {"value": value, "unit": unit_type}
+    except FileNotFoundError:
+        logger.error(f"Error: File not found at {filepath}")
+        sys.exit(1)
+    return results
+
+
+def format_change(change: float) -> str:
+    """Formats the percentage change."""
+    if change > 0.1:
+        return f"+{change:.2f}%"
+    elif change < -0.1:
+        return f"{change:.2f}%"
+    else:
+        return " ~0.00%"
+
+
+def main():
+    """Main function to compare benchmark files."""
+    parser = argparse.ArgumentParser(
+        description="Compare two benchmark result files and generate a report.",
+        formatter_class=argparse.RawTextHelpFormatter,
+    )
+    help_b = "Path to the baseline benchmark results file."
+    parser.add_argument(
+        "-b", "--baseline", dest="baseline", type=Path, required=True, help=help_b
+    )
+    help_c = "Path to the benchmark results file to compare against the baseline."
+    parser.add_argument(
+        "-c", "--compare", dest="compare", type=Path, required=True, help=help_c
+    )
+    parser.add_argument(
+        "-o",
+        "--output",
+        type=Path,
+        default="comparison_backend_ops_perf.txt",
+        help="Path to the output report file (default: comparison_backend_ops_perf.txt).",
+    )
+    args = parser.parse_args()
+
+    logger.info(f"Loading baseline results from: {args.baseline}")
+    baseline_results = load_results(args.baseline)
+    logger.info(f"Loading compare results from: {args.compare}")
+    compare_results = load_results(args.compare)
+
+    if not baseline_results or not compare_results:
+        logger.error("Could not load results from one or both files. Exiting.")
+        return
+
+    # Extract commit IDs from filenames
+    baseline_commit = extract_commit_id(args.baseline)
+    compare_commit = extract_commit_id(args.compare)
+
+    all_keys = sorted(list(set(baseline_results.keys()) | set(compare_results.keys())))
+
+    # Determine the unit type from the first available result
+    # Assume all data will be of the same unit type (either all GFLOPS or all GB/s)
+    unit_type = "GFLOPS"  # default
+    for key in all_keys:
+        baseline_data = baseline_results.get(key)
+        compare_data = compare_results.get(key)
+        if baseline_data:
+            unit_type = baseline_data["unit"]
+            break
+        elif compare_data:
+            unit_type = compare_data["unit"]
+            break
+
+    comparisons = []
+
+    for key in all_keys:
+        baseline_data = baseline_results.get(key)
+        compare_data = compare_results.get(key)
+
+        # Extract values
+        baseline_val = baseline_data["value"] if baseline_data else None
+        compare_val = compare_data["value"] if compare_data else None
+
+        # Calculate change if both values exist
+        change = 0
+        if baseline_val is not None and compare_val is not None:
+            change = ((compare_val - baseline_val) / baseline_val) * 100
+
+        entry = {
+            "key": key,
+            "baseline": baseline_val,
+            "compare": compare_val,
+            "change": change,
+        }
+
+        comparisons.append(entry)
+
+    # --- Generate Report ---
+    with open(args.output, "w", encoding="utf-8") as f:
+
+        # Create header with the determined unit type
+        baseline_header = f"Baseline {unit_type}"
+        compare_header = f"Compare {unit_type}"
+
+        if baseline_commit:
+            baseline_header = f"Baseline ({baseline_commit}) {unit_type}"
+        if compare_commit:
+            compare_header = f"Compare ({compare_commit}) {unit_type}"
+
+        key_width = max(len(k) for k in all_keys) + 2
+        header = f"{'Test Configuration':<{key_width}} {baseline_header:>25} {compare_header:>25} {'Change (%)':>15}"
+        f.write(header + "\n")
+        f.write("-" * len(header) + "\n")
+
+        for item in comparisons:
+            baseline_str = (
+                f"{item['baseline']:.2f}" if item["baseline"] is not None else "N/A"
+            )
+            compare_str = (
+                f"{item['compare']:.2f}" if item["compare"] is not None else "N/A"
+            )
+            change_str = format_change(item["change"])
+            f.write(
+                f"{item['key']:<{key_width}} {baseline_str:>25} {compare_str:>25} {change_str:>15}\n"
+            )
+
+    logger.info(f"Comparison report successfully generated at: {args.output}")
+
+
+if __name__ == "__main__":
+    main()