microsoft · xieofxie · Jun 5, 2026
@@ -297,6 +297,7 @@ def _check_model(self) -> list[Information]:
                 self._model,
                 op_runtime_results=self._op_runtime_results,
                 device=self._device,
+                ep=self._ep,
             )
             manager_init_ms = int((time.perf_counter() - manager_init_start) * 1000)
 

@@ -12,6 +12,7 @@
 from __future__ import annotations
 
 from .base import ModelValidator
+from .batched_const_matmul_validator import BatchedConstMatMulValidator
 from .constant_folding_validator import ConstantFoldingValidator
 from .dynamic_input_validator import DynamicInputValidator
 from .model_validator_manager import ModelValidatorManager
@@ -21,6 +22,7 @@
 
 
 __all__ = [
+    "BatchedConstMatMulValidator",
     "ConstantFoldingValidator",
     "DynamicInputValidator",
     "ModelValidator",

@@ -0,0 +1,129 @@
+# -------------------------------------------------------------------------
+# Copyright (c) Microsoft Corporation. All rights reserved.
+# Licensed under the MIT License.
+# --------------------------------------------------------------------------
+"""Validator for batched MatMul with a constant operand on OpenVINO GPU.
+
+OpenVINO GPU's oneDNN gemm cannot select an implementation for a batched
+(rank >= 3) MatMul where an operand is a compile-time constant. The identical
+gemm with a dynamic operand, and 2D constant gemm, both compile fine. Models
+whose batched MatMul weights fold to constants (e.g. transformer disentangled
+attention position terms) therefore fail to compile on OpenVINO GPU with:
+
+    [GPU] Failed to select implementation for ... type: gemm
+
+This validator detects that structural pattern and recommends the
+``untie-constant-batched-matmul`` surgery, which makes the constant operand
+runtime-valued so gemm implementation selection succeeds.
+"""
+
+from __future__ import annotations
+
+import logging
+from typing import TYPE_CHECKING
+
+from ...models.information import Action, ActionItem, ActionLevel, Information
+from ...utils import infer_ihv_from_ep_name
+from .base import ModelValidator
+
+
+if TYPE_CHECKING:
+    from ...models.onnx_model import ONNXModel
+    from ...models.runtime_checks import PatternRuntime
+
+logger = logging.getLogger(__name__)
+
+# Surgery capability enabled when the pattern is detected (kebab-case to match
+# the capability registry / autoconf normalization).
+_SURGERY_FLAG = "untie-constant-batched-matmul"
+
+
+class BatchedConstMatMulValidator(ModelValidator):
+    """Detect batched MatMul with a constant operand (OpenVINO GPU only)."""
+
+    def __init__(
+        self,
+        model: ONNXModel,
+        op_runtime_results: list[PatternRuntime] | None = None,
+        ep: str | None = None,
+        device: str | None = None,
+    ) -> None:
+        super().__init__(model, op_runtime_results=op_runtime_results)
+        self.ep = ep
+        self.device = device
+
+    @property
+    def validator_name(self) -> str:
+        """Name of this validator for logging/reporting."""
+        return "BatchedConstMatMulValidator"
+
+    @property
+    def pattern_id(self) -> str:
+        """Pattern ID for Information objects."""
+        return "MODEL/BatchedConstantMatMul"
+
+    def _is_enabled(self) -> bool:
+        """Only relevant for OpenVINO (Intel IHV) on GPU."""
+        if (self.device or "").upper() != "GPU":
+            return False
+        if not self.ep:
+            return False
+        try:
+            from ...models.ihv_type import IHVType
+
+            return infer_ihv_from_ep_name(self.ep) == IHVType.INTEL
+        except Exception:  # pragma: no cover - defensive
+            return False
+
+    def validate(self) -> Information | None:
+        """Detect batched MatMul with a single constant rank>=3 operand."""
+        if not self._is_enabled():
+            return None
+
+        initializers = {init.name for init in self.graph.initializer}
+        rank_by_init = {init.name: len(init.dims) for init in self.graph.initializer}
+
+        offenders: list[str] = []
+        for node in self.graph.node:
+            if node.op_type != "MatMul" or len(node.input) != 2:
+                continue
+            const_inputs = [name for name in node.input if name in initializers]
+            # Exactly one constant operand (two-constant MatMuls fold away and
+            # never reach gemm impl selection).
+            if len(const_inputs) != 1:
+                continue
+            if rank_by_init.get(const_inputs[0], 0) >= 3:
+                offenders.append(node.name or const_inputs[0])
+
+        if not offenders:
+            return None
+
+        examples = ", ".join(offenders[:3])
+        action = Action(
+            pattern_from_id="",
+            pattern_to_id="",
+            level=ActionLevel.REQUIRED,
+            status=None,
+            action_items=[
+                ActionItem(type="GraphOptimization", optimization_options={_SURGERY_FLAG: True})
+            ],
+            details=(
+                "Enable untie-constant-batched-matmul surgery so the constant "
+                "operand becomes runtime-valued and OpenVINO GPU can select a "
+                "gemm implementation."
+            ),
+        )
+        explanation = (
+            f"Model contains {len(offenders)} batched MatMul(s) with a constant "
+            f"operand (examples: {examples}). OpenVINO GPU's oneDNN gemm cannot "
+            f"select an implementation for a batched MatMul with a constant "
+            f"operand, causing a '[GPU] Failed to select implementation ... gemm' "
+            f"compile failure. The untie-constant-batched-matmul surgery makes "
+            f"the operand runtime-valued without changing numerics."
+        )
+        return Information(
+            explanation=explanation,
+            actions=[action],
+            pattern_id=self.pattern_id,
+            status=None,
+        )
@@ -15,6 +15,7 @@
 from typing import TYPE_CHECKING, ClassVar
 
 from ...utils.timing_utils import make_timing_logger
+from .batched_const_matmul_validator import BatchedConstMatMulValidator
 from .constant_folding_validator import ConstantFoldingValidator
 from .dynamic_input_validator import DynamicInputValidator
 from .pattern_matching_validator import PatternMatchingValidator
@@ -64,6 +65,11 @@ class ModelValidatorManager:
             "class": PatternMatchingValidator,
             "enabled_devices": None,  # All devices
         },
+        "batched_const_matmul": {
+            "class": BatchedConstMatMulValidator,
+            "enabled_devices": ["GPU"],  # OpenVINO GPU gemm impl-selection issue
+            "needs_context": True,  # validator self-gates on EP (Intel IHV)
+        },
     }
 
     def __init__(
@@ -72,6 +78,7 @@ def __init__(
         enabled_validators: list[str] | None = None,
         op_runtime_results: list[PatternRuntime] | None = None,
         device: str | None = None,
+        ep: str | None = None,
     ) -> None:
         """Initialize validator manager.
 
@@ -92,6 +99,7 @@ def __init__(
         self.model_proto = model.get_model()
         self.op_runtime_results = op_runtime_results or []
         self.device = device or "NPU"
+        self.ep = ep
         self.enabled_validators = enabled_validators or list(self.VALIDATORS.keys())
 
         # Instantiate enabled validators
@@ -102,18 +110,24 @@ def __init__(
                 validator_class = validator_config["class"]
                 enabled_devices = validator_config.get("enabled_devices")
 
-                # Check device constraint
-                if enabled_devices is not None and self.device not in enabled_devices:
+                # Check device constraint (case-insensitive: callers may pass
+                # "gpu" or "GPU" depending on the build/analyze entry point).
+                if enabled_devices is not None and (self.device or "").upper() not in {
+                    d.upper() for d in enabled_devices
+                }:
                     logger.info(
                         f"Validator '{name}' is not enabled for device '{self.device}'. "
                         f"Only enabled for: {enabled_devices}"
                     )
                     continue
 
+                ctor_kwargs: dict = {"op_runtime_results": self.op_runtime_results}
+                if validator_config.get("needs_context"):
+                    ctor_kwargs["ep"] = self.ep
+                    ctor_kwargs["device"] = self.device
+
                 try:
-                    self.validators.append(
-                        validator_class(self.model, op_runtime_results=self.op_runtime_results)
-                    )
+                    self.validators.append(validator_class(self.model, **ctor_kwargs))
                     logger.debug(f"Initialized validator: {name}")
                 except Exception:
                     logger.exception(f"Failed to initialize validator {name}")

@@ -37,3 +37,20 @@
     category=CapabilityCategory.SURGERY,
     default=False,
 )
+
+# Route a constant operand of a batched (rank >= 3) MatMul through a runtime
+# no-op so it is no longer a compile-time constant. OpenVINO GPU's oneDNN gemm
+# cannot select an implementation for a batched MatMul with a constant operand
+# (e.g. transformer disentangled-attention position terms that fold to 3D
+# constants); making the operand runtime-valued lets gemm impl selection
+# succeed without changing numerics or splitting the batched op.
+UNTIE_CONSTANT_BATCHED_MATMUL = BoolCapability(
+    name="untie-constant-batched-matmul",
+    ort_name=None,  # Custom implementation, not ORT optimizer
+    description=(
+        "Make a batched MatMul's constant operand runtime-valued so OpenVINO "
+        "GPU can select a gemm implementation"
+    ),
+    category=CapabilityCategory.SURGERY,
+    default=False,
+)