Skip to content
Open
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
1 change: 1 addition & 0 deletions src/winml/modelkit/analyze/core/information_engine.py
Original file line number Diff line number Diff line change
Expand Up @@ -297,6 +297,7 @@ def _check_model(self) -> list[Information]:
self._model,
op_runtime_results=self._op_runtime_results,
device=self._device,
ep=self._ep,
)
manager_init_ms = int((time.perf_counter() - manager_init_start) * 1000)

Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -12,6 +12,7 @@
from __future__ import annotations

from .base import ModelValidator
from .batched_const_matmul_validator import BatchedConstMatMulValidator
from .constant_folding_validator import ConstantFoldingValidator
from .dynamic_input_validator import DynamicInputValidator
from .model_validator_manager import ModelValidatorManager
Expand All @@ -21,6 +22,7 @@


__all__ = [
"BatchedConstMatMulValidator",
"ConstantFoldingValidator",
"DynamicInputValidator",
"ModelValidator",
Expand Down
Original file line number Diff line number Diff line change
@@ -0,0 +1,129 @@
# -------------------------------------------------------------------------
# Copyright (c) Microsoft Corporation. All rights reserved.
# Licensed under the MIT License.
# --------------------------------------------------------------------------
"""Validator for batched MatMul with a constant operand on OpenVINO GPU.

OpenVINO GPU's oneDNN gemm cannot select an implementation for a batched
(rank >= 3) MatMul where an operand is a compile-time constant. The identical
gemm with a dynamic operand, and 2D constant gemm, both compile fine. Models
whose batched MatMul weights fold to constants (e.g. transformer disentangled
attention position terms) therefore fail to compile on OpenVINO GPU with:

[GPU] Failed to select implementation for ... type: gemm

This validator detects that structural pattern and recommends the
``untie-constant-batched-matmul`` surgery, which makes the constant operand
runtime-valued so gemm implementation selection succeeds.
"""

from __future__ import annotations

import logging
from typing import TYPE_CHECKING

from ...models.information import Action, ActionItem, ActionLevel, Information
from ...utils import infer_ihv_from_ep_name
from .base import ModelValidator


if TYPE_CHECKING:
from ...models.onnx_model import ONNXModel
from ...models.runtime_checks import PatternRuntime

logger = logging.getLogger(__name__)

# Surgery capability enabled when the pattern is detected (kebab-case to match
# the capability registry / autoconf normalization).
_SURGERY_FLAG = "untie-constant-batched-matmul"


class BatchedConstMatMulValidator(ModelValidator):
"""Detect batched MatMul with a constant operand (OpenVINO GPU only)."""

def __init__(
self,
model: ONNXModel,
op_runtime_results: list[PatternRuntime] | None = None,
ep: str | None = None,
device: str | None = None,
) -> None:
super().__init__(model, op_runtime_results=op_runtime_results)
self.ep = ep
self.device = device

@property
def validator_name(self) -> str:
"""Name of this validator for logging/reporting."""
return "BatchedConstMatMulValidator"

@property
def pattern_id(self) -> str:
"""Pattern ID for Information objects."""
return "MODEL/BatchedConstantMatMul"

def _is_enabled(self) -> bool:
"""Only relevant for OpenVINO (Intel IHV) on GPU."""
if (self.device or "").upper() != "GPU":
return False
if not self.ep:
return False
try:
from ...models.ihv_type import IHVType

return infer_ihv_from_ep_name(self.ep) == IHVType.INTEL
except Exception: # pragma: no cover - defensive
return False

def validate(self) -> Information | None:
"""Detect batched MatMul with a single constant rank>=3 operand."""
if not self._is_enabled():
return None

initializers = {init.name for init in self.graph.initializer}
rank_by_init = {init.name: len(init.dims) for init in self.graph.initializer}

offenders: list[str] = []
for node in self.graph.node:
if node.op_type != "MatMul" or len(node.input) != 2:
continue
const_inputs = [name for name in node.input if name in initializers]
# Exactly one constant operand (two-constant MatMuls fold away and
# never reach gemm impl selection).
if len(const_inputs) != 1:
continue
if rank_by_init.get(const_inputs[0], 0) >= 3:
offenders.append(node.name or const_inputs[0])

if not offenders:
return None

examples = ", ".join(offenders[:3])
action = Action(
pattern_from_id="",
pattern_to_id="",
level=ActionLevel.REQUIRED,
status=None,
action_items=[
ActionItem(type="GraphOptimization", optimization_options={_SURGERY_FLAG: True})
],
details=(
"Enable untie-constant-batched-matmul surgery so the constant "
"operand becomes runtime-valued and OpenVINO GPU can select a "
"gemm implementation."
),
)
explanation = (
f"Model contains {len(offenders)} batched MatMul(s) with a constant "
f"operand (examples: {examples}). OpenVINO GPU's oneDNN gemm cannot "
f"select an implementation for a batched MatMul with a constant "
f"operand, causing a '[GPU] Failed to select implementation ... gemm' "
f"compile failure. The untie-constant-batched-matmul surgery makes "
f"the operand runtime-valued without changing numerics."
)
return Information(
explanation=explanation,
actions=[action],
pattern_id=self.pattern_id,
status=None,
)
Original file line number Diff line number Diff line change
Expand Up @@ -15,6 +15,7 @@
from typing import TYPE_CHECKING, ClassVar

from ...utils.timing_utils import make_timing_logger
from .batched_const_matmul_validator import BatchedConstMatMulValidator
from .constant_folding_validator import ConstantFoldingValidator
from .dynamic_input_validator import DynamicInputValidator
from .pattern_matching_validator import PatternMatchingValidator
Expand Down Expand Up @@ -64,6 +65,11 @@ class ModelValidatorManager:
"class": PatternMatchingValidator,
"enabled_devices": None, # All devices
},
"batched_const_matmul": {
"class": BatchedConstMatMulValidator,
"enabled_devices": ["GPU"], # OpenVINO GPU gemm impl-selection issue
"needs_context": True, # validator self-gates on EP (Intel IHV)
},
}

def __init__(
Expand All @@ -72,6 +78,7 @@ def __init__(
enabled_validators: list[str] | None = None,
op_runtime_results: list[PatternRuntime] | None = None,
device: str | None = None,
ep: str | None = None,
) -> None:
"""Initialize validator manager.

Expand All @@ -92,6 +99,7 @@ def __init__(
self.model_proto = model.get_model()
self.op_runtime_results = op_runtime_results or []
self.device = device or "NPU"
self.ep = ep
self.enabled_validators = enabled_validators or list(self.VALIDATORS.keys())

# Instantiate enabled validators
Expand All @@ -102,18 +110,24 @@ def __init__(
validator_class = validator_config["class"]
enabled_devices = validator_config.get("enabled_devices")

# Check device constraint
if enabled_devices is not None and self.device not in enabled_devices:
# Check device constraint (case-insensitive: callers may pass
# "gpu" or "GPU" depending on the build/analyze entry point).
if enabled_devices is not None and (self.device or "").upper() not in {
d.upper() for d in enabled_devices
}:
logger.info(
f"Validator '{name}' is not enabled for device '{self.device}'. "
f"Only enabled for: {enabled_devices}"
)
continue

ctor_kwargs: dict = {"op_runtime_results": self.op_runtime_results}
if validator_config.get("needs_context"):
ctor_kwargs["ep"] = self.ep
ctor_kwargs["device"] = self.device

try:
self.validators.append(
validator_class(self.model, op_runtime_results=self.op_runtime_results)
)
self.validators.append(validator_class(self.model, **ctor_kwargs))
logger.debug(f"Initialized validator: {name}")
except Exception:
logger.exception(f"Failed to initialize validator {name}")
Expand Down
17 changes: 17 additions & 0 deletions src/winml/modelkit/optim/capabilities/surgery.py
Original file line number Diff line number Diff line change
Expand Up @@ -37,3 +37,20 @@
category=CapabilityCategory.SURGERY,
default=False,
)

# Route a constant operand of a batched (rank >= 3) MatMul through a runtime
# no-op so it is no longer a compile-time constant. OpenVINO GPU's oneDNN gemm
# cannot select an implementation for a batched MatMul with a constant operand
# (e.g. transformer disentangled-attention position terms that fold to 3D
# constants); making the operand runtime-valued lets gemm impl selection
# succeed without changing numerics or splitting the batched op.
UNTIE_CONSTANT_BATCHED_MATMUL = BoolCapability(
name="untie-constant-batched-matmul",
ort_name=None, # Custom implementation, not ORT optimizer
description=(
"Make a batched MatMul's constant operand runtime-valued so OpenVINO "
"GPU can select a gemm implementation"
),
category=CapabilityCategory.SURGERY,
default=False,
)
Loading
Loading