Skip to content
Merged
22 changes: 22 additions & 0 deletions scripts/e2e_eval/testsets/models_all.json
Original file line number Diff line number Diff line change
Expand Up @@ -4839,6 +4839,28 @@
"optimum_supported": false,
"order": 6
},
{
"hf_id": "timm/mobilenetv3_small_100.lamb_in1k",
"task": "image-classification",
"model_type": "timm_wrapper",
"group": "ISV",
"priority": "P1",
"downloads": 0,
"last_update_time": "2024-01-01T00:00:00+00:00",
"optimum_supported": true,
"order": 1
},
{
"hf_id": "timm/repghostnet_200.in1k",
"task": "image-classification",
"model_type": "timm_wrapper",
"group": "ISV",
"priority": "P1",
"downloads": 0,
"last_update_time": "2024-01-01T00:00:00+00:00",
"optimum_supported": true,
"order": 2
},
{
"hf_id": "timpal0l/mdeberta-v3-base-squad2",
"task": "question-answering",
Expand Down
111 changes: 87 additions & 24 deletions src/winml/modelkit/export/io.py
Original file line number Diff line number Diff line change
Expand Up @@ -206,49 +206,112 @@ def _get_onnx_config(
def _populate_image_size_from_preprocessor(
model_id: str | None,
shape_kwargs: dict,
hf_config: PretrainedConfig | None = None,
) -> None:
"""Populate height/width in shape_kwargs from preprocessor_config.json.
"""Populate height/width in shape_kwargs from preprocessor metadata.

Optimum's DummyVisionInputGenerator falls back to 64x64 when model config
lacks image_size (e.g., ResNet). This reads the correct size from
preprocessor_config.json and injects it into shape_kwargs.
lacks image_size (e.g., ResNet, timm). This reads the correct size from
a preprocessor_config-style dict obtained via :func:`_get_preprocessor_dict`
(which consults the hub's ``preprocessor_config.json`` first and, when that
is unavailable, synthesizes one from wrapper-config metadata such as
``TimmWrapperConfig.pretrained_cfg``).

Args:
model_id: HuggingFace model identifier (e.g., "microsoft/resnet-50")
shape_kwargs: Mutable dict to update with height/width if found
hf_config: HuggingFace PretrainedConfig used to synthesize a
preprocessor dict when ``preprocessor_config.json`` is missing.
"""
if not model_id:
return

if "height" in shape_kwargs or "width" in shape_kwargs:
return

config = _get_preprocessor_dict(model_id, hf_config)
size = config.get("size")
Comment thread
vortex-captain marked this conversation as resolved.

if isinstance(size, int):
shape_kwargs["height"] = size
shape_kwargs["width"] = size
elif isinstance(size, dict):
if "height" in size:
shape_kwargs["height"] = size["height"]
shape_kwargs["width"] = size["width"]
elif "shortest_edge" in size:
shape_kwargs["height"] = size["shortest_edge"]
shape_kwargs["width"] = size["shortest_edge"]

if "height" in shape_kwargs:
logger.debug(
"Loaded image size from preprocessor dict: %dx%d",
shape_kwargs["height"],
shape_kwargs["width"],
)


def _get_preprocessor_dict(
model_id: str | None,
hf_config: PretrainedConfig | None,
) -> dict:
"""Return a ``preprocessor_config.json``-style dict for the model.

Resolution order:

1. ``preprocessor_config.json`` fetched from the hub (standard HF vision),
used only when it carries a ``size`` key.
2. Synthesized from a nested plain-dict attribute on ``hf_config``
carrying ``input_size`` or ``image_size`` (e.g.
``TimmWrapperConfig.pretrained_cfg``). Reached when the hub file is
unavailable *or* present but missing ``size`` (a partial config).

Returns the dict in the standard preprocessor schema (``{"size": ...}``)
so downstream parsing logic does not need to know which source it came
from. Returns an empty dict when neither source yields a usable size.
"""
try:
from transformers.image_processing_utils import ImageProcessingMixin

config, _ = ImageProcessingMixin.get_image_processor_dict(model_id)
size = config.get("size")

if isinstance(size, int):
shape_kwargs["height"] = size
shape_kwargs["width"] = size
elif isinstance(size, dict):
if "height" in size:
shape_kwargs["height"] = size["height"]
shape_kwargs["width"] = size["width"]
elif "shortest_edge" in size:
shape_kwargs["height"] = size["shortest_edge"]
shape_kwargs["width"] = size["shortest_edge"]

if "height" in shape_kwargs:
logger.debug(
"Loaded image size from preprocessor_config.json: %dx%d",
shape_kwargs["height"],
shape_kwargs["width"],
)
if "size" in config:
return config
# Partial preprocessor_config.json without a "size" key: fall through
# to synthesis so we don't silently use Optimum's 64x64 default.
except (OSError, ValueError, KeyError) as e:
# if model_id is None, OSError is raised
logger.debug("Could not load preprocessor_config.json for %s: %s", model_id, e)

if hf_config is not None:
return _synthesize_preprocessor_dict(hf_config)
return {}


def _synthesize_preprocessor_dict(hf_config: PretrainedConfig) -> dict:
"""Build a ``preprocessor_config.json``-style dict from ``hf_config.pretrained_cfg``.

timm wrapper configs (``TimmWrapperConfig``) stash shape metadata in a
``pretrained_cfg`` dict carrying ``input_size = [C, H, W]``. Optimum's
NormalizedConfig only walks ``PretrainedConfig`` children, so this
dict-wrapped value is invisible to the dummy-input generator and it
falls back to 64x64.

Preprocessing keys (``mean``/``std``/``interpolation``/``crop_pct``)
don't affect export tensor shapes and are intentionally ignored.
"""
pretrained_cfg = getattr(hf_config, "pretrained_cfg", None)
if not isinstance(pretrained_cfg, dict):
return {}

input_size = pretrained_cfg.get("input_size")
if isinstance(input_size, (list, tuple)):
if len(input_size) == 3:
return {"size": {"height": input_size[1], "width": input_size[2]}}
if len(input_size) == 1:
return {"size": input_size[0]}

return {}


# Practical cap for export dummy input sequence length.
# LLMs have max_position_embeddings of 40K-131K which would OOM during export.
Expand Down Expand Up @@ -339,7 +402,7 @@ def generate_dummy_inputs(
onnx_config.float_dtype = float_dtype

shape_kwargs["batch_size"] = batch_size
_populate_image_size_from_preprocessor(model_id, shape_kwargs)
_populate_image_size_from_preprocessor(model_id, shape_kwargs, hf_config)
_populate_sequence_length_from_config(hf_config, shape_kwargs)

logger.debug(
Expand Down Expand Up @@ -402,7 +465,7 @@ def resolve_io_specs(

# Populate shapes from model config / preprocessor
shape_kwargs["batch_size"] = batch_size
_populate_image_size_from_preprocessor(model_id, shape_kwargs)
_populate_image_size_from_preprocessor(model_id, shape_kwargs, hf_config)
_populate_sequence_length_from_config(hf_config, shape_kwargs)

# Generate dummy inputs for concrete shapes and dtypes,
Expand Down
8 changes: 5 additions & 3 deletions src/winml/modelkit/inspect/resolver.py
Original file line number Diff line number Diff line change
Expand Up @@ -812,14 +812,16 @@ def get_config_attr(
if val is not None:
extra[attr] = val

# Step 5: Fallback — read image_size from preprocessor_config.json
# for models like ResNet where HF config lacks image_size
# Step 5: Fallback — read image_size from a preprocessor-style dict
# (preprocessor_config.json on the hub, or synthesized from a nested
# dict on hf_config such as TimmWrapperConfig.pretrained_cfg) when the
# top-level HF config lacks image_size.
if image_size is None and model_id is not None:
try:
from ..export.io import _populate_image_size_from_preprocessor

shape_kwargs: dict = {}
_populate_image_size_from_preprocessor(model_id, shape_kwargs)
_populate_image_size_from_preprocessor(model_id, shape_kwargs, config)
if "height" in shape_kwargs:
h, w = shape_kwargs["height"], shape_kwargs["width"]
image_size = h if h == w else (h, w)
Expand Down
Loading
Loading