From e72eb9387de60087f0e3602232f59c81554aea3e Mon Sep 17 00:00:00 2001 From: pradhyum6144 Date: Tue, 24 Mar 2026 00:04:39 +0530 Subject: [PATCH 1/5] feat: add vision encoder spec for multimodal models (issue #150) Adds a vision encoder specification to describe how multimodal models process image and video inputs. The current spec supports declaring image modality via capabilities.inputTypes but has no architectural description of the vision encoder. New files: - schema/vision-encoder-schema.json: JSON Schema for vision encoder fields (encoder type, patch size, projector, fusion type, special tokens, dynamic resolution, mRoPE) - docs/vision-encoder.md: Spec document with field descriptions and model coverage matrix (LLaVA, Qwen2-VL, LLaMA-3.2 Vision, Gemma 2 VL) - tools/hf_parser.py: Extended HF config parser with vision model support (parse_vision_config function) - tools/hf_parser_test.py: 50 tests (26 decoder + 24 vision) covering LLaVA 1.5, Qwen2-VL, LLaMA-3.2 Vision, and text-only models Relates to #150 Signed-off-by: pradhyum6144 --- docs/vision-encoder.md | 210 ++++++++++++ schema/vision-encoder-schema.json | 167 +++++++++ tools/hf_parser.py | 547 ++++++++++++++++++++++++++++++ tools/hf_parser_test.py | 517 ++++++++++++++++++++++++++++ 4 files changed, 1441 insertions(+) create mode 100644 docs/vision-encoder.md create mode 100644 schema/vision-encoder-schema.json create mode 100644 tools/hf_parser.py create mode 100644 tools/hf_parser_test.py diff --git a/docs/vision-encoder.md b/docs/vision-encoder.md new file mode 100644 index 0000000..c2eee0d --- /dev/null +++ b/docs/vision-encoder.md @@ -0,0 +1,210 @@ +# Vision Encoder Specification + +This document describes the vision encoder architecture fields for multimodal models that process image and video inputs. It extends the model configuration defined in [config.md](./config.md) to cover the architectural details of how visual inputs are processed. + +## Background + +The current ModelPack specification supports declaring image modality via `capabilities.inputTypes: ["image"]`, but provides no architectural description of how images are processed. Every major model family now has a vision variant (LLaVA, Qwen2-VL, LLaMA-3.2 Vision, Gemma 2 VL), and inference engines need structured metadata about the vision encoder to correctly configure image preprocessing, patch embedding, and vision-language fusion. + +## Architecture Overview + +Vision-language models follow a common pattern: + +``` +Input Image → Vision Encoder → Projector → Language Model → Text Output + ↓ + Visual token embeddings +``` + +The **vision encoder** converts raw images into a sequence of visual tokens using a Vision Transformer (ViT) or CLIP-ViT architecture. A **projector** module maps these visual tokens into the language model's embedding space. The **fusion type** determines how visual and textual tokens interact inside the language model. + +## Properties + +- **type** _string_, REQUIRED + + The vision encoder architecture type. Supported values: + + | Value | Description | + |-------|-------------| + | `"vit"` | Standard Vision Transformer | + | `"clip_vit"` | CLIP-pretrained Vision Transformer | + | `"other"` | Other vision encoder architecture | + +- **hidden_size** _integer_, REQUIRED + + The hidden size (embedding dimension) of the vision encoder. + +- **patch_size** _integer_, REQUIRED + + The spatial patch size in pixels. For example, `14` means the image is divided into 14×14 pixel patches. Each patch becomes one visual token. + +- **image_size** _integer_, REQUIRED + + The default input image resolution in pixels. + +- **num_layers** _integer_, REQUIRED + + The number of transformer layers in the vision encoder. + +- **num_attention_heads** _integer_, REQUIRED + + The number of attention heads in the vision encoder. + +- **intermediate_size** _integer_, OPTIONAL + + The FFN intermediate size in the vision encoder. + +- **in_channels** _integer_, OPTIONAL + + The number of input image channels. Defaults to `3` (RGB). + +- **activation** _string_, OPTIONAL + + The activation function used in the vision encoder, such as `"quick_gelu"`, `"gelu"`, or `"silu"`. + +- **norm** _object_, OPTIONAL + + Normalization configuration for the vision encoder. + + - **type** _string_, OPTIONAL + + The normalization type. Supported values: `"layernorm"`, `"rmsnorm"`. + + - **epsilon** _number_, OPTIONAL + + The epsilon value for normalization. + +- **projector** _object_, OPTIONAL + + The multimodal projector that maps vision encoder outputs to the language model embedding space. + + - **type** _string_, OPTIONAL + + The projector architecture type. Supported values: + + | Value | Description | + |-------|-------------| + | `"mlp"` | Multi-layer perceptron (e.g., LLaVA 1.5 uses 2-layer MLP with GELU) | + | `"linear"` | Single linear projection | + | `"cross_attention"` | Cross-attention layers (e.g., LLaMA-3.2 Vision) | + | `"perceiver"` | Perceiver-style resampler | + | `"other"` | Other projector architecture | + + - **num_layers** _integer_, OPTIONAL + + The number of layers in the projector (for MLP-type projectors). + + - **activation** _string_, OPTIONAL + + The activation function in the projector, such as `"gelu"`. + +- **special_tokens** _object_, OPTIONAL + + Special token IDs used for image and video inputs in the tokenizer. + + - **image_token_id** _integer_, OPTIONAL + + The token ID used as a placeholder for image input in the text sequence. + + - **vision_start_token_id** _integer_, OPTIONAL + + The token ID marking the start of a vision region (used by models like Qwen2-VL). + + - **vision_end_token_id** _integer_, OPTIONAL + + The token ID marking the end of a vision region. + + - **video_token_id** _integer_, OPTIONAL + + The token ID for video frame placeholders. + +- **dynamic_resolution** _object_, OPTIONAL + + Dynamic image resolution support, where the model can handle variable-resolution inputs. + + - **enabled** _boolean_, OPTIONAL + + Whether dynamic resolution is enabled. + + - **min_pixels** _integer_, OPTIONAL + + The minimum number of visual tokens. + + - **max_pixels** _integer_, OPTIONAL + + The maximum number of visual tokens. + + - **spatial_merge_size** _integer_, OPTIONAL + + The spatial merging stride for reducing visual token count. + +- **temporal_patch_size** _integer_, OPTIONAL + + The temporal patch size for video understanding. Specifies how many frames are grouped into one temporal patch. + +- **fusion_type** _string_, OPTIONAL + + How vision and language modalities are fused. Supported values: + + | Value | Description | + |-------|-------------| + | `"early"` | Visual tokens are concatenated with text tokens before the first transformer layer (e.g., Qwen2-VL) | + | `"late"` | Visual tokens are injected after separate encoding (e.g., LLaVA) | + | `"cross_attention"` | Dedicated cross-attention layers between vision and language (e.g., LLaMA-3.2 Vision) | + +- **num_cross_attention_layers** _integer_, OPTIONAL + + The number of cross-attention layers for vision-language fusion. Only applicable when `fusion_type` is `"cross_attention"`. + +- **position_embedding** _object_, OPTIONAL + + Position embedding configuration for the vision encoder. + + - **type** _string_, OPTIONAL + + The type of position embedding. Supported values: `"learned"`, `"rope"`, `"mrope"`, `"sinusoidal"`. + + - **mrope_sections** _array of integers_, OPTIONAL + + Per-modality RoPE dimension sections. Only applicable when type is `"mrope"` (e.g., Qwen2-VL uses `[16, 24, 24]` for temporal, height, width dimensions). + +## Model Coverage + +| Model | Encoder | Patch Size | Image Size | Projector | Fusion | Special Features | +|-------|---------|-----------|------------|-----------|--------|------------------| +| LLaVA 1.5 | CLIP-ViT-L/14 | 14 | 336 | 2-layer MLP | late | — | +| Qwen2-VL | ViT | 14 | dynamic | — | early | mRoPE, dynamic resolution, video | +| LLaMA-3.2 Vision | CLIP-ViT | 14 | 560 | cross-attention | cross_attention | Gated cross-attention | +| Gemma 2 VL | SigLIP | 14 | 224 | linear | late | — | + +## Example + +```json,title=Vision%20Encoder%20Config&mediatype=application/vnd.cncf.model.vision-encoder.v1+json +{ + "type": "clip_vit", + "hidden_size": 1024, + "patch_size": 14, + "image_size": 336, + "num_layers": 24, + "num_attention_heads": 16, + "intermediate_size": 4096, + "in_channels": 3, + "activation": "quick_gelu", + "norm": { + "type": "layernorm", + "epsilon": 1e-5 + }, + "projector": { + "type": "mlp", + "num_layers": 2, + "activation": "gelu" + }, + "special_tokens": { + "image_token_id": 32000 + }, + "fusion_type": "late", + "position_embedding": { + "type": "learned" + } +} +``` diff --git a/schema/vision-encoder-schema.json b/schema/vision-encoder-schema.json new file mode 100644 index 0000000..abce40a --- /dev/null +++ b/schema/vision-encoder-schema.json @@ -0,0 +1,167 @@ +{ + "description": "Vision Encoder Architecture Schema for Multimodal Models", + "$schema": "http://json-schema.org/draft-04/schema#", + "$id": "https://github.com/modelpack/model-spec/vision-encoder", + "type": "object", + "properties": { + "type": { + "type": "string", + "enum": ["vit", "clip_vit", "other"], + "description": "The vision encoder architecture type" + }, + "hidden_size": { + "type": "integer", + "description": "Hidden size / embedding dimension of the vision encoder" + }, + "patch_size": { + "type": "integer", + "description": "Spatial patch size in pixels (e.g., 14 means 14x14 patches)" + }, + "image_size": { + "type": "integer", + "description": "Default input image resolution in pixels" + }, + "num_layers": { + "type": "integer", + "description": "Number of transformer layers in the vision encoder" + }, + "num_attention_heads": { + "type": "integer", + "description": "Number of attention heads in the vision encoder" + }, + "intermediate_size": { + "type": "integer", + "description": "FFN intermediate size in the vision encoder" + }, + "in_channels": { + "type": "integer", + "description": "Number of input image channels (3 for RGB)", + "default": 3 + }, + "activation": { + "type": "string", + "description": "Activation function used in the vision encoder (e.g., quick_gelu, gelu, silu)" + }, + "norm": { + "type": "object", + "properties": { + "type": { + "type": "string", + "enum": ["layernorm", "rmsnorm"], + "description": "Normalization type in the vision encoder" + }, + "epsilon": { + "type": "number", + "description": "Epsilon value for normalization" + } + }, + "additionalProperties": false + }, + "projector": { + "type": "object", + "description": "Multimodal projector that maps vision embeddings to language model space", + "properties": { + "type": { + "type": "string", + "enum": ["mlp", "linear", "cross_attention", "perceiver", "other"], + "description": "Projector architecture type" + }, + "num_layers": { + "type": "integer", + "description": "Number of layers in the projector (for MLP projectors)" + }, + "activation": { + "type": "string", + "description": "Activation function in the projector (e.g., gelu)" + } + }, + "additionalProperties": false + }, + "special_tokens": { + "type": "object", + "description": "Special token IDs for image/video in the tokenizer", + "properties": { + "image_token_id": { + "type": "integer", + "description": "Token ID used as a placeholder for image input" + }, + "vision_start_token_id": { + "type": "integer", + "description": "Token ID marking the start of a vision region" + }, + "vision_end_token_id": { + "type": "integer", + "description": "Token ID marking the end of a vision region" + }, + "video_token_id": { + "type": "integer", + "description": "Token ID for video frame placeholder" + } + }, + "additionalProperties": false + }, + "dynamic_resolution": { + "type": "object", + "description": "Dynamic image resolution support (e.g., Qwen2-VL native dynamic resolution)", + "properties": { + "enabled": { + "type": "boolean" + }, + "min_pixels": { + "type": "integer", + "description": "Minimum number of visual tokens" + }, + "max_pixels": { + "type": "integer", + "description": "Maximum number of visual tokens" + }, + "spatial_merge_size": { + "type": "integer", + "description": "Spatial merging stride for reducing token count" + } + }, + "additionalProperties": false + }, + "temporal_patch_size": { + "type": "integer", + "description": "Temporal patch size for video understanding (number of frames per patch)" + }, + "fusion_type": { + "type": "string", + "enum": ["early", "late", "cross_attention"], + "description": "How vision and language modalities are fused" + }, + "num_cross_attention_layers": { + "type": "integer", + "description": "Number of cross-attention layers for vision-language fusion (for cross_attention fusion type)" + }, + "position_embedding": { + "type": "object", + "description": "Position embedding configuration for the vision encoder", + "properties": { + "type": { + "type": "string", + "enum": ["learned", "rope", "mrope", "sinusoidal"], + "description": "Type of position embedding" + }, + "mrope_sections": { + "type": "array", + "items": { + "type": "integer" + }, + "description": "Per-modality RoPE dimension sections (for mrope type)" + } + }, + "additionalProperties": false + } + }, + "required": [ + "type", + "hidden_size", + "patch_size", + "image_size", + "num_layers", + "num_attention_heads" + ], + "additionalProperties": false +} diff --git a/tools/hf_parser.py b/tools/hf_parser.py new file mode 100644 index 0000000..5735431 --- /dev/null +++ b/tools/hf_parser.py @@ -0,0 +1,547 @@ +#!/usr/bin/env python3 +"""Parse HuggingFace model config.json into ModelPack transformer spec format. + +This tool maps HuggingFace Transformers config.json fields to the ModelPack +unified transformer specification vocabulary defined in PR #111 +(docs/architecture.md by @aftersnow). + +Usage: + python tools/hf_parser.py meta-llama/Meta-Llama-3-8B + python tools/hf_parser.py mistralai/Mistral-7B-v0.3 + python tools/hf_parser.py --file path/to/config.json + +The output is a YAML spec file following the ModelPack transformer spec format. +Fields that cannot be reliably inferred from config.json are marked as +NEEDS_REVIEW for human verification. +""" + +from __future__ import annotations + +import argparse +import json +import sys +from pathlib import Path + +NEEDS_REVIEW = "__NEEDS_REVIEW__" + +# Maps HuggingFace config.json field names to ModelPack transformer spec paths. +# Based on PR #111's field vocabulary (docs/architecture.md). +FIELD_MAP = { + # Top-level transformer fields + "vocab_size": "vocabulary_size", + "hidden_size": "hidden_size", + # Position embedding + "max_position_embeddings": "position_embedding.max_position_embeddings", + "rope_theta": "position_embedding.rope_theta", + "rope_scaling": "position_embedding.rope_scaling", + # Attention + "num_attention_heads": "attention.num_attention_heads", + "num_key_value_heads": "attention.num_key_value_heads", + "head_dim": "attention.head_dim", + # FFN / MLP + "intermediate_size": "mlp.intermediate_size", + # Transformer layers + "num_hidden_layers": "num_layers", + # Normalization + "rms_norm_eps": "norm.epsilon", + # MoE fields + "num_local_experts": "moe.num_experts", + "num_experts_per_tok": "moe.top_k", + "num_experts": "moe.num_experts", + "n_routed_experts": "moe.num_experts", # DeepSeek naming variant + # MLA fields (DeepSeek) + "kv_lora_rank": "attention.kv_lora_rank", + "q_lora_rank": "attention.q_lora_rank", + "qk_nope_head_dim": "attention.qk_nope_head_dim", + "qk_rope_head_dim": "attention.qk_rope_head_dim", + "v_head_dim": "attention.v_head_dim", +} + +# Known model type → attention type mapping +ATTENTION_TYPE_MAP = { + "llama": "gqa", + "mistral": "gqa", + "mixtral": "gqa", + "qwen2": "gqa", + "qwen2_moe": "gqa", + "gemma": "gqa", + "gemma2": "gqa", + "phi3": "gqa", + "deepseek_v2": "mla", + "deepseek_v3": "mla", + "gpt2": "mha", + "gpt_neo": "mha", + "gpt_neox": "mha", + "falcon": "mha", +} + +# Known model type → FFN type mapping +FFN_TYPE_MAP = { + "llama": "mlp", + "mistral": "mlp", + "mixtral": "moe", + "qwen2": "mlp", + "qwen2_moe": "moe", + "gemma": "mlp", + "gemma2": "mlp", + "phi3": "mlp", + "deepseek_v2": "moe", + "deepseek_v3": "moe", + "gpt2": "mlp", + "gpt_neo": "mlp", + "gpt_neox": "mlp", + "falcon": "mlp", +} + +# Known model type → activation function mapping +ACTIVATION_MAP = { + "llama": "silu", + "mistral": "silu", + "mixtral": "silu", + "qwen2": "silu", + "qwen2_moe": "silu", + "gemma": "gelu", + "gemma2": "gelu", + "phi3": "silu", + "gpt2": "gelu", + "gpt_neo": "gelu", + "gpt_neox": "gelu", + "falcon": "gelu", +} + + +def _set_nested(d: dict, path: str, value) -> None: + """Set a value in a nested dict using dot-separated path.""" + keys = path.split(".") + for key in keys[:-1]: + d = d.setdefault(key, {}) + d[keys[-1]] = value + + +def _get_nested(d: dict, path: str, default=None): + """Get a value from a nested dict using dot-separated path.""" + keys = path.split(".") + for key in keys: + if not isinstance(d, dict) or key not in d: + return default + d = d[key] + return d + + +def parse_hf_config(raw: dict) -> dict: + """Parse a HuggingFace config.json dict into ModelPack transformer spec. + + Args: + raw: The parsed config.json dict from HuggingFace. + + Returns: + A dict following the ModelPack transformer spec format. + """ + result: dict = {} + model_type = raw.get("model_type", "").lower() + + # Map static fields + for hf_key, mp_path in FIELD_MAP.items(): + if hf_key in raw and raw[hf_key] is not None: + _set_nested(result, mp_path, raw[hf_key]) + + # Derive head_dim if absent + if "attention" in result and "head_dim" not in result.get("attention", {}): + hidden = result.get("hidden_size") + n_heads = _get_nested(result, "attention.num_attention_heads") + if hidden and n_heads: + _set_nested(result, "attention.head_dim", hidden // n_heads) + + # Set architecture type + result["type"] = "decoder" + result["architecture_version"] = "0.1.0" + + # Infer attention type from model_type + attn_type = ATTENTION_TYPE_MAP.get(model_type, NEEDS_REVIEW) + _set_nested(result, "attention.type", attn_type) + _set_nested(result, "attention.is_causal", True) + + # Check for sliding window attention + if raw.get("sliding_window") is not None: + _set_nested(result, "attention.sliding_window", raw["sliding_window"]) + + # Infer FFN type + ffn_type = FFN_TYPE_MAP.get(model_type, NEEDS_REVIEW) + result["ffn_type"] = ffn_type + + # Set activation function + hf_activation = raw.get("hidden_act", raw.get("activation_function")) + if hf_activation: + activation = hf_activation.lower() + if "silu" in activation or "swish" in activation: + activation = "silu" + elif "gelu" in activation: + activation = "gelu" + elif "relu" in activation: + activation = "relu" + else: + activation = ACTIVATION_MAP.get(model_type, NEEDS_REVIEW) + + if ffn_type == "mlp": + _set_nested(result, "mlp.activation", activation) + # Most modern models use gated activation (SwiGLU, GeGLU) + use_gated = model_type in ( + "llama", "mistral", "mixtral", "qwen2", "qwen2_moe", "phi3", + "gemma", "gemma2", "deepseek_v2", "deepseek_v3", + ) + _set_nested(result, "mlp.use_gated_activation", use_gated) + elif ffn_type == "moe": + _set_nested(result, "moe.activation", activation) + # MoE-specific fields + if "moe_intermediate_size" in raw: + _set_nested(result, "moe.moe_intermediate_size", raw["moe_intermediate_size"]) + if "num_shared_experts" in raw: + _set_nested(result, "moe.num_shared_experts", raw["num_shared_experts"]) + if "shared_expert_intermediate_size" in raw: + _set_nested( + result, "moe.shared_expert_intermediate_size", + raw["shared_expert_intermediate_size"], + ) + # DeepSeek MoE-specific fields (from PR #185 research) + if "routed_scaling_factor" in raw: + _set_nested(result, "moe.routed_scaling_factor", raw["routed_scaling_factor"]) + if "topk_method" in raw: + _set_nested(result, "moe.topk_method", raw["topk_method"]) + if "norm_topk_prob" in raw: + _set_nested(result, "moe.norm_topk_prob", raw["norm_topk_prob"]) + + # Mixed layers support (DeepSeek uses dense layers before switching to MoE) + if "first_k_dense_replace" in raw and "moe_layer_freq" in raw: + result["layer_structure"] = "mixed" + _set_nested(result, "mixed_layers.first_k_dense_replace", raw["first_k_dense_replace"]) + _set_nested(result, "mixed_layers.moe_layer_freq", raw["moe_layer_freq"]) + + # Normalization + norm_type = "rmsnorm" # Most modern models use RMSNorm + if model_type in ("gpt2", "gpt_neo"): + norm_type = "layernorm" + _set_nested(result, "norm.type", norm_type) + + if "layer_norm_eps" in raw: + _set_nested(result, "norm.epsilon", raw["layer_norm_eps"]) + + # Tokenizer + _set_nested(result, "tokenizer.type", "bpe") + _set_nested(result, "tokenizer.library", "huggingface") + + # Position embedding type + if model_type in ("gpt2", "gpt_neo"): + _set_nested(result, "position_embedding.type", "learned") + else: + _set_nested(result, "position_embedding.type", "rope") + + # Embedding + tie_embeddings = raw.get("tie_word_embeddings", False) + _set_nested(result, "token_embedding.shared_embedding", tie_embeddings) + + # Bias flags + attn_bias = raw.get("attention_bias", False) + _set_nested(result, "attention.has_qkv_bias", attn_bias) + _set_nested(result, "attention.has_output_bias", attn_bias) + + mlp_bias = raw.get("mlp_bias", False) + if ffn_type == "mlp": + _set_nested(result, "mlp.has_bias", mlp_bias) + + # Vision encoder (multimodal models) + vision = parse_vision_config(raw) + if vision: + result["vision_encoder"] = vision + + return result + + +# Maps HuggingFace vision config field names to ModelPack vision encoder paths. +VISION_FIELD_MAP = { + "hidden_size": "hidden_size", + "patch_size": "patch_size", + "image_size": "image_size", + "num_hidden_layers": "num_layers", + "num_attention_heads": "num_attention_heads", + "intermediate_size": "intermediate_size", + "num_channels": "in_channels", + "in_chans": "in_channels", +} + +# Known vision model types +VISION_MODEL_TYPES = { + "llava", "llava_next", "llava_onevision", + "mllama", # LLaMA-3.2 Vision + "qwen2_vl", + "paligemma", "idefics2", "idefics3", +} + +# Known vision encoder type mapping +VISION_ENCODER_TYPE_MAP = { + "clip_vision_model": "clip_vit", + "siglip_vision_model": "clip_vit", + "CLIPVisionConfig": "clip_vit", + "SiglipVisionConfig": "clip_vit", +} + +# Known projector type mapping +PROJECTOR_TYPE_MAP = { + "llava": ("mlp", 2, "gelu"), + "llava_next": ("mlp", 2, "gelu"), + "llava_onevision": ("mlp", 2, "gelu"), + "mllama": ("cross_attention", None, None), + "paligemma": ("linear", 1, None), +} + +# Known fusion type mapping +FUSION_TYPE_MAP = { + "llava": "late", + "llava_next": "late", + "llava_onevision": "late", + "mllama": "cross_attention", + "qwen2_vl": "early", + "paligemma": "late", + "idefics2": "late", +} + + +def parse_vision_config(raw: dict) -> dict | None: + """Parse vision encoder fields from a HuggingFace multimodal config. + + Args: + raw: The parsed config.json dict from HuggingFace. + + Returns: + A dict following the ModelPack vision encoder spec, or None if not + a vision model. + """ + model_type = raw.get("model_type", "").lower() + + # Extract the nested vision_config dict + vcfg = raw.get("vision_config") + if vcfg is None and model_type not in VISION_MODEL_TYPES: + return None + + # Some models embed vision config as a flat dict, others as nested + if isinstance(vcfg, dict): + vision_raw = vcfg + else: + return None + + result: dict = {} + + # Map static vision fields + for hf_key, mp_path in VISION_FIELD_MAP.items(): + if hf_key in vision_raw and vision_raw[hf_key] is not None: + _set_nested(result, mp_path, vision_raw[hf_key]) + + # Qwen2-VL uses different field names + if model_type == "qwen2_vl": + if "depth" in vision_raw: + result["num_layers"] = vision_raw["depth"] + if "embed_dim" in vision_raw: + result["hidden_size"] = vision_raw["embed_dim"] + if "num_heads" in vision_raw: + result["num_attention_heads"] = vision_raw["num_heads"] + if "spatial_patch_size" in vision_raw: + result["patch_size"] = vision_raw["spatial_patch_size"] + if "temporal_patch_size" in vision_raw: + result["temporal_patch_size"] = vision_raw["temporal_patch_size"] + if "spatial_merge_size" in vision_raw: + _set_nested(result, "dynamic_resolution.enabled", True) + _set_nested( + result, "dynamic_resolution.spatial_merge_size", + vision_raw["spatial_merge_size"], + ) + + # Infer encoder type + vision_model_type = vision_raw.get("model_type", "") + encoder_type = VISION_ENCODER_TYPE_MAP.get(vision_model_type) + if encoder_type is None: + # Check for CLIP-like configs + if "projection_dim" in vision_raw or vision_model_type in ("clip_vision_model",): + encoder_type = "clip_vit" + else: + encoder_type = "vit" + result["type"] = encoder_type + + # Activation + hf_act = vision_raw.get("hidden_act") + if hf_act: + act = hf_act.lower() + if "quick_gelu" in act: + result["activation"] = "quick_gelu" + elif "gelu" in act: + result["activation"] = "gelu" + elif "silu" in act or "swish" in act: + result["activation"] = "silu" + + # Normalization + eps = vision_raw.get("layer_norm_eps") + if eps is not None: + _set_nested(result, "norm.type", "layernorm") + _set_nested(result, "norm.epsilon", eps) + + # Projector + proj_info = PROJECTOR_TYPE_MAP.get(model_type) + if proj_info: + proj_type, proj_layers, proj_act = proj_info + _set_nested(result, "projector.type", proj_type) + if proj_layers is not None: + _set_nested(result, "projector.num_layers", proj_layers) + if proj_act is not None: + _set_nested(result, "projector.activation", proj_act) + # Check for explicit projector config + if "projector_hidden_act" in raw: + act = raw["projector_hidden_act"].lower() + _set_nested(result, "projector.activation", act) + + # Special tokens + token_fields = { + "image_token_id": "special_tokens.image_token_id", + "image_token_index": "special_tokens.image_token_id", + "vision_start_token_id": "special_tokens.vision_start_token_id", + "vision_end_token_id": "special_tokens.vision_end_token_id", + "video_token_id": "special_tokens.video_token_id", + } + for hf_key, mp_path in token_fields.items(): + val = raw.get(hf_key) + if val is not None: + _set_nested(result, mp_path, val) + + # Fusion type + fusion = FUSION_TYPE_MAP.get(model_type, NEEDS_REVIEW) + result["fusion_type"] = fusion + + # Cross-attention layers (LLaMA-3.2 Vision) + if model_type == "mllama": + cross_attn_layers = raw.get("cross_attention_layers") + if cross_attn_layers is not None: + result["num_cross_attention_layers"] = len(cross_attn_layers) + + # Position embedding for vision encoder + if model_type == "qwen2_vl": + rope_scaling = raw.get("rope_scaling", {}) + if rope_scaling.get("type") == "mrope": + _set_nested(result, "position_embedding.type", "mrope") + sections = rope_scaling.get("mrope_section") + if sections: + _set_nested(result, "position_embedding.mrope_sections", sections) + else: + _set_nested(result, "position_embedding.type", "learned") + + return result + + +def format_yaml(spec: dict, indent: int = 0) -> str: + """Format a spec dict as YAML string.""" + lines = [] + prefix = " " * indent + for key, value in spec.items(): + if isinstance(value, dict): + lines.append(f"{prefix}{key}:") + lines.append(format_yaml(value, indent + 1)) + elif isinstance(value, bool): + lines.append(f"{prefix}{key}: {str(value).lower()}") + elif isinstance(value, str): + if value == NEEDS_REVIEW: + lines.append(f"{prefix}{key}: {value} # requires human review") + else: + lines.append(f'{prefix}{key}: "{value}"') + elif value is None: + lines.append(f"{prefix}{key}: null") + else: + lines.append(f"{prefix}{key}: {value}") + return "\n".join(lines) + + +def load_config(source: str) -> dict: + """Load a config.json from a file path or HuggingFace model ID. + + Args: + source: Either a local file path or a HuggingFace model ID. + + Returns: + The parsed config.json dict. + """ + path = Path(source) + if path.is_file(): + with path.open(encoding="utf-8") as f: + return json.load(f) + + # Try loading from HuggingFace Hub + try: + from huggingface_hub import hf_hub_download + + config_path = hf_hub_download(repo_id=source, filename="config.json") + with open(config_path, encoding="utf-8") as f: + return json.load(f) + except ImportError: + print( + "error: huggingface_hub not installed. " + "Install with: pip install huggingface_hub", + file=sys.stderr, + ) + sys.exit(1) + except Exception as e: + print(f"error: failed to load config from '{source}': {e}", file=sys.stderr) + sys.exit(1) + + +def main() -> int: + parser = argparse.ArgumentParser( + description="Parse HuggingFace config.json into ModelPack transformer spec", + ) + parser.add_argument( + "model", + help="HuggingFace model ID (e.g., meta-llama/Meta-Llama-3-8B) " + "or path to config.json", + ) + parser.add_argument( + "--format", + choices=["yaml", "json"], + default="yaml", + help="Output format (default: yaml)", + ) + + args = parser.parse_args() + + raw = load_config(args.model) + spec = parse_hf_config(raw) + + model_type = raw.get("model_type", "unknown") + model_name = raw.get("_name_or_path", args.model) + + if args.format == "json": + print(json.dumps(spec, indent=2)) + else: + print(f"# ModelPack Transformer Spec") + print(f"# Generated from: {model_name}") + print(f"# Model type: {model_type}") + print(f"# NOTE: Fields marked NEEDS_REVIEW require human verification") + print() + print(format_yaml(spec)) + + # Report coverage + needs_review = [] + _find_needs_review(spec, "", needs_review) + if needs_review: + print(f"\n# --- Fields requiring review ({len(needs_review)}) ---") + for field in needs_review: + print(f"# - {field}") + + return 0 + + +def _find_needs_review(d: dict, prefix: str, result: list) -> None: + """Recursively find all NEEDS_REVIEW fields.""" + for key, value in d.items(): + path = f"{prefix}.{key}" if prefix else key + if isinstance(value, dict): + _find_needs_review(value, path, result) + elif value == NEEDS_REVIEW: + result.append(path) + + +if __name__ == "__main__": + raise SystemExit(main()) diff --git a/tools/hf_parser_test.py b/tools/hf_parser_test.py new file mode 100644 index 0000000..245a22f --- /dev/null +++ b/tools/hf_parser_test.py @@ -0,0 +1,517 @@ +#!/usr/bin/env python3 +"""Tests for the HuggingFace config parser.""" + +from __future__ import annotations + +import pytest + +from hf_parser import NEEDS_REVIEW, parse_hf_config, parse_vision_config + + +# Minimal config.json samples based on real HuggingFace models. +MISTRAL_7B_CONFIG = { + "model_type": "mistral", + "vocab_size": 32000, + "hidden_size": 4096, + "num_hidden_layers": 32, + "num_attention_heads": 32, + "num_key_value_heads": 8, + "intermediate_size": 14336, + "max_position_embeddings": 32768, + "rope_theta": 10000.0, + "rms_norm_eps": 1e-5, + "hidden_act": "silu", + "sliding_window": 4096, + "tie_word_embeddings": False, + "attention_bias": False, +} + +MIXTRAL_8X7B_CONFIG = { + "model_type": "mixtral", + "vocab_size": 32000, + "hidden_size": 4096, + "num_hidden_layers": 32, + "num_attention_heads": 32, + "num_key_value_heads": 8, + "intermediate_size": 14336, + "max_position_embeddings": 32768, + "rope_theta": 1000000.0, + "rms_norm_eps": 1e-5, + "hidden_act": "silu", + "num_local_experts": 8, + "num_experts_per_tok": 2, + "tie_word_embeddings": False, +} + +QWEN2_7B_CONFIG = { + "model_type": "qwen2", + "vocab_size": 152064, + "hidden_size": 3584, + "num_hidden_layers": 28, + "num_attention_heads": 28, + "num_key_value_heads": 4, + "intermediate_size": 18944, + "max_position_embeddings": 131072, + "rope_theta": 1000000.0, + "rms_norm_eps": 1e-6, + "hidden_act": "silu", + "tie_word_embeddings": False, + "attention_bias": True, + "sliding_window": 131072, +} + +GPT2_CONFIG = { + "model_type": "gpt2", + "vocab_size": 50257, + "hidden_size": 768, + "num_hidden_layers": 12, + "num_attention_heads": 12, + "intermediate_size": 3072, + "max_position_embeddings": 1024, + "layer_norm_eps": 1e-5, + "activation_function": "gelu_new", + "tie_word_embeddings": True, +} + +DEEPSEEK_V2_LITE_CONFIG = { + "model_type": "deepseek_v2", + "vocab_size": 102400, + "hidden_size": 2048, + "num_hidden_layers": 27, + "num_attention_heads": 16, + "num_key_value_heads": 16, + "intermediate_size": 10944, + "max_position_embeddings": 163840, + "rope_theta": 10000.0, + "rms_norm_eps": 1e-6, + "hidden_act": "silu", + "kv_lora_rank": 512, + "q_lora_rank": 1536, + "qk_nope_head_dim": 128, + "qk_rope_head_dim": 64, + "v_head_dim": 128, + "n_routed_experts": 64, + "num_experts_per_tok": 6, + "first_k_dense_replace": 1, + "moe_layer_freq": 1, + "num_shared_experts": 2, + "routed_scaling_factor": 1.0, + "topk_method": "group_limited_greedy", + "norm_topk_prob": False, + "tie_word_embeddings": False, +} + +UNKNOWN_CONFIG = { + "model_type": "some_new_model", + "vocab_size": 65536, + "hidden_size": 2048, + "num_hidden_layers": 24, + "num_attention_heads": 16, +} + + +class TestMistral: + def test_basic_fields(self): + spec = parse_hf_config(MISTRAL_7B_CONFIG) + assert spec["vocabulary_size"] == 32000 + assert spec["hidden_size"] == 4096 + assert spec["num_layers"] == 32 + assert spec["type"] == "decoder" + + def test_attention(self): + spec = parse_hf_config(MISTRAL_7B_CONFIG) + attn = spec["attention"] + assert attn["type"] == "gqa" + assert attn["num_attention_heads"] == 32 + assert attn["num_key_value_heads"] == 8 + assert attn["head_dim"] == 128 # 4096 / 32 + assert attn["is_causal"] is True + assert attn["sliding_window"] == 4096 + + def test_ffn(self): + spec = parse_hf_config(MISTRAL_7B_CONFIG) + assert spec["ffn_type"] == "mlp" + assert spec["mlp"]["intermediate_size"] == 14336 + assert spec["mlp"]["activation"] == "silu" + assert spec["mlp"]["use_gated_activation"] is True + + def test_norm(self): + spec = parse_hf_config(MISTRAL_7B_CONFIG) + assert spec["norm"]["type"] == "rmsnorm" + assert spec["norm"]["epsilon"] == 1e-5 + + def test_position_embedding(self): + spec = parse_hf_config(MISTRAL_7B_CONFIG) + pe = spec["position_embedding"] + assert pe["type"] == "rope" + assert pe["rope_theta"] == 10000.0 + assert pe["max_position_embeddings"] == 32768 + + +class TestMixtral: + def test_moe_detection(self): + spec = parse_hf_config(MIXTRAL_8X7B_CONFIG) + assert spec["ffn_type"] == "moe" + assert spec["moe"]["num_experts"] == 8 + assert spec["moe"]["top_k"] == 2 + + def test_attention_still_gqa(self): + spec = parse_hf_config(MIXTRAL_8X7B_CONFIG) + assert spec["attention"]["type"] == "gqa" + assert spec["attention"]["num_key_value_heads"] == 8 + + +class TestQwen2: + def test_attention_bias(self): + spec = parse_hf_config(QWEN2_7B_CONFIG) + assert spec["attention"]["has_qkv_bias"] is True + assert spec["attention"]["has_output_bias"] is True + + def test_rope_theta(self): + spec = parse_hf_config(QWEN2_7B_CONFIG) + assert spec["position_embedding"]["rope_theta"] == 1000000.0 + + +class TestGPT2: + def test_mha_attention(self): + spec = parse_hf_config(GPT2_CONFIG) + assert spec["attention"]["type"] == "mha" + + def test_layernorm(self): + spec = parse_hf_config(GPT2_CONFIG) + assert spec["norm"]["type"] == "layernorm" + assert spec["norm"]["epsilon"] == 1e-5 + + def test_tied_embeddings(self): + spec = parse_hf_config(GPT2_CONFIG) + assert spec["token_embedding"]["shared_embedding"] is True + + def test_no_gated_activation(self): + spec = parse_hf_config(GPT2_CONFIG) + assert spec["mlp"]["use_gated_activation"] is False + + def test_gelu_activation(self): + spec = parse_hf_config(GPT2_CONFIG) + assert spec["mlp"]["activation"] == "gelu" + + +class TestDeepSeekV2: + def test_mla_attention(self): + spec = parse_hf_config(DEEPSEEK_V2_LITE_CONFIG) + assert spec["attention"]["type"] == "mla" + + def test_mla_fields(self): + spec = parse_hf_config(DEEPSEEK_V2_LITE_CONFIG) + attn = spec["attention"] + assert attn["kv_lora_rank"] == 512 + assert attn["q_lora_rank"] == 1536 + assert attn["qk_nope_head_dim"] == 128 + assert attn["qk_rope_head_dim"] == 64 + assert attn["v_head_dim"] == 128 + + def test_moe_with_n_routed_experts(self): + spec = parse_hf_config(DEEPSEEK_V2_LITE_CONFIG) + assert spec["ffn_type"] == "moe" + assert spec["moe"]["num_experts"] == 64 + assert spec["moe"]["top_k"] == 6 + + def test_shared_experts(self): + spec = parse_hf_config(DEEPSEEK_V2_LITE_CONFIG) + assert spec["moe"]["num_shared_experts"] == 2 + + def test_moe_routing_fields(self): + spec = parse_hf_config(DEEPSEEK_V2_LITE_CONFIG) + assert spec["moe"]["routed_scaling_factor"] == 1.0 + assert spec["moe"]["topk_method"] == "group_limited_greedy" + assert spec["moe"]["norm_topk_prob"] is False + + def test_mixed_layers(self): + spec = parse_hf_config(DEEPSEEK_V2_LITE_CONFIG) + assert spec["layer_structure"] == "mixed" + assert spec["mixed_layers"]["first_k_dense_replace"] == 1 + assert spec["mixed_layers"]["moe_layer_freq"] == 1 + + +class TestGPT2PositionEmbedding: + def test_learned_position_embedding(self): + spec = parse_hf_config(GPT2_CONFIG) + assert spec["position_embedding"]["type"] == "learned" + + +class TestUnknownModel: + def test_needs_review_flags(self): + spec = parse_hf_config(UNKNOWN_CONFIG) + assert spec["attention"]["type"] == NEEDS_REVIEW + assert spec["ffn_type"] == NEEDS_REVIEW + + def test_static_fields_still_parsed(self): + spec = parse_hf_config(UNKNOWN_CONFIG) + assert spec["vocabulary_size"] == 65536 + assert spec["hidden_size"] == 2048 + assert spec["num_layers"] == 24 + + def test_head_dim_derived(self): + spec = parse_hf_config(UNKNOWN_CONFIG) + assert spec["attention"]["head_dim"] == 128 # 2048 / 16 + + +class TestHeadDimDerivation: + def test_explicit_head_dim(self): + config = { + "model_type": "mistral", + "vocab_size": 32000, + "hidden_size": 4096, + "num_attention_heads": 32, + "head_dim": 64, # explicit, not derived + } + spec = parse_hf_config(config) + assert spec["attention"]["head_dim"] == 64 # uses explicit value + + def test_derived_head_dim(self): + config = { + "model_type": "llama", + "vocab_size": 32000, + "hidden_size": 4096, + "num_attention_heads": 32, + } + spec = parse_hf_config(config) + assert spec["attention"]["head_dim"] == 128 # 4096 / 32 + + +# ============================================================================ +# Vision model configs +# ============================================================================ + +LLAVA_15_CONFIG = { + "model_type": "llava", + "vocab_size": 32064, + "hidden_size": 4096, + "num_hidden_layers": 32, + "num_attention_heads": 32, + "num_key_value_heads": 32, + "intermediate_size": 11008, + "max_position_embeddings": 4096, + "rms_norm_eps": 1e-5, + "hidden_act": "silu", + "image_token_index": 32000, + "projector_hidden_act": "gelu", + "vision_config": { + "model_type": "clip_vision_model", + "hidden_size": 1024, + "patch_size": 14, + "image_size": 336, + "num_hidden_layers": 24, + "num_attention_heads": 16, + "intermediate_size": 4096, + "num_channels": 3, + "hidden_act": "quick_gelu", + "layer_norm_eps": 1e-5, + "projection_dim": 768, + }, +} + +QWEN2_VL_CONFIG = { + "model_type": "qwen2_vl", + "vocab_size": 152064, + "hidden_size": 3584, + "num_hidden_layers": 28, + "num_attention_heads": 28, + "num_key_value_heads": 4, + "intermediate_size": 18944, + "max_position_embeddings": 32768, + "rms_norm_eps": 1e-6, + "hidden_act": "silu", + "vision_start_token_id": 151652, + "vision_end_token_id": 151653, + "vision_token_id": 151654, + "image_token_id": 151655, + "video_token_id": 151656, + "rope_scaling": { + "type": "mrope", + "mrope_section": [16, 24, 24], + }, + "vision_config": { + "depth": 32, + "embed_dim": 1280, + "num_heads": 16, + "in_chans": 3, + "spatial_patch_size": 14, + "spatial_merge_size": 2, + "temporal_patch_size": 2, + "hidden_act": "quick_gelu", + }, +} + +LLAMA_32_VISION_CONFIG = { + "model_type": "mllama", + "vocab_size": 128256, + "hidden_size": 4096, + "num_hidden_layers": 32, + "num_attention_heads": 32, + "num_key_value_heads": 8, + "intermediate_size": 14336, + "max_position_embeddings": 131072, + "rms_norm_eps": 1e-5, + "hidden_act": "silu", + "image_token_index": 128256, + "cross_attention_layers": [3, 8, 13, 18, 23, 28, 33, 38], + "vision_config": { + "model_type": "clip_vision_model", + "hidden_size": 1280, + "patch_size": 14, + "image_size": 560, + "num_hidden_layers": 32, + "num_attention_heads": 16, + "intermediate_size": 5120, + "num_channels": 3, + "hidden_act": "gelu", + "layer_norm_eps": 1e-5, + }, +} + +TEXT_ONLY_CONFIG = { + "model_type": "llama", + "vocab_size": 32000, + "hidden_size": 4096, + "num_hidden_layers": 32, + "num_attention_heads": 32, +} + + +# ============================================================================ +# Vision model tests +# ============================================================================ + +class TestLLaVA: + def test_vision_encoder_present(self): + spec = parse_hf_config(LLAVA_15_CONFIG) + assert "vision_encoder" in spec + + def test_encoder_type(self): + spec = parse_hf_config(LLAVA_15_CONFIG) + ve = spec["vision_encoder"] + assert ve["type"] == "clip_vit" + + def test_encoder_fields(self): + spec = parse_hf_config(LLAVA_15_CONFIG) + ve = spec["vision_encoder"] + assert ve["hidden_size"] == 1024 + assert ve["patch_size"] == 14 + assert ve["image_size"] == 336 + assert ve["num_layers"] == 24 + assert ve["num_attention_heads"] == 16 + assert ve["intermediate_size"] == 4096 + assert ve["in_channels"] == 3 + + def test_activation(self): + spec = parse_hf_config(LLAVA_15_CONFIG) + assert spec["vision_encoder"]["activation"] == "quick_gelu" + + def test_norm(self): + spec = parse_hf_config(LLAVA_15_CONFIG) + ve = spec["vision_encoder"] + assert ve["norm"]["type"] == "layernorm" + assert ve["norm"]["epsilon"] == 1e-5 + + def test_projector(self): + spec = parse_hf_config(LLAVA_15_CONFIG) + proj = spec["vision_encoder"]["projector"] + assert proj["type"] == "mlp" + assert proj["num_layers"] == 2 + assert proj["activation"] == "gelu" + + def test_special_tokens(self): + spec = parse_hf_config(LLAVA_15_CONFIG) + assert spec["vision_encoder"]["special_tokens"]["image_token_id"] == 32000 + + def test_fusion_type(self): + spec = parse_hf_config(LLAVA_15_CONFIG) + assert spec["vision_encoder"]["fusion_type"] == "late" + + def test_position_embedding(self): + spec = parse_hf_config(LLAVA_15_CONFIG) + assert spec["vision_encoder"]["position_embedding"]["type"] == "learned" + + +class TestQwen2VL: + def test_vision_encoder_present(self): + spec = parse_hf_config(QWEN2_VL_CONFIG) + assert "vision_encoder" in spec + + def test_qwen_specific_fields(self): + spec = parse_hf_config(QWEN2_VL_CONFIG) + ve = spec["vision_encoder"] + assert ve["num_layers"] == 32 + assert ve["hidden_size"] == 1280 + assert ve["num_attention_heads"] == 16 + assert ve["patch_size"] == 14 + + def test_temporal_patch(self): + spec = parse_hf_config(QWEN2_VL_CONFIG) + assert spec["vision_encoder"]["temporal_patch_size"] == 2 + + def test_dynamic_resolution(self): + spec = parse_hf_config(QWEN2_VL_CONFIG) + dr = spec["vision_encoder"]["dynamic_resolution"] + assert dr["enabled"] is True + assert dr["spatial_merge_size"] == 2 + + def test_special_tokens(self): + spec = parse_hf_config(QWEN2_VL_CONFIG) + tokens = spec["vision_encoder"]["special_tokens"] + assert tokens["image_token_id"] == 151655 + assert tokens["vision_start_token_id"] == 151652 + assert tokens["vision_end_token_id"] == 151653 + assert tokens["video_token_id"] == 151656 + + def test_mrope_position_embedding(self): + spec = parse_hf_config(QWEN2_VL_CONFIG) + pe = spec["vision_encoder"]["position_embedding"] + assert pe["type"] == "mrope" + assert pe["mrope_sections"] == [16, 24, 24] + + def test_fusion_type(self): + spec = parse_hf_config(QWEN2_VL_CONFIG) + assert spec["vision_encoder"]["fusion_type"] == "early" + + +class TestLLaMA32Vision: + def test_vision_encoder_present(self): + spec = parse_hf_config(LLAMA_32_VISION_CONFIG) + assert "vision_encoder" in spec + + def test_encoder_fields(self): + spec = parse_hf_config(LLAMA_32_VISION_CONFIG) + ve = spec["vision_encoder"] + assert ve["hidden_size"] == 1280 + assert ve["patch_size"] == 14 + assert ve["image_size"] == 560 + assert ve["num_layers"] == 32 + assert ve["num_attention_heads"] == 16 + + def test_cross_attention_projector(self): + spec = parse_hf_config(LLAMA_32_VISION_CONFIG) + proj = spec["vision_encoder"]["projector"] + assert proj["type"] == "cross_attention" + + def test_cross_attention_layers_count(self): + spec = parse_hf_config(LLAMA_32_VISION_CONFIG) + assert spec["vision_encoder"]["num_cross_attention_layers"] == 8 + + def test_fusion_type(self): + spec = parse_hf_config(LLAMA_32_VISION_CONFIG) + assert spec["vision_encoder"]["fusion_type"] == "cross_attention" + + def test_special_tokens(self): + spec = parse_hf_config(LLAMA_32_VISION_CONFIG) + assert spec["vision_encoder"]["special_tokens"]["image_token_id"] == 128256 + + +class TestTextOnlyModel: + def test_no_vision_encoder(self): + spec = parse_hf_config(TEXT_ONLY_CONFIG) + assert "vision_encoder" not in spec + + def test_no_vision_config_returns_none(self): + result = parse_vision_config(TEXT_ONLY_CONFIG) + assert result is None From 1be8d4479ea0e92dfcd56fc3294758b82e873a1c Mon Sep 17 00:00:00 2001 From: pradhyum6144 Date: Tue, 24 Mar 2026 00:09:26 +0530 Subject: [PATCH 2/5] fix: add language specifier to fenced code block in vision-encoder.md Fixes MD040 markdown lint error (fenced-code-language). Signed-off-by: pradhyum6144 --- docs/vision-encoder.md | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/docs/vision-encoder.md b/docs/vision-encoder.md index c2eee0d..5444d04 100644 --- a/docs/vision-encoder.md +++ b/docs/vision-encoder.md @@ -10,7 +10,7 @@ The current ModelPack specification supports declaring image modality via `capab Vision-language models follow a common pattern: -``` +```text Input Image → Vision Encoder → Projector → Language Model → Text Output ↓ Visual token embeddings From 4fc53bda22873f0db1fa246163ffaca2fb90fa10 Mon Sep 17 00:00:00 2001 From: pradhyum6144 Date: Tue, 24 Mar 2026 00:11:08 +0530 Subject: [PATCH 3/5] fix: remove unregistered mediatype from vision-encoder example The example_test.go validator has no registered schema for the vision-encoder mediatype, causing CI to fail. Remove the mediatype tag until the validator is extended to support it. Signed-off-by: pradhyum6144 --- docs/vision-encoder.md | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/docs/vision-encoder.md b/docs/vision-encoder.md index 5444d04..3eb827f 100644 --- a/docs/vision-encoder.md +++ b/docs/vision-encoder.md @@ -179,7 +179,7 @@ The **vision encoder** converts raw images into a sequence of visual tokens usin ## Example -```json,title=Vision%20Encoder%20Config&mediatype=application/vnd.cncf.model.vision-encoder.v1+json +```json { "type": "clip_vit", "hidden_size": 1024, From 5f1d7ee6dc0b84e7d9cc5a4c21b8e8703ed79618 Mon Sep 17 00:00:00 2001 From: pradhyum6144 Date: Tue, 24 Mar 2026 00:14:04 +0530 Subject: [PATCH 4/5] fix: move vision-encoder-schema.json out of schema/ directory schema.go uses //go:embed *.json which embeds ALL json files in schema/. The validator then requires each embedded file to have an entry in specURLs, which vision-encoder-schema.json does not have. Moved to docs/schemas/ to avoid breaking the Go embed/validation pipeline until the vision encoder is integrated into the validator. Signed-off-by: pradhyum6144 --- {schema => docs/schemas}/vision-encoder-schema.json | 0 1 file changed, 0 insertions(+), 0 deletions(-) rename {schema => docs/schemas}/vision-encoder-schema.json (100%) diff --git a/schema/vision-encoder-schema.json b/docs/schemas/vision-encoder-schema.json similarity index 100% rename from schema/vision-encoder-schema.json rename to docs/schemas/vision-encoder-schema.json From 66adf85482956f9b23d87c84e534196a536482fc Mon Sep 17 00:00:00 2001 From: pradhyum6144 Date: Tue, 24 Mar 2026 00:18:41 +0530 Subject: [PATCH 5/5] fix: address Gemini review feedback on vision encoder spec - Add missing vision_token_id to spec, schema, and parser (Qwen2-VL uses 5 special tokens, not 4) - Move num_cross_attention_layers into projector.num_layers for consistent schema structure across MLP and cross-attention projectors - Add vision_token_id assertion in Qwen2-VL test - Update projector.num_layers description to cover both MLP and cross-attention types Signed-off-by: pradhyum6144 --- docs/schemas/vision-encoder-schema.json | 10 +++++----- docs/vision-encoder.md | 10 +++++----- tools/hf_parser.py | 5 +++-- tools/hf_parser_test.py | 3 ++- 4 files changed, 15 insertions(+), 13 deletions(-) diff --git a/docs/schemas/vision-encoder-schema.json b/docs/schemas/vision-encoder-schema.json index abce40a..6c76417 100644 --- a/docs/schemas/vision-encoder-schema.json +++ b/docs/schemas/vision-encoder-schema.json @@ -68,7 +68,7 @@ }, "num_layers": { "type": "integer", - "description": "Number of layers in the projector (for MLP projectors)" + "description": "Number of layers in the projector (for MLP or cross-attention projectors)" }, "activation": { "type": "string", @@ -93,6 +93,10 @@ "type": "integer", "description": "Token ID marking the end of a vision region" }, + "vision_token_id": { + "type": "integer", + "description": "Token ID for a generic vision placeholder (e.g., used by Qwen2-VL)" + }, "video_token_id": { "type": "integer", "description": "Token ID for video frame placeholder" @@ -131,10 +135,6 @@ "enum": ["early", "late", "cross_attention"], "description": "How vision and language modalities are fused" }, - "num_cross_attention_layers": { - "type": "integer", - "description": "Number of cross-attention layers for vision-language fusion (for cross_attention fusion type)" - }, "position_embedding": { "type": "object", "description": "Position embedding configuration for the vision encoder", diff --git a/docs/vision-encoder.md b/docs/vision-encoder.md index 3eb827f..e05d3c0 100644 --- a/docs/vision-encoder.md +++ b/docs/vision-encoder.md @@ -92,7 +92,7 @@ The **vision encoder** converts raw images into a sequence of visual tokens usin - **num_layers** _integer_, OPTIONAL - The number of layers in the projector (for MLP-type projectors). + The number of layers in the projector (for MLP or cross-attention type projectors). - **activation** _string_, OPTIONAL @@ -114,6 +114,10 @@ The **vision encoder** converts raw images into a sequence of visual tokens usin The token ID marking the end of a vision region. + - **vision_token_id** _integer_, OPTIONAL + + The token ID for a generic vision placeholder (used by models like Qwen2-VL). + - **video_token_id** _integer_, OPTIONAL The token ID for video frame placeholders. @@ -152,10 +156,6 @@ The **vision encoder** converts raw images into a sequence of visual tokens usin | `"late"` | Visual tokens are injected after separate encoding (e.g., LLaVA) | | `"cross_attention"` | Dedicated cross-attention layers between vision and language (e.g., LLaMA-3.2 Vision) | -- **num_cross_attention_layers** _integer_, OPTIONAL - - The number of cross-attention layers for vision-language fusion. Only applicable when `fusion_type` is `"cross_attention"`. - - **position_embedding** _object_, OPTIONAL Position embedding configuration for the vision encoder. diff --git a/tools/hf_parser.py b/tools/hf_parser.py index 5735431..2ef8ae9 100644 --- a/tools/hf_parser.py +++ b/tools/hf_parser.py @@ -402,6 +402,7 @@ def parse_vision_config(raw: dict) -> dict | None: "image_token_index": "special_tokens.image_token_id", "vision_start_token_id": "special_tokens.vision_start_token_id", "vision_end_token_id": "special_tokens.vision_end_token_id", + "vision_token_id": "special_tokens.vision_token_id", "video_token_id": "special_tokens.video_token_id", } for hf_key, mp_path in token_fields.items(): @@ -413,11 +414,11 @@ def parse_vision_config(raw: dict) -> dict | None: fusion = FUSION_TYPE_MAP.get(model_type, NEEDS_REVIEW) result["fusion_type"] = fusion - # Cross-attention layers (LLaMA-3.2 Vision) + # Cross-attention layers count into projector.num_layers (LLaMA-3.2 Vision) if model_type == "mllama": cross_attn_layers = raw.get("cross_attention_layers") if cross_attn_layers is not None: - result["num_cross_attention_layers"] = len(cross_attn_layers) + _set_nested(result, "projector.num_layers", len(cross_attn_layers)) # Position embedding for vision encoder if model_type == "qwen2_vl": diff --git a/tools/hf_parser_test.py b/tools/hf_parser_test.py index 245a22f..9d27255 100644 --- a/tools/hf_parser_test.py +++ b/tools/hf_parser_test.py @@ -462,6 +462,7 @@ def test_special_tokens(self): assert tokens["image_token_id"] == 151655 assert tokens["vision_start_token_id"] == 151652 assert tokens["vision_end_token_id"] == 151653 + assert tokens["vision_token_id"] == 151654 assert tokens["video_token_id"] == 151656 def test_mrope_position_embedding(self): @@ -496,7 +497,7 @@ def test_cross_attention_projector(self): def test_cross_attention_layers_count(self): spec = parse_hf_config(LLAMA_32_VISION_CONFIG) - assert spec["vision_encoder"]["num_cross_attention_layers"] == 8 + assert spec["vision_encoder"]["projector"]["num_layers"] == 8 def test_fusion_type(self): spec = parse_hf_config(LLAMA_32_VISION_CONFIG)