diff --git a/docs/schemas/vision-encoder-schema.json b/docs/schemas/vision-encoder-schema.json new file mode 100644 index 0000000..6c76417 --- /dev/null +++ b/docs/schemas/vision-encoder-schema.json @@ -0,0 +1,167 @@ +{ + "description": "Vision Encoder Architecture Schema for Multimodal Models", + "$schema": "http://json-schema.org/draft-04/schema#", + "$id": "https://github.com/modelpack/model-spec/vision-encoder", + "type": "object", + "properties": { + "type": { + "type": "string", + "enum": ["vit", "clip_vit", "other"], + "description": "The vision encoder architecture type" + }, + "hidden_size": { + "type": "integer", + "description": "Hidden size / embedding dimension of the vision encoder" + }, + "patch_size": { + "type": "integer", + "description": "Spatial patch size in pixels (e.g., 14 means 14x14 patches)" + }, + "image_size": { + "type": "integer", + "description": "Default input image resolution in pixels" + }, + "num_layers": { + "type": "integer", + "description": "Number of transformer layers in the vision encoder" + }, + "num_attention_heads": { + "type": "integer", + "description": "Number of attention heads in the vision encoder" + }, + "intermediate_size": { + "type": "integer", + "description": "FFN intermediate size in the vision encoder" + }, + "in_channels": { + "type": "integer", + "description": "Number of input image channels (3 for RGB)", + "default": 3 + }, + "activation": { + "type": "string", + "description": "Activation function used in the vision encoder (e.g., quick_gelu, gelu, silu)" + }, + "norm": { + "type": "object", + "properties": { + "type": { + "type": "string", + "enum": ["layernorm", "rmsnorm"], + "description": "Normalization type in the vision encoder" + }, + "epsilon": { + "type": "number", + "description": "Epsilon value for normalization" + } + }, + "additionalProperties": false + }, + "projector": { + "type": "object", + "description": "Multimodal projector that maps vision embeddings to language model space", + "properties": { + "type": { + "type": "string", + "enum": ["mlp", "linear", "cross_attention", "perceiver", "other"], + "description": "Projector architecture type" + }, + "num_layers": { + "type": "integer", + "description": "Number of layers in the projector (for MLP or cross-attention projectors)" + }, + "activation": { + "type": "string", + "description": "Activation function in the projector (e.g., gelu)" + } + }, + "additionalProperties": false + }, + "special_tokens": { + "type": "object", + "description": "Special token IDs for image/video in the tokenizer", + "properties": { + "image_token_id": { + "type": "integer", + "description": "Token ID used as a placeholder for image input" + }, + "vision_start_token_id": { + "type": "integer", + "description": "Token ID marking the start of a vision region" + }, + "vision_end_token_id": { + "type": "integer", + "description": "Token ID marking the end of a vision region" + }, + "vision_token_id": { + "type": "integer", + "description": "Token ID for a generic vision placeholder (e.g., used by Qwen2-VL)" + }, + "video_token_id": { + "type": "integer", + "description": "Token ID for video frame placeholder" + } + }, + "additionalProperties": false + }, + "dynamic_resolution": { + "type": "object", + "description": "Dynamic image resolution support (e.g., Qwen2-VL native dynamic resolution)", + "properties": { + "enabled": { + "type": "boolean" + }, + "min_pixels": { + "type": "integer", + "description": "Minimum number of visual tokens" + }, + "max_pixels": { + "type": "integer", + "description": "Maximum number of visual tokens" + }, + "spatial_merge_size": { + "type": "integer", + "description": "Spatial merging stride for reducing token count" + } + }, + "additionalProperties": false + }, + "temporal_patch_size": { + "type": "integer", + "description": "Temporal patch size for video understanding (number of frames per patch)" + }, + "fusion_type": { + "type": "string", + "enum": ["early", "late", "cross_attention"], + "description": "How vision and language modalities are fused" + }, + "position_embedding": { + "type": "object", + "description": "Position embedding configuration for the vision encoder", + "properties": { + "type": { + "type": "string", + "enum": ["learned", "rope", "mrope", "sinusoidal"], + "description": "Type of position embedding" + }, + "mrope_sections": { + "type": "array", + "items": { + "type": "integer" + }, + "description": "Per-modality RoPE dimension sections (for mrope type)" + } + }, + "additionalProperties": false + } + }, + "required": [ + "type", + "hidden_size", + "patch_size", + "image_size", + "num_layers", + "num_attention_heads" + ], + "additionalProperties": false +} diff --git a/docs/vision-encoder.md b/docs/vision-encoder.md new file mode 100644 index 0000000..e05d3c0 --- /dev/null +++ b/docs/vision-encoder.md @@ -0,0 +1,210 @@ +# Vision Encoder Specification + +This document describes the vision encoder architecture fields for multimodal models that process image and video inputs. It extends the model configuration defined in [config.md](./config.md) to cover the architectural details of how visual inputs are processed. + +## Background + +The current ModelPack specification supports declaring image modality via `capabilities.inputTypes: ["image"]`, but provides no architectural description of how images are processed. Every major model family now has a vision variant (LLaVA, Qwen2-VL, LLaMA-3.2 Vision, Gemma 2 VL), and inference engines need structured metadata about the vision encoder to correctly configure image preprocessing, patch embedding, and vision-language fusion. + +## Architecture Overview + +Vision-language models follow a common pattern: + +```text +Input Image → Vision Encoder → Projector → Language Model → Text Output + ↓ + Visual token embeddings +``` + +The **vision encoder** converts raw images into a sequence of visual tokens using a Vision Transformer (ViT) or CLIP-ViT architecture. A **projector** module maps these visual tokens into the language model's embedding space. The **fusion type** determines how visual and textual tokens interact inside the language model. + +## Properties + +- **type** _string_, REQUIRED + + The vision encoder architecture type. Supported values: + + | Value | Description | + |-------|-------------| + | `"vit"` | Standard Vision Transformer | + | `"clip_vit"` | CLIP-pretrained Vision Transformer | + | `"other"` | Other vision encoder architecture | + +- **hidden_size** _integer_, REQUIRED + + The hidden size (embedding dimension) of the vision encoder. + +- **patch_size** _integer_, REQUIRED + + The spatial patch size in pixels. For example, `14` means the image is divided into 14×14 pixel patches. Each patch becomes one visual token. + +- **image_size** _integer_, REQUIRED + + The default input image resolution in pixels. + +- **num_layers** _integer_, REQUIRED + + The number of transformer layers in the vision encoder. + +- **num_attention_heads** _integer_, REQUIRED + + The number of attention heads in the vision encoder. + +- **intermediate_size** _integer_, OPTIONAL + + The FFN intermediate size in the vision encoder. + +- **in_channels** _integer_, OPTIONAL + + The number of input image channels. Defaults to `3` (RGB). + +- **activation** _string_, OPTIONAL + + The activation function used in the vision encoder, such as `"quick_gelu"`, `"gelu"`, or `"silu"`. + +- **norm** _object_, OPTIONAL + + Normalization configuration for the vision encoder. + + - **type** _string_, OPTIONAL + + The normalization type. Supported values: `"layernorm"`, `"rmsnorm"`. + + - **epsilon** _number_, OPTIONAL + + The epsilon value for normalization. + +- **projector** _object_, OPTIONAL + + The multimodal projector that maps vision encoder outputs to the language model embedding space. + + - **type** _string_, OPTIONAL + + The projector architecture type. Supported values: + + | Value | Description | + |-------|-------------| + | `"mlp"` | Multi-layer perceptron (e.g., LLaVA 1.5 uses 2-layer MLP with GELU) | + | `"linear"` | Single linear projection | + | `"cross_attention"` | Cross-attention layers (e.g., LLaMA-3.2 Vision) | + | `"perceiver"` | Perceiver-style resampler | + | `"other"` | Other projector architecture | + + - **num_layers** _integer_, OPTIONAL + + The number of layers in the projector (for MLP or cross-attention type projectors). + + - **activation** _string_, OPTIONAL + + The activation function in the projector, such as `"gelu"`. + +- **special_tokens** _object_, OPTIONAL + + Special token IDs used for image and video inputs in the tokenizer. + + - **image_token_id** _integer_, OPTIONAL + + The token ID used as a placeholder for image input in the text sequence. + + - **vision_start_token_id** _integer_, OPTIONAL + + The token ID marking the start of a vision region (used by models like Qwen2-VL). + + - **vision_end_token_id** _integer_, OPTIONAL + + The token ID marking the end of a vision region. + + - **vision_token_id** _integer_, OPTIONAL + + The token ID for a generic vision placeholder (used by models like Qwen2-VL). + + - **video_token_id** _integer_, OPTIONAL + + The token ID for video frame placeholders. + +- **dynamic_resolution** _object_, OPTIONAL + + Dynamic image resolution support, where the model can handle variable-resolution inputs. + + - **enabled** _boolean_, OPTIONAL + + Whether dynamic resolution is enabled. + + - **min_pixels** _integer_, OPTIONAL + + The minimum number of visual tokens. + + - **max_pixels** _integer_, OPTIONAL + + The maximum number of visual tokens. + + - **spatial_merge_size** _integer_, OPTIONAL + + The spatial merging stride for reducing visual token count. + +- **temporal_patch_size** _integer_, OPTIONAL + + The temporal patch size for video understanding. Specifies how many frames are grouped into one temporal patch. + +- **fusion_type** _string_, OPTIONAL + + How vision and language modalities are fused. Supported values: + + | Value | Description | + |-------|-------------| + | `"early"` | Visual tokens are concatenated with text tokens before the first transformer layer (e.g., Qwen2-VL) | + | `"late"` | Visual tokens are injected after separate encoding (e.g., LLaVA) | + | `"cross_attention"` | Dedicated cross-attention layers between vision and language (e.g., LLaMA-3.2 Vision) | + +- **position_embedding** _object_, OPTIONAL + + Position embedding configuration for the vision encoder. + + - **type** _string_, OPTIONAL + + The type of position embedding. Supported values: `"learned"`, `"rope"`, `"mrope"`, `"sinusoidal"`. + + - **mrope_sections** _array of integers_, OPTIONAL + + Per-modality RoPE dimension sections. Only applicable when type is `"mrope"` (e.g., Qwen2-VL uses `[16, 24, 24]` for temporal, height, width dimensions). + +## Model Coverage + +| Model | Encoder | Patch Size | Image Size | Projector | Fusion | Special Features | +|-------|---------|-----------|------------|-----------|--------|------------------| +| LLaVA 1.5 | CLIP-ViT-L/14 | 14 | 336 | 2-layer MLP | late | — | +| Qwen2-VL | ViT | 14 | dynamic | — | early | mRoPE, dynamic resolution, video | +| LLaMA-3.2 Vision | CLIP-ViT | 14 | 560 | cross-attention | cross_attention | Gated cross-attention | +| Gemma 2 VL | SigLIP | 14 | 224 | linear | late | — | + +## Example + +```json +{ + "type": "clip_vit", + "hidden_size": 1024, + "patch_size": 14, + "image_size": 336, + "num_layers": 24, + "num_attention_heads": 16, + "intermediate_size": 4096, + "in_channels": 3, + "activation": "quick_gelu", + "norm": { + "type": "layernorm", + "epsilon": 1e-5 + }, + "projector": { + "type": "mlp", + "num_layers": 2, + "activation": "gelu" + }, + "special_tokens": { + "image_token_id": 32000 + }, + "fusion_type": "late", + "position_embedding": { + "type": "learned" + } +} +``` diff --git a/tools/hf_parser.py b/tools/hf_parser.py new file mode 100644 index 0000000..2ef8ae9 --- /dev/null +++ b/tools/hf_parser.py @@ -0,0 +1,548 @@ +#!/usr/bin/env python3 +"""Parse HuggingFace model config.json into ModelPack transformer spec format. + +This tool maps HuggingFace Transformers config.json fields to the ModelPack +unified transformer specification vocabulary defined in PR #111 +(docs/architecture.md by @aftersnow). + +Usage: + python tools/hf_parser.py meta-llama/Meta-Llama-3-8B + python tools/hf_parser.py mistralai/Mistral-7B-v0.3 + python tools/hf_parser.py --file path/to/config.json + +The output is a YAML spec file following the ModelPack transformer spec format. +Fields that cannot be reliably inferred from config.json are marked as +NEEDS_REVIEW for human verification. +""" + +from __future__ import annotations + +import argparse +import json +import sys +from pathlib import Path + +NEEDS_REVIEW = "__NEEDS_REVIEW__" + +# Maps HuggingFace config.json field names to ModelPack transformer spec paths. +# Based on PR #111's field vocabulary (docs/architecture.md). +FIELD_MAP = { + # Top-level transformer fields + "vocab_size": "vocabulary_size", + "hidden_size": "hidden_size", + # Position embedding + "max_position_embeddings": "position_embedding.max_position_embeddings", + "rope_theta": "position_embedding.rope_theta", + "rope_scaling": "position_embedding.rope_scaling", + # Attention + "num_attention_heads": "attention.num_attention_heads", + "num_key_value_heads": "attention.num_key_value_heads", + "head_dim": "attention.head_dim", + # FFN / MLP + "intermediate_size": "mlp.intermediate_size", + # Transformer layers + "num_hidden_layers": "num_layers", + # Normalization + "rms_norm_eps": "norm.epsilon", + # MoE fields + "num_local_experts": "moe.num_experts", + "num_experts_per_tok": "moe.top_k", + "num_experts": "moe.num_experts", + "n_routed_experts": "moe.num_experts", # DeepSeek naming variant + # MLA fields (DeepSeek) + "kv_lora_rank": "attention.kv_lora_rank", + "q_lora_rank": "attention.q_lora_rank", + "qk_nope_head_dim": "attention.qk_nope_head_dim", + "qk_rope_head_dim": "attention.qk_rope_head_dim", + "v_head_dim": "attention.v_head_dim", +} + +# Known model type → attention type mapping +ATTENTION_TYPE_MAP = { + "llama": "gqa", + "mistral": "gqa", + "mixtral": "gqa", + "qwen2": "gqa", + "qwen2_moe": "gqa", + "gemma": "gqa", + "gemma2": "gqa", + "phi3": "gqa", + "deepseek_v2": "mla", + "deepseek_v3": "mla", + "gpt2": "mha", + "gpt_neo": "mha", + "gpt_neox": "mha", + "falcon": "mha", +} + +# Known model type → FFN type mapping +FFN_TYPE_MAP = { + "llama": "mlp", + "mistral": "mlp", + "mixtral": "moe", + "qwen2": "mlp", + "qwen2_moe": "moe", + "gemma": "mlp", + "gemma2": "mlp", + "phi3": "mlp", + "deepseek_v2": "moe", + "deepseek_v3": "moe", + "gpt2": "mlp", + "gpt_neo": "mlp", + "gpt_neox": "mlp", + "falcon": "mlp", +} + +# Known model type → activation function mapping +ACTIVATION_MAP = { + "llama": "silu", + "mistral": "silu", + "mixtral": "silu", + "qwen2": "silu", + "qwen2_moe": "silu", + "gemma": "gelu", + "gemma2": "gelu", + "phi3": "silu", + "gpt2": "gelu", + "gpt_neo": "gelu", + "gpt_neox": "gelu", + "falcon": "gelu", +} + + +def _set_nested(d: dict, path: str, value) -> None: + """Set a value in a nested dict using dot-separated path.""" + keys = path.split(".") + for key in keys[:-1]: + d = d.setdefault(key, {}) + d[keys[-1]] = value + + +def _get_nested(d: dict, path: str, default=None): + """Get a value from a nested dict using dot-separated path.""" + keys = path.split(".") + for key in keys: + if not isinstance(d, dict) or key not in d: + return default + d = d[key] + return d + + +def parse_hf_config(raw: dict) -> dict: + """Parse a HuggingFace config.json dict into ModelPack transformer spec. + + Args: + raw: The parsed config.json dict from HuggingFace. + + Returns: + A dict following the ModelPack transformer spec format. + """ + result: dict = {} + model_type = raw.get("model_type", "").lower() + + # Map static fields + for hf_key, mp_path in FIELD_MAP.items(): + if hf_key in raw and raw[hf_key] is not None: + _set_nested(result, mp_path, raw[hf_key]) + + # Derive head_dim if absent + if "attention" in result and "head_dim" not in result.get("attention", {}): + hidden = result.get("hidden_size") + n_heads = _get_nested(result, "attention.num_attention_heads") + if hidden and n_heads: + _set_nested(result, "attention.head_dim", hidden // n_heads) + + # Set architecture type + result["type"] = "decoder" + result["architecture_version"] = "0.1.0" + + # Infer attention type from model_type + attn_type = ATTENTION_TYPE_MAP.get(model_type, NEEDS_REVIEW) + _set_nested(result, "attention.type", attn_type) + _set_nested(result, "attention.is_causal", True) + + # Check for sliding window attention + if raw.get("sliding_window") is not None: + _set_nested(result, "attention.sliding_window", raw["sliding_window"]) + + # Infer FFN type + ffn_type = FFN_TYPE_MAP.get(model_type, NEEDS_REVIEW) + result["ffn_type"] = ffn_type + + # Set activation function + hf_activation = raw.get("hidden_act", raw.get("activation_function")) + if hf_activation: + activation = hf_activation.lower() + if "silu" in activation or "swish" in activation: + activation = "silu" + elif "gelu" in activation: + activation = "gelu" + elif "relu" in activation: + activation = "relu" + else: + activation = ACTIVATION_MAP.get(model_type, NEEDS_REVIEW) + + if ffn_type == "mlp": + _set_nested(result, "mlp.activation", activation) + # Most modern models use gated activation (SwiGLU, GeGLU) + use_gated = model_type in ( + "llama", "mistral", "mixtral", "qwen2", "qwen2_moe", "phi3", + "gemma", "gemma2", "deepseek_v2", "deepseek_v3", + ) + _set_nested(result, "mlp.use_gated_activation", use_gated) + elif ffn_type == "moe": + _set_nested(result, "moe.activation", activation) + # MoE-specific fields + if "moe_intermediate_size" in raw: + _set_nested(result, "moe.moe_intermediate_size", raw["moe_intermediate_size"]) + if "num_shared_experts" in raw: + _set_nested(result, "moe.num_shared_experts", raw["num_shared_experts"]) + if "shared_expert_intermediate_size" in raw: + _set_nested( + result, "moe.shared_expert_intermediate_size", + raw["shared_expert_intermediate_size"], + ) + # DeepSeek MoE-specific fields (from PR #185 research) + if "routed_scaling_factor" in raw: + _set_nested(result, "moe.routed_scaling_factor", raw["routed_scaling_factor"]) + if "topk_method" in raw: + _set_nested(result, "moe.topk_method", raw["topk_method"]) + if "norm_topk_prob" in raw: + _set_nested(result, "moe.norm_topk_prob", raw["norm_topk_prob"]) + + # Mixed layers support (DeepSeek uses dense layers before switching to MoE) + if "first_k_dense_replace" in raw and "moe_layer_freq" in raw: + result["layer_structure"] = "mixed" + _set_nested(result, "mixed_layers.first_k_dense_replace", raw["first_k_dense_replace"]) + _set_nested(result, "mixed_layers.moe_layer_freq", raw["moe_layer_freq"]) + + # Normalization + norm_type = "rmsnorm" # Most modern models use RMSNorm + if model_type in ("gpt2", "gpt_neo"): + norm_type = "layernorm" + _set_nested(result, "norm.type", norm_type) + + if "layer_norm_eps" in raw: + _set_nested(result, "norm.epsilon", raw["layer_norm_eps"]) + + # Tokenizer + _set_nested(result, "tokenizer.type", "bpe") + _set_nested(result, "tokenizer.library", "huggingface") + + # Position embedding type + if model_type in ("gpt2", "gpt_neo"): + _set_nested(result, "position_embedding.type", "learned") + else: + _set_nested(result, "position_embedding.type", "rope") + + # Embedding + tie_embeddings = raw.get("tie_word_embeddings", False) + _set_nested(result, "token_embedding.shared_embedding", tie_embeddings) + + # Bias flags + attn_bias = raw.get("attention_bias", False) + _set_nested(result, "attention.has_qkv_bias", attn_bias) + _set_nested(result, "attention.has_output_bias", attn_bias) + + mlp_bias = raw.get("mlp_bias", False) + if ffn_type == "mlp": + _set_nested(result, "mlp.has_bias", mlp_bias) + + # Vision encoder (multimodal models) + vision = parse_vision_config(raw) + if vision: + result["vision_encoder"] = vision + + return result + + +# Maps HuggingFace vision config field names to ModelPack vision encoder paths. +VISION_FIELD_MAP = { + "hidden_size": "hidden_size", + "patch_size": "patch_size", + "image_size": "image_size", + "num_hidden_layers": "num_layers", + "num_attention_heads": "num_attention_heads", + "intermediate_size": "intermediate_size", + "num_channels": "in_channels", + "in_chans": "in_channels", +} + +# Known vision model types +VISION_MODEL_TYPES = { + "llava", "llava_next", "llava_onevision", + "mllama", # LLaMA-3.2 Vision + "qwen2_vl", + "paligemma", "idefics2", "idefics3", +} + +# Known vision encoder type mapping +VISION_ENCODER_TYPE_MAP = { + "clip_vision_model": "clip_vit", + "siglip_vision_model": "clip_vit", + "CLIPVisionConfig": "clip_vit", + "SiglipVisionConfig": "clip_vit", +} + +# Known projector type mapping +PROJECTOR_TYPE_MAP = { + "llava": ("mlp", 2, "gelu"), + "llava_next": ("mlp", 2, "gelu"), + "llava_onevision": ("mlp", 2, "gelu"), + "mllama": ("cross_attention", None, None), + "paligemma": ("linear", 1, None), +} + +# Known fusion type mapping +FUSION_TYPE_MAP = { + "llava": "late", + "llava_next": "late", + "llava_onevision": "late", + "mllama": "cross_attention", + "qwen2_vl": "early", + "paligemma": "late", + "idefics2": "late", +} + + +def parse_vision_config(raw: dict) -> dict | None: + """Parse vision encoder fields from a HuggingFace multimodal config. + + Args: + raw: The parsed config.json dict from HuggingFace. + + Returns: + A dict following the ModelPack vision encoder spec, or None if not + a vision model. + """ + model_type = raw.get("model_type", "").lower() + + # Extract the nested vision_config dict + vcfg = raw.get("vision_config") + if vcfg is None and model_type not in VISION_MODEL_TYPES: + return None + + # Some models embed vision config as a flat dict, others as nested + if isinstance(vcfg, dict): + vision_raw = vcfg + else: + return None + + result: dict = {} + + # Map static vision fields + for hf_key, mp_path in VISION_FIELD_MAP.items(): + if hf_key in vision_raw and vision_raw[hf_key] is not None: + _set_nested(result, mp_path, vision_raw[hf_key]) + + # Qwen2-VL uses different field names + if model_type == "qwen2_vl": + if "depth" in vision_raw: + result["num_layers"] = vision_raw["depth"] + if "embed_dim" in vision_raw: + result["hidden_size"] = vision_raw["embed_dim"] + if "num_heads" in vision_raw: + result["num_attention_heads"] = vision_raw["num_heads"] + if "spatial_patch_size" in vision_raw: + result["patch_size"] = vision_raw["spatial_patch_size"] + if "temporal_patch_size" in vision_raw: + result["temporal_patch_size"] = vision_raw["temporal_patch_size"] + if "spatial_merge_size" in vision_raw: + _set_nested(result, "dynamic_resolution.enabled", True) + _set_nested( + result, "dynamic_resolution.spatial_merge_size", + vision_raw["spatial_merge_size"], + ) + + # Infer encoder type + vision_model_type = vision_raw.get("model_type", "") + encoder_type = VISION_ENCODER_TYPE_MAP.get(vision_model_type) + if encoder_type is None: + # Check for CLIP-like configs + if "projection_dim" in vision_raw or vision_model_type in ("clip_vision_model",): + encoder_type = "clip_vit" + else: + encoder_type = "vit" + result["type"] = encoder_type + + # Activation + hf_act = vision_raw.get("hidden_act") + if hf_act: + act = hf_act.lower() + if "quick_gelu" in act: + result["activation"] = "quick_gelu" + elif "gelu" in act: + result["activation"] = "gelu" + elif "silu" in act or "swish" in act: + result["activation"] = "silu" + + # Normalization + eps = vision_raw.get("layer_norm_eps") + if eps is not None: + _set_nested(result, "norm.type", "layernorm") + _set_nested(result, "norm.epsilon", eps) + + # Projector + proj_info = PROJECTOR_TYPE_MAP.get(model_type) + if proj_info: + proj_type, proj_layers, proj_act = proj_info + _set_nested(result, "projector.type", proj_type) + if proj_layers is not None: + _set_nested(result, "projector.num_layers", proj_layers) + if proj_act is not None: + _set_nested(result, "projector.activation", proj_act) + # Check for explicit projector config + if "projector_hidden_act" in raw: + act = raw["projector_hidden_act"].lower() + _set_nested(result, "projector.activation", act) + + # Special tokens + token_fields = { + "image_token_id": "special_tokens.image_token_id", + "image_token_index": "special_tokens.image_token_id", + "vision_start_token_id": "special_tokens.vision_start_token_id", + "vision_end_token_id": "special_tokens.vision_end_token_id", + "vision_token_id": "special_tokens.vision_token_id", + "video_token_id": "special_tokens.video_token_id", + } + for hf_key, mp_path in token_fields.items(): + val = raw.get(hf_key) + if val is not None: + _set_nested(result, mp_path, val) + + # Fusion type + fusion = FUSION_TYPE_MAP.get(model_type, NEEDS_REVIEW) + result["fusion_type"] = fusion + + # Cross-attention layers count into projector.num_layers (LLaMA-3.2 Vision) + if model_type == "mllama": + cross_attn_layers = raw.get("cross_attention_layers") + if cross_attn_layers is not None: + _set_nested(result, "projector.num_layers", len(cross_attn_layers)) + + # Position embedding for vision encoder + if model_type == "qwen2_vl": + rope_scaling = raw.get("rope_scaling", {}) + if rope_scaling.get("type") == "mrope": + _set_nested(result, "position_embedding.type", "mrope") + sections = rope_scaling.get("mrope_section") + if sections: + _set_nested(result, "position_embedding.mrope_sections", sections) + else: + _set_nested(result, "position_embedding.type", "learned") + + return result + + +def format_yaml(spec: dict, indent: int = 0) -> str: + """Format a spec dict as YAML string.""" + lines = [] + prefix = " " * indent + for key, value in spec.items(): + if isinstance(value, dict): + lines.append(f"{prefix}{key}:") + lines.append(format_yaml(value, indent + 1)) + elif isinstance(value, bool): + lines.append(f"{prefix}{key}: {str(value).lower()}") + elif isinstance(value, str): + if value == NEEDS_REVIEW: + lines.append(f"{prefix}{key}: {value} # requires human review") + else: + lines.append(f'{prefix}{key}: "{value}"') + elif value is None: + lines.append(f"{prefix}{key}: null") + else: + lines.append(f"{prefix}{key}: {value}") + return "\n".join(lines) + + +def load_config(source: str) -> dict: + """Load a config.json from a file path or HuggingFace model ID. + + Args: + source: Either a local file path or a HuggingFace model ID. + + Returns: + The parsed config.json dict. + """ + path = Path(source) + if path.is_file(): + with path.open(encoding="utf-8") as f: + return json.load(f) + + # Try loading from HuggingFace Hub + try: + from huggingface_hub import hf_hub_download + + config_path = hf_hub_download(repo_id=source, filename="config.json") + with open(config_path, encoding="utf-8") as f: + return json.load(f) + except ImportError: + print( + "error: huggingface_hub not installed. " + "Install with: pip install huggingface_hub", + file=sys.stderr, + ) + sys.exit(1) + except Exception as e: + print(f"error: failed to load config from '{source}': {e}", file=sys.stderr) + sys.exit(1) + + +def main() -> int: + parser = argparse.ArgumentParser( + description="Parse HuggingFace config.json into ModelPack transformer spec", + ) + parser.add_argument( + "model", + help="HuggingFace model ID (e.g., meta-llama/Meta-Llama-3-8B) " + "or path to config.json", + ) + parser.add_argument( + "--format", + choices=["yaml", "json"], + default="yaml", + help="Output format (default: yaml)", + ) + + args = parser.parse_args() + + raw = load_config(args.model) + spec = parse_hf_config(raw) + + model_type = raw.get("model_type", "unknown") + model_name = raw.get("_name_or_path", args.model) + + if args.format == "json": + print(json.dumps(spec, indent=2)) + else: + print(f"# ModelPack Transformer Spec") + print(f"# Generated from: {model_name}") + print(f"# Model type: {model_type}") + print(f"# NOTE: Fields marked NEEDS_REVIEW require human verification") + print() + print(format_yaml(spec)) + + # Report coverage + needs_review = [] + _find_needs_review(spec, "", needs_review) + if needs_review: + print(f"\n# --- Fields requiring review ({len(needs_review)}) ---") + for field in needs_review: + print(f"# - {field}") + + return 0 + + +def _find_needs_review(d: dict, prefix: str, result: list) -> None: + """Recursively find all NEEDS_REVIEW fields.""" + for key, value in d.items(): + path = f"{prefix}.{key}" if prefix else key + if isinstance(value, dict): + _find_needs_review(value, path, result) + elif value == NEEDS_REVIEW: + result.append(path) + + +if __name__ == "__main__": + raise SystemExit(main()) diff --git a/tools/hf_parser_test.py b/tools/hf_parser_test.py new file mode 100644 index 0000000..9d27255 --- /dev/null +++ b/tools/hf_parser_test.py @@ -0,0 +1,518 @@ +#!/usr/bin/env python3 +"""Tests for the HuggingFace config parser.""" + +from __future__ import annotations + +import pytest + +from hf_parser import NEEDS_REVIEW, parse_hf_config, parse_vision_config + + +# Minimal config.json samples based on real HuggingFace models. +MISTRAL_7B_CONFIG = { + "model_type": "mistral", + "vocab_size": 32000, + "hidden_size": 4096, + "num_hidden_layers": 32, + "num_attention_heads": 32, + "num_key_value_heads": 8, + "intermediate_size": 14336, + "max_position_embeddings": 32768, + "rope_theta": 10000.0, + "rms_norm_eps": 1e-5, + "hidden_act": "silu", + "sliding_window": 4096, + "tie_word_embeddings": False, + "attention_bias": False, +} + +MIXTRAL_8X7B_CONFIG = { + "model_type": "mixtral", + "vocab_size": 32000, + "hidden_size": 4096, + "num_hidden_layers": 32, + "num_attention_heads": 32, + "num_key_value_heads": 8, + "intermediate_size": 14336, + "max_position_embeddings": 32768, + "rope_theta": 1000000.0, + "rms_norm_eps": 1e-5, + "hidden_act": "silu", + "num_local_experts": 8, + "num_experts_per_tok": 2, + "tie_word_embeddings": False, +} + +QWEN2_7B_CONFIG = { + "model_type": "qwen2", + "vocab_size": 152064, + "hidden_size": 3584, + "num_hidden_layers": 28, + "num_attention_heads": 28, + "num_key_value_heads": 4, + "intermediate_size": 18944, + "max_position_embeddings": 131072, + "rope_theta": 1000000.0, + "rms_norm_eps": 1e-6, + "hidden_act": "silu", + "tie_word_embeddings": False, + "attention_bias": True, + "sliding_window": 131072, +} + +GPT2_CONFIG = { + "model_type": "gpt2", + "vocab_size": 50257, + "hidden_size": 768, + "num_hidden_layers": 12, + "num_attention_heads": 12, + "intermediate_size": 3072, + "max_position_embeddings": 1024, + "layer_norm_eps": 1e-5, + "activation_function": "gelu_new", + "tie_word_embeddings": True, +} + +DEEPSEEK_V2_LITE_CONFIG = { + "model_type": "deepseek_v2", + "vocab_size": 102400, + "hidden_size": 2048, + "num_hidden_layers": 27, + "num_attention_heads": 16, + "num_key_value_heads": 16, + "intermediate_size": 10944, + "max_position_embeddings": 163840, + "rope_theta": 10000.0, + "rms_norm_eps": 1e-6, + "hidden_act": "silu", + "kv_lora_rank": 512, + "q_lora_rank": 1536, + "qk_nope_head_dim": 128, + "qk_rope_head_dim": 64, + "v_head_dim": 128, + "n_routed_experts": 64, + "num_experts_per_tok": 6, + "first_k_dense_replace": 1, + "moe_layer_freq": 1, + "num_shared_experts": 2, + "routed_scaling_factor": 1.0, + "topk_method": "group_limited_greedy", + "norm_topk_prob": False, + "tie_word_embeddings": False, +} + +UNKNOWN_CONFIG = { + "model_type": "some_new_model", + "vocab_size": 65536, + "hidden_size": 2048, + "num_hidden_layers": 24, + "num_attention_heads": 16, +} + + +class TestMistral: + def test_basic_fields(self): + spec = parse_hf_config(MISTRAL_7B_CONFIG) + assert spec["vocabulary_size"] == 32000 + assert spec["hidden_size"] == 4096 + assert spec["num_layers"] == 32 + assert spec["type"] == "decoder" + + def test_attention(self): + spec = parse_hf_config(MISTRAL_7B_CONFIG) + attn = spec["attention"] + assert attn["type"] == "gqa" + assert attn["num_attention_heads"] == 32 + assert attn["num_key_value_heads"] == 8 + assert attn["head_dim"] == 128 # 4096 / 32 + assert attn["is_causal"] is True + assert attn["sliding_window"] == 4096 + + def test_ffn(self): + spec = parse_hf_config(MISTRAL_7B_CONFIG) + assert spec["ffn_type"] == "mlp" + assert spec["mlp"]["intermediate_size"] == 14336 + assert spec["mlp"]["activation"] == "silu" + assert spec["mlp"]["use_gated_activation"] is True + + def test_norm(self): + spec = parse_hf_config(MISTRAL_7B_CONFIG) + assert spec["norm"]["type"] == "rmsnorm" + assert spec["norm"]["epsilon"] == 1e-5 + + def test_position_embedding(self): + spec = parse_hf_config(MISTRAL_7B_CONFIG) + pe = spec["position_embedding"] + assert pe["type"] == "rope" + assert pe["rope_theta"] == 10000.0 + assert pe["max_position_embeddings"] == 32768 + + +class TestMixtral: + def test_moe_detection(self): + spec = parse_hf_config(MIXTRAL_8X7B_CONFIG) + assert spec["ffn_type"] == "moe" + assert spec["moe"]["num_experts"] == 8 + assert spec["moe"]["top_k"] == 2 + + def test_attention_still_gqa(self): + spec = parse_hf_config(MIXTRAL_8X7B_CONFIG) + assert spec["attention"]["type"] == "gqa" + assert spec["attention"]["num_key_value_heads"] == 8 + + +class TestQwen2: + def test_attention_bias(self): + spec = parse_hf_config(QWEN2_7B_CONFIG) + assert spec["attention"]["has_qkv_bias"] is True + assert spec["attention"]["has_output_bias"] is True + + def test_rope_theta(self): + spec = parse_hf_config(QWEN2_7B_CONFIG) + assert spec["position_embedding"]["rope_theta"] == 1000000.0 + + +class TestGPT2: + def test_mha_attention(self): + spec = parse_hf_config(GPT2_CONFIG) + assert spec["attention"]["type"] == "mha" + + def test_layernorm(self): + spec = parse_hf_config(GPT2_CONFIG) + assert spec["norm"]["type"] == "layernorm" + assert spec["norm"]["epsilon"] == 1e-5 + + def test_tied_embeddings(self): + spec = parse_hf_config(GPT2_CONFIG) + assert spec["token_embedding"]["shared_embedding"] is True + + def test_no_gated_activation(self): + spec = parse_hf_config(GPT2_CONFIG) + assert spec["mlp"]["use_gated_activation"] is False + + def test_gelu_activation(self): + spec = parse_hf_config(GPT2_CONFIG) + assert spec["mlp"]["activation"] == "gelu" + + +class TestDeepSeekV2: + def test_mla_attention(self): + spec = parse_hf_config(DEEPSEEK_V2_LITE_CONFIG) + assert spec["attention"]["type"] == "mla" + + def test_mla_fields(self): + spec = parse_hf_config(DEEPSEEK_V2_LITE_CONFIG) + attn = spec["attention"] + assert attn["kv_lora_rank"] == 512 + assert attn["q_lora_rank"] == 1536 + assert attn["qk_nope_head_dim"] == 128 + assert attn["qk_rope_head_dim"] == 64 + assert attn["v_head_dim"] == 128 + + def test_moe_with_n_routed_experts(self): + spec = parse_hf_config(DEEPSEEK_V2_LITE_CONFIG) + assert spec["ffn_type"] == "moe" + assert spec["moe"]["num_experts"] == 64 + assert spec["moe"]["top_k"] == 6 + + def test_shared_experts(self): + spec = parse_hf_config(DEEPSEEK_V2_LITE_CONFIG) + assert spec["moe"]["num_shared_experts"] == 2 + + def test_moe_routing_fields(self): + spec = parse_hf_config(DEEPSEEK_V2_LITE_CONFIG) + assert spec["moe"]["routed_scaling_factor"] == 1.0 + assert spec["moe"]["topk_method"] == "group_limited_greedy" + assert spec["moe"]["norm_topk_prob"] is False + + def test_mixed_layers(self): + spec = parse_hf_config(DEEPSEEK_V2_LITE_CONFIG) + assert spec["layer_structure"] == "mixed" + assert spec["mixed_layers"]["first_k_dense_replace"] == 1 + assert spec["mixed_layers"]["moe_layer_freq"] == 1 + + +class TestGPT2PositionEmbedding: + def test_learned_position_embedding(self): + spec = parse_hf_config(GPT2_CONFIG) + assert spec["position_embedding"]["type"] == "learned" + + +class TestUnknownModel: + def test_needs_review_flags(self): + spec = parse_hf_config(UNKNOWN_CONFIG) + assert spec["attention"]["type"] == NEEDS_REVIEW + assert spec["ffn_type"] == NEEDS_REVIEW + + def test_static_fields_still_parsed(self): + spec = parse_hf_config(UNKNOWN_CONFIG) + assert spec["vocabulary_size"] == 65536 + assert spec["hidden_size"] == 2048 + assert spec["num_layers"] == 24 + + def test_head_dim_derived(self): + spec = parse_hf_config(UNKNOWN_CONFIG) + assert spec["attention"]["head_dim"] == 128 # 2048 / 16 + + +class TestHeadDimDerivation: + def test_explicit_head_dim(self): + config = { + "model_type": "mistral", + "vocab_size": 32000, + "hidden_size": 4096, + "num_attention_heads": 32, + "head_dim": 64, # explicit, not derived + } + spec = parse_hf_config(config) + assert spec["attention"]["head_dim"] == 64 # uses explicit value + + def test_derived_head_dim(self): + config = { + "model_type": "llama", + "vocab_size": 32000, + "hidden_size": 4096, + "num_attention_heads": 32, + } + spec = parse_hf_config(config) + assert spec["attention"]["head_dim"] == 128 # 4096 / 32 + + +# ============================================================================ +# Vision model configs +# ============================================================================ + +LLAVA_15_CONFIG = { + "model_type": "llava", + "vocab_size": 32064, + "hidden_size": 4096, + "num_hidden_layers": 32, + "num_attention_heads": 32, + "num_key_value_heads": 32, + "intermediate_size": 11008, + "max_position_embeddings": 4096, + "rms_norm_eps": 1e-5, + "hidden_act": "silu", + "image_token_index": 32000, + "projector_hidden_act": "gelu", + "vision_config": { + "model_type": "clip_vision_model", + "hidden_size": 1024, + "patch_size": 14, + "image_size": 336, + "num_hidden_layers": 24, + "num_attention_heads": 16, + "intermediate_size": 4096, + "num_channels": 3, + "hidden_act": "quick_gelu", + "layer_norm_eps": 1e-5, + "projection_dim": 768, + }, +} + +QWEN2_VL_CONFIG = { + "model_type": "qwen2_vl", + "vocab_size": 152064, + "hidden_size": 3584, + "num_hidden_layers": 28, + "num_attention_heads": 28, + "num_key_value_heads": 4, + "intermediate_size": 18944, + "max_position_embeddings": 32768, + "rms_norm_eps": 1e-6, + "hidden_act": "silu", + "vision_start_token_id": 151652, + "vision_end_token_id": 151653, + "vision_token_id": 151654, + "image_token_id": 151655, + "video_token_id": 151656, + "rope_scaling": { + "type": "mrope", + "mrope_section": [16, 24, 24], + }, + "vision_config": { + "depth": 32, + "embed_dim": 1280, + "num_heads": 16, + "in_chans": 3, + "spatial_patch_size": 14, + "spatial_merge_size": 2, + "temporal_patch_size": 2, + "hidden_act": "quick_gelu", + }, +} + +LLAMA_32_VISION_CONFIG = { + "model_type": "mllama", + "vocab_size": 128256, + "hidden_size": 4096, + "num_hidden_layers": 32, + "num_attention_heads": 32, + "num_key_value_heads": 8, + "intermediate_size": 14336, + "max_position_embeddings": 131072, + "rms_norm_eps": 1e-5, + "hidden_act": "silu", + "image_token_index": 128256, + "cross_attention_layers": [3, 8, 13, 18, 23, 28, 33, 38], + "vision_config": { + "model_type": "clip_vision_model", + "hidden_size": 1280, + "patch_size": 14, + "image_size": 560, + "num_hidden_layers": 32, + "num_attention_heads": 16, + "intermediate_size": 5120, + "num_channels": 3, + "hidden_act": "gelu", + "layer_norm_eps": 1e-5, + }, +} + +TEXT_ONLY_CONFIG = { + "model_type": "llama", + "vocab_size": 32000, + "hidden_size": 4096, + "num_hidden_layers": 32, + "num_attention_heads": 32, +} + + +# ============================================================================ +# Vision model tests +# ============================================================================ + +class TestLLaVA: + def test_vision_encoder_present(self): + spec = parse_hf_config(LLAVA_15_CONFIG) + assert "vision_encoder" in spec + + def test_encoder_type(self): + spec = parse_hf_config(LLAVA_15_CONFIG) + ve = spec["vision_encoder"] + assert ve["type"] == "clip_vit" + + def test_encoder_fields(self): + spec = parse_hf_config(LLAVA_15_CONFIG) + ve = spec["vision_encoder"] + assert ve["hidden_size"] == 1024 + assert ve["patch_size"] == 14 + assert ve["image_size"] == 336 + assert ve["num_layers"] == 24 + assert ve["num_attention_heads"] == 16 + assert ve["intermediate_size"] == 4096 + assert ve["in_channels"] == 3 + + def test_activation(self): + spec = parse_hf_config(LLAVA_15_CONFIG) + assert spec["vision_encoder"]["activation"] == "quick_gelu" + + def test_norm(self): + spec = parse_hf_config(LLAVA_15_CONFIG) + ve = spec["vision_encoder"] + assert ve["norm"]["type"] == "layernorm" + assert ve["norm"]["epsilon"] == 1e-5 + + def test_projector(self): + spec = parse_hf_config(LLAVA_15_CONFIG) + proj = spec["vision_encoder"]["projector"] + assert proj["type"] == "mlp" + assert proj["num_layers"] == 2 + assert proj["activation"] == "gelu" + + def test_special_tokens(self): + spec = parse_hf_config(LLAVA_15_CONFIG) + assert spec["vision_encoder"]["special_tokens"]["image_token_id"] == 32000 + + def test_fusion_type(self): + spec = parse_hf_config(LLAVA_15_CONFIG) + assert spec["vision_encoder"]["fusion_type"] == "late" + + def test_position_embedding(self): + spec = parse_hf_config(LLAVA_15_CONFIG) + assert spec["vision_encoder"]["position_embedding"]["type"] == "learned" + + +class TestQwen2VL: + def test_vision_encoder_present(self): + spec = parse_hf_config(QWEN2_VL_CONFIG) + assert "vision_encoder" in spec + + def test_qwen_specific_fields(self): + spec = parse_hf_config(QWEN2_VL_CONFIG) + ve = spec["vision_encoder"] + assert ve["num_layers"] == 32 + assert ve["hidden_size"] == 1280 + assert ve["num_attention_heads"] == 16 + assert ve["patch_size"] == 14 + + def test_temporal_patch(self): + spec = parse_hf_config(QWEN2_VL_CONFIG) + assert spec["vision_encoder"]["temporal_patch_size"] == 2 + + def test_dynamic_resolution(self): + spec = parse_hf_config(QWEN2_VL_CONFIG) + dr = spec["vision_encoder"]["dynamic_resolution"] + assert dr["enabled"] is True + assert dr["spatial_merge_size"] == 2 + + def test_special_tokens(self): + spec = parse_hf_config(QWEN2_VL_CONFIG) + tokens = spec["vision_encoder"]["special_tokens"] + assert tokens["image_token_id"] == 151655 + assert tokens["vision_start_token_id"] == 151652 + assert tokens["vision_end_token_id"] == 151653 + assert tokens["vision_token_id"] == 151654 + assert tokens["video_token_id"] == 151656 + + def test_mrope_position_embedding(self): + spec = parse_hf_config(QWEN2_VL_CONFIG) + pe = spec["vision_encoder"]["position_embedding"] + assert pe["type"] == "mrope" + assert pe["mrope_sections"] == [16, 24, 24] + + def test_fusion_type(self): + spec = parse_hf_config(QWEN2_VL_CONFIG) + assert spec["vision_encoder"]["fusion_type"] == "early" + + +class TestLLaMA32Vision: + def test_vision_encoder_present(self): + spec = parse_hf_config(LLAMA_32_VISION_CONFIG) + assert "vision_encoder" in spec + + def test_encoder_fields(self): + spec = parse_hf_config(LLAMA_32_VISION_CONFIG) + ve = spec["vision_encoder"] + assert ve["hidden_size"] == 1280 + assert ve["patch_size"] == 14 + assert ve["image_size"] == 560 + assert ve["num_layers"] == 32 + assert ve["num_attention_heads"] == 16 + + def test_cross_attention_projector(self): + spec = parse_hf_config(LLAMA_32_VISION_CONFIG) + proj = spec["vision_encoder"]["projector"] + assert proj["type"] == "cross_attention" + + def test_cross_attention_layers_count(self): + spec = parse_hf_config(LLAMA_32_VISION_CONFIG) + assert spec["vision_encoder"]["projector"]["num_layers"] == 8 + + def test_fusion_type(self): + spec = parse_hf_config(LLAMA_32_VISION_CONFIG) + assert spec["vision_encoder"]["fusion_type"] == "cross_attention" + + def test_special_tokens(self): + spec = parse_hf_config(LLAMA_32_VISION_CONFIG) + assert spec["vision_encoder"]["special_tokens"]["image_token_id"] == 128256 + + +class TestTextOnlyModel: + def test_no_vision_encoder(self): + spec = parse_hf_config(TEXT_ONLY_CONFIG) + assert "vision_encoder" not in spec + + def test_no_vision_config_returns_none(self): + result = parse_vision_config(TEXT_ONLY_CONFIG) + assert result is None