diff --git a/docs/schemas/vision-encoder-schema.json b/docs/schemas/vision-encoder-schema.json
new file mode 100644
index 0000000..6c76417
--- /dev/null
+++ b/docs/schemas/vision-encoder-schema.json
@@ -0,0 +1,167 @@
+{
+    "description": "Vision Encoder Architecture Schema for Multimodal Models",
+    "$schema": "http://json-schema.org/draft-04/schema#",
+    "$id": "https://github.com/modelpack/model-spec/vision-encoder",
+    "type": "object",
+    "properties": {
+      "type": {
+        "type": "string",
+        "enum": ["vit", "clip_vit", "other"],
+        "description": "The vision encoder architecture type"
+      },
+      "hidden_size": {
+        "type": "integer",
+        "description": "Hidden size / embedding dimension of the vision encoder"
+      },
+      "patch_size": {
+        "type": "integer",
+        "description": "Spatial patch size in pixels (e.g., 14 means 14x14 patches)"
+      },
+      "image_size": {
+        "type": "integer",
+        "description": "Default input image resolution in pixels"
+      },
+      "num_layers": {
+        "type": "integer",
+        "description": "Number of transformer layers in the vision encoder"
+      },
+      "num_attention_heads": {
+        "type": "integer",
+        "description": "Number of attention heads in the vision encoder"
+      },
+      "intermediate_size": {
+        "type": "integer",
+        "description": "FFN intermediate size in the vision encoder"
+      },
+      "in_channels": {
+        "type": "integer",
+        "description": "Number of input image channels (3 for RGB)",
+        "default": 3
+      },
+      "activation": {
+        "type": "string",
+        "description": "Activation function used in the vision encoder (e.g., quick_gelu, gelu, silu)"
+      },
+      "norm": {
+        "type": "object",
+        "properties": {
+          "type": {
+            "type": "string",
+            "enum": ["layernorm", "rmsnorm"],
+            "description": "Normalization type in the vision encoder"
+          },
+          "epsilon": {
+            "type": "number",
+            "description": "Epsilon value for normalization"
+          }
+        },
+        "additionalProperties": false
+      },
+      "projector": {
+        "type": "object",
+        "description": "Multimodal projector that maps vision embeddings to language model space",
+        "properties": {
+          "type": {
+            "type": "string",
+            "enum": ["mlp", "linear", "cross_attention", "perceiver", "other"],
+            "description": "Projector architecture type"
+          },
+          "num_layers": {
+            "type": "integer",
+            "description": "Number of layers in the projector (for MLP or cross-attention projectors)"
+          },
+          "activation": {
+            "type": "string",
+            "description": "Activation function in the projector (e.g., gelu)"
+          }
+        },
+        "additionalProperties": false
+      },
+      "special_tokens": {
+        "type": "object",
+        "description": "Special token IDs for image/video in the tokenizer",
+        "properties": {
+          "image_token_id": {
+            "type": "integer",
+            "description": "Token ID used as a placeholder for image input"
+          },
+          "vision_start_token_id": {
+            "type": "integer",
+            "description": "Token ID marking the start of a vision region"
+          },
+          "vision_end_token_id": {
+            "type": "integer",
+            "description": "Token ID marking the end of a vision region"
+          },
+          "vision_token_id": {
+            "type": "integer",
+            "description": "Token ID for a generic vision placeholder (e.g., used by Qwen2-VL)"
+          },
+          "video_token_id": {
+            "type": "integer",
+            "description": "Token ID for video frame placeholder"
+          }
+        },
+        "additionalProperties": false
+      },
+      "dynamic_resolution": {
+        "type": "object",
+        "description": "Dynamic image resolution support (e.g., Qwen2-VL native dynamic resolution)",
+        "properties": {
+          "enabled": {
+            "type": "boolean"
+          },
+          "min_pixels": {
+            "type": "integer",
+            "description": "Minimum number of visual tokens"
+          },
+          "max_pixels": {
+            "type": "integer",
+            "description": "Maximum number of visual tokens"
+          },
+          "spatial_merge_size": {
+            "type": "integer",
+            "description": "Spatial merging stride for reducing token count"
+          }
+        },
+        "additionalProperties": false
+      },
+      "temporal_patch_size": {
+        "type": "integer",
+        "description": "Temporal patch size for video understanding (number of frames per patch)"
+      },
+      "fusion_type": {
+        "type": "string",
+        "enum": ["early", "late", "cross_attention"],
+        "description": "How vision and language modalities are fused"
+      },
+      "position_embedding": {
+        "type": "object",
+        "description": "Position embedding configuration for the vision encoder",
+        "properties": {
+          "type": {
+            "type": "string",
+            "enum": ["learned", "rope", "mrope", "sinusoidal"],
+            "description": "Type of position embedding"
+          },
+          "mrope_sections": {
+            "type": "array",
+            "items": {
+              "type": "integer"
+            },
+            "description": "Per-modality RoPE dimension sections (for mrope type)"
+          }
+        },
+        "additionalProperties": false
+      }
+    },
+    "required": [
+      "type",
+      "hidden_size",
+      "patch_size",
+      "image_size",
+      "num_layers",
+      "num_attention_heads"
+    ],
+    "additionalProperties": false
+}
diff --git a/docs/vision-encoder.md b/docs/vision-encoder.md
new file mode 100644
index 0000000..e05d3c0
--- /dev/null
+++ b/docs/vision-encoder.md
@@ -0,0 +1,210 @@
+# Vision Encoder Specification
+
+This document describes the vision encoder architecture fields for multimodal models that process image and video inputs. It extends the model configuration defined in [config.md](./config.md) to cover the architectural details of how visual inputs are processed.
+
+## Background
+
+The current ModelPack specification supports declaring image modality via `capabilities.inputTypes: ["image"]`, but provides no architectural description of how images are processed. Every major model family now has a vision variant (LLaVA, Qwen2-VL, LLaMA-3.2 Vision, Gemma 2 VL), and inference engines need structured metadata about the vision encoder to correctly configure image preprocessing, patch embedding, and vision-language fusion.
+
+## Architecture Overview
+
+Vision-language models follow a common pattern:
+
+```text
+Input Image → Vision Encoder → Projector → Language Model → Text Output
+                    ↓
+           Visual token embeddings
+```
+
+The **vision encoder** converts raw images into a sequence of visual tokens using a Vision Transformer (ViT) or CLIP-ViT architecture. A **projector** module maps these visual tokens into the language model's embedding space. The **fusion type** determines how visual and textual tokens interact inside the language model.
+
+## Properties
+
+- **type** _string_, REQUIRED
+
+  The vision encoder architecture type. Supported values:
+
+  | Value | Description |
+  |-------|-------------|
+  | `"vit"` | Standard Vision Transformer |
+  | `"clip_vit"` | CLIP-pretrained Vision Transformer |
+  | `"other"` | Other vision encoder architecture |
+
+- **hidden_size** _integer_, REQUIRED
+
+  The hidden size (embedding dimension) of the vision encoder.
+
+- **patch_size** _integer_, REQUIRED
+
+  The spatial patch size in pixels. For example, `14` means the image is divided into 14×14 pixel patches. Each patch becomes one visual token.
+
+- **image_size** _integer_, REQUIRED
+
+  The default input image resolution in pixels.
+
+- **num_layers** _integer_, REQUIRED
+
+  The number of transformer layers in the vision encoder.
+
+- **num_attention_heads** _integer_, REQUIRED
+
+  The number of attention heads in the vision encoder.
+
+- **intermediate_size** _integer_, OPTIONAL
+
+  The FFN intermediate size in the vision encoder.
+
+- **in_channels** _integer_, OPTIONAL
+
+  The number of input image channels. Defaults to `3` (RGB).
+
+- **activation** _string_, OPTIONAL
+
+  The activation function used in the vision encoder, such as `"quick_gelu"`, `"gelu"`, or `"silu"`.
+
+- **norm** _object_, OPTIONAL
+
+  Normalization configuration for the vision encoder.
+
+  - **type** _string_, OPTIONAL
+
+    The normalization type. Supported values: `"layernorm"`, `"rmsnorm"`.
+
+  - **epsilon** _number_, OPTIONAL
+
+    The epsilon value for normalization.
+
+- **projector** _object_, OPTIONAL
+
+  The multimodal projector that maps vision encoder outputs to the language model embedding space.
+
+  - **type** _string_, OPTIONAL
+
+    The projector architecture type. Supported values:
+
+    | Value | Description |
+    |-------|-------------|
+    | `"mlp"` | Multi-layer perceptron (e.g., LLaVA 1.5 uses 2-layer MLP with GELU) |
+    | `"linear"` | Single linear projection |
+    | `"cross_attention"` | Cross-attention layers (e.g., LLaMA-3.2 Vision) |
+    | `"perceiver"` | Perceiver-style resampler |
+    | `"other"` | Other projector architecture |
+
+  - **num_layers** _integer_, OPTIONAL
+
+    The number of layers in the projector (for MLP or cross-attention type projectors).
+
+  - **activation** _string_, OPTIONAL
+
+    The activation function in the projector, such as `"gelu"`.
+
+- **special_tokens** _object_, OPTIONAL
+
+  Special token IDs used for image and video inputs in the tokenizer.
+
+  - **image_token_id** _integer_, OPTIONAL
+
+    The token ID used as a placeholder for image input in the text sequence.
+
+  - **vision_start_token_id** _integer_, OPTIONAL
+
+    The token ID marking the start of a vision region (used by models like Qwen2-VL).
+
+  - **vision_end_token_id** _integer_, OPTIONAL
+
+    The token ID marking the end of a vision region.
+
+  - **vision_token_id** _integer_, OPTIONAL
+
+    The token ID for a generic vision placeholder (used by models like Qwen2-VL).
+
+  - **video_token_id** _integer_, OPTIONAL
+
+    The token ID for video frame placeholders.
+
+- **dynamic_resolution** _object_, OPTIONAL
+
+  Dynamic image resolution support, where the model can handle variable-resolution inputs.
+
+  - **enabled** _boolean_, OPTIONAL
+
+    Whether dynamic resolution is enabled.
+
+  - **min_pixels** _integer_, OPTIONAL
+
+    The minimum number of visual tokens.
+
+  - **max_pixels** _integer_, OPTIONAL
+
+    The maximum number of visual tokens.
+
+  - **spatial_merge_size** _integer_, OPTIONAL
+
+    The spatial merging stride for reducing visual token count.
+
+- **temporal_patch_size** _integer_, OPTIONAL
+
+  The temporal patch size for video understanding. Specifies how many frames are grouped into one temporal patch.
+
+- **fusion_type** _string_, OPTIONAL
+
+  How vision and language modalities are fused. Supported values:
+
+  | Value | Description |
+  |-------|-------------|
+  | `"early"` | Visual tokens are concatenated with text tokens before the first transformer layer (e.g., Qwen2-VL) |
+  | `"late"` | Visual tokens are injected after separate encoding (e.g., LLaVA) |
+  | `"cross_attention"` | Dedicated cross-attention layers between vision and language (e.g., LLaMA-3.2 Vision) |
+
+- **position_embedding** _object_, OPTIONAL
+
+  Position embedding configuration for the vision encoder.
+
+  - **type** _string_, OPTIONAL
+
+    The type of position embedding. Supported values: `"learned"`, `"rope"`, `"mrope"`, `"sinusoidal"`.
+
+  - **mrope_sections** _array of integers_, OPTIONAL
+
+    Per-modality RoPE dimension sections. Only applicable when type is `"mrope"` (e.g., Qwen2-VL uses `[16, 24, 24]` for temporal, height, width dimensions).
+
+## Model Coverage
+
+| Model | Encoder | Patch Size | Image Size | Projector | Fusion | Special Features |
+|-------|---------|-----------|------------|-----------|--------|------------------|
+| LLaVA 1.5 | CLIP-ViT-L/14 | 14 | 336 | 2-layer MLP | late | — |
+| Qwen2-VL | ViT | 14 | dynamic | — | early | mRoPE, dynamic resolution, video |
+| LLaMA-3.2 Vision | CLIP-ViT | 14 | 560 | cross-attention | cross_attention | Gated cross-attention |
+| Gemma 2 VL | SigLIP | 14 | 224 | linear | late | — |
+
+## Example
+
+```json
+{
+  "type": "clip_vit",
+  "hidden_size": 1024,
+  "patch_size": 14,
+  "image_size": 336,
+  "num_layers": 24,
+  "num_attention_heads": 16,
+  "intermediate_size": 4096,
+  "in_channels": 3,
+  "activation": "quick_gelu",
+  "norm": {
+    "type": "layernorm",
+    "epsilon": 1e-5
+  },
+  "projector": {
+    "type": "mlp",
+    "num_layers": 2,
+    "activation": "gelu"
+  },
+  "special_tokens": {
+    "image_token_id": 32000
+  },
+  "fusion_type": "late",
+  "position_embedding": {
+    "type": "learned"
+  }
+}
+```
diff --git a/tools/hf_parser.py b/tools/hf_parser.py
new file mode 100644
index 0000000..2ef8ae9
--- /dev/null
+++ b/tools/hf_parser.py
@@ -0,0 +1,548 @@
+#!/usr/bin/env python3
+"""Parse HuggingFace model config.json into ModelPack transformer spec format.
+
+This tool maps HuggingFace Transformers config.json fields to the ModelPack
+unified transformer specification vocabulary defined in PR #111
+(docs/architecture.md by @aftersnow).
+
+Usage:
+    python tools/hf_parser.py meta-llama/Meta-Llama-3-8B
+    python tools/hf_parser.py mistralai/Mistral-7B-v0.3
+    python tools/hf_parser.py --file path/to/config.json
+
+The output is a YAML spec file following the ModelPack transformer spec format.
+Fields that cannot be reliably inferred from config.json are marked as
+NEEDS_REVIEW for human verification.
+"""
+
+from __future__ import annotations
+
+import argparse
+import json
+import sys
+from pathlib import Path
+
+NEEDS_REVIEW = "__NEEDS_REVIEW__"
+
+# Maps HuggingFace config.json field names to ModelPack transformer spec paths.
+# Based on PR #111's field vocabulary (docs/architecture.md).
+FIELD_MAP = {
+    # Top-level transformer fields
+    "vocab_size": "vocabulary_size",
+    "hidden_size": "hidden_size",
+    # Position embedding
+    "max_position_embeddings": "position_embedding.max_position_embeddings",
+    "rope_theta": "position_embedding.rope_theta",
+    "rope_scaling": "position_embedding.rope_scaling",
+    # Attention
+    "num_attention_heads": "attention.num_attention_heads",
+    "num_key_value_heads": "attention.num_key_value_heads",
+    "head_dim": "attention.head_dim",
+    # FFN / MLP
+    "intermediate_size": "mlp.intermediate_size",
+    # Transformer layers
+    "num_hidden_layers": "num_layers",
+    # Normalization
+    "rms_norm_eps": "norm.epsilon",
+    # MoE fields
+    "num_local_experts": "moe.num_experts",
+    "num_experts_per_tok": "moe.top_k",
+    "num_experts": "moe.num_experts",
+    "n_routed_experts": "moe.num_experts",  # DeepSeek naming variant
+    # MLA fields (DeepSeek)
+    "kv_lora_rank": "attention.kv_lora_rank",
+    "q_lora_rank": "attention.q_lora_rank",
+    "qk_nope_head_dim": "attention.qk_nope_head_dim",
+    "qk_rope_head_dim": "attention.qk_rope_head_dim",
+    "v_head_dim": "attention.v_head_dim",
+}
+
+# Known model type → attention type mapping
+ATTENTION_TYPE_MAP = {
+    "llama": "gqa",
+    "mistral": "gqa",
+    "mixtral": "gqa",
+    "qwen2": "gqa",
+    "qwen2_moe": "gqa",
+    "gemma": "gqa",
+    "gemma2": "gqa",
+    "phi3": "gqa",
+    "deepseek_v2": "mla",
+    "deepseek_v3": "mla",
+    "gpt2": "mha",
+    "gpt_neo": "mha",
+    "gpt_neox": "mha",
+    "falcon": "mha",
+}
+
+# Known model type → FFN type mapping
+FFN_TYPE_MAP = {
+    "llama": "mlp",
+    "mistral": "mlp",
+    "mixtral": "moe",
+    "qwen2": "mlp",
+    "qwen2_moe": "moe",
+    "gemma": "mlp",
+    "gemma2": "mlp",
+    "phi3": "mlp",
+    "deepseek_v2": "moe",
+    "deepseek_v3": "moe",
+    "gpt2": "mlp",
+    "gpt_neo": "mlp",
+    "gpt_neox": "mlp",
+    "falcon": "mlp",
+}
+
+# Known model type → activation function mapping
+ACTIVATION_MAP = {
+    "llama": "silu",
+    "mistral": "silu",
+    "mixtral": "silu",
+    "qwen2": "silu",
+    "qwen2_moe": "silu",
+    "gemma": "gelu",
+    "gemma2": "gelu",
+    "phi3": "silu",
+    "gpt2": "gelu",
+    "gpt_neo": "gelu",
+    "gpt_neox": "gelu",
+    "falcon": "gelu",
+}
+
+
+def _set_nested(d: dict, path: str, value) -> None:
+    """Set a value in a nested dict using dot-separated path."""
+    keys = path.split(".")
+    for key in keys[:-1]:
+        d = d.setdefault(key, {})
+    d[keys[-1]] = value
+
+
+def _get_nested(d: dict, path: str, default=None):
+    """Get a value from a nested dict using dot-separated path."""
+    keys = path.split(".")
+    for key in keys:
+        if not isinstance(d, dict) or key not in d:
+            return default
+        d = d[key]
+    return d
+
+
+def parse_hf_config(raw: dict) -> dict:
+    """Parse a HuggingFace config.json dict into ModelPack transformer spec.
+
+    Args:
+        raw: The parsed config.json dict from HuggingFace.
+
+    Returns:
+        A dict following the ModelPack transformer spec format.
+    """
+    result: dict = {}
+    model_type = raw.get("model_type", "").lower()
+
+    # Map static fields
+    for hf_key, mp_path in FIELD_MAP.items():
+        if hf_key in raw and raw[hf_key] is not None:
+            _set_nested(result, mp_path, raw[hf_key])
+
+    # Derive head_dim if absent
+    if "attention" in result and "head_dim" not in result.get("attention", {}):
+        hidden = result.get("hidden_size")
+        n_heads = _get_nested(result, "attention.num_attention_heads")
+        if hidden and n_heads:
+            _set_nested(result, "attention.head_dim", hidden // n_heads)
+
+    # Set architecture type
+    result["type"] = "decoder"
+    result["architecture_version"] = "0.1.0"
+
+    # Infer attention type from model_type
+    attn_type = ATTENTION_TYPE_MAP.get(model_type, NEEDS_REVIEW)
+    _set_nested(result, "attention.type", attn_type)
+    _set_nested(result, "attention.is_causal", True)
+
+    # Check for sliding window attention
+    if raw.get("sliding_window") is not None:
+        _set_nested(result, "attention.sliding_window", raw["sliding_window"])
+
+    # Infer FFN type
+    ffn_type = FFN_TYPE_MAP.get(model_type, NEEDS_REVIEW)
+    result["ffn_type"] = ffn_type
+
+    # Set activation function
+    hf_activation = raw.get("hidden_act", raw.get("activation_function"))
+    if hf_activation:
+        activation = hf_activation.lower()
+        if "silu" in activation or "swish" in activation:
+            activation = "silu"
+        elif "gelu" in activation:
+            activation = "gelu"
+        elif "relu" in activation:
+            activation = "relu"
+    else:
+        activation = ACTIVATION_MAP.get(model_type, NEEDS_REVIEW)
+
+    if ffn_type == "mlp":
+        _set_nested(result, "mlp.activation", activation)
+        # Most modern models use gated activation (SwiGLU, GeGLU)
+        use_gated = model_type in (
+            "llama", "mistral", "mixtral", "qwen2", "qwen2_moe", "phi3",
+            "gemma", "gemma2", "deepseek_v2", "deepseek_v3",
+        )
+        _set_nested(result, "mlp.use_gated_activation", use_gated)
+    elif ffn_type == "moe":
+        _set_nested(result, "moe.activation", activation)
+        # MoE-specific fields
+        if "moe_intermediate_size" in raw:
+            _set_nested(result, "moe.moe_intermediate_size", raw["moe_intermediate_size"])
+        if "num_shared_experts" in raw:
+            _set_nested(result, "moe.num_shared_experts", raw["num_shared_experts"])
+        if "shared_expert_intermediate_size" in raw:
+            _set_nested(
+                result, "moe.shared_expert_intermediate_size",
+                raw["shared_expert_intermediate_size"],
+            )
+        # DeepSeek MoE-specific fields (from PR #185 research)
+        if "routed_scaling_factor" in raw:
+            _set_nested(result, "moe.routed_scaling_factor", raw["routed_scaling_factor"])
+        if "topk_method" in raw:
+            _set_nested(result, "moe.topk_method", raw["topk_method"])
+        if "norm_topk_prob" in raw:
+            _set_nested(result, "moe.norm_topk_prob", raw["norm_topk_prob"])
+
+    # Mixed layers support (DeepSeek uses dense layers before switching to MoE)
+    if "first_k_dense_replace" in raw and "moe_layer_freq" in raw:
+        result["layer_structure"] = "mixed"
+        _set_nested(result, "mixed_layers.first_k_dense_replace", raw["first_k_dense_replace"])
+        _set_nested(result, "mixed_layers.moe_layer_freq", raw["moe_layer_freq"])
+
+    # Normalization
+    norm_type = "rmsnorm"  # Most modern models use RMSNorm
+    if model_type in ("gpt2", "gpt_neo"):
+        norm_type = "layernorm"
+    _set_nested(result, "norm.type", norm_type)
+
+    if "layer_norm_eps" in raw:
+        _set_nested(result, "norm.epsilon", raw["layer_norm_eps"])
+
+    # Tokenizer
+    _set_nested(result, "tokenizer.type", "bpe")
+    _set_nested(result, "tokenizer.library", "huggingface")
+
+    # Position embedding type
+    if model_type in ("gpt2", "gpt_neo"):
+        _set_nested(result, "position_embedding.type", "learned")
+    else:
+        _set_nested(result, "position_embedding.type", "rope")
+
+    # Embedding
+    tie_embeddings = raw.get("tie_word_embeddings", False)
+    _set_nested(result, "token_embedding.shared_embedding", tie_embeddings)
+
+    # Bias flags
+    attn_bias = raw.get("attention_bias", False)
+    _set_nested(result, "attention.has_qkv_bias", attn_bias)
+    _set_nested(result, "attention.has_output_bias", attn_bias)
+
+    mlp_bias = raw.get("mlp_bias", False)
+    if ffn_type == "mlp":
+        _set_nested(result, "mlp.has_bias", mlp_bias)
+
+    # Vision encoder (multimodal models)
+    vision = parse_vision_config(raw)
+    if vision:
+        result["vision_encoder"] = vision
+
+    return result
+
+
+# Maps HuggingFace vision config field names to ModelPack vision encoder paths.
+VISION_FIELD_MAP = {
+    "hidden_size": "hidden_size",
+    "patch_size": "patch_size",
+    "image_size": "image_size",
+    "num_hidden_layers": "num_layers",
+    "num_attention_heads": "num_attention_heads",
+    "intermediate_size": "intermediate_size",
+    "num_channels": "in_channels",
+    "in_chans": "in_channels",
+}
+
+# Known vision model types
+VISION_MODEL_TYPES = {
+    "llava", "llava_next", "llava_onevision",
+    "mllama",  # LLaMA-3.2 Vision
+    "qwen2_vl",
+    "paligemma", "idefics2", "idefics3",
+}
+
+# Known vision encoder type mapping
+VISION_ENCODER_TYPE_MAP = {
+    "clip_vision_model": "clip_vit",
+    "siglip_vision_model": "clip_vit",
+    "CLIPVisionConfig": "clip_vit",
+    "SiglipVisionConfig": "clip_vit",
+}
+
+# Known projector type mapping
+PROJECTOR_TYPE_MAP = {
+    "llava": ("mlp", 2, "gelu"),
+    "llava_next": ("mlp", 2, "gelu"),
+    "llava_onevision": ("mlp", 2, "gelu"),
+    "mllama": ("cross_attention", None, None),
+    "paligemma": ("linear", 1, None),
+}
+
+# Known fusion type mapping
+FUSION_TYPE_MAP = {
+    "llava": "late",
+    "llava_next": "late",
+    "llava_onevision": "late",
+    "mllama": "cross_attention",
+    "qwen2_vl": "early",
+    "paligemma": "late",
+    "idefics2": "late",
+}
+
+
+def parse_vision_config(raw: dict) -> dict | None:
+    """Parse vision encoder fields from a HuggingFace multimodal config.
+
+    Args:
+        raw: The parsed config.json dict from HuggingFace.
+
+    Returns:
+        A dict following the ModelPack vision encoder spec, or None if not
+        a vision model.
+    """
+    model_type = raw.get("model_type", "").lower()
+
+    # Extract the nested vision_config dict
+    vcfg = raw.get("vision_config")
+    if vcfg is None and model_type not in VISION_MODEL_TYPES:
+        return None
+
+    # Some models embed vision config as a flat dict, others as nested
+    if isinstance(vcfg, dict):
+        vision_raw = vcfg
+    else:
+        return None
+
+    result: dict = {}
+
+    # Map static vision fields
+    for hf_key, mp_path in VISION_FIELD_MAP.items():
+        if hf_key in vision_raw and vision_raw[hf_key] is not None:
+            _set_nested(result, mp_path, vision_raw[hf_key])
+
+    # Qwen2-VL uses different field names
+    if model_type == "qwen2_vl":
+        if "depth" in vision_raw:
+            result["num_layers"] = vision_raw["depth"]
+        if "embed_dim" in vision_raw:
+            result["hidden_size"] = vision_raw["embed_dim"]
+        if "num_heads" in vision_raw:
+            result["num_attention_heads"] = vision_raw["num_heads"]
+        if "spatial_patch_size" in vision_raw:
+            result["patch_size"] = vision_raw["spatial_patch_size"]
+        if "temporal_patch_size" in vision_raw:
+            result["temporal_patch_size"] = vision_raw["temporal_patch_size"]
+        if "spatial_merge_size" in vision_raw:
+            _set_nested(result, "dynamic_resolution.enabled", True)
+            _set_nested(
+                result, "dynamic_resolution.spatial_merge_size",
+                vision_raw["spatial_merge_size"],
+            )
+
+    # Infer encoder type
+    vision_model_type = vision_raw.get("model_type", "")
+    encoder_type = VISION_ENCODER_TYPE_MAP.get(vision_model_type)
+    if encoder_type is None:
+        # Check for CLIP-like configs
+        if "projection_dim" in vision_raw or vision_model_type in ("clip_vision_model",):
+            encoder_type = "clip_vit"
+        else:
+            encoder_type = "vit"
+    result["type"] = encoder_type
+
+    # Activation
+    hf_act = vision_raw.get("hidden_act")
+    if hf_act:
+        act = hf_act.lower()
+        if "quick_gelu" in act:
+            result["activation"] = "quick_gelu"
+        elif "gelu" in act:
+            result["activation"] = "gelu"
+        elif "silu" in act or "swish" in act:
+            result["activation"] = "silu"
+
+    # Normalization
+    eps = vision_raw.get("layer_norm_eps")
+    if eps is not None:
+        _set_nested(result, "norm.type", "layernorm")
+        _set_nested(result, "norm.epsilon", eps)
+
+    # Projector
+    proj_info = PROJECTOR_TYPE_MAP.get(model_type)
+    if proj_info:
+        proj_type, proj_layers, proj_act = proj_info
+        _set_nested(result, "projector.type", proj_type)
+        if proj_layers is not None:
+            _set_nested(result, "projector.num_layers", proj_layers)
+        if proj_act is not None:
+            _set_nested(result, "projector.activation", proj_act)
+    # Check for explicit projector config
+    if "projector_hidden_act" in raw:
+        act = raw["projector_hidden_act"].lower()
+        _set_nested(result, "projector.activation", act)
+
+    # Special tokens
+    token_fields = {
+        "image_token_id": "special_tokens.image_token_id",
+        "image_token_index": "special_tokens.image_token_id",
+        "vision_start_token_id": "special_tokens.vision_start_token_id",
+        "vision_end_token_id": "special_tokens.vision_end_token_id",
+        "vision_token_id": "special_tokens.vision_token_id",
+        "video_token_id": "special_tokens.video_token_id",
+    }
+    for hf_key, mp_path in token_fields.items():
+        val = raw.get(hf_key)
+        if val is not None:
+            _set_nested(result, mp_path, val)
+
+    # Fusion type
+    fusion = FUSION_TYPE_MAP.get(model_type, NEEDS_REVIEW)
+    result["fusion_type"] = fusion
+
+    # Cross-attention layers count into projector.num_layers (LLaMA-3.2 Vision)
+    if model_type == "mllama":
+        cross_attn_layers = raw.get("cross_attention_layers")
+        if cross_attn_layers is not None:
+            _set_nested(result, "projector.num_layers", len(cross_attn_layers))
+
+    # Position embedding for vision encoder
+    if model_type == "qwen2_vl":
+        rope_scaling = raw.get("rope_scaling", {})
+        if rope_scaling.get("type") == "mrope":
+            _set_nested(result, "position_embedding.type", "mrope")
+            sections = rope_scaling.get("mrope_section")
+            if sections:
+                _set_nested(result, "position_embedding.mrope_sections", sections)
+    else:
+        _set_nested(result, "position_embedding.type", "learned")
+
+    return result
+
+
+def format_yaml(spec: dict, indent: int = 0) -> str:
+    """Format a spec dict as YAML string."""
+    lines = []
+    prefix = "  " * indent
+    for key, value in spec.items():
+        if isinstance(value, dict):
+            lines.append(f"{prefix}{key}:")
+            lines.append(format_yaml(value, indent + 1))
+        elif isinstance(value, bool):
+            lines.append(f"{prefix}{key}: {str(value).lower()}")
+        elif isinstance(value, str):
+            if value == NEEDS_REVIEW:
+                lines.append(f"{prefix}{key}: {value}  # requires human review")
+            else:
+                lines.append(f'{prefix}{key}: "{value}"')
+        elif value is None:
+            lines.append(f"{prefix}{key}: null")
+        else:
+            lines.append(f"{prefix}{key}: {value}")
+    return "\n".join(lines)
+
+
+def load_config(source: str) -> dict:
+    """Load a config.json from a file path or HuggingFace model ID.
+
+    Args:
+        source: Either a local file path or a HuggingFace model ID.
+
+    Returns:
+        The parsed config.json dict.
+    """
+    path = Path(source)
+    if path.is_file():
+        with path.open(encoding="utf-8") as f:
+            return json.load(f)
+
+    # Try loading from HuggingFace Hub
+    try:
+        from huggingface_hub import hf_hub_download
+
+        config_path = hf_hub_download(repo_id=source, filename="config.json")
+        with open(config_path, encoding="utf-8") as f:
+            return json.load(f)
+    except ImportError:
+        print(
+            "error: huggingface_hub not installed. "
+            "Install with: pip install huggingface_hub",
+            file=sys.stderr,
+        )
+        sys.exit(1)
+    except Exception as e:
+        print(f"error: failed to load config from '{source}': {e}", file=sys.stderr)
+        sys.exit(1)
+
+
+def main() -> int:
+    parser = argparse.ArgumentParser(
+        description="Parse HuggingFace config.json into ModelPack transformer spec",
+    )
+    parser.add_argument(
+        "model",
+        help="HuggingFace model ID (e.g., meta-llama/Meta-Llama-3-8B) "
+        "or path to config.json",
+    )
+    parser.add_argument(
+        "--format",
+        choices=["yaml", "json"],
+        default="yaml",
+        help="Output format (default: yaml)",
+    )
+
+    args = parser.parse_args()
+
+    raw = load_config(args.model)
+    spec = parse_hf_config(raw)
+
+    model_type = raw.get("model_type", "unknown")
+    model_name = raw.get("_name_or_path", args.model)
+
+    if args.format == "json":
+        print(json.dumps(spec, indent=2))
+    else:
+        print(f"# ModelPack Transformer Spec")
+        print(f"# Generated from: {model_name}")
+        print(f"# Model type: {model_type}")
+        print(f"# NOTE: Fields marked NEEDS_REVIEW require human verification")
+        print()
+        print(format_yaml(spec))
+
+    # Report coverage
+    needs_review = []
+    _find_needs_review(spec, "", needs_review)
+    if needs_review:
+        print(f"\n# --- Fields requiring review ({len(needs_review)}) ---")
+        for field in needs_review:
+            print(f"#   - {field}")
+
+    return 0
+
+
+def _find_needs_review(d: dict, prefix: str, result: list) -> None:
+    """Recursively find all NEEDS_REVIEW fields."""
+    for key, value in d.items():
+        path = f"{prefix}.{key}" if prefix else key
+        if isinstance(value, dict):
+            _find_needs_review(value, path, result)
+        elif value == NEEDS_REVIEW:
+            result.append(path)
+
+
+if __name__ == "__main__":
+    raise SystemExit(main())
diff --git a/tools/hf_parser_test.py b/tools/hf_parser_test.py
new file mode 100644
index 0000000..9d27255
--- /dev/null
+++ b/tools/hf_parser_test.py
@@ -0,0 +1,518 @@
+#!/usr/bin/env python3
+"""Tests for the HuggingFace config parser."""
+
+from __future__ import annotations
+
+import pytest
+
+from hf_parser import NEEDS_REVIEW, parse_hf_config, parse_vision_config
+
+
+# Minimal config.json samples based on real HuggingFace models.
+MISTRAL_7B_CONFIG = {
+    "model_type": "mistral",
+    "vocab_size": 32000,
+    "hidden_size": 4096,
+    "num_hidden_layers": 32,
+    "num_attention_heads": 32,
+    "num_key_value_heads": 8,
+    "intermediate_size": 14336,
+    "max_position_embeddings": 32768,
+    "rope_theta": 10000.0,
+    "rms_norm_eps": 1e-5,
+    "hidden_act": "silu",
+    "sliding_window": 4096,
+    "tie_word_embeddings": False,
+    "attention_bias": False,
+}
+
+MIXTRAL_8X7B_CONFIG = {
+    "model_type": "mixtral",
+    "vocab_size": 32000,
+    "hidden_size": 4096,
+    "num_hidden_layers": 32,
+    "num_attention_heads": 32,
+    "num_key_value_heads": 8,
+    "intermediate_size": 14336,
+    "max_position_embeddings": 32768,
+    "rope_theta": 1000000.0,
+    "rms_norm_eps": 1e-5,
+    "hidden_act": "silu",
+    "num_local_experts": 8,
+    "num_experts_per_tok": 2,
+    "tie_word_embeddings": False,
+}
+
+QWEN2_7B_CONFIG = {
+    "model_type": "qwen2",
+    "vocab_size": 152064,
+    "hidden_size": 3584,
+    "num_hidden_layers": 28,
+    "num_attention_heads": 28,
+    "num_key_value_heads": 4,
+    "intermediate_size": 18944,
+    "max_position_embeddings": 131072,
+    "rope_theta": 1000000.0,
+    "rms_norm_eps": 1e-6,
+    "hidden_act": "silu",
+    "tie_word_embeddings": False,
+    "attention_bias": True,
+    "sliding_window": 131072,
+}
+
+GPT2_CONFIG = {
+    "model_type": "gpt2",
+    "vocab_size": 50257,
+    "hidden_size": 768,
+    "num_hidden_layers": 12,
+    "num_attention_heads": 12,
+    "intermediate_size": 3072,
+    "max_position_embeddings": 1024,
+    "layer_norm_eps": 1e-5,
+    "activation_function": "gelu_new",
+    "tie_word_embeddings": True,
+}
+
+DEEPSEEK_V2_LITE_CONFIG = {
+    "model_type": "deepseek_v2",
+    "vocab_size": 102400,
+    "hidden_size": 2048,
+    "num_hidden_layers": 27,
+    "num_attention_heads": 16,
+    "num_key_value_heads": 16,
+    "intermediate_size": 10944,
+    "max_position_embeddings": 163840,
+    "rope_theta": 10000.0,
+    "rms_norm_eps": 1e-6,
+    "hidden_act": "silu",
+    "kv_lora_rank": 512,
+    "q_lora_rank": 1536,
+    "qk_nope_head_dim": 128,
+    "qk_rope_head_dim": 64,
+    "v_head_dim": 128,
+    "n_routed_experts": 64,
+    "num_experts_per_tok": 6,
+    "first_k_dense_replace": 1,
+    "moe_layer_freq": 1,
+    "num_shared_experts": 2,
+    "routed_scaling_factor": 1.0,
+    "topk_method": "group_limited_greedy",
+    "norm_topk_prob": False,
+    "tie_word_embeddings": False,
+}
+
+UNKNOWN_CONFIG = {
+    "model_type": "some_new_model",
+    "vocab_size": 65536,
+    "hidden_size": 2048,
+    "num_hidden_layers": 24,
+    "num_attention_heads": 16,
+}
+
+
+class TestMistral:
+    def test_basic_fields(self):
+        spec = parse_hf_config(MISTRAL_7B_CONFIG)
+        assert spec["vocabulary_size"] == 32000
+        assert spec["hidden_size"] == 4096
+        assert spec["num_layers"] == 32
+        assert spec["type"] == "decoder"
+
+    def test_attention(self):
+        spec = parse_hf_config(MISTRAL_7B_CONFIG)
+        attn = spec["attention"]
+        assert attn["type"] == "gqa"
+        assert attn["num_attention_heads"] == 32
+        assert attn["num_key_value_heads"] == 8
+        assert attn["head_dim"] == 128  # 4096 / 32
+        assert attn["is_causal"] is True
+        assert attn["sliding_window"] == 4096
+
+    def test_ffn(self):
+        spec = parse_hf_config(MISTRAL_7B_CONFIG)
+        assert spec["ffn_type"] == "mlp"
+        assert spec["mlp"]["intermediate_size"] == 14336
+        assert spec["mlp"]["activation"] == "silu"
+        assert spec["mlp"]["use_gated_activation"] is True
+
+    def test_norm(self):
+        spec = parse_hf_config(MISTRAL_7B_CONFIG)
+        assert spec["norm"]["type"] == "rmsnorm"
+        assert spec["norm"]["epsilon"] == 1e-5
+
+    def test_position_embedding(self):
+        spec = parse_hf_config(MISTRAL_7B_CONFIG)
+        pe = spec["position_embedding"]
+        assert pe["type"] == "rope"
+        assert pe["rope_theta"] == 10000.0
+        assert pe["max_position_embeddings"] == 32768
+
+
+class TestMixtral:
+    def test_moe_detection(self):
+        spec = parse_hf_config(MIXTRAL_8X7B_CONFIG)
+        assert spec["ffn_type"] == "moe"
+        assert spec["moe"]["num_experts"] == 8
+        assert spec["moe"]["top_k"] == 2
+
+    def test_attention_still_gqa(self):
+        spec = parse_hf_config(MIXTRAL_8X7B_CONFIG)
+        assert spec["attention"]["type"] == "gqa"
+        assert spec["attention"]["num_key_value_heads"] == 8
+
+
+class TestQwen2:
+    def test_attention_bias(self):
+        spec = parse_hf_config(QWEN2_7B_CONFIG)
+        assert spec["attention"]["has_qkv_bias"] is True
+        assert spec["attention"]["has_output_bias"] is True
+
+    def test_rope_theta(self):
+        spec = parse_hf_config(QWEN2_7B_CONFIG)
+        assert spec["position_embedding"]["rope_theta"] == 1000000.0
+
+
+class TestGPT2:
+    def test_mha_attention(self):
+        spec = parse_hf_config(GPT2_CONFIG)
+        assert spec["attention"]["type"] == "mha"
+
+    def test_layernorm(self):
+        spec = parse_hf_config(GPT2_CONFIG)
+        assert spec["norm"]["type"] == "layernorm"
+        assert spec["norm"]["epsilon"] == 1e-5
+
+    def test_tied_embeddings(self):
+        spec = parse_hf_config(GPT2_CONFIG)
+        assert spec["token_embedding"]["shared_embedding"] is True
+
+    def test_no_gated_activation(self):
+        spec = parse_hf_config(GPT2_CONFIG)
+        assert spec["mlp"]["use_gated_activation"] is False
+
+    def test_gelu_activation(self):
+        spec = parse_hf_config(GPT2_CONFIG)
+        assert spec["mlp"]["activation"] == "gelu"
+
+
+class TestDeepSeekV2:
+    def test_mla_attention(self):
+        spec = parse_hf_config(DEEPSEEK_V2_LITE_CONFIG)
+        assert spec["attention"]["type"] == "mla"
+
+    def test_mla_fields(self):
+        spec = parse_hf_config(DEEPSEEK_V2_LITE_CONFIG)
+        attn = spec["attention"]
+        assert attn["kv_lora_rank"] == 512
+        assert attn["q_lora_rank"] == 1536
+        assert attn["qk_nope_head_dim"] == 128
+        assert attn["qk_rope_head_dim"] == 64
+        assert attn["v_head_dim"] == 128
+
+    def test_moe_with_n_routed_experts(self):
+        spec = parse_hf_config(DEEPSEEK_V2_LITE_CONFIG)
+        assert spec["ffn_type"] == "moe"
+        assert spec["moe"]["num_experts"] == 64
+        assert spec["moe"]["top_k"] == 6
+
+    def test_shared_experts(self):
+        spec = parse_hf_config(DEEPSEEK_V2_LITE_CONFIG)
+        assert spec["moe"]["num_shared_experts"] == 2
+
+    def test_moe_routing_fields(self):
+        spec = parse_hf_config(DEEPSEEK_V2_LITE_CONFIG)
+        assert spec["moe"]["routed_scaling_factor"] == 1.0
+        assert spec["moe"]["topk_method"] == "group_limited_greedy"
+        assert spec["moe"]["norm_topk_prob"] is False
+
+    def test_mixed_layers(self):
+        spec = parse_hf_config(DEEPSEEK_V2_LITE_CONFIG)
+        assert spec["layer_structure"] == "mixed"
+        assert spec["mixed_layers"]["first_k_dense_replace"] == 1
+        assert spec["mixed_layers"]["moe_layer_freq"] == 1
+
+
+class TestGPT2PositionEmbedding:
+    def test_learned_position_embedding(self):
+        spec = parse_hf_config(GPT2_CONFIG)
+        assert spec["position_embedding"]["type"] == "learned"
+
+
+class TestUnknownModel:
+    def test_needs_review_flags(self):
+        spec = parse_hf_config(UNKNOWN_CONFIG)
+        assert spec["attention"]["type"] == NEEDS_REVIEW
+        assert spec["ffn_type"] == NEEDS_REVIEW
+
+    def test_static_fields_still_parsed(self):
+        spec = parse_hf_config(UNKNOWN_CONFIG)
+        assert spec["vocabulary_size"] == 65536
+        assert spec["hidden_size"] == 2048
+        assert spec["num_layers"] == 24
+
+    def test_head_dim_derived(self):
+        spec = parse_hf_config(UNKNOWN_CONFIG)
+        assert spec["attention"]["head_dim"] == 128  # 2048 / 16
+
+
+class TestHeadDimDerivation:
+    def test_explicit_head_dim(self):
+        config = {
+            "model_type": "mistral",
+            "vocab_size": 32000,
+            "hidden_size": 4096,
+            "num_attention_heads": 32,
+            "head_dim": 64,  # explicit, not derived
+        }
+        spec = parse_hf_config(config)
+        assert spec["attention"]["head_dim"] == 64  # uses explicit value
+
+    def test_derived_head_dim(self):
+        config = {
+            "model_type": "llama",
+            "vocab_size": 32000,
+            "hidden_size": 4096,
+            "num_attention_heads": 32,
+        }
+        spec = parse_hf_config(config)
+        assert spec["attention"]["head_dim"] == 128  # 4096 / 32
+
+
+# ============================================================================
+# Vision model configs
+# ============================================================================
+
+LLAVA_15_CONFIG = {
+    "model_type": "llava",
+    "vocab_size": 32064,
+    "hidden_size": 4096,
+    "num_hidden_layers": 32,
+    "num_attention_heads": 32,
+    "num_key_value_heads": 32,
+    "intermediate_size": 11008,
+    "max_position_embeddings": 4096,
+    "rms_norm_eps": 1e-5,
+    "hidden_act": "silu",
+    "image_token_index": 32000,
+    "projector_hidden_act": "gelu",
+    "vision_config": {
+        "model_type": "clip_vision_model",
+        "hidden_size": 1024,
+        "patch_size": 14,
+        "image_size": 336,
+        "num_hidden_layers": 24,
+        "num_attention_heads": 16,
+        "intermediate_size": 4096,
+        "num_channels": 3,
+        "hidden_act": "quick_gelu",
+        "layer_norm_eps": 1e-5,
+        "projection_dim": 768,
+    },
+}
+
+QWEN2_VL_CONFIG = {
+    "model_type": "qwen2_vl",
+    "vocab_size": 152064,
+    "hidden_size": 3584,
+    "num_hidden_layers": 28,
+    "num_attention_heads": 28,
+    "num_key_value_heads": 4,
+    "intermediate_size": 18944,
+    "max_position_embeddings": 32768,
+    "rms_norm_eps": 1e-6,
+    "hidden_act": "silu",
+    "vision_start_token_id": 151652,
+    "vision_end_token_id": 151653,
+    "vision_token_id": 151654,
+    "image_token_id": 151655,
+    "video_token_id": 151656,
+    "rope_scaling": {
+        "type": "mrope",
+        "mrope_section": [16, 24, 24],
+    },
+    "vision_config": {
+        "depth": 32,
+        "embed_dim": 1280,
+        "num_heads": 16,
+        "in_chans": 3,
+        "spatial_patch_size": 14,
+        "spatial_merge_size": 2,
+        "temporal_patch_size": 2,
+        "hidden_act": "quick_gelu",
+    },
+}
+
+LLAMA_32_VISION_CONFIG = {
+    "model_type": "mllama",
+    "vocab_size": 128256,
+    "hidden_size": 4096,
+    "num_hidden_layers": 32,
+    "num_attention_heads": 32,
+    "num_key_value_heads": 8,
+    "intermediate_size": 14336,
+    "max_position_embeddings": 131072,
+    "rms_norm_eps": 1e-5,
+    "hidden_act": "silu",
+    "image_token_index": 128256,
+    "cross_attention_layers": [3, 8, 13, 18, 23, 28, 33, 38],
+    "vision_config": {
+        "model_type": "clip_vision_model",
+        "hidden_size": 1280,
+        "patch_size": 14,
+        "image_size": 560,
+        "num_hidden_layers": 32,
+        "num_attention_heads": 16,
+        "intermediate_size": 5120,
+        "num_channels": 3,
+        "hidden_act": "gelu",
+        "layer_norm_eps": 1e-5,
+    },
+}
+
+TEXT_ONLY_CONFIG = {
+    "model_type": "llama",
+    "vocab_size": 32000,
+    "hidden_size": 4096,
+    "num_hidden_layers": 32,
+    "num_attention_heads": 32,
+}
+
+
+# ============================================================================
+# Vision model tests
+# ============================================================================
+
+class TestLLaVA:
+    def test_vision_encoder_present(self):
+        spec = parse_hf_config(LLAVA_15_CONFIG)
+        assert "vision_encoder" in spec
+
+    def test_encoder_type(self):
+        spec = parse_hf_config(LLAVA_15_CONFIG)
+        ve = spec["vision_encoder"]
+        assert ve["type"] == "clip_vit"
+
+    def test_encoder_fields(self):
+        spec = parse_hf_config(LLAVA_15_CONFIG)
+        ve = spec["vision_encoder"]
+        assert ve["hidden_size"] == 1024
+        assert ve["patch_size"] == 14
+        assert ve["image_size"] == 336
+        assert ve["num_layers"] == 24
+        assert ve["num_attention_heads"] == 16
+        assert ve["intermediate_size"] == 4096
+        assert ve["in_channels"] == 3
+
+    def test_activation(self):
+        spec = parse_hf_config(LLAVA_15_CONFIG)
+        assert spec["vision_encoder"]["activation"] == "quick_gelu"
+
+    def test_norm(self):
+        spec = parse_hf_config(LLAVA_15_CONFIG)
+        ve = spec["vision_encoder"]
+        assert ve["norm"]["type"] == "layernorm"
+        assert ve["norm"]["epsilon"] == 1e-5
+
+    def test_projector(self):
+        spec = parse_hf_config(LLAVA_15_CONFIG)
+        proj = spec["vision_encoder"]["projector"]
+        assert proj["type"] == "mlp"
+        assert proj["num_layers"] == 2
+        assert proj["activation"] == "gelu"
+
+    def test_special_tokens(self):
+        spec = parse_hf_config(LLAVA_15_CONFIG)
+        assert spec["vision_encoder"]["special_tokens"]["image_token_id"] == 32000
+
+    def test_fusion_type(self):
+        spec = parse_hf_config(LLAVA_15_CONFIG)
+        assert spec["vision_encoder"]["fusion_type"] == "late"
+
+    def test_position_embedding(self):
+        spec = parse_hf_config(LLAVA_15_CONFIG)
+        assert spec["vision_encoder"]["position_embedding"]["type"] == "learned"
+
+
+class TestQwen2VL:
+    def test_vision_encoder_present(self):
+        spec = parse_hf_config(QWEN2_VL_CONFIG)
+        assert "vision_encoder" in spec
+
+    def test_qwen_specific_fields(self):
+        spec = parse_hf_config(QWEN2_VL_CONFIG)
+        ve = spec["vision_encoder"]
+        assert ve["num_layers"] == 32
+        assert ve["hidden_size"] == 1280
+        assert ve["num_attention_heads"] == 16
+        assert ve["patch_size"] == 14
+
+    def test_temporal_patch(self):
+        spec = parse_hf_config(QWEN2_VL_CONFIG)
+        assert spec["vision_encoder"]["temporal_patch_size"] == 2
+
+    def test_dynamic_resolution(self):
+        spec = parse_hf_config(QWEN2_VL_CONFIG)
+        dr = spec["vision_encoder"]["dynamic_resolution"]
+        assert dr["enabled"] is True
+        assert dr["spatial_merge_size"] == 2
+
+    def test_special_tokens(self):
+        spec = parse_hf_config(QWEN2_VL_CONFIG)
+        tokens = spec["vision_encoder"]["special_tokens"]
+        assert tokens["image_token_id"] == 151655
+        assert tokens["vision_start_token_id"] == 151652
+        assert tokens["vision_end_token_id"] == 151653
+        assert tokens["vision_token_id"] == 151654
+        assert tokens["video_token_id"] == 151656
+
+    def test_mrope_position_embedding(self):
+        spec = parse_hf_config(QWEN2_VL_CONFIG)
+        pe = spec["vision_encoder"]["position_embedding"]
+        assert pe["type"] == "mrope"
+        assert pe["mrope_sections"] == [16, 24, 24]
+
+    def test_fusion_type(self):
+        spec = parse_hf_config(QWEN2_VL_CONFIG)
+        assert spec["vision_encoder"]["fusion_type"] == "early"
+
+
+class TestLLaMA32Vision:
+    def test_vision_encoder_present(self):
+        spec = parse_hf_config(LLAMA_32_VISION_CONFIG)
+        assert "vision_encoder" in spec
+
+    def test_encoder_fields(self):
+        spec = parse_hf_config(LLAMA_32_VISION_CONFIG)
+        ve = spec["vision_encoder"]
+        assert ve["hidden_size"] == 1280
+        assert ve["patch_size"] == 14
+        assert ve["image_size"] == 560
+        assert ve["num_layers"] == 32
+        assert ve["num_attention_heads"] == 16
+
+    def test_cross_attention_projector(self):
+        spec = parse_hf_config(LLAMA_32_VISION_CONFIG)
+        proj = spec["vision_encoder"]["projector"]
+        assert proj["type"] == "cross_attention"
+
+    def test_cross_attention_layers_count(self):
+        spec = parse_hf_config(LLAMA_32_VISION_CONFIG)
+        assert spec["vision_encoder"]["projector"]["num_layers"] == 8
+
+    def test_fusion_type(self):
+        spec = parse_hf_config(LLAMA_32_VISION_CONFIG)
+        assert spec["vision_encoder"]["fusion_type"] == "cross_attention"
+
+    def test_special_tokens(self):
+        spec = parse_hf_config(LLAMA_32_VISION_CONFIG)
+        assert spec["vision_encoder"]["special_tokens"]["image_token_id"] == 128256
+
+
+class TestTextOnlyModel:
+    def test_no_vision_encoder(self):
+        spec = parse_hf_config(TEXT_ONLY_CONFIG)
+        assert "vision_encoder" not in spec
+
+    def test_no_vision_config_returns_none(self):
+        result = parse_vision_config(TEXT_ONLY_CONFIG)
+        assert result is None