diff --git a/tools/hf_parser.py b/tools/hf_parser.py
new file mode 100644
index 0000000..37301b8
--- /dev/null
+++ b/tools/hf_parser.py
@@ -0,0 +1,365 @@
+#!/usr/bin/env python3
+"""Parse HuggingFace model config.json into ModelPack transformer spec format.
+
+This tool maps HuggingFace Transformers config.json fields to the ModelPack
+unified transformer specification vocabulary defined in PR #111
+(docs/architecture.md by @aftersnow).
+
+Usage:
+    python tools/hf_parser.py meta-llama/Meta-Llama-3-8B
+    python tools/hf_parser.py mistralai/Mistral-7B-v0.3
+    python tools/hf_parser.py --file path/to/config.json
+
+The output is a YAML spec file following the ModelPack transformer spec format.
+Fields that cannot be reliably inferred from config.json are marked as
+NEEDS_REVIEW for human verification.
+"""
+
+from __future__ import annotations
+
+import argparse
+import json
+import sys
+from pathlib import Path
+
+NEEDS_REVIEW = "__NEEDS_REVIEW__"
+
+# Maps HuggingFace config.json field names to ModelPack transformer spec paths.
+# Based on PR #111's field vocabulary (docs/architecture.md).
+FIELD_MAP = {
+    # Top-level transformer fields
+    "vocab_size": "vocabulary_size",
+    "hidden_size": "hidden_size",
+    # Position embedding
+    "max_position_embeddings": "position_embedding.max_position_embeddings",
+    "rope_theta": "position_embedding.rope_theta",
+    "rope_scaling": "position_embedding.rope_scaling",
+    # Attention
+    "num_attention_heads": "attention.num_attention_heads",
+    "num_key_value_heads": "attention.num_key_value_heads",
+    "head_dim": "attention.head_dim",
+    # FFN / MLP
+    "intermediate_size": "mlp.intermediate_size",
+    # Transformer layers
+    "num_hidden_layers": "num_layers",
+    # Normalization
+    "rms_norm_eps": "norm.epsilon",
+    # MoE fields
+    "num_local_experts": "moe.num_experts",
+    "num_experts_per_tok": "moe.top_k",
+    "num_experts": "moe.num_experts",
+    "n_routed_experts": "moe.num_experts",  # DeepSeek naming variant
+    # MLA fields (DeepSeek)
+    "kv_lora_rank": "attention.kv_lora_rank",
+    "q_lora_rank": "attention.q_lora_rank",
+    "qk_nope_head_dim": "attention.qk_nope_head_dim",
+    "qk_rope_head_dim": "attention.qk_rope_head_dim",
+    "v_head_dim": "attention.v_head_dim",
+}
+
+# Known model type → attention type mapping
+ATTENTION_TYPE_MAP = {
+    "llama": "gqa",
+    "mistral": "gqa",
+    "mixtral": "gqa",
+    "qwen2": "gqa",
+    "qwen2_moe": "gqa",
+    "gemma": "gqa",
+    "gemma2": "gqa",
+    "phi3": "gqa",
+    "deepseek_v2": "mla",
+    "deepseek_v3": "mla",
+    "gpt2": "mha",
+    "gpt_neo": "mha",
+    "gpt_neox": "mha",
+    "falcon": "mha",
+}
+
+# Known model type → FFN type mapping
+FFN_TYPE_MAP = {
+    "llama": "mlp",
+    "mistral": "mlp",
+    "mixtral": "moe",
+    "qwen2": "mlp",
+    "qwen2_moe": "moe",
+    "gemma": "mlp",
+    "gemma2": "mlp",
+    "phi3": "mlp",
+    "deepseek_v2": "moe",
+    "deepseek_v3": "moe",
+    "gpt2": "mlp",
+    "gpt_neo": "mlp",
+    "gpt_neox": "mlp",
+    "falcon": "mlp",
+}
+
+# Known model type → activation function mapping
+ACTIVATION_MAP = {
+    "llama": "silu",
+    "mistral": "silu",
+    "mixtral": "silu",
+    "qwen2": "silu",
+    "qwen2_moe": "silu",
+    "gemma": "gelu",
+    "gemma2": "gelu",
+    "phi3": "silu",
+    "gpt2": "gelu",
+    "gpt_neo": "gelu",
+    "gpt_neox": "gelu",
+    "falcon": "gelu",
+}
+
+
+def _set_nested(d: dict, path: str, value) -> None:
+    """Set a value in a nested dict using dot-separated path."""
+    keys = path.split(".")
+    for key in keys[:-1]:
+        d = d.setdefault(key, {})
+    d[keys[-1]] = value
+
+
+def _get_nested(d: dict, path: str, default=None):
+    """Get a value from a nested dict using dot-separated path."""
+    keys = path.split(".")
+    for key in keys:
+        if not isinstance(d, dict) or key not in d:
+            return default
+        d = d[key]
+    return d
+
+
+def parse_hf_config(raw: dict) -> dict:
+    """Parse a HuggingFace config.json dict into ModelPack transformer spec.
+
+    Args:
+        raw: The parsed config.json dict from HuggingFace.
+
+    Returns:
+        A dict following the ModelPack transformer spec format.
+    """
+    result: dict = {}
+    model_type = raw.get("model_type", "").lower()
+
+    # Map static fields
+    for hf_key, mp_path in FIELD_MAP.items():
+        if hf_key in raw and raw[hf_key] is not None:
+            _set_nested(result, mp_path, raw[hf_key])
+
+    # Derive head_dim if absent
+    if "attention" in result and "head_dim" not in result.get("attention", {}):
+        hidden = result.get("hidden_size")
+        n_heads = _get_nested(result, "attention.num_attention_heads")
+        if hidden and n_heads:
+            _set_nested(result, "attention.head_dim", hidden // n_heads)
+
+    # Set architecture type
+    result["type"] = "decoder"
+    result["architecture_version"] = "0.1.0"
+
+    # Infer attention type from model_type
+    attn_type = ATTENTION_TYPE_MAP.get(model_type, NEEDS_REVIEW)
+    _set_nested(result, "attention.type", attn_type)
+    _set_nested(result, "attention.is_causal", True)
+
+    # Check for sliding window attention
+    if raw.get("sliding_window") is not None:
+        _set_nested(result, "attention.sliding_window", raw["sliding_window"])
+
+    # Infer FFN type
+    ffn_type = FFN_TYPE_MAP.get(model_type, NEEDS_REVIEW)
+    result["ffn_type"] = ffn_type
+
+    # Set activation function
+    hf_activation = raw.get("hidden_act", raw.get("activation_function"))
+    if hf_activation:
+        activation = hf_activation.lower()
+        if "silu" in activation or "swish" in activation:
+            activation = "silu"
+        elif "gelu" in activation:
+            activation = "gelu"
+        elif "relu" in activation:
+            activation = "relu"
+    else:
+        activation = ACTIVATION_MAP.get(model_type, NEEDS_REVIEW)
+
+    if ffn_type == "mlp":
+        _set_nested(result, "mlp.activation", activation)
+        # Most modern models use gated activation (SwiGLU, GeGLU)
+        use_gated = model_type in (
+            "llama", "mistral", "mixtral", "qwen2", "qwen2_moe", "phi3",
+            "gemma", "gemma2", "deepseek_v2", "deepseek_v3",
+        )
+        _set_nested(result, "mlp.use_gated_activation", use_gated)
+    elif ffn_type == "moe":
+        _set_nested(result, "moe.activation", activation)
+        # MoE-specific fields
+        if "moe_intermediate_size" in raw:
+            _set_nested(result, "moe.moe_intermediate_size", raw["moe_intermediate_size"])
+        if "num_shared_experts" in raw:
+            _set_nested(result, "moe.num_shared_experts", raw["num_shared_experts"])
+        if "shared_expert_intermediate_size" in raw:
+            _set_nested(
+                result, "moe.shared_expert_intermediate_size",
+                raw["shared_expert_intermediate_size"],
+            )
+        # DeepSeek MoE-specific fields (from PR #185 research)
+        if "routed_scaling_factor" in raw:
+            _set_nested(result, "moe.routed_scaling_factor", raw["routed_scaling_factor"])
+        if "topk_method" in raw:
+            _set_nested(result, "moe.topk_method", raw["topk_method"])
+        if "norm_topk_prob" in raw:
+            _set_nested(result, "moe.norm_topk_prob", raw["norm_topk_prob"])
+
+    # Mixed layers support (DeepSeek uses dense layers before switching to MoE)
+    if "first_k_dense_replace" in raw and "moe_layer_freq" in raw:
+        result["layer_structure"] = "mixed"
+        _set_nested(result, "mixed_layers.first_k_dense_replace", raw["first_k_dense_replace"])
+        _set_nested(result, "mixed_layers.moe_layer_freq", raw["moe_layer_freq"])
+
+    # Normalization
+    norm_type = "rmsnorm"  # Most modern models use RMSNorm
+    if model_type in ("gpt2", "gpt_neo"):
+        norm_type = "layernorm"
+    _set_nested(result, "norm.type", norm_type)
+
+    if "layer_norm_eps" in raw:
+        _set_nested(result, "norm.epsilon", raw["layer_norm_eps"])
+
+    # Tokenizer
+    _set_nested(result, "tokenizer.type", "bpe")
+    _set_nested(result, "tokenizer.library", "huggingface")
+
+    # Position embedding type
+    if model_type in ("gpt2", "gpt_neo"):
+        _set_nested(result, "position_embedding.type", "learned")
+    else:
+        _set_nested(result, "position_embedding.type", "rope")
+
+    # Embedding
+    tie_embeddings = raw.get("tie_word_embeddings", False)
+    _set_nested(result, "token_embedding.shared_embedding", tie_embeddings)
+
+    # Bias flags
+    attn_bias = raw.get("attention_bias", False)
+    _set_nested(result, "attention.has_qkv_bias", attn_bias)
+    _set_nested(result, "attention.has_output_bias", attn_bias)
+
+    mlp_bias = raw.get("mlp_bias", False)
+    if ffn_type == "mlp":
+        _set_nested(result, "mlp.has_bias", mlp_bias)
+
+    return result
+
+
+def format_yaml(spec: dict, indent: int = 0) -> str:
+    """Format a spec dict as YAML string."""
+    lines = []
+    prefix = "  " * indent
+    for key, value in spec.items():
+        if isinstance(value, dict):
+            lines.append(f"{prefix}{key}:")
+            lines.append(format_yaml(value, indent + 1))
+        elif isinstance(value, bool):
+            lines.append(f"{prefix}{key}: {str(value).lower()}")
+        elif isinstance(value, str):
+            if value == NEEDS_REVIEW:
+                lines.append(f"{prefix}{key}: {value}  # requires human review")
+            else:
+                lines.append(f'{prefix}{key}: "{value}"')
+        elif value is None:
+            lines.append(f"{prefix}{key}: null")
+        else:
+            lines.append(f"{prefix}{key}: {value}")
+    return "\n".join(lines)
+
+
+def load_config(source: str) -> dict:
+    """Load a config.json from a file path or HuggingFace model ID.
+
+    Args:
+        source: Either a local file path or a HuggingFace model ID.
+
+    Returns:
+        The parsed config.json dict.
+    """
+    path = Path(source)
+    if path.is_file():
+        with path.open(encoding="utf-8") as f:
+            return json.load(f)
+
+    # Try loading from HuggingFace Hub
+    try:
+        from huggingface_hub import hf_hub_download
+
+        config_path = hf_hub_download(repo_id=source, filename="config.json")
+        with open(config_path, encoding="utf-8") as f:
+            return json.load(f)
+    except ImportError:
+        print(
+            "error: huggingface_hub not installed. "
+            "Install with: pip install huggingface_hub",
+            file=sys.stderr,
+        )
+        sys.exit(1)
+    except Exception as e:
+        print(f"error: failed to load config from '{source}': {e}", file=sys.stderr)
+        sys.exit(1)
+
+
+def main() -> int:
+    parser = argparse.ArgumentParser(
+        description="Parse HuggingFace config.json into ModelPack transformer spec",
+    )
+    parser.add_argument(
+        "model",
+        help="HuggingFace model ID (e.g., meta-llama/Meta-Llama-3-8B) "
+        "or path to config.json",
+    )
+    parser.add_argument(
+        "--format",
+        choices=["yaml", "json"],
+        default="yaml",
+        help="Output format (default: yaml)",
+    )
+
+    args = parser.parse_args()
+
+    raw = load_config(args.model)
+    spec = parse_hf_config(raw)
+
+    model_type = raw.get("model_type", "unknown")
+    model_name = raw.get("_name_or_path", args.model)
+
+    if args.format == "json":
+        print(json.dumps(spec, indent=2))
+    else:
+        print(f"# ModelPack Transformer Spec")
+        print(f"# Generated from: {model_name}")
+        print(f"# Model type: {model_type}")
+        print(f"# NOTE: Fields marked NEEDS_REVIEW require human verification")
+        print()
+        print(format_yaml(spec))
+
+    # Report coverage
+    needs_review = []
+    _find_needs_review(spec, "", needs_review)
+    if needs_review:
+        print(f"\n# --- Fields requiring review ({len(needs_review)}) ---")
+        for field in needs_review:
+            print(f"#   - {field}")
+
+    return 0
+
+
+def _find_needs_review(d: dict, prefix: str, result: list) -> None:
+    """Recursively find all NEEDS_REVIEW fields."""
+    for key, value in d.items():
+        path = f"{prefix}.{key}" if prefix else key
+        if isinstance(value, dict):
+            _find_needs_review(value, path, result)
+        elif value == NEEDS_REVIEW:
+            result.append(path)
+
+
+if __name__ == "__main__":
+    raise SystemExit(main())
diff --git a/tools/hf_parser_test.py b/tools/hf_parser_test.py
new file mode 100644
index 0000000..75227fc
--- /dev/null
+++ b/tools/hf_parser_test.py
@@ -0,0 +1,278 @@
+#!/usr/bin/env python3
+"""Tests for the HuggingFace config parser."""
+
+from __future__ import annotations
+
+import pytest
+
+from hf_parser import NEEDS_REVIEW, parse_hf_config
+
+
+# Minimal config.json samples based on real HuggingFace models.
+MISTRAL_7B_CONFIG = {
+    "model_type": "mistral",
+    "vocab_size": 32000,
+    "hidden_size": 4096,
+    "num_hidden_layers": 32,
+    "num_attention_heads": 32,
+    "num_key_value_heads": 8,
+    "intermediate_size": 14336,
+    "max_position_embeddings": 32768,
+    "rope_theta": 10000.0,
+    "rms_norm_eps": 1e-5,
+    "hidden_act": "silu",
+    "sliding_window": 4096,
+    "tie_word_embeddings": False,
+    "attention_bias": False,
+}
+
+MIXTRAL_8X7B_CONFIG = {
+    "model_type": "mixtral",
+    "vocab_size": 32000,
+    "hidden_size": 4096,
+    "num_hidden_layers": 32,
+    "num_attention_heads": 32,
+    "num_key_value_heads": 8,
+    "intermediate_size": 14336,
+    "max_position_embeddings": 32768,
+    "rope_theta": 1000000.0,
+    "rms_norm_eps": 1e-5,
+    "hidden_act": "silu",
+    "num_local_experts": 8,
+    "num_experts_per_tok": 2,
+    "tie_word_embeddings": False,
+}
+
+QWEN2_7B_CONFIG = {
+    "model_type": "qwen2",
+    "vocab_size": 152064,
+    "hidden_size": 3584,
+    "num_hidden_layers": 28,
+    "num_attention_heads": 28,
+    "num_key_value_heads": 4,
+    "intermediate_size": 18944,
+    "max_position_embeddings": 131072,
+    "rope_theta": 1000000.0,
+    "rms_norm_eps": 1e-6,
+    "hidden_act": "silu",
+    "tie_word_embeddings": False,
+    "attention_bias": True,
+    "sliding_window": 131072,
+}
+
+GPT2_CONFIG = {
+    "model_type": "gpt2",
+    "vocab_size": 50257,
+    "hidden_size": 768,
+    "num_hidden_layers": 12,
+    "num_attention_heads": 12,
+    "intermediate_size": 3072,
+    "max_position_embeddings": 1024,
+    "layer_norm_eps": 1e-5,
+    "activation_function": "gelu_new",
+    "tie_word_embeddings": True,
+}
+
+DEEPSEEK_V2_LITE_CONFIG = {
+    "model_type": "deepseek_v2",
+    "vocab_size": 102400,
+    "hidden_size": 2048,
+    "num_hidden_layers": 27,
+    "num_attention_heads": 16,
+    "num_key_value_heads": 16,
+    "intermediate_size": 10944,
+    "max_position_embeddings": 163840,
+    "rope_theta": 10000.0,
+    "rms_norm_eps": 1e-6,
+    "hidden_act": "silu",
+    "kv_lora_rank": 512,
+    "q_lora_rank": 1536,
+    "qk_nope_head_dim": 128,
+    "qk_rope_head_dim": 64,
+    "v_head_dim": 128,
+    "n_routed_experts": 64,
+    "num_experts_per_tok": 6,
+    "first_k_dense_replace": 1,
+    "moe_layer_freq": 1,
+    "num_shared_experts": 2,
+    "routed_scaling_factor": 1.0,
+    "topk_method": "group_limited_greedy",
+    "norm_topk_prob": False,
+    "tie_word_embeddings": False,
+}
+
+UNKNOWN_CONFIG = {
+    "model_type": "some_new_model",
+    "vocab_size": 65536,
+    "hidden_size": 2048,
+    "num_hidden_layers": 24,
+    "num_attention_heads": 16,
+}
+
+
+class TestMistral:
+    def test_basic_fields(self):
+        spec = parse_hf_config(MISTRAL_7B_CONFIG)
+        assert spec["vocabulary_size"] == 32000
+        assert spec["hidden_size"] == 4096
+        assert spec["num_layers"] == 32
+        assert spec["type"] == "decoder"
+
+    def test_attention(self):
+        spec = parse_hf_config(MISTRAL_7B_CONFIG)
+        attn = spec["attention"]
+        assert attn["type"] == "gqa"
+        assert attn["num_attention_heads"] == 32
+        assert attn["num_key_value_heads"] == 8
+        assert attn["head_dim"] == 128  # 4096 / 32
+        assert attn["is_causal"] is True
+        assert attn["sliding_window"] == 4096
+
+    def test_ffn(self):
+        spec = parse_hf_config(MISTRAL_7B_CONFIG)
+        assert spec["ffn_type"] == "mlp"
+        assert spec["mlp"]["intermediate_size"] == 14336
+        assert spec["mlp"]["activation"] == "silu"
+        assert spec["mlp"]["use_gated_activation"] is True
+
+    def test_norm(self):
+        spec = parse_hf_config(MISTRAL_7B_CONFIG)
+        assert spec["norm"]["type"] == "rmsnorm"
+        assert spec["norm"]["epsilon"] == 1e-5
+
+    def test_position_embedding(self):
+        spec = parse_hf_config(MISTRAL_7B_CONFIG)
+        pe = spec["position_embedding"]
+        assert pe["type"] == "rope"
+        assert pe["rope_theta"] == 10000.0
+        assert pe["max_position_embeddings"] == 32768
+
+
+class TestMixtral:
+    def test_moe_detection(self):
+        spec = parse_hf_config(MIXTRAL_8X7B_CONFIG)
+        assert spec["ffn_type"] == "moe"
+        assert spec["moe"]["num_experts"] == 8
+        assert spec["moe"]["top_k"] == 2
+
+    def test_attention_still_gqa(self):
+        spec = parse_hf_config(MIXTRAL_8X7B_CONFIG)
+        assert spec["attention"]["type"] == "gqa"
+        assert spec["attention"]["num_key_value_heads"] == 8
+
+
+class TestQwen2:
+    def test_attention_bias(self):
+        spec = parse_hf_config(QWEN2_7B_CONFIG)
+        assert spec["attention"]["has_qkv_bias"] is True
+        assert spec["attention"]["has_output_bias"] is True
+
+    def test_rope_theta(self):
+        spec = parse_hf_config(QWEN2_7B_CONFIG)
+        assert spec["position_embedding"]["rope_theta"] == 1000000.0
+
+
+class TestGPT2:
+    def test_mha_attention(self):
+        spec = parse_hf_config(GPT2_CONFIG)
+        assert spec["attention"]["type"] == "mha"
+
+    def test_layernorm(self):
+        spec = parse_hf_config(GPT2_CONFIG)
+        assert spec["norm"]["type"] == "layernorm"
+        assert spec["norm"]["epsilon"] == 1e-5
+
+    def test_tied_embeddings(self):
+        spec = parse_hf_config(GPT2_CONFIG)
+        assert spec["token_embedding"]["shared_embedding"] is True
+
+    def test_no_gated_activation(self):
+        spec = parse_hf_config(GPT2_CONFIG)
+        assert spec["mlp"]["use_gated_activation"] is False
+
+    def test_gelu_activation(self):
+        spec = parse_hf_config(GPT2_CONFIG)
+        assert spec["mlp"]["activation"] == "gelu"
+
+
+class TestDeepSeekV2:
+    def test_mla_attention(self):
+        spec = parse_hf_config(DEEPSEEK_V2_LITE_CONFIG)
+        assert spec["attention"]["type"] == "mla"
+
+    def test_mla_fields(self):
+        spec = parse_hf_config(DEEPSEEK_V2_LITE_CONFIG)
+        attn = spec["attention"]
+        assert attn["kv_lora_rank"] == 512
+        assert attn["q_lora_rank"] == 1536
+        assert attn["qk_nope_head_dim"] == 128
+        assert attn["qk_rope_head_dim"] == 64
+        assert attn["v_head_dim"] == 128
+
+    def test_moe_with_n_routed_experts(self):
+        spec = parse_hf_config(DEEPSEEK_V2_LITE_CONFIG)
+        assert spec["ffn_type"] == "moe"
+        assert spec["moe"]["num_experts"] == 64
+        assert spec["moe"]["top_k"] == 6
+
+    def test_shared_experts(self):
+        spec = parse_hf_config(DEEPSEEK_V2_LITE_CONFIG)
+        assert spec["moe"]["num_shared_experts"] == 2
+
+    def test_moe_routing_fields(self):
+        spec = parse_hf_config(DEEPSEEK_V2_LITE_CONFIG)
+        assert spec["moe"]["routed_scaling_factor"] == 1.0
+        assert spec["moe"]["topk_method"] == "group_limited_greedy"
+        assert spec["moe"]["norm_topk_prob"] is False
+
+    def test_mixed_layers(self):
+        spec = parse_hf_config(DEEPSEEK_V2_LITE_CONFIG)
+        assert spec["layer_structure"] == "mixed"
+        assert spec["mixed_layers"]["first_k_dense_replace"] == 1
+        assert spec["mixed_layers"]["moe_layer_freq"] == 1
+
+
+class TestGPT2PositionEmbedding:
+    def test_learned_position_embedding(self):
+        spec = parse_hf_config(GPT2_CONFIG)
+        assert spec["position_embedding"]["type"] == "learned"
+
+
+class TestUnknownModel:
+    def test_needs_review_flags(self):
+        spec = parse_hf_config(UNKNOWN_CONFIG)
+        assert spec["attention"]["type"] == NEEDS_REVIEW
+        assert spec["ffn_type"] == NEEDS_REVIEW
+
+    def test_static_fields_still_parsed(self):
+        spec = parse_hf_config(UNKNOWN_CONFIG)
+        assert spec["vocabulary_size"] == 65536
+        assert spec["hidden_size"] == 2048
+        assert spec["num_layers"] == 24
+
+    def test_head_dim_derived(self):
+        spec = parse_hf_config(UNKNOWN_CONFIG)
+        assert spec["attention"]["head_dim"] == 128  # 2048 / 16
+
+
+class TestHeadDimDerivation:
+    def test_explicit_head_dim(self):
+        config = {
+            "model_type": "mistral",
+            "vocab_size": 32000,
+            "hidden_size": 4096,
+            "num_attention_heads": 32,
+            "head_dim": 64,  # explicit, not derived
+        }
+        spec = parse_hf_config(config)
+        assert spec["attention"]["head_dim"] == 64  # uses explicit value
+
+    def test_derived_head_dim(self):
+        config = {
+            "model_type": "llama",
+            "vocab_size": 32000,
+            "hidden_size": 4096,
+            "num_attention_heads": 32,
+        }
+        spec = parse_hf_config(config)
+        assert spec["attention"]["head_dim"] == 128  # 4096 / 32