diff --git a/tools/hf_parser.py b/tools/hf_parser.py new file mode 100644 index 0000000..37301b8 --- /dev/null +++ b/tools/hf_parser.py @@ -0,0 +1,365 @@ +#!/usr/bin/env python3 +"""Parse HuggingFace model config.json into ModelPack transformer spec format. + +This tool maps HuggingFace Transformers config.json fields to the ModelPack +unified transformer specification vocabulary defined in PR #111 +(docs/architecture.md by @aftersnow). + +Usage: + python tools/hf_parser.py meta-llama/Meta-Llama-3-8B + python tools/hf_parser.py mistralai/Mistral-7B-v0.3 + python tools/hf_parser.py --file path/to/config.json + +The output is a YAML spec file following the ModelPack transformer spec format. +Fields that cannot be reliably inferred from config.json are marked as +NEEDS_REVIEW for human verification. +""" + +from __future__ import annotations + +import argparse +import json +import sys +from pathlib import Path + +NEEDS_REVIEW = "__NEEDS_REVIEW__" + +# Maps HuggingFace config.json field names to ModelPack transformer spec paths. +# Based on PR #111's field vocabulary (docs/architecture.md). +FIELD_MAP = { + # Top-level transformer fields + "vocab_size": "vocabulary_size", + "hidden_size": "hidden_size", + # Position embedding + "max_position_embeddings": "position_embedding.max_position_embeddings", + "rope_theta": "position_embedding.rope_theta", + "rope_scaling": "position_embedding.rope_scaling", + # Attention + "num_attention_heads": "attention.num_attention_heads", + "num_key_value_heads": "attention.num_key_value_heads", + "head_dim": "attention.head_dim", + # FFN / MLP + "intermediate_size": "mlp.intermediate_size", + # Transformer layers + "num_hidden_layers": "num_layers", + # Normalization + "rms_norm_eps": "norm.epsilon", + # MoE fields + "num_local_experts": "moe.num_experts", + "num_experts_per_tok": "moe.top_k", + "num_experts": "moe.num_experts", + "n_routed_experts": "moe.num_experts", # DeepSeek naming variant + # MLA fields (DeepSeek) + "kv_lora_rank": "attention.kv_lora_rank", + "q_lora_rank": "attention.q_lora_rank", + "qk_nope_head_dim": "attention.qk_nope_head_dim", + "qk_rope_head_dim": "attention.qk_rope_head_dim", + "v_head_dim": "attention.v_head_dim", +} + +# Known model type → attention type mapping +ATTENTION_TYPE_MAP = { + "llama": "gqa", + "mistral": "gqa", + "mixtral": "gqa", + "qwen2": "gqa", + "qwen2_moe": "gqa", + "gemma": "gqa", + "gemma2": "gqa", + "phi3": "gqa", + "deepseek_v2": "mla", + "deepseek_v3": "mla", + "gpt2": "mha", + "gpt_neo": "mha", + "gpt_neox": "mha", + "falcon": "mha", +} + +# Known model type → FFN type mapping +FFN_TYPE_MAP = { + "llama": "mlp", + "mistral": "mlp", + "mixtral": "moe", + "qwen2": "mlp", + "qwen2_moe": "moe", + "gemma": "mlp", + "gemma2": "mlp", + "phi3": "mlp", + "deepseek_v2": "moe", + "deepseek_v3": "moe", + "gpt2": "mlp", + "gpt_neo": "mlp", + "gpt_neox": "mlp", + "falcon": "mlp", +} + +# Known model type → activation function mapping +ACTIVATION_MAP = { + "llama": "silu", + "mistral": "silu", + "mixtral": "silu", + "qwen2": "silu", + "qwen2_moe": "silu", + "gemma": "gelu", + "gemma2": "gelu", + "phi3": "silu", + "gpt2": "gelu", + "gpt_neo": "gelu", + "gpt_neox": "gelu", + "falcon": "gelu", +} + + +def _set_nested(d: dict, path: str, value) -> None: + """Set a value in a nested dict using dot-separated path.""" + keys = path.split(".") + for key in keys[:-1]: + d = d.setdefault(key, {}) + d[keys[-1]] = value + + +def _get_nested(d: dict, path: str, default=None): + """Get a value from a nested dict using dot-separated path.""" + keys = path.split(".") + for key in keys: + if not isinstance(d, dict) or key not in d: + return default + d = d[key] + return d + + +def parse_hf_config(raw: dict) -> dict: + """Parse a HuggingFace config.json dict into ModelPack transformer spec. + + Args: + raw: The parsed config.json dict from HuggingFace. + + Returns: + A dict following the ModelPack transformer spec format. + """ + result: dict = {} + model_type = raw.get("model_type", "").lower() + + # Map static fields + for hf_key, mp_path in FIELD_MAP.items(): + if hf_key in raw and raw[hf_key] is not None: + _set_nested(result, mp_path, raw[hf_key]) + + # Derive head_dim if absent + if "attention" in result and "head_dim" not in result.get("attention", {}): + hidden = result.get("hidden_size") + n_heads = _get_nested(result, "attention.num_attention_heads") + if hidden and n_heads: + _set_nested(result, "attention.head_dim", hidden // n_heads) + + # Set architecture type + result["type"] = "decoder" + result["architecture_version"] = "0.1.0" + + # Infer attention type from model_type + attn_type = ATTENTION_TYPE_MAP.get(model_type, NEEDS_REVIEW) + _set_nested(result, "attention.type", attn_type) + _set_nested(result, "attention.is_causal", True) + + # Check for sliding window attention + if raw.get("sliding_window") is not None: + _set_nested(result, "attention.sliding_window", raw["sliding_window"]) + + # Infer FFN type + ffn_type = FFN_TYPE_MAP.get(model_type, NEEDS_REVIEW) + result["ffn_type"] = ffn_type + + # Set activation function + hf_activation = raw.get("hidden_act", raw.get("activation_function")) + if hf_activation: + activation = hf_activation.lower() + if "silu" in activation or "swish" in activation: + activation = "silu" + elif "gelu" in activation: + activation = "gelu" + elif "relu" in activation: + activation = "relu" + else: + activation = ACTIVATION_MAP.get(model_type, NEEDS_REVIEW) + + if ffn_type == "mlp": + _set_nested(result, "mlp.activation", activation) + # Most modern models use gated activation (SwiGLU, GeGLU) + use_gated = model_type in ( + "llama", "mistral", "mixtral", "qwen2", "qwen2_moe", "phi3", + "gemma", "gemma2", "deepseek_v2", "deepseek_v3", + ) + _set_nested(result, "mlp.use_gated_activation", use_gated) + elif ffn_type == "moe": + _set_nested(result, "moe.activation", activation) + # MoE-specific fields + if "moe_intermediate_size" in raw: + _set_nested(result, "moe.moe_intermediate_size", raw["moe_intermediate_size"]) + if "num_shared_experts" in raw: + _set_nested(result, "moe.num_shared_experts", raw["num_shared_experts"]) + if "shared_expert_intermediate_size" in raw: + _set_nested( + result, "moe.shared_expert_intermediate_size", + raw["shared_expert_intermediate_size"], + ) + # DeepSeek MoE-specific fields (from PR #185 research) + if "routed_scaling_factor" in raw: + _set_nested(result, "moe.routed_scaling_factor", raw["routed_scaling_factor"]) + if "topk_method" in raw: + _set_nested(result, "moe.topk_method", raw["topk_method"]) + if "norm_topk_prob" in raw: + _set_nested(result, "moe.norm_topk_prob", raw["norm_topk_prob"]) + + # Mixed layers support (DeepSeek uses dense layers before switching to MoE) + if "first_k_dense_replace" in raw and "moe_layer_freq" in raw: + result["layer_structure"] = "mixed" + _set_nested(result, "mixed_layers.first_k_dense_replace", raw["first_k_dense_replace"]) + _set_nested(result, "mixed_layers.moe_layer_freq", raw["moe_layer_freq"]) + + # Normalization + norm_type = "rmsnorm" # Most modern models use RMSNorm + if model_type in ("gpt2", "gpt_neo"): + norm_type = "layernorm" + _set_nested(result, "norm.type", norm_type) + + if "layer_norm_eps" in raw: + _set_nested(result, "norm.epsilon", raw["layer_norm_eps"]) + + # Tokenizer + _set_nested(result, "tokenizer.type", "bpe") + _set_nested(result, "tokenizer.library", "huggingface") + + # Position embedding type + if model_type in ("gpt2", "gpt_neo"): + _set_nested(result, "position_embedding.type", "learned") + else: + _set_nested(result, "position_embedding.type", "rope") + + # Embedding + tie_embeddings = raw.get("tie_word_embeddings", False) + _set_nested(result, "token_embedding.shared_embedding", tie_embeddings) + + # Bias flags + attn_bias = raw.get("attention_bias", False) + _set_nested(result, "attention.has_qkv_bias", attn_bias) + _set_nested(result, "attention.has_output_bias", attn_bias) + + mlp_bias = raw.get("mlp_bias", False) + if ffn_type == "mlp": + _set_nested(result, "mlp.has_bias", mlp_bias) + + return result + + +def format_yaml(spec: dict, indent: int = 0) -> str: + """Format a spec dict as YAML string.""" + lines = [] + prefix = " " * indent + for key, value in spec.items(): + if isinstance(value, dict): + lines.append(f"{prefix}{key}:") + lines.append(format_yaml(value, indent + 1)) + elif isinstance(value, bool): + lines.append(f"{prefix}{key}: {str(value).lower()}") + elif isinstance(value, str): + if value == NEEDS_REVIEW: + lines.append(f"{prefix}{key}: {value} # requires human review") + else: + lines.append(f'{prefix}{key}: "{value}"') + elif value is None: + lines.append(f"{prefix}{key}: null") + else: + lines.append(f"{prefix}{key}: {value}") + return "\n".join(lines) + + +def load_config(source: str) -> dict: + """Load a config.json from a file path or HuggingFace model ID. + + Args: + source: Either a local file path or a HuggingFace model ID. + + Returns: + The parsed config.json dict. + """ + path = Path(source) + if path.is_file(): + with path.open(encoding="utf-8") as f: + return json.load(f) + + # Try loading from HuggingFace Hub + try: + from huggingface_hub import hf_hub_download + + config_path = hf_hub_download(repo_id=source, filename="config.json") + with open(config_path, encoding="utf-8") as f: + return json.load(f) + except ImportError: + print( + "error: huggingface_hub not installed. " + "Install with: pip install huggingface_hub", + file=sys.stderr, + ) + sys.exit(1) + except Exception as e: + print(f"error: failed to load config from '{source}': {e}", file=sys.stderr) + sys.exit(1) + + +def main() -> int: + parser = argparse.ArgumentParser( + description="Parse HuggingFace config.json into ModelPack transformer spec", + ) + parser.add_argument( + "model", + help="HuggingFace model ID (e.g., meta-llama/Meta-Llama-3-8B) " + "or path to config.json", + ) + parser.add_argument( + "--format", + choices=["yaml", "json"], + default="yaml", + help="Output format (default: yaml)", + ) + + args = parser.parse_args() + + raw = load_config(args.model) + spec = parse_hf_config(raw) + + model_type = raw.get("model_type", "unknown") + model_name = raw.get("_name_or_path", args.model) + + if args.format == "json": + print(json.dumps(spec, indent=2)) + else: + print(f"# ModelPack Transformer Spec") + print(f"# Generated from: {model_name}") + print(f"# Model type: {model_type}") + print(f"# NOTE: Fields marked NEEDS_REVIEW require human verification") + print() + print(format_yaml(spec)) + + # Report coverage + needs_review = [] + _find_needs_review(spec, "", needs_review) + if needs_review: + print(f"\n# --- Fields requiring review ({len(needs_review)}) ---") + for field in needs_review: + print(f"# - {field}") + + return 0 + + +def _find_needs_review(d: dict, prefix: str, result: list) -> None: + """Recursively find all NEEDS_REVIEW fields.""" + for key, value in d.items(): + path = f"{prefix}.{key}" if prefix else key + if isinstance(value, dict): + _find_needs_review(value, path, result) + elif value == NEEDS_REVIEW: + result.append(path) + + +if __name__ == "__main__": + raise SystemExit(main()) diff --git a/tools/hf_parser_test.py b/tools/hf_parser_test.py new file mode 100644 index 0000000..75227fc --- /dev/null +++ b/tools/hf_parser_test.py @@ -0,0 +1,278 @@ +#!/usr/bin/env python3 +"""Tests for the HuggingFace config parser.""" + +from __future__ import annotations + +import pytest + +from hf_parser import NEEDS_REVIEW, parse_hf_config + + +# Minimal config.json samples based on real HuggingFace models. +MISTRAL_7B_CONFIG = { + "model_type": "mistral", + "vocab_size": 32000, + "hidden_size": 4096, + "num_hidden_layers": 32, + "num_attention_heads": 32, + "num_key_value_heads": 8, + "intermediate_size": 14336, + "max_position_embeddings": 32768, + "rope_theta": 10000.0, + "rms_norm_eps": 1e-5, + "hidden_act": "silu", + "sliding_window": 4096, + "tie_word_embeddings": False, + "attention_bias": False, +} + +MIXTRAL_8X7B_CONFIG = { + "model_type": "mixtral", + "vocab_size": 32000, + "hidden_size": 4096, + "num_hidden_layers": 32, + "num_attention_heads": 32, + "num_key_value_heads": 8, + "intermediate_size": 14336, + "max_position_embeddings": 32768, + "rope_theta": 1000000.0, + "rms_norm_eps": 1e-5, + "hidden_act": "silu", + "num_local_experts": 8, + "num_experts_per_tok": 2, + "tie_word_embeddings": False, +} + +QWEN2_7B_CONFIG = { + "model_type": "qwen2", + "vocab_size": 152064, + "hidden_size": 3584, + "num_hidden_layers": 28, + "num_attention_heads": 28, + "num_key_value_heads": 4, + "intermediate_size": 18944, + "max_position_embeddings": 131072, + "rope_theta": 1000000.0, + "rms_norm_eps": 1e-6, + "hidden_act": "silu", + "tie_word_embeddings": False, + "attention_bias": True, + "sliding_window": 131072, +} + +GPT2_CONFIG = { + "model_type": "gpt2", + "vocab_size": 50257, + "hidden_size": 768, + "num_hidden_layers": 12, + "num_attention_heads": 12, + "intermediate_size": 3072, + "max_position_embeddings": 1024, + "layer_norm_eps": 1e-5, + "activation_function": "gelu_new", + "tie_word_embeddings": True, +} + +DEEPSEEK_V2_LITE_CONFIG = { + "model_type": "deepseek_v2", + "vocab_size": 102400, + "hidden_size": 2048, + "num_hidden_layers": 27, + "num_attention_heads": 16, + "num_key_value_heads": 16, + "intermediate_size": 10944, + "max_position_embeddings": 163840, + "rope_theta": 10000.0, + "rms_norm_eps": 1e-6, + "hidden_act": "silu", + "kv_lora_rank": 512, + "q_lora_rank": 1536, + "qk_nope_head_dim": 128, + "qk_rope_head_dim": 64, + "v_head_dim": 128, + "n_routed_experts": 64, + "num_experts_per_tok": 6, + "first_k_dense_replace": 1, + "moe_layer_freq": 1, + "num_shared_experts": 2, + "routed_scaling_factor": 1.0, + "topk_method": "group_limited_greedy", + "norm_topk_prob": False, + "tie_word_embeddings": False, +} + +UNKNOWN_CONFIG = { + "model_type": "some_new_model", + "vocab_size": 65536, + "hidden_size": 2048, + "num_hidden_layers": 24, + "num_attention_heads": 16, +} + + +class TestMistral: + def test_basic_fields(self): + spec = parse_hf_config(MISTRAL_7B_CONFIG) + assert spec["vocabulary_size"] == 32000 + assert spec["hidden_size"] == 4096 + assert spec["num_layers"] == 32 + assert spec["type"] == "decoder" + + def test_attention(self): + spec = parse_hf_config(MISTRAL_7B_CONFIG) + attn = spec["attention"] + assert attn["type"] == "gqa" + assert attn["num_attention_heads"] == 32 + assert attn["num_key_value_heads"] == 8 + assert attn["head_dim"] == 128 # 4096 / 32 + assert attn["is_causal"] is True + assert attn["sliding_window"] == 4096 + + def test_ffn(self): + spec = parse_hf_config(MISTRAL_7B_CONFIG) + assert spec["ffn_type"] == "mlp" + assert spec["mlp"]["intermediate_size"] == 14336 + assert spec["mlp"]["activation"] == "silu" + assert spec["mlp"]["use_gated_activation"] is True + + def test_norm(self): + spec = parse_hf_config(MISTRAL_7B_CONFIG) + assert spec["norm"]["type"] == "rmsnorm" + assert spec["norm"]["epsilon"] == 1e-5 + + def test_position_embedding(self): + spec = parse_hf_config(MISTRAL_7B_CONFIG) + pe = spec["position_embedding"] + assert pe["type"] == "rope" + assert pe["rope_theta"] == 10000.0 + assert pe["max_position_embeddings"] == 32768 + + +class TestMixtral: + def test_moe_detection(self): + spec = parse_hf_config(MIXTRAL_8X7B_CONFIG) + assert spec["ffn_type"] == "moe" + assert spec["moe"]["num_experts"] == 8 + assert spec["moe"]["top_k"] == 2 + + def test_attention_still_gqa(self): + spec = parse_hf_config(MIXTRAL_8X7B_CONFIG) + assert spec["attention"]["type"] == "gqa" + assert spec["attention"]["num_key_value_heads"] == 8 + + +class TestQwen2: + def test_attention_bias(self): + spec = parse_hf_config(QWEN2_7B_CONFIG) + assert spec["attention"]["has_qkv_bias"] is True + assert spec["attention"]["has_output_bias"] is True + + def test_rope_theta(self): + spec = parse_hf_config(QWEN2_7B_CONFIG) + assert spec["position_embedding"]["rope_theta"] == 1000000.0 + + +class TestGPT2: + def test_mha_attention(self): + spec = parse_hf_config(GPT2_CONFIG) + assert spec["attention"]["type"] == "mha" + + def test_layernorm(self): + spec = parse_hf_config(GPT2_CONFIG) + assert spec["norm"]["type"] == "layernorm" + assert spec["norm"]["epsilon"] == 1e-5 + + def test_tied_embeddings(self): + spec = parse_hf_config(GPT2_CONFIG) + assert spec["token_embedding"]["shared_embedding"] is True + + def test_no_gated_activation(self): + spec = parse_hf_config(GPT2_CONFIG) + assert spec["mlp"]["use_gated_activation"] is False + + def test_gelu_activation(self): + spec = parse_hf_config(GPT2_CONFIG) + assert spec["mlp"]["activation"] == "gelu" + + +class TestDeepSeekV2: + def test_mla_attention(self): + spec = parse_hf_config(DEEPSEEK_V2_LITE_CONFIG) + assert spec["attention"]["type"] == "mla" + + def test_mla_fields(self): + spec = parse_hf_config(DEEPSEEK_V2_LITE_CONFIG) + attn = spec["attention"] + assert attn["kv_lora_rank"] == 512 + assert attn["q_lora_rank"] == 1536 + assert attn["qk_nope_head_dim"] == 128 + assert attn["qk_rope_head_dim"] == 64 + assert attn["v_head_dim"] == 128 + + def test_moe_with_n_routed_experts(self): + spec = parse_hf_config(DEEPSEEK_V2_LITE_CONFIG) + assert spec["ffn_type"] == "moe" + assert spec["moe"]["num_experts"] == 64 + assert spec["moe"]["top_k"] == 6 + + def test_shared_experts(self): + spec = parse_hf_config(DEEPSEEK_V2_LITE_CONFIG) + assert spec["moe"]["num_shared_experts"] == 2 + + def test_moe_routing_fields(self): + spec = parse_hf_config(DEEPSEEK_V2_LITE_CONFIG) + assert spec["moe"]["routed_scaling_factor"] == 1.0 + assert spec["moe"]["topk_method"] == "group_limited_greedy" + assert spec["moe"]["norm_topk_prob"] is False + + def test_mixed_layers(self): + spec = parse_hf_config(DEEPSEEK_V2_LITE_CONFIG) + assert spec["layer_structure"] == "mixed" + assert spec["mixed_layers"]["first_k_dense_replace"] == 1 + assert spec["mixed_layers"]["moe_layer_freq"] == 1 + + +class TestGPT2PositionEmbedding: + def test_learned_position_embedding(self): + spec = parse_hf_config(GPT2_CONFIG) + assert spec["position_embedding"]["type"] == "learned" + + +class TestUnknownModel: + def test_needs_review_flags(self): + spec = parse_hf_config(UNKNOWN_CONFIG) + assert spec["attention"]["type"] == NEEDS_REVIEW + assert spec["ffn_type"] == NEEDS_REVIEW + + def test_static_fields_still_parsed(self): + spec = parse_hf_config(UNKNOWN_CONFIG) + assert spec["vocabulary_size"] == 65536 + assert spec["hidden_size"] == 2048 + assert spec["num_layers"] == 24 + + def test_head_dim_derived(self): + spec = parse_hf_config(UNKNOWN_CONFIG) + assert spec["attention"]["head_dim"] == 128 # 2048 / 16 + + +class TestHeadDimDerivation: + def test_explicit_head_dim(self): + config = { + "model_type": "mistral", + "vocab_size": 32000, + "hidden_size": 4096, + "num_attention_heads": 32, + "head_dim": 64, # explicit, not derived + } + spec = parse_hf_config(config) + assert spec["attention"]["head_dim"] == 64 # uses explicit value + + def test_derived_head_dim(self): + config = { + "model_type": "llama", + "vocab_size": 32000, + "hidden_size": 4096, + "num_attention_heads": 32, + } + spec = parse_hf_config(config) + assert spec["attention"]["head_dim"] == 128 # 4096 / 32