|
| 1 | +#!/usr/bin/env python3 |
| 2 | +"""Parse HuggingFace model config.json into ModelPack transformer spec format. |
| 3 | +
|
| 4 | +This tool maps HuggingFace Transformers config.json fields to the ModelPack |
| 5 | +unified transformer specification vocabulary defined in PR #111 |
| 6 | +(docs/architecture.md by @aftersnow). |
| 7 | +
|
| 8 | +Usage: |
| 9 | + python tools/hf_parser.py meta-llama/Meta-Llama-3-8B |
| 10 | + python tools/hf_parser.py mistralai/Mistral-7B-v0.3 |
| 11 | + python tools/hf_parser.py --file path/to/config.json |
| 12 | +
|
| 13 | +The output is a YAML spec file following the ModelPack transformer spec format. |
| 14 | +Fields that cannot be reliably inferred from config.json are marked as |
| 15 | +NEEDS_REVIEW for human verification. |
| 16 | +""" |
| 17 | + |
| 18 | +from __future__ import annotations |
| 19 | + |
| 20 | +import argparse |
| 21 | +import json |
| 22 | +import sys |
| 23 | +from pathlib import Path |
| 24 | + |
| 25 | +NEEDS_REVIEW = "__NEEDS_REVIEW__" |
| 26 | + |
| 27 | +# Maps HuggingFace config.json field names to ModelPack transformer spec paths. |
| 28 | +# Based on PR #111's field vocabulary (docs/architecture.md). |
| 29 | +FIELD_MAP = { |
| 30 | + # Top-level transformer fields |
| 31 | + "vocab_size": "vocabulary_size", |
| 32 | + "hidden_size": "hidden_size", |
| 33 | + # Position embedding |
| 34 | + "max_position_embeddings": "position_embedding.max_position_embeddings", |
| 35 | + "rope_theta": "position_embedding.rope_theta", |
| 36 | + "rope_scaling": "position_embedding.rope_scaling", |
| 37 | + # Attention |
| 38 | + "num_attention_heads": "attention.num_attention_heads", |
| 39 | + "num_key_value_heads": "attention.num_key_value_heads", |
| 40 | + "head_dim": "attention.head_dim", |
| 41 | + # FFN / MLP |
| 42 | + "intermediate_size": "mlp.intermediate_size", |
| 43 | + # Transformer layers |
| 44 | + "num_hidden_layers": "num_layers", |
| 45 | + # Normalization |
| 46 | + "rms_norm_eps": "norm.epsilon", |
| 47 | + # MoE fields |
| 48 | + "num_local_experts": "moe.num_experts", |
| 49 | + "num_experts_per_tok": "moe.top_k", |
| 50 | + "num_experts": "moe.num_experts", |
| 51 | + "n_routed_experts": "moe.num_experts", # DeepSeek naming variant |
| 52 | + # MLA fields (DeepSeek) |
| 53 | + "kv_lora_rank": "attention.kv_lora_rank", |
| 54 | + "q_lora_rank": "attention.q_lora_rank", |
| 55 | + "qk_nope_head_dim": "attention.qk_nope_head_dim", |
| 56 | + "qk_rope_head_dim": "attention.qk_rope_head_dim", |
| 57 | + "v_head_dim": "attention.v_head_dim", |
| 58 | +} |
| 59 | + |
| 60 | +# Known model type → attention type mapping |
| 61 | +ATTENTION_TYPE_MAP = { |
| 62 | + "llama": "gqa", |
| 63 | + "mistral": "gqa", |
| 64 | + "mixtral": "gqa", |
| 65 | + "qwen2": "gqa", |
| 66 | + "qwen2_moe": "gqa", |
| 67 | + "gemma": "gqa", |
| 68 | + "gemma2": "gqa", |
| 69 | + "phi3": "gqa", |
| 70 | + "deepseek_v2": "mla", |
| 71 | + "deepseek_v3": "mla", |
| 72 | + "gpt2": "mha", |
| 73 | + "gpt_neo": "mha", |
| 74 | + "gpt_neox": "mha", |
| 75 | + "falcon": "mha", |
| 76 | +} |
| 77 | + |
| 78 | +# Known model type → FFN type mapping |
| 79 | +FFN_TYPE_MAP = { |
| 80 | + "llama": "mlp", |
| 81 | + "mistral": "mlp", |
| 82 | + "mixtral": "moe", |
| 83 | + "qwen2": "mlp", |
| 84 | + "qwen2_moe": "moe", |
| 85 | + "gemma": "mlp", |
| 86 | + "gemma2": "mlp", |
| 87 | + "phi3": "mlp", |
| 88 | + "deepseek_v2": "moe", |
| 89 | + "deepseek_v3": "moe", |
| 90 | + "gpt2": "mlp", |
| 91 | + "gpt_neo": "mlp", |
| 92 | + "gpt_neox": "mlp", |
| 93 | + "falcon": "mlp", |
| 94 | +} |
| 95 | + |
| 96 | +# Known model type → activation function mapping |
| 97 | +ACTIVATION_MAP = { |
| 98 | + "llama": "silu", |
| 99 | + "mistral": "silu", |
| 100 | + "mixtral": "silu", |
| 101 | + "qwen2": "silu", |
| 102 | + "qwen2_moe": "silu", |
| 103 | + "gemma": "gelu", |
| 104 | + "gemma2": "gelu", |
| 105 | + "phi3": "silu", |
| 106 | + "gpt2": "gelu", |
| 107 | + "gpt_neo": "gelu", |
| 108 | + "gpt_neox": "gelu", |
| 109 | + "falcon": "gelu", |
| 110 | +} |
| 111 | + |
| 112 | + |
| 113 | +def _set_nested(d: dict, path: str, value) -> None: |
| 114 | + """Set a value in a nested dict using dot-separated path.""" |
| 115 | + keys = path.split(".") |
| 116 | + for key in keys[:-1]: |
| 117 | + d = d.setdefault(key, {}) |
| 118 | + d[keys[-1]] = value |
| 119 | + |
| 120 | + |
| 121 | +def _get_nested(d: dict, path: str, default=None): |
| 122 | + """Get a value from a nested dict using dot-separated path.""" |
| 123 | + keys = path.split(".") |
| 124 | + for key in keys: |
| 125 | + if not isinstance(d, dict) or key not in d: |
| 126 | + return default |
| 127 | + d = d[key] |
| 128 | + return d |
| 129 | + |
| 130 | + |
| 131 | +def parse_hf_config(raw: dict) -> dict: |
| 132 | + """Parse a HuggingFace config.json dict into ModelPack transformer spec. |
| 133 | +
|
| 134 | + Args: |
| 135 | + raw: The parsed config.json dict from HuggingFace. |
| 136 | +
|
| 137 | + Returns: |
| 138 | + A dict following the ModelPack transformer spec format. |
| 139 | + """ |
| 140 | + result: dict = {} |
| 141 | + model_type = raw.get("model_type", "").lower() |
| 142 | + |
| 143 | + # Map static fields |
| 144 | + for hf_key, mp_path in FIELD_MAP.items(): |
| 145 | + if hf_key in raw and raw[hf_key] is not None: |
| 146 | + _set_nested(result, mp_path, raw[hf_key]) |
| 147 | + |
| 148 | + # Derive head_dim if absent |
| 149 | + if "attention" in result and "head_dim" not in result.get("attention", {}): |
| 150 | + hidden = result.get("hidden_size") |
| 151 | + n_heads = _get_nested(result, "attention.num_attention_heads") |
| 152 | + if hidden and n_heads: |
| 153 | + _set_nested(result, "attention.head_dim", hidden // n_heads) |
| 154 | + |
| 155 | + # Set architecture type |
| 156 | + result["type"] = "decoder" |
| 157 | + result["architecture_version"] = "0.1.0" |
| 158 | + |
| 159 | + # Infer attention type from model_type |
| 160 | + attn_type = ATTENTION_TYPE_MAP.get(model_type, NEEDS_REVIEW) |
| 161 | + _set_nested(result, "attention.type", attn_type) |
| 162 | + _set_nested(result, "attention.is_causal", True) |
| 163 | + |
| 164 | + # Check for sliding window attention |
| 165 | + if raw.get("sliding_window") is not None: |
| 166 | + _set_nested(result, "attention.sliding_window", raw["sliding_window"]) |
| 167 | + |
| 168 | + # Infer FFN type |
| 169 | + ffn_type = FFN_TYPE_MAP.get(model_type, NEEDS_REVIEW) |
| 170 | + result["ffn_type"] = ffn_type |
| 171 | + |
| 172 | + # Set activation function |
| 173 | + hf_activation = raw.get("hidden_act", raw.get("activation_function")) |
| 174 | + if hf_activation: |
| 175 | + activation = hf_activation.lower() |
| 176 | + if "silu" in activation or "swish" in activation: |
| 177 | + activation = "silu" |
| 178 | + elif "gelu" in activation: |
| 179 | + activation = "gelu" |
| 180 | + elif "relu" in activation: |
| 181 | + activation = "relu" |
| 182 | + else: |
| 183 | + activation = ACTIVATION_MAP.get(model_type, NEEDS_REVIEW) |
| 184 | + |
| 185 | + if ffn_type == "mlp": |
| 186 | + _set_nested(result, "mlp.activation", activation) |
| 187 | + # Most modern models use gated activation (SwiGLU, GeGLU) |
| 188 | + use_gated = model_type in ( |
| 189 | + "llama", "mistral", "mixtral", "qwen2", "qwen2_moe", "phi3", |
| 190 | + "gemma", "gemma2", "deepseek_v2", "deepseek_v3", |
| 191 | + ) |
| 192 | + _set_nested(result, "mlp.use_gated_activation", use_gated) |
| 193 | + elif ffn_type == "moe": |
| 194 | + _set_nested(result, "moe.activation", activation) |
| 195 | + # MoE-specific fields |
| 196 | + if "moe_intermediate_size" in raw: |
| 197 | + _set_nested(result, "moe.moe_intermediate_size", raw["moe_intermediate_size"]) |
| 198 | + if "num_shared_experts" in raw: |
| 199 | + _set_nested(result, "moe.num_shared_experts", raw["num_shared_experts"]) |
| 200 | + if "shared_expert_intermediate_size" in raw: |
| 201 | + _set_nested( |
| 202 | + result, "moe.shared_expert_intermediate_size", |
| 203 | + raw["shared_expert_intermediate_size"], |
| 204 | + ) |
| 205 | + # DeepSeek MoE-specific fields (from PR #185 research) |
| 206 | + if "routed_scaling_factor" in raw: |
| 207 | + _set_nested(result, "moe.routed_scaling_factor", raw["routed_scaling_factor"]) |
| 208 | + if "topk_method" in raw: |
| 209 | + _set_nested(result, "moe.topk_method", raw["topk_method"]) |
| 210 | + if "norm_topk_prob" in raw: |
| 211 | + _set_nested(result, "moe.norm_topk_prob", raw["norm_topk_prob"]) |
| 212 | + |
| 213 | + # Mixed layers support (DeepSeek uses dense layers before switching to MoE) |
| 214 | + if "first_k_dense_replace" in raw and "moe_layer_freq" in raw: |
| 215 | + result["layer_structure"] = "mixed" |
| 216 | + _set_nested(result, "mixed_layers.first_k_dense_replace", raw["first_k_dense_replace"]) |
| 217 | + _set_nested(result, "mixed_layers.moe_layer_freq", raw["moe_layer_freq"]) |
| 218 | + |
| 219 | + # Normalization |
| 220 | + norm_type = "rmsnorm" # Most modern models use RMSNorm |
| 221 | + if model_type in ("gpt2", "gpt_neo"): |
| 222 | + norm_type = "layernorm" |
| 223 | + _set_nested(result, "norm.type", norm_type) |
| 224 | + |
| 225 | + if "layer_norm_eps" in raw: |
| 226 | + _set_nested(result, "norm.epsilon", raw["layer_norm_eps"]) |
| 227 | + |
| 228 | + # Tokenizer |
| 229 | + _set_nested(result, "tokenizer.type", "bpe") |
| 230 | + _set_nested(result, "tokenizer.library", "huggingface") |
| 231 | + |
| 232 | + # Position embedding type |
| 233 | + if model_type in ("gpt2", "gpt_neo"): |
| 234 | + _set_nested(result, "position_embedding.type", "learned") |
| 235 | + else: |
| 236 | + _set_nested(result, "position_embedding.type", "rope") |
| 237 | + |
| 238 | + # Embedding |
| 239 | + tie_embeddings = raw.get("tie_word_embeddings", False) |
| 240 | + _set_nested(result, "token_embedding.shared_embedding", tie_embeddings) |
| 241 | + |
| 242 | + # Bias flags |
| 243 | + attn_bias = raw.get("attention_bias", False) |
| 244 | + _set_nested(result, "attention.has_qkv_bias", attn_bias) |
| 245 | + _set_nested(result, "attention.has_output_bias", attn_bias) |
| 246 | + |
| 247 | + mlp_bias = raw.get("mlp_bias", False) |
| 248 | + if ffn_type == "mlp": |
| 249 | + _set_nested(result, "mlp.has_bias", mlp_bias) |
| 250 | + |
| 251 | + return result |
| 252 | + |
| 253 | + |
| 254 | +def format_yaml(spec: dict, indent: int = 0) -> str: |
| 255 | + """Format a spec dict as YAML string.""" |
| 256 | + lines = [] |
| 257 | + prefix = " " * indent |
| 258 | + for key, value in spec.items(): |
| 259 | + if isinstance(value, dict): |
| 260 | + lines.append(f"{prefix}{key}:") |
| 261 | + lines.append(format_yaml(value, indent + 1)) |
| 262 | + elif isinstance(value, bool): |
| 263 | + lines.append(f"{prefix}{key}: {str(value).lower()}") |
| 264 | + elif isinstance(value, str): |
| 265 | + if value == NEEDS_REVIEW: |
| 266 | + lines.append(f"{prefix}{key}: {value} # requires human review") |
| 267 | + else: |
| 268 | + lines.append(f'{prefix}{key}: "{value}"') |
| 269 | + elif value is None: |
| 270 | + lines.append(f"{prefix}{key}: null") |
| 271 | + else: |
| 272 | + lines.append(f"{prefix}{key}: {value}") |
| 273 | + return "\n".join(lines) |
| 274 | + |
| 275 | + |
| 276 | +def load_config(source: str) -> dict: |
| 277 | + """Load a config.json from a file path or HuggingFace model ID. |
| 278 | +
|
| 279 | + Args: |
| 280 | + source: Either a local file path or a HuggingFace model ID. |
| 281 | +
|
| 282 | + Returns: |
| 283 | + The parsed config.json dict. |
| 284 | + """ |
| 285 | + path = Path(source) |
| 286 | + if path.is_file(): |
| 287 | + with path.open(encoding="utf-8") as f: |
| 288 | + return json.load(f) |
| 289 | + |
| 290 | + # Try loading from HuggingFace Hub |
| 291 | + try: |
| 292 | + from huggingface_hub import hf_hub_download |
| 293 | + |
| 294 | + config_path = hf_hub_download(repo_id=source, filename="config.json") |
| 295 | + with open(config_path, encoding="utf-8") as f: |
| 296 | + return json.load(f) |
| 297 | + except ImportError: |
| 298 | + print( |
| 299 | + "error: huggingface_hub not installed. " |
| 300 | + "Install with: pip install huggingface_hub", |
| 301 | + file=sys.stderr, |
| 302 | + ) |
| 303 | + sys.exit(1) |
| 304 | + except Exception as e: |
| 305 | + print(f"error: failed to load config from '{source}': {e}", file=sys.stderr) |
| 306 | + sys.exit(1) |
| 307 | + |
| 308 | + |
| 309 | +def main() -> int: |
| 310 | + parser = argparse.ArgumentParser( |
| 311 | + description="Parse HuggingFace config.json into ModelPack transformer spec", |
| 312 | + ) |
| 313 | + parser.add_argument( |
| 314 | + "model", |
| 315 | + help="HuggingFace model ID (e.g., meta-llama/Meta-Llama-3-8B) " |
| 316 | + "or path to config.json", |
| 317 | + ) |
| 318 | + parser.add_argument( |
| 319 | + "--format", |
| 320 | + choices=["yaml", "json"], |
| 321 | + default="yaml", |
| 322 | + help="Output format (default: yaml)", |
| 323 | + ) |
| 324 | + |
| 325 | + args = parser.parse_args() |
| 326 | + |
| 327 | + raw = load_config(args.model) |
| 328 | + spec = parse_hf_config(raw) |
| 329 | + |
| 330 | + model_type = raw.get("model_type", "unknown") |
| 331 | + model_name = raw.get("_name_or_path", args.model) |
| 332 | + |
| 333 | + if args.format == "json": |
| 334 | + print(json.dumps(spec, indent=2)) |
| 335 | + else: |
| 336 | + print(f"# ModelPack Transformer Spec") |
| 337 | + print(f"# Generated from: {model_name}") |
| 338 | + print(f"# Model type: {model_type}") |
| 339 | + print(f"# NOTE: Fields marked NEEDS_REVIEW require human verification") |
| 340 | + print() |
| 341 | + print(format_yaml(spec)) |
| 342 | + |
| 343 | + # Report coverage |
| 344 | + needs_review = [] |
| 345 | + _find_needs_review(spec, "", needs_review) |
| 346 | + if needs_review: |
| 347 | + print(f"\n# --- Fields requiring review ({len(needs_review)}) ---") |
| 348 | + for field in needs_review: |
| 349 | + print(f"# - {field}") |
| 350 | + |
| 351 | + return 0 |
| 352 | + |
| 353 | + |
| 354 | +def _find_needs_review(d: dict, prefix: str, result: list) -> None: |
| 355 | + """Recursively find all NEEDS_REVIEW fields.""" |
| 356 | + for key, value in d.items(): |
| 357 | + path = f"{prefix}.{key}" if prefix else key |
| 358 | + if isinstance(value, dict): |
| 359 | + _find_needs_review(value, path, result) |
| 360 | + elif value == NEEDS_REVIEW: |
| 361 | + result.append(path) |
| 362 | + |
| 363 | + |
| 364 | +if __name__ == "__main__": |
| 365 | + raise SystemExit(main()) |
0 commit comments