Skip to content

Commit d64692a

Browse files
committed
feat(tools): add HuggingFace config parser for transformer spec (issue modelpack#164)
Adds hf_parser.py that converts HuggingFace config.json into ModelPack transformer spec format (PR modelpack#111 vocabulary). Supports Mistral, Mixtral, Qwen2, GPT-2, DeepSeek-V2 (MLA + mixed layers), and unknown models with NEEDS_REVIEW fallback. Includes 26 unit tests. Improvements over PR modelpack#185's field mapping research: - MLA attention fields (kv_lora_rank, q_lora_rank, qk_nope/rope_head_dim) - DeepSeek MoE routing params (routed_scaling_factor, topk_method) - Mixed layers support (first_k_dense_replace, moe_layer_freq) - Correct learned position embedding for GPT-2/GPT-Neo Signed-off-by: pradhyum6144 <pradhyum314@gmail.com>
1 parent b24b8fd commit d64692a

File tree

2 files changed

+643
-0
lines changed

2 files changed

+643
-0
lines changed

tools/hf_parser.py

Lines changed: 365 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,365 @@
1+
#!/usr/bin/env python3
2+
"""Parse HuggingFace model config.json into ModelPack transformer spec format.
3+
4+
This tool maps HuggingFace Transformers config.json fields to the ModelPack
5+
unified transformer specification vocabulary defined in PR #111
6+
(docs/architecture.md by @aftersnow).
7+
8+
Usage:
9+
python tools/hf_parser.py meta-llama/Meta-Llama-3-8B
10+
python tools/hf_parser.py mistralai/Mistral-7B-v0.3
11+
python tools/hf_parser.py --file path/to/config.json
12+
13+
The output is a YAML spec file following the ModelPack transformer spec format.
14+
Fields that cannot be reliably inferred from config.json are marked as
15+
NEEDS_REVIEW for human verification.
16+
"""
17+
18+
from __future__ import annotations
19+
20+
import argparse
21+
import json
22+
import sys
23+
from pathlib import Path
24+
25+
NEEDS_REVIEW = "__NEEDS_REVIEW__"
26+
27+
# Maps HuggingFace config.json field names to ModelPack transformer spec paths.
28+
# Based on PR #111's field vocabulary (docs/architecture.md).
29+
FIELD_MAP = {
30+
# Top-level transformer fields
31+
"vocab_size": "vocabulary_size",
32+
"hidden_size": "hidden_size",
33+
# Position embedding
34+
"max_position_embeddings": "position_embedding.max_position_embeddings",
35+
"rope_theta": "position_embedding.rope_theta",
36+
"rope_scaling": "position_embedding.rope_scaling",
37+
# Attention
38+
"num_attention_heads": "attention.num_attention_heads",
39+
"num_key_value_heads": "attention.num_key_value_heads",
40+
"head_dim": "attention.head_dim",
41+
# FFN / MLP
42+
"intermediate_size": "mlp.intermediate_size",
43+
# Transformer layers
44+
"num_hidden_layers": "num_layers",
45+
# Normalization
46+
"rms_norm_eps": "norm.epsilon",
47+
# MoE fields
48+
"num_local_experts": "moe.num_experts",
49+
"num_experts_per_tok": "moe.top_k",
50+
"num_experts": "moe.num_experts",
51+
"n_routed_experts": "moe.num_experts", # DeepSeek naming variant
52+
# MLA fields (DeepSeek)
53+
"kv_lora_rank": "attention.kv_lora_rank",
54+
"q_lora_rank": "attention.q_lora_rank",
55+
"qk_nope_head_dim": "attention.qk_nope_head_dim",
56+
"qk_rope_head_dim": "attention.qk_rope_head_dim",
57+
"v_head_dim": "attention.v_head_dim",
58+
}
59+
60+
# Known model type → attention type mapping
61+
ATTENTION_TYPE_MAP = {
62+
"llama": "gqa",
63+
"mistral": "gqa",
64+
"mixtral": "gqa",
65+
"qwen2": "gqa",
66+
"qwen2_moe": "gqa",
67+
"gemma": "gqa",
68+
"gemma2": "gqa",
69+
"phi3": "gqa",
70+
"deepseek_v2": "mla",
71+
"deepseek_v3": "mla",
72+
"gpt2": "mha",
73+
"gpt_neo": "mha",
74+
"gpt_neox": "mha",
75+
"falcon": "mha",
76+
}
77+
78+
# Known model type → FFN type mapping
79+
FFN_TYPE_MAP = {
80+
"llama": "mlp",
81+
"mistral": "mlp",
82+
"mixtral": "moe",
83+
"qwen2": "mlp",
84+
"qwen2_moe": "moe",
85+
"gemma": "mlp",
86+
"gemma2": "mlp",
87+
"phi3": "mlp",
88+
"deepseek_v2": "moe",
89+
"deepseek_v3": "moe",
90+
"gpt2": "mlp",
91+
"gpt_neo": "mlp",
92+
"gpt_neox": "mlp",
93+
"falcon": "mlp",
94+
}
95+
96+
# Known model type → activation function mapping
97+
ACTIVATION_MAP = {
98+
"llama": "silu",
99+
"mistral": "silu",
100+
"mixtral": "silu",
101+
"qwen2": "silu",
102+
"qwen2_moe": "silu",
103+
"gemma": "gelu",
104+
"gemma2": "gelu",
105+
"phi3": "silu",
106+
"gpt2": "gelu",
107+
"gpt_neo": "gelu",
108+
"gpt_neox": "gelu",
109+
"falcon": "gelu",
110+
}
111+
112+
113+
def _set_nested(d: dict, path: str, value) -> None:
114+
"""Set a value in a nested dict using dot-separated path."""
115+
keys = path.split(".")
116+
for key in keys[:-1]:
117+
d = d.setdefault(key, {})
118+
d[keys[-1]] = value
119+
120+
121+
def _get_nested(d: dict, path: str, default=None):
122+
"""Get a value from a nested dict using dot-separated path."""
123+
keys = path.split(".")
124+
for key in keys:
125+
if not isinstance(d, dict) or key not in d:
126+
return default
127+
d = d[key]
128+
return d
129+
130+
131+
def parse_hf_config(raw: dict) -> dict:
132+
"""Parse a HuggingFace config.json dict into ModelPack transformer spec.
133+
134+
Args:
135+
raw: The parsed config.json dict from HuggingFace.
136+
137+
Returns:
138+
A dict following the ModelPack transformer spec format.
139+
"""
140+
result: dict = {}
141+
model_type = raw.get("model_type", "").lower()
142+
143+
# Map static fields
144+
for hf_key, mp_path in FIELD_MAP.items():
145+
if hf_key in raw and raw[hf_key] is not None:
146+
_set_nested(result, mp_path, raw[hf_key])
147+
148+
# Derive head_dim if absent
149+
if "attention" in result and "head_dim" not in result.get("attention", {}):
150+
hidden = result.get("hidden_size")
151+
n_heads = _get_nested(result, "attention.num_attention_heads")
152+
if hidden and n_heads:
153+
_set_nested(result, "attention.head_dim", hidden // n_heads)
154+
155+
# Set architecture type
156+
result["type"] = "decoder"
157+
result["architecture_version"] = "0.1.0"
158+
159+
# Infer attention type from model_type
160+
attn_type = ATTENTION_TYPE_MAP.get(model_type, NEEDS_REVIEW)
161+
_set_nested(result, "attention.type", attn_type)
162+
_set_nested(result, "attention.is_causal", True)
163+
164+
# Check for sliding window attention
165+
if raw.get("sliding_window") is not None:
166+
_set_nested(result, "attention.sliding_window", raw["sliding_window"])
167+
168+
# Infer FFN type
169+
ffn_type = FFN_TYPE_MAP.get(model_type, NEEDS_REVIEW)
170+
result["ffn_type"] = ffn_type
171+
172+
# Set activation function
173+
hf_activation = raw.get("hidden_act", raw.get("activation_function"))
174+
if hf_activation:
175+
activation = hf_activation.lower()
176+
if "silu" in activation or "swish" in activation:
177+
activation = "silu"
178+
elif "gelu" in activation:
179+
activation = "gelu"
180+
elif "relu" in activation:
181+
activation = "relu"
182+
else:
183+
activation = ACTIVATION_MAP.get(model_type, NEEDS_REVIEW)
184+
185+
if ffn_type == "mlp":
186+
_set_nested(result, "mlp.activation", activation)
187+
# Most modern models use gated activation (SwiGLU, GeGLU)
188+
use_gated = model_type in (
189+
"llama", "mistral", "mixtral", "qwen2", "qwen2_moe", "phi3",
190+
"gemma", "gemma2", "deepseek_v2", "deepseek_v3",
191+
)
192+
_set_nested(result, "mlp.use_gated_activation", use_gated)
193+
elif ffn_type == "moe":
194+
_set_nested(result, "moe.activation", activation)
195+
# MoE-specific fields
196+
if "moe_intermediate_size" in raw:
197+
_set_nested(result, "moe.moe_intermediate_size", raw["moe_intermediate_size"])
198+
if "num_shared_experts" in raw:
199+
_set_nested(result, "moe.num_shared_experts", raw["num_shared_experts"])
200+
if "shared_expert_intermediate_size" in raw:
201+
_set_nested(
202+
result, "moe.shared_expert_intermediate_size",
203+
raw["shared_expert_intermediate_size"],
204+
)
205+
# DeepSeek MoE-specific fields (from PR #185 research)
206+
if "routed_scaling_factor" in raw:
207+
_set_nested(result, "moe.routed_scaling_factor", raw["routed_scaling_factor"])
208+
if "topk_method" in raw:
209+
_set_nested(result, "moe.topk_method", raw["topk_method"])
210+
if "norm_topk_prob" in raw:
211+
_set_nested(result, "moe.norm_topk_prob", raw["norm_topk_prob"])
212+
213+
# Mixed layers support (DeepSeek uses dense layers before switching to MoE)
214+
if "first_k_dense_replace" in raw and "moe_layer_freq" in raw:
215+
result["layer_structure"] = "mixed"
216+
_set_nested(result, "mixed_layers.first_k_dense_replace", raw["first_k_dense_replace"])
217+
_set_nested(result, "mixed_layers.moe_layer_freq", raw["moe_layer_freq"])
218+
219+
# Normalization
220+
norm_type = "rmsnorm" # Most modern models use RMSNorm
221+
if model_type in ("gpt2", "gpt_neo"):
222+
norm_type = "layernorm"
223+
_set_nested(result, "norm.type", norm_type)
224+
225+
if "layer_norm_eps" in raw:
226+
_set_nested(result, "norm.epsilon", raw["layer_norm_eps"])
227+
228+
# Tokenizer
229+
_set_nested(result, "tokenizer.type", "bpe")
230+
_set_nested(result, "tokenizer.library", "huggingface")
231+
232+
# Position embedding type
233+
if model_type in ("gpt2", "gpt_neo"):
234+
_set_nested(result, "position_embedding.type", "learned")
235+
else:
236+
_set_nested(result, "position_embedding.type", "rope")
237+
238+
# Embedding
239+
tie_embeddings = raw.get("tie_word_embeddings", False)
240+
_set_nested(result, "token_embedding.shared_embedding", tie_embeddings)
241+
242+
# Bias flags
243+
attn_bias = raw.get("attention_bias", False)
244+
_set_nested(result, "attention.has_qkv_bias", attn_bias)
245+
_set_nested(result, "attention.has_output_bias", attn_bias)
246+
247+
mlp_bias = raw.get("mlp_bias", False)
248+
if ffn_type == "mlp":
249+
_set_nested(result, "mlp.has_bias", mlp_bias)
250+
251+
return result
252+
253+
254+
def format_yaml(spec: dict, indent: int = 0) -> str:
255+
"""Format a spec dict as YAML string."""
256+
lines = []
257+
prefix = " " * indent
258+
for key, value in spec.items():
259+
if isinstance(value, dict):
260+
lines.append(f"{prefix}{key}:")
261+
lines.append(format_yaml(value, indent + 1))
262+
elif isinstance(value, bool):
263+
lines.append(f"{prefix}{key}: {str(value).lower()}")
264+
elif isinstance(value, str):
265+
if value == NEEDS_REVIEW:
266+
lines.append(f"{prefix}{key}: {value} # requires human review")
267+
else:
268+
lines.append(f'{prefix}{key}: "{value}"')
269+
elif value is None:
270+
lines.append(f"{prefix}{key}: null")
271+
else:
272+
lines.append(f"{prefix}{key}: {value}")
273+
return "\n".join(lines)
274+
275+
276+
def load_config(source: str) -> dict:
277+
"""Load a config.json from a file path or HuggingFace model ID.
278+
279+
Args:
280+
source: Either a local file path or a HuggingFace model ID.
281+
282+
Returns:
283+
The parsed config.json dict.
284+
"""
285+
path = Path(source)
286+
if path.is_file():
287+
with path.open(encoding="utf-8") as f:
288+
return json.load(f)
289+
290+
# Try loading from HuggingFace Hub
291+
try:
292+
from huggingface_hub import hf_hub_download
293+
294+
config_path = hf_hub_download(repo_id=source, filename="config.json")
295+
with open(config_path, encoding="utf-8") as f:
296+
return json.load(f)
297+
except ImportError:
298+
print(
299+
"error: huggingface_hub not installed. "
300+
"Install with: pip install huggingface_hub",
301+
file=sys.stderr,
302+
)
303+
sys.exit(1)
304+
except Exception as e:
305+
print(f"error: failed to load config from '{source}': {e}", file=sys.stderr)
306+
sys.exit(1)
307+
308+
309+
def main() -> int:
310+
parser = argparse.ArgumentParser(
311+
description="Parse HuggingFace config.json into ModelPack transformer spec",
312+
)
313+
parser.add_argument(
314+
"model",
315+
help="HuggingFace model ID (e.g., meta-llama/Meta-Llama-3-8B) "
316+
"or path to config.json",
317+
)
318+
parser.add_argument(
319+
"--format",
320+
choices=["yaml", "json"],
321+
default="yaml",
322+
help="Output format (default: yaml)",
323+
)
324+
325+
args = parser.parse_args()
326+
327+
raw = load_config(args.model)
328+
spec = parse_hf_config(raw)
329+
330+
model_type = raw.get("model_type", "unknown")
331+
model_name = raw.get("_name_or_path", args.model)
332+
333+
if args.format == "json":
334+
print(json.dumps(spec, indent=2))
335+
else:
336+
print(f"# ModelPack Transformer Spec")
337+
print(f"# Generated from: {model_name}")
338+
print(f"# Model type: {model_type}")
339+
print(f"# NOTE: Fields marked NEEDS_REVIEW require human verification")
340+
print()
341+
print(format_yaml(spec))
342+
343+
# Report coverage
344+
needs_review = []
345+
_find_needs_review(spec, "", needs_review)
346+
if needs_review:
347+
print(f"\n# --- Fields requiring review ({len(needs_review)}) ---")
348+
for field in needs_review:
349+
print(f"# - {field}")
350+
351+
return 0
352+
353+
354+
def _find_needs_review(d: dict, prefix: str, result: list) -> None:
355+
"""Recursively find all NEEDS_REVIEW fields."""
356+
for key, value in d.items():
357+
path = f"{prefix}.{key}" if prefix else key
358+
if isinstance(value, dict):
359+
_find_needs_review(value, path, result)
360+
elif value == NEEDS_REVIEW:
361+
result.append(path)
362+
363+
364+
if __name__ == "__main__":
365+
raise SystemExit(main())

0 commit comments

Comments
 (0)