Skip to content
Merged
Show file tree
Hide file tree
Changes from 6 commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
1 change: 1 addition & 0 deletions CHANGELOG.md
Original file line number Diff line number Diff line change
Expand Up @@ -12,6 +12,7 @@ and this project adheres to [Semantic Versioning](https://semver.org/spec/v2.0.0
- avoid repeatedly scanning sharded model families during directory scans
- keep shard sibling discovery within the requested scan root
- preserve per-shard metadata when aggregating sharded model families
- preserve HuggingFace snapshot shard paths while grouping cache-backed families
- distinguish ASCII-serialized Torch7 artifacts from plain PyTorch source text

## [0.2.45](https://github.com/promptfoo/modelaudit/compare/v0.2.44...v0.2.45) (2026-05-03)
Expand Down
17 changes: 13 additions & 4 deletions modelaudit/core.py
Original file line number Diff line number Diff line change
Expand Up @@ -617,6 +617,12 @@ def scan_model_directory_or_file(
)
if resolved_file is None:
continue
snapshot_path = Path(file_path).absolute()
scan_source = (
snapshot_path
if is_hf_cache_symlink and _shard_family_key_for_path(str(snapshot_path)) is not None
else resolved_file
Comment thread
mldangelo-oai marked this conversation as resolved.
Outdated
Comment thread
mldangelo-oai marked this conversation as resolved.
Outdated
)

# Skip non-model files early if filtering is enabled
# Note: skip_file_types parameter already contains the correct value
Expand Down Expand Up @@ -647,17 +653,18 @@ def scan_model_directory_or_file(
continue

# Handle DVC files and get target paths
target_paths = [resolved_file]
target_paths = [scan_source]
if file.endswith(".dvc"):
dvc_targets = resolve_dvc_file(file_path)
if dvc_targets:
target_paths = [Path(t).resolve() for t in dvc_targets]

for target_path in target_paths:
target_str = str(target_path)
if target_str in scanned_paths:
dedupe_target_str = str(target_path.resolve()) if is_hf_cache_symlink else target_str
if dedupe_target_str in scanned_paths:
Comment thread
mldangelo-oai marked this conversation as resolved.
Outdated
continue
scanned_paths.add(target_str)
scanned_paths.add(dedupe_target_str)

if not is_hf_cache_symlink and not is_within_directory(str(base_dir), str(target_path)):
_add_issue_to_model(
Expand Down Expand Up @@ -690,7 +697,9 @@ def scan_model_directory_or_file(
resolved_shard_path,
)
)
if shard_in_base_dir or shard_in_hf_blobs:
if shard_in_hf_blobs:
family_paths.add(str(Path(shard_path).absolute()))
elif shard_in_base_dir:
family_paths.add(resolved_shard_path)
else:
_add_issue_to_model(
Expand Down
120 changes: 120 additions & 0 deletions tests/test_core.py
Original file line number Diff line number Diff line change
Expand Up @@ -208,6 +208,126 @@ def fake_scan_file(path: str, config: dict[str, Any] | None = None) -> ScanResul
)


@pytest.mark.usefixtures("requires_symlinks")
def test_directory_scan_groups_hf_cache_sharded_symlinks(
tmp_path: Path,
monkeypatch: pytest.MonkeyPatch,
) -> None:
hf_home = tmp_path / "hf-home"
monkeypatch.setenv("HF_HOME", str(hf_home))
cache_dir = hf_home / "hub" / "models--org--model"
snapshots_dir = cache_dir / "snapshots" / "abc123"
blobs_dir = cache_dir / "blobs"
snapshots_dir.mkdir(parents=True)
blobs_dir.mkdir()

blob_paths: list[Path] = []
shard_links: list[Path] = []
for shard_index in range(1, 3):
blob_path = blobs_dir / f"blob-{shard_index}"
blob_path.write_bytes(f"hf-shard-{shard_index}".encode())
shard_link = snapshots_dir / f"model-{shard_index:05d}-of-00002.safetensors"
shard_link.symlink_to(Path("../../blobs") / blob_path.name)
blob_paths.append(blob_path.resolve())
shard_links.append(shard_link)

captured_configs: list[dict[str, Any]] = []
calls: list[str] = []

def fake_scan_file(path: str, config: dict[str, Any] | None = None) -> ScanResult:
calls.append(path)
captured_configs.append(dict(config or {}))
return _mock_sharded_scan_result(sum(blob_path.stat().st_size for blob_path in blob_paths))

monkeypatch.setattr(core_module, "scan_file", fake_scan_file)

result = core_module.scan_model_directory_or_file(str(snapshots_dir), cache_scan_results=False)

material_config = normalize_material_scan_config(captured_configs[0])
fingerprint = material_config[core_module._SHARD_FAMILY_CACHE_FINGERPRINT_CONFIG_KEY]
assert len(calls) == 1
assert Path(calls[0]).name in {shard_link.name for shard_link in shard_links}
assert result.files_scanned == len(shard_links)
assert set(result.file_metadata) == {str(shard_link) for shard_link in shard_links}
assert {asset.path for asset in result.assets} == {str(shard_link) for shard_link in shard_links}
assert {member["path"] for member in fingerprint["members"]} == {str(blob_path) for blob_path in blob_paths}
assert not any("path traversal" in issue.message.lower() for issue in result.issues)


@pytest.mark.usefixtures("requires_symlinks")
def test_directory_scan_keeps_nonsharded_hf_snapshot_aliases_deduplicated(
tmp_path: Path,
monkeypatch: pytest.MonkeyPatch,
) -> None:
hf_home = tmp_path / "hf-home"
monkeypatch.setenv("HF_HOME", str(hf_home))
cache_dir = hf_home / "hub" / "models--org--model"
blobs_dir = cache_dir / "blobs"
blobs_dir.mkdir(parents=True)
blob_path = blobs_dir / "shared-blob"
blob_path.write_bytes(b"shared-model")

for revision in ("abc123", "def456"):
snapshots_dir = cache_dir / "snapshots" / revision
snapshots_dir.mkdir(parents=True)
(snapshots_dir / "model.safetensors").symlink_to(Path("../../blobs") / blob_path.name)

calls: list[str] = []

def fake_scan_file(path: str, config: dict[str, Any] | None = None) -> ScanResult:
calls.append(path)
return _mock_sharded_scan_result(blob_path.stat().st_size)

monkeypatch.setattr(core_module, "scan_file", fake_scan_file)

result = core_module.scan_model_directory_or_file(str(cache_dir / "snapshots"), cache_scan_results=False)

assert calls == [str(blob_path.resolve())]
assert result.files_scanned == 1


@pytest.mark.usefixtures("requires_symlinks")
def test_directory_scan_deduplicates_identical_hf_shard_families_across_snapshots(
tmp_path: Path,
monkeypatch: pytest.MonkeyPatch,
) -> None:
hf_home = tmp_path / "hf-home"
monkeypatch.setenv("HF_HOME", str(hf_home))
cache_dir = hf_home / "hub" / "models--org--model"
blobs_dir = cache_dir / "blobs"
blobs_dir.mkdir(parents=True)
blob_paths: list[Path] = []
for shard_index in range(1, 3):
blob_path = blobs_dir / f"blob-{shard_index}"
blob_path.write_bytes(f"shared-hf-shard-{shard_index}".encode())
blob_paths.append(blob_path.resolve())
for revision in ("abc123", "def456"):
snapshots_dir = cache_dir / "snapshots" / revision
snapshots_dir.mkdir(parents=True, exist_ok=True)
(snapshots_dir / f"model-{shard_index:05d}-of-00002.safetensors").symlink_to(
Path("../../blobs") / blob_path.name
)

captured_configs: list[dict[str, Any]] = []
calls: list[str] = []

def fake_scan_file(path: str, config: dict[str, Any] | None = None) -> ScanResult:
calls.append(path)
captured_configs.append(dict(config or {}))
return _mock_sharded_scan_result(sum(blob_path.stat().st_size for blob_path in blob_paths))

monkeypatch.setattr(core_module, "scan_file", fake_scan_file)

result = core_module.scan_model_directory_or_file(str(cache_dir / "snapshots"), cache_scan_results=False)

material_config = normalize_material_scan_config(captured_configs[0])
fingerprint = material_config[core_module._SHARD_FAMILY_CACHE_FINGERPRINT_CONFIG_KEY]
assert len(calls) == 1
assert result.files_scanned == len(blob_paths)
assert result.bytes_scanned == sum(blob_path.stat().st_size for blob_path in blob_paths)
assert {member["path"] for member in fingerprint["members"]} == {str(blob_path) for blob_path in blob_paths}


def test_scan_file_passes_shard_allowlist_to_advanced_handler(
tmp_path: Path,
monkeypatch: pytest.MonkeyPatch,
Expand Down
Loading