Skip to content
Open
Show file tree
Hide file tree
Changes from 6 commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
14 changes: 5 additions & 9 deletions src/huggingface_hub/_buckets.py
Original file line number Diff line number Diff line change
Expand Up @@ -31,6 +31,7 @@
from . import constants, logging
from .errors import BucketNotFoundError
from .utils import XetFileData, disable_progress_bars, enable_progress_bars, parse_datetime
from .utils._hf_uri import parse_hf_url
from .utils._terminal import StatusLine


Expand All @@ -56,12 +57,8 @@ def _split_bucket_id_and_prefix(path: str) -> tuple[str, str]:
Returns (bucket_id, prefix) where prefix may be empty string.
Raises ValueError if path doesn't contain at least namespace/name.
"""
parts = path.split("/", 2)
if len(parts) < 2 or not parts[0] or not parts[1]:
raise ValueError(f"Invalid bucket path: '{path}'. Expected format: namespace/bucket_name")
bucket_id = f"{parts[0]}/{parts[1]}"
prefix = parts[2] if len(parts) > 2 else ""
return bucket_id, prefix
parsed = parse_hf_url(f"buckets/{path}")
return parsed.bucket_id, parsed.path


@dataclass
Expand Down Expand Up @@ -242,9 +239,8 @@ def _parse_bucket_path(path: str) -> tuple[str, str]:
Returns:
tuple: (bucket_id, prefix) where bucket_id is "namespace/bucket_name" and prefix may be empty string.
"""
if not path.startswith(BUCKET_PREFIX):
raise ValueError(f"Invalid bucket path: {path}. Must start with {BUCKET_PREFIX}")
return _split_bucket_id_and_prefix(path.removeprefix(BUCKET_PREFIX))
parsed = parse_hf_url(path)
return parsed.bucket_id, parsed.path


def _is_bucket_path(path: str) -> bool:
Expand Down
55 changes: 17 additions & 38 deletions src/huggingface_hub/cli/jobs.py
Original file line number Diff line number Diff line change
Expand Up @@ -78,6 +78,7 @@
from huggingface_hub._jobs_api import Volume
from huggingface_hub.errors import CLIError, HfHubHTTPError
from huggingface_hub.utils import logging
from huggingface_hub.utils._hf_uri import ParsedBucketUrl, parse_hf_url
from huggingface_hub.utils._cache_manager import _format_size

from ._cli_utils import (
Expand Down Expand Up @@ -1099,12 +1100,6 @@ def _parse_volumes(volumes: Optional[list[str]]) -> Optional[list[Volume]]:
return None

HF_PREFIX = "hf://"
HF_TYPES_MAPPING = {
"models": constants.REPO_TYPE_MODEL,
"datasets": constants.REPO_TYPE_DATASET,
"spaces": constants.REPO_TYPE_SPACE,
"buckets": "bucket",
}

result: list[Volume] = []
for raw_spec in volumes:
Expand All @@ -1124,45 +1119,29 @@ def _parse_volumes(volumes: Optional[list[str]]) -> Optional[list[Volume]]:
f"Invalid volume format: '{raw_spec}'. Source must start with 'hf://'. "
f"Expected hf://[TYPE/]SOURCE:/MOUNT_PATH[:ro]. E.g. hf://gpt2:/data"
)
spec = spec[len(HF_PREFIX) :]

# Find the mount path: look for :/ pattern
colon_slash_idx = spec.find(":/")
# We search in the part after "hf://" to avoid matching the "://" in the prefix.
after_prefix = spec[len(HF_PREFIX) :]
colon_slash_idx = after_prefix.find(":/")
if colon_slash_idx == -1:
raise CLIError(
f"Invalid volume format: '{raw_spec}'. Expected hf://[TYPE/]SOURCE:/MOUNT_PATH[:ro]. E.g. hf://gpt2:/data"
)
source_part = spec[:colon_slash_idx]
mount_path = spec[colon_slash_idx + 1 :]

# Parse type from source_part (first segment before /)
# Then split remaining into source (namespace/name or name) and optional path.
slash_idx = source_part.find("/")
if slash_idx == -1:
# No slash: bare source like "gpt2" -> model type
vol_type_str = constants.REPO_TYPE_MODEL
source = source_part
path = None
source_uri = HF_PREFIX + after_prefix[:colon_slash_idx]
mount_path = after_prefix[colon_slash_idx + 1 :]

# Parse the source URI using the central parser.
parsed = parse_hf_url(source_uri)

if isinstance(parsed, ParsedBucketUrl):
vol_type_str = "bucket"
source = parsed.bucket_id
path = parsed.path or None
else:
first_segment = source_part[:slash_idx]
if first_segment in HF_TYPES_MAPPING:
vol_type_str = HF_TYPES_MAPPING[first_segment]
remaining = source_part[slash_idx + 1 :]
else:
# First segment isn't a known type -> model type
vol_type_str = constants.REPO_TYPE_MODEL
remaining = source_part
Copy link
Copy Markdown

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Volume @revision silently dropped during parsing

High Severity

The Volume dataclass has a revision field, but the refactored _parse_volumes never populates it. The old code preserved @revision as part of the source string (e.g., source="user/repo@main"). Now parse_hf_url extracts the revision into parsed.revision and strips it from repo_id, but parsed.revision is never passed to the Volume constructor. For any volume spec using @revision syntax, the revision is silently lost.

Additional Locations (1)
Fix in Cursor Fix in Web


# Split remaining into source (namespace/name) and optional path.
# Repo/bucket IDs are "namespace/name" (2 segments) or "name" (1 segment).
# Any extra segments are the path inside the repo/bucket.
parts = remaining.split("/", 2)
if len(parts) >= 3:
source = parts[0] + "/" + parts[1]
path = parts[2]
else:
source = remaining
path = None
vol_type_str = parsed.repo_type
source = parsed.repo_id
path = parsed.path_in_repo or None

result.append(
Volume(
Expand Down
104 changes: 33 additions & 71 deletions src/huggingface_hub/hf_api.py
Original file line number Diff line number Diff line change
Expand Up @@ -301,6 +301,9 @@ def repo_type_and_id_from_hf_id(hf_id: str, hub_url: Optional[str] = None) -> tu
[`ValueError`](https://docs.python.org/3/library/exceptions.html#ValueError)
If `repo_type` is unknown.
"""

from .utils._hf_uri import ParsedBucketUrl, parse_hf_url

input_hf_id = hf_id

# Get the hub_url (with or without protocol)
Expand All @@ -315,86 +318,45 @@ def repo_type_and_id_from_hf_id(hf_id: str, hub_url: Optional[str] = None) -> tu
if hf_id.startswith(HFFS_PREFIX): # Remove "hf://" prefix if exists
hf_id = hf_id[len(HFFS_PREFIX) :]

# If it's a URL, strip the endpoint prefix to get the path
# If it's a URL, strip the endpoint prefix to get the relative path
if is_hf_url:
# Remove protocol if present
hf_id_normalized = _REGEX_HTTP_PROTOCOL.sub("", hf_id)

# Remove the hub_url prefix to get the relative path
if hf_id_normalized.startswith(hub_url_without_protocol):
# Strip the hub URL and any leading slashes
hf_id = hf_id_normalized[len(hub_url_without_protocol) :].lstrip("/")

# At this point hf_id is a relative path like "datasets/user/repo", "user/repo", or "repo".
url_segments = hf_id.split("/")
is_hf_id = len(url_segments) <= 3

namespace: Optional[str]
if is_hf_url:
# For URLs, we need to extract repo_type, namespace, repo_id
# Expected format after stripping endpoint: [repo_type]/namespace/repo_id or namespace/repo_id

if len(url_segments) >= 3:
# Check if first segment is a repo type
if url_segments[0] in constants.REPO_TYPES_MAPPING:
repo_type = constants.REPO_TYPES_MAPPING[url_segments[0]]
namespace = url_segments[1]
repo_id = url_segments[2]
elif url_segments[0] == "buckets":
# Special case for buckets
repo_type = "bucket"
namespace = url_segments[1]
repo_id = url_segments[2]
else:
# First segment is namespace
namespace = url_segments[0]
repo_id = url_segments[1]
repo_type = None
elif len(url_segments) == 2:
namespace = url_segments[0]
repo_id = url_segments[1]

# Check if namespace is actually a repo type mapping
if namespace in constants.REPO_TYPES_MAPPING:
# Mean canonical dataset or model
repo_type = constants.REPO_TYPES_MAPPING[namespace]
namespace = None
elif namespace == "buckets":
# Special case for buckets
repo_type = "bucket"
namespace = None
else:
repo_type = None
else:
# Single segment
repo_id = url_segments[0]
namespace = None
repo_type = None
elif is_hf_id:
if len(url_segments) == 3:
# Passed <repo_type>/<user>/<model_id> or <repo_type>/<org>/<model_id>
repo_type, namespace, repo_id = url_segments[-3:]
elif len(url_segments) == 2:
if url_segments[0] in constants.REPO_TYPES_MAPPING:
# Passed '<model_id>' or 'datasets/<dataset_id>' for a canonical model or dataset
repo_type = constants.REPO_TYPES_MAPPING[url_segments[0]]
namespace = None
repo_id = hf_id.split("/")[-1]
elif url_segments[0] == "buckets":
# Special case for buckets
repo_type = "bucket"
namespace = None
repo_id = hf_id.split("/")[-1]
else:
# Passed <user>/<model_id> or <org>/<model_id>
namespace, repo_id = hf_id.split("/")[-2:]
repo_type = None
else:
# Passed <model_id>
repo_id = url_segments[0]
namespace, repo_type = None, None
else:
# For non-URL inputs, reject paths with more than 3 segments.
# URL inputs can have extra segments (e.g. /blob/main/file.txt) which are ignored.
if not is_hf_url and len(url_segments) > 3:
raise ValueError(f"Unable to retrieve user and repo ID from the passed HF ID: {hf_id}")

if not is_hf_url and len(url_segments) == 3:
# Passed <repo_type>/<user>/<model_id> — accept singular type names
# (e.g. "dataset/user/id") which parse_hf_url doesn't handle.
repo_type, namespace, repo_id = url_segments
Copy link
Copy Markdown

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Bucket paths with 3 segments fail validation

High Severity

When repo_type_and_id_from_hf_id receives a 3-segment non-URL bucket path like "hf://buckets/ns/name", the code at the len(url_segments) == 3 branch does a simple repo_type, namespace, repo_id = url_segments, setting repo_type to "buckets" (plural). The downstream validation checks repo_type != "bucket" (singular), so "buckets" fails and raises a ValueError. The old code had explicit elif url_segments[0] == "buckets" handling that correctly set repo_type = "bucket".

Fix in Cursor Fix in Web

elif not is_hf_url and len(url_segments) == 2 and url_segments[0] == "buckets":
# Special case: "buckets/name" (no namespace) — parse_hf_url expects
# namespace/name for buckets, but this function accepts bare bucket names.
repo_type = "bucket"
namespace = None
repo_id = url_segments[1]
else:
# Delegate to the central parser for type detection, bucket handling, etc.
parsed = parse_hf_url(hf_id)

if isinstance(parsed, ParsedBucketUrl):
repo_type: Optional[str] = "bucket"
namespace = parsed.namespace
repo_id = parsed.bucket_name
else:
# When no type prefix is present, parse_hf_url defaults to "model".
# This function returns None instead.
repo_type = parsed.repo_type if parsed.has_explicit_type else None
namespace = parsed.namespace
repo_id = parsed.repo_name

# Check if repo type is known (mapping "spaces" => "space" + empty value => `None`)
if repo_type in constants.REPO_TYPES_MAPPING:
repo_type = constants.REPO_TYPES_MAPPING[repo_type]
Expand Down
Loading