-
Notifications
You must be signed in to change notification settings - Fork 982
Centralize hf:// URI parsing into utils/_hf_uri.py
#3994
New issue
Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.
By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.
Already on GitHub? Sign in to your account
base: main
Are you sure you want to change the base?
Changes from 5 commits
0d676aa
34591c8
0df167d
4d3b1a2
44364b6
397c88d
1aaaa7f
4c080e6
File filter
Filter by extension
Conversations
Jump to
Diff view
Diff view
There are no files selected for viewing
| Original file line number | Diff line number | Diff line change |
|---|---|---|
|
|
@@ -78,6 +78,7 @@ | |
| from huggingface_hub._jobs_api import Volume | ||
| from huggingface_hub.errors import CLIError, HfHubHTTPError | ||
| from huggingface_hub.utils import logging | ||
| from huggingface_hub.utils._hf_uri import ParsedBucketUrl, parse_hf_url | ||
| from huggingface_hub.utils._cache_manager import _format_size | ||
|
|
||
| from ._cli_utils import ( | ||
|
|
@@ -1099,12 +1100,6 @@ def _parse_volumes(volumes: Optional[list[str]]) -> Optional[list[Volume]]: | |
| return None | ||
|
|
||
| HF_PREFIX = "hf://" | ||
| HF_TYPES_MAPPING = { | ||
| "models": constants.REPO_TYPE_MODEL, | ||
| "datasets": constants.REPO_TYPE_DATASET, | ||
| "spaces": constants.REPO_TYPE_SPACE, | ||
| "buckets": "bucket", | ||
| } | ||
|
|
||
| result: list[Volume] = [] | ||
| for raw_spec in volumes: | ||
|
|
@@ -1124,45 +1119,29 @@ def _parse_volumes(volumes: Optional[list[str]]) -> Optional[list[Volume]]: | |
| f"Invalid volume format: '{raw_spec}'. Source must start with 'hf://'. " | ||
| f"Expected hf://[TYPE/]SOURCE:/MOUNT_PATH[:ro]. E.g. hf://gpt2:/data" | ||
| ) | ||
| spec = spec[len(HF_PREFIX) :] | ||
|
|
||
| # Find the mount path: look for :/ pattern | ||
| colon_slash_idx = spec.find(":/") | ||
| # We search in the part after "hf://" to avoid matching the "://" in the prefix. | ||
| after_prefix = spec[len(HF_PREFIX) :] | ||
| colon_slash_idx = after_prefix.find(":/") | ||
| if colon_slash_idx == -1: | ||
| raise CLIError( | ||
| f"Invalid volume format: '{raw_spec}'. Expected hf://[TYPE/]SOURCE:/MOUNT_PATH[:ro]. E.g. hf://gpt2:/data" | ||
| ) | ||
| source_part = spec[:colon_slash_idx] | ||
| mount_path = spec[colon_slash_idx + 1 :] | ||
|
|
||
| # Parse type from source_part (first segment before /) | ||
| # Then split remaining into source (namespace/name or name) and optional path. | ||
| slash_idx = source_part.find("/") | ||
| if slash_idx == -1: | ||
| # No slash: bare source like "gpt2" -> model type | ||
| vol_type_str = constants.REPO_TYPE_MODEL | ||
| source = source_part | ||
| path = None | ||
| source_uri = HF_PREFIX + after_prefix[:colon_slash_idx] | ||
| mount_path = after_prefix[colon_slash_idx + 1 :] | ||
|
|
||
| # Parse the source URI using the central parser. | ||
| parsed = parse_hf_url(source_uri) | ||
|
|
||
| if isinstance(parsed, ParsedBucketUrl): | ||
| vol_type_str = "bucket" | ||
| source = parsed.bucket_id | ||
| path = parsed.path or None | ||
| else: | ||
| first_segment = source_part[:slash_idx] | ||
| if first_segment in HF_TYPES_MAPPING: | ||
| vol_type_str = HF_TYPES_MAPPING[first_segment] | ||
| remaining = source_part[slash_idx + 1 :] | ||
| else: | ||
| # First segment isn't a known type -> model type | ||
| vol_type_str = constants.REPO_TYPE_MODEL | ||
| remaining = source_part | ||
|
There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. Volume
|
||
|
|
||
| # Split remaining into source (namespace/name) and optional path. | ||
| # Repo/bucket IDs are "namespace/name" (2 segments) or "name" (1 segment). | ||
| # Any extra segments are the path inside the repo/bucket. | ||
| parts = remaining.split("/", 2) | ||
| if len(parts) >= 3: | ||
| source = parts[0] + "/" + parts[1] | ||
| path = parts[2] | ||
| else: | ||
| source = remaining | ||
| path = None | ||
| vol_type_str = parsed.repo_type | ||
| source = parsed.repo_id | ||
| path = parsed.path_in_repo or None | ||
|
|
||
| result.append( | ||
| Volume( | ||
|
|
||
| Original file line number | Diff line number | Diff line change |
|---|---|---|
|
|
@@ -301,6 +301,9 @@ def repo_type_and_id_from_hf_id(hf_id: str, hub_url: Optional[str] = None) -> tu | |
| [`ValueError`](https://docs.python.org/3/library/exceptions.html#ValueError) | ||
| If `repo_type` is unknown. | ||
| """ | ||
|
|
||
| from .utils._hf_uri import ParsedBucketUrl, parse_hf_url | ||
|
|
||
| input_hf_id = hf_id | ||
|
|
||
| # Get the hub_url (with or without protocol) | ||
|
|
@@ -315,86 +318,45 @@ def repo_type_and_id_from_hf_id(hf_id: str, hub_url: Optional[str] = None) -> tu | |
| if hf_id.startswith(HFFS_PREFIX): # Remove "hf://" prefix if exists | ||
| hf_id = hf_id[len(HFFS_PREFIX) :] | ||
|
|
||
| # If it's a URL, strip the endpoint prefix to get the path | ||
| # If it's a URL, strip the endpoint prefix to get the relative path | ||
| if is_hf_url: | ||
| # Remove protocol if present | ||
| hf_id_normalized = _REGEX_HTTP_PROTOCOL.sub("", hf_id) | ||
|
|
||
| # Remove the hub_url prefix to get the relative path | ||
| if hf_id_normalized.startswith(hub_url_without_protocol): | ||
| # Strip the hub URL and any leading slashes | ||
| hf_id = hf_id_normalized[len(hub_url_without_protocol) :].lstrip("/") | ||
|
|
||
| # At this point hf_id is a relative path like "datasets/user/repo", "user/repo", or "repo". | ||
| url_segments = hf_id.split("/") | ||
| is_hf_id = len(url_segments) <= 3 | ||
|
|
||
| namespace: Optional[str] | ||
| if is_hf_url: | ||
| # For URLs, we need to extract repo_type, namespace, repo_id | ||
| # Expected format after stripping endpoint: [repo_type]/namespace/repo_id or namespace/repo_id | ||
|
|
||
| if len(url_segments) >= 3: | ||
| # Check if first segment is a repo type | ||
| if url_segments[0] in constants.REPO_TYPES_MAPPING: | ||
| repo_type = constants.REPO_TYPES_MAPPING[url_segments[0]] | ||
| namespace = url_segments[1] | ||
| repo_id = url_segments[2] | ||
| elif url_segments[0] == "buckets": | ||
| # Special case for buckets | ||
| repo_type = "bucket" | ||
| namespace = url_segments[1] | ||
| repo_id = url_segments[2] | ||
| else: | ||
| # First segment is namespace | ||
| namespace = url_segments[0] | ||
| repo_id = url_segments[1] | ||
| repo_type = None | ||
| elif len(url_segments) == 2: | ||
| namespace = url_segments[0] | ||
| repo_id = url_segments[1] | ||
|
|
||
| # Check if namespace is actually a repo type mapping | ||
| if namespace in constants.REPO_TYPES_MAPPING: | ||
| # Mean canonical dataset or model | ||
| repo_type = constants.REPO_TYPES_MAPPING[namespace] | ||
| namespace = None | ||
| elif namespace == "buckets": | ||
| # Special case for buckets | ||
| repo_type = "bucket" | ||
| namespace = None | ||
| else: | ||
| repo_type = None | ||
| else: | ||
| # Single segment | ||
| repo_id = url_segments[0] | ||
| namespace = None | ||
| repo_type = None | ||
| elif is_hf_id: | ||
| if len(url_segments) == 3: | ||
| # Passed <repo_type>/<user>/<model_id> or <repo_type>/<org>/<model_id> | ||
| repo_type, namespace, repo_id = url_segments[-3:] | ||
| elif len(url_segments) == 2: | ||
| if url_segments[0] in constants.REPO_TYPES_MAPPING: | ||
| # Passed '<model_id>' or 'datasets/<dataset_id>' for a canonical model or dataset | ||
| repo_type = constants.REPO_TYPES_MAPPING[url_segments[0]] | ||
| namespace = None | ||
| repo_id = hf_id.split("/")[-1] | ||
| elif url_segments[0] == "buckets": | ||
| # Special case for buckets | ||
| repo_type = "bucket" | ||
| namespace = None | ||
| repo_id = hf_id.split("/")[-1] | ||
| else: | ||
| # Passed <user>/<model_id> or <org>/<model_id> | ||
| namespace, repo_id = hf_id.split("/")[-2:] | ||
| repo_type = None | ||
| else: | ||
| # Passed <model_id> | ||
| repo_id = url_segments[0] | ||
| namespace, repo_type = None, None | ||
| else: | ||
| # For non-URL inputs, reject paths with more than 3 segments. | ||
| # URL inputs can have extra segments (e.g. /blob/main/file.txt) which are ignored. | ||
| if not is_hf_url and len(url_segments) > 3: | ||
| raise ValueError(f"Unable to retrieve user and repo ID from the passed HF ID: {hf_id}") | ||
|
|
||
| if not is_hf_url and len(url_segments) == 3: | ||
| # Passed <repo_type>/<user>/<model_id> — accept singular type names | ||
| # (e.g. "dataset/user/id") which parse_hf_url doesn't handle. | ||
| repo_type, namespace, repo_id = url_segments | ||
|
There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. Bucket paths with 3 segments fail validationHigh Severity When |
||
| elif not is_hf_url and len(url_segments) == 2 and url_segments[0] == "buckets": | ||
| # Special case: "buckets/name" (no namespace) — parse_hf_url expects | ||
| # namespace/name for buckets, but this function accepts bare bucket names. | ||
| repo_type = "bucket" | ||
| namespace = None | ||
| repo_id = url_segments[1] | ||
| else: | ||
| # Delegate to the central parser for type detection, bucket handling, etc. | ||
| parsed = parse_hf_url(hf_id) | ||
|
|
||
| if isinstance(parsed, ParsedBucketUrl): | ||
| repo_type: Optional[str] = "bucket" | ||
| namespace = parsed.namespace | ||
| repo_id = parsed.bucket_name | ||
| else: | ||
| # When no type prefix is present, parse_hf_url defaults to "model". | ||
| # This function returns None instead. | ||
| repo_type = parsed.repo_type if parsed.has_explicit_type else None | ||
| namespace = parsed.namespace | ||
| repo_id = parsed.repo_name | ||
|
|
||
| # Check if repo type is known (mapping "spaces" => "space" + empty value => `None`) | ||
| if repo_type in constants.REPO_TYPES_MAPPING: | ||
| repo_type = constants.REPO_TYPES_MAPPING[repo_type] | ||
|
|
||


Uh oh!
There was an error while loading. Please reload this page.