-
Notifications
You must be signed in to change notification settings - Fork 982
Centralize hf:// URI parsing into utils/_hf_uri.py
#3994
New issue
Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.
By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.
Already on GitHub? Sign in to your account
base: main
Are you sure you want to change the base?
Changes from 6 commits
0d676aa
34591c8
0df167d
4d3b1a2
44364b6
397c88d
1aaaa7f
4c080e6
File filter
Filter by extension
Conversations
Jump to
Diff view
Diff view
There are no files selected for viewing
| Original file line number | Diff line number | Diff line change |
|---|---|---|
|
|
@@ -301,6 +301,9 @@ def repo_type_and_id_from_hf_id(hf_id: str, hub_url: Optional[str] = None) -> tu | |
| [`ValueError`](https://docs.python.org/3/library/exceptions.html#ValueError) | ||
| If `repo_type` is unknown. | ||
| """ | ||
|
|
||
| from .utils._hf_uri import ParsedBucketUrl, parse_hf_url | ||
|
|
||
| input_hf_id = hf_id | ||
|
|
||
| # Get the hub_url (with or without protocol) | ||
|
|
@@ -315,86 +318,45 @@ def repo_type_and_id_from_hf_id(hf_id: str, hub_url: Optional[str] = None) -> tu | |
| if hf_id.startswith(HFFS_PREFIX): # Remove "hf://" prefix if exists | ||
| hf_id = hf_id[len(HFFS_PREFIX) :] | ||
|
|
||
| # If it's a URL, strip the endpoint prefix to get the path | ||
| # If it's a URL, strip the endpoint prefix to get the relative path | ||
| if is_hf_url: | ||
| # Remove protocol if present | ||
| hf_id_normalized = _REGEX_HTTP_PROTOCOL.sub("", hf_id) | ||
|
|
||
| # Remove the hub_url prefix to get the relative path | ||
| if hf_id_normalized.startswith(hub_url_without_protocol): | ||
| # Strip the hub URL and any leading slashes | ||
| hf_id = hf_id_normalized[len(hub_url_without_protocol) :].lstrip("/") | ||
|
|
||
| # At this point hf_id is a relative path like "datasets/user/repo", "user/repo", or "repo". | ||
| url_segments = hf_id.split("/") | ||
| is_hf_id = len(url_segments) <= 3 | ||
|
|
||
| namespace: Optional[str] | ||
| if is_hf_url: | ||
| # For URLs, we need to extract repo_type, namespace, repo_id | ||
| # Expected format after stripping endpoint: [repo_type]/namespace/repo_id or namespace/repo_id | ||
|
|
||
| if len(url_segments) >= 3: | ||
| # Check if first segment is a repo type | ||
| if url_segments[0] in constants.REPO_TYPES_MAPPING: | ||
| repo_type = constants.REPO_TYPES_MAPPING[url_segments[0]] | ||
| namespace = url_segments[1] | ||
| repo_id = url_segments[2] | ||
| elif url_segments[0] == "buckets": | ||
| # Special case for buckets | ||
| repo_type = "bucket" | ||
| namespace = url_segments[1] | ||
| repo_id = url_segments[2] | ||
| else: | ||
| # First segment is namespace | ||
| namespace = url_segments[0] | ||
| repo_id = url_segments[1] | ||
| repo_type = None | ||
| elif len(url_segments) == 2: | ||
| namespace = url_segments[0] | ||
| repo_id = url_segments[1] | ||
|
|
||
| # Check if namespace is actually a repo type mapping | ||
| if namespace in constants.REPO_TYPES_MAPPING: | ||
| # Mean canonical dataset or model | ||
| repo_type = constants.REPO_TYPES_MAPPING[namespace] | ||
| namespace = None | ||
| elif namespace == "buckets": | ||
| # Special case for buckets | ||
| repo_type = "bucket" | ||
| namespace = None | ||
| else: | ||
| repo_type = None | ||
| else: | ||
| # Single segment | ||
| repo_id = url_segments[0] | ||
| namespace = None | ||
| repo_type = None | ||
| elif is_hf_id: | ||
| if len(url_segments) == 3: | ||
| # Passed <repo_type>/<user>/<model_id> or <repo_type>/<org>/<model_id> | ||
| repo_type, namespace, repo_id = url_segments[-3:] | ||
| elif len(url_segments) == 2: | ||
| if url_segments[0] in constants.REPO_TYPES_MAPPING: | ||
| # Passed '<model_id>' or 'datasets/<dataset_id>' for a canonical model or dataset | ||
| repo_type = constants.REPO_TYPES_MAPPING[url_segments[0]] | ||
| namespace = None | ||
| repo_id = hf_id.split("/")[-1] | ||
| elif url_segments[0] == "buckets": | ||
| # Special case for buckets | ||
| repo_type = "bucket" | ||
| namespace = None | ||
| repo_id = hf_id.split("/")[-1] | ||
| else: | ||
| # Passed <user>/<model_id> or <org>/<model_id> | ||
| namespace, repo_id = hf_id.split("/")[-2:] | ||
| repo_type = None | ||
| else: | ||
| # Passed <model_id> | ||
| repo_id = url_segments[0] | ||
| namespace, repo_type = None, None | ||
| else: | ||
| # For non-URL inputs, reject paths with more than 3 segments. | ||
| # URL inputs can have extra segments (e.g. /blob/main/file.txt) which are ignored. | ||
| if not is_hf_url and len(url_segments) > 3: | ||
| raise ValueError(f"Unable to retrieve user and repo ID from the passed HF ID: {hf_id}") | ||
|
|
||
| if not is_hf_url and len(url_segments) == 3: | ||
| # Passed <repo_type>/<user>/<model_id> — accept singular type names | ||
| # (e.g. "dataset/user/id") which parse_hf_url doesn't handle. | ||
| repo_type, namespace, repo_id = url_segments | ||
|
There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. Bucket paths with 3 segments fail validationHigh Severity When |
||
| elif not is_hf_url and len(url_segments) == 2 and url_segments[0] == "buckets": | ||
| # Special case: "buckets/name" (no namespace) — parse_hf_url expects | ||
| # namespace/name for buckets, but this function accepts bare bucket names. | ||
| repo_type = "bucket" | ||
| namespace = None | ||
| repo_id = url_segments[1] | ||
| else: | ||
| # Delegate to the central parser for type detection, bucket handling, etc. | ||
| parsed = parse_hf_url(hf_id) | ||
|
|
||
| if isinstance(parsed, ParsedBucketUrl): | ||
| repo_type: Optional[str] = "bucket" | ||
| namespace = parsed.namespace | ||
| repo_id = parsed.bucket_name | ||
| else: | ||
| # When no type prefix is present, parse_hf_url defaults to "model". | ||
| # This function returns None instead. | ||
| repo_type = parsed.repo_type if parsed.has_explicit_type else None | ||
| namespace = parsed.namespace | ||
| repo_id = parsed.repo_name | ||
|
|
||
| # Check if repo type is known (mapping "spaces" => "space" + empty value => `None`) | ||
| if repo_type in constants.REPO_TYPES_MAPPING: | ||
| repo_type = constants.REPO_TYPES_MAPPING[repo_type] | ||
|
|
||


There was a problem hiding this comment.
Choose a reason for hiding this comment
The reason will be displayed to describe this comment to others. Learn more.
Volume
@revisionsilently dropped during parsingHigh Severity
The
Volumedataclass has arevisionfield, but the refactored_parse_volumesnever populates it. The old code preserved@revisionas part of thesourcestring (e.g.,source="user/repo@main"). Nowparse_hf_urlextracts the revision intoparsed.revisionand strips it fromrepo_id, butparsed.revisionis never passed to theVolumeconstructor. For any volume spec using@revisionsyntax, the revision is silently lost.Additional Locations (1)
src/huggingface_hub/cli/jobs.py#L1134-L1144