huggingface · omkar-334 · Mar 27, 2026 · Mar 27, 2026 · Mar 27, 2026 · Mar 28, 2026
diff --git a/src/huggingface_hub/_buckets.py b/src/huggingface_hub/_buckets.py
@@ -31,6 +31,7 @@
 from . import constants, logging
 from .errors import BucketNotFoundError
 from .utils import XetFileData, disable_progress_bars, enable_progress_bars, parse_datetime
+from .utils._hf_uri import parse_hf_url
 from .utils._terminal import StatusLine
 
 
@@ -56,12 +57,8 @@ def _split_bucket_id_and_prefix(path: str) -> tuple[str, str]:
     Returns (bucket_id, prefix) where prefix may be empty string.
     Raises ValueError if path doesn't contain at least namespace/name.
     """
-    parts = path.split("/", 2)
-    if len(parts) < 2 or not parts[0] or not parts[1]:
-        raise ValueError(f"Invalid bucket path: '{path}'. Expected format: namespace/bucket_name")
-    bucket_id = f"{parts[0]}/{parts[1]}"
-    prefix = parts[2] if len(parts) > 2 else ""
-    return bucket_id, prefix
+    parsed = parse_hf_url(f"buckets/{path}")
+    return parsed.bucket_id, parsed.path
 
 
 @dataclass
@@ -242,9 +239,8 @@ def _parse_bucket_path(path: str) -> tuple[str, str]:
     Returns:
         tuple: (bucket_id, prefix) where bucket_id is "namespace/bucket_name" and prefix may be empty string.
     """
-    if not path.startswith(BUCKET_PREFIX):
-        raise ValueError(f"Invalid bucket path: {path}. Must start with {BUCKET_PREFIX}")
-    return _split_bucket_id_and_prefix(path.removeprefix(BUCKET_PREFIX))
+    parsed = parse_hf_url(path)
+    return parsed.bucket_id, parsed.path
 
 
 def _is_bucket_path(path: str) -> bool:

diff --git a/src/huggingface_hub/cli/_cli_utils.py b/src/huggingface_hub/cli/_cli_utils.py
@@ -33,6 +33,7 @@
 from huggingface_hub import __version__, constants
 from huggingface_hub.utils import ANSI, get_session, hf_raise_for_status, installation_method, logging, tabulate
 from huggingface_hub.utils._dotenv import load_dotenv
+from huggingface_hub.utils._hf_uri import ParsedBucketUrl, parse_hf_url
 
 
 logger = logging.get_logger()
@@ -201,16 +202,19 @@ def _rewrite_repo_type_prefix(cmd: click.Command, args: list[str]) -> None:
         rewrites: list[tuple[int, str]] = []  # (args index, new value without prefix)
 
         for arg_index in repo_id_arg_indices:
-            parts = args[arg_index].split("/", 2)
-            if len(parts) != 3 or parts[0] not in constants.REPO_TYPES_MAPPING:
+            try:
+                parsed = parse_hf_url(args[arg_index])
+            except ValueError:
                 continue
-            prefix = parts[0]
-            mapped_type = constants.REPO_TYPES_MAPPING[prefix]
-            if inferred_type is not None and mapped_type != inferred_type:
-                raise click.UsageError(f"Conflicting repo type prefixes: '{first_prefix}/' and '{prefix}/'.")
-            inferred_type = mapped_type
-            first_prefix = prefix
-            rewrites.append((arg_index, f"{parts[1]}/{parts[2]}"))
+            if isinstance(parsed, ParsedBucketUrl) or not parsed.has_explicit_type:
+                continue
+            if inferred_type is not None and parsed.repo_type != inferred_type:
+                raise click.UsageError(
+                    f"Conflicting repo type prefixes: '{first_prefix}/' and '{args[arg_index].split('/', 1)[0]}/'."
+                )
+            inferred_type = parsed.repo_type
+            first_prefix = args[arg_index].split("/", 1)[0]
+            rewrites.append((arg_index, parsed.repo_id))
 
         if not rewrites:
             return

diff --git a/src/huggingface_hub/cli/jobs.py b/src/huggingface_hub/cli/jobs.py
@@ -78,6 +78,7 @@
 from huggingface_hub._jobs_api import Volume
 from huggingface_hub.errors import CLIError, HfHubHTTPError
 from huggingface_hub.utils import logging
+from huggingface_hub.utils._hf_uri import ParsedBucketUrl, parse_hf_url
 from huggingface_hub.utils._cache_manager import _format_size
 
 from ._cli_utils import (
@@ -1099,12 +1100,6 @@ def _parse_volumes(volumes: Optional[list[str]]) -> Optional[list[Volume]]:
         return None
 
     HF_PREFIX = "hf://"
-    HF_TYPES_MAPPING = {
-        "models": constants.REPO_TYPE_MODEL,
-        "datasets": constants.REPO_TYPE_DATASET,
-        "spaces": constants.REPO_TYPE_SPACE,
-        "buckets": "bucket",
-    }
 
     result: list[Volume] = []
     for raw_spec in volumes:
@@ -1124,45 +1119,29 @@ def _parse_volumes(volumes: Optional[list[str]]) -> Optional[list[Volume]]:
                 f"Invalid volume format: '{raw_spec}'. Source must start with 'hf://'. "
                 f"Expected hf://[TYPE/]SOURCE:/MOUNT_PATH[:ro]. E.g. hf://gpt2:/data"
             )
-        spec = spec[len(HF_PREFIX) :]
 
         # Find the mount path: look for :/ pattern
-        colon_slash_idx = spec.find(":/")
+        # We search in the part after "hf://" to avoid matching the "://" in the prefix.
+        after_prefix = spec[len(HF_PREFIX) :]
+        colon_slash_idx = after_prefix.find(":/")
         if colon_slash_idx == -1:
             raise CLIError(
                 f"Invalid volume format: '{raw_spec}'. Expected hf://[TYPE/]SOURCE:/MOUNT_PATH[:ro]. E.g. hf://gpt2:/data"
             )
-        source_part = spec[:colon_slash_idx]
-        mount_path = spec[colon_slash_idx + 1 :]
-
-        # Parse type from source_part (first segment before /)
-        # Then split remaining into source (namespace/name or name) and optional path.
-        slash_idx = source_part.find("/")
-        if slash_idx == -1:
-            # No slash: bare source like "gpt2" -> model type
-            vol_type_str = constants.REPO_TYPE_MODEL
-            source = source_part
-            path = None
+        source_uri = HF_PREFIX + after_prefix[:colon_slash_idx]
+        mount_path = after_prefix[colon_slash_idx + 1 :]
+
+        # Parse the source URI using the central parser.
+        parsed = parse_hf_url(source_uri)
+
+        if isinstance(parsed, ParsedBucketUrl):
+            vol_type_str = "bucket"
+            source = parsed.bucket_id
+            path = parsed.path or None
         else:
-            first_segment = source_part[:slash_idx]
-            if first_segment in HF_TYPES_MAPPING:
-                vol_type_str = HF_TYPES_MAPPING[first_segment]
-                remaining = source_part[slash_idx + 1 :]
-            else:
-                # First segment isn't a known type -> model type
-                vol_type_str = constants.REPO_TYPE_MODEL
-                remaining = source_part
-
-            # Split remaining into source (namespace/name) and optional path.
-            # Repo/bucket IDs are "namespace/name" (2 segments) or "name" (1 segment).
-            # Any extra segments are the path inside the repo/bucket.
-            parts = remaining.split("/", 2)
-            if len(parts) >= 3:
-                source = parts[0] + "/" + parts[1]
-                path = parts[2]
-            else:
-                source = remaining
-                path = None
+            vol_type_str = parsed.repo_type
+            source = parsed.repo_id
+            path = parsed.path_in_repo or None
 
         result.append(
             Volume(

diff --git a/src/huggingface_hub/hf_api.py b/src/huggingface_hub/hf_api.py
@@ -301,6 +301,9 @@ def repo_type_and_id_from_hf_id(hf_id: str, hub_url: Optional[str] = None) -> tu
         [`ValueError`](https://docs.python.org/3/library/exceptions.html#ValueError)
             If `repo_type` is unknown.
     """
+
+    from .utils._hf_uri import ParsedBucketUrl, parse_hf_url
+
     input_hf_id = hf_id
 
     # Get the hub_url (with or without protocol)
@@ -315,86 +318,45 @@ def repo_type_and_id_from_hf_id(hf_id: str, hub_url: Optional[str] = None) -> tu
     if hf_id.startswith(HFFS_PREFIX):  # Remove "hf://" prefix if exists
         hf_id = hf_id[len(HFFS_PREFIX) :]
 
-    # If it's a URL, strip the endpoint prefix to get the path
+    # If it's a URL, strip the endpoint prefix to get the relative path
     if is_hf_url:
-        # Remove protocol if present
         hf_id_normalized = _REGEX_HTTP_PROTOCOL.sub("", hf_id)
-
-        # Remove the hub_url prefix to get the relative path
         if hf_id_normalized.startswith(hub_url_without_protocol):
-            # Strip the hub URL and any leading slashes
             hf_id = hf_id_normalized[len(hub_url_without_protocol) :].lstrip("/")
 
+    # At this point hf_id is a relative path like "datasets/user/repo", "user/repo", or "repo".
     url_segments = hf_id.split("/")
-    is_hf_id = len(url_segments) <= 3
 
-    namespace: Optional[str]
-    if is_hf_url:
-        # For URLs, we need to extract repo_type, namespace, repo_id
-        # Expected format after stripping endpoint: [repo_type]/namespace/repo_id or namespace/repo_id
-
-        if len(url_segments) >= 3:
-            # Check if first segment is a repo type
-            if url_segments[0] in constants.REPO_TYPES_MAPPING:
-                repo_type = constants.REPO_TYPES_MAPPING[url_segments[0]]
-                namespace = url_segments[1]
-                repo_id = url_segments[2]
-            elif url_segments[0] == "buckets":
-                # Special case for buckets
-                repo_type = "bucket"
-                namespace = url_segments[1]
-                repo_id = url_segments[2]
-            else:
-                # First segment is namespace
-                namespace = url_segments[0]
-                repo_id = url_segments[1]
-                repo_type = None
-        elif len(url_segments) == 2:
-            namespace = url_segments[0]
-            repo_id = url_segments[1]
-
-            # Check if namespace is actually a repo type mapping
-            if namespace in constants.REPO_TYPES_MAPPING:
-                # Mean canonical dataset or model
-                repo_type = constants.REPO_TYPES_MAPPING[namespace]
-                namespace = None
-            elif namespace == "buckets":
-                # Special case for buckets
-                repo_type = "bucket"
-                namespace = None
-            else:
-                repo_type = None
-        else:
-            # Single segment
-            repo_id = url_segments[0]
-            namespace = None
-            repo_type = None
-    elif is_hf_id:
-        if len(url_segments) == 3:
-            # Passed <repo_type>/<user>/<model_id> or <repo_type>/<org>/<model_id>
-            repo_type, namespace, repo_id = url_segments[-3:]
-        elif len(url_segments) == 2:
-            if url_segments[0] in constants.REPO_TYPES_MAPPING:
-                # Passed '<model_id>' or 'datasets/<dataset_id>' for a canonical model or dataset
-                repo_type = constants.REPO_TYPES_MAPPING[url_segments[0]]
-                namespace = None
-                repo_id = hf_id.split("/")[-1]
-            elif url_segments[0] == "buckets":
-                # Special case for buckets
-                repo_type = "bucket"
-                namespace = None
-                repo_id = hf_id.split("/")[-1]
-            else:
-                # Passed <user>/<model_id> or <org>/<model_id>
-                namespace, repo_id = hf_id.split("/")[-2:]
-                repo_type = None
-        else:
-            # Passed <model_id>
-            repo_id = url_segments[0]
-            namespace, repo_type = None, None
-    else:
+    # For non-URL inputs, reject paths with more than 3 segments.
+    # URL inputs can have extra segments (e.g. /blob/main/file.txt) which are ignored.
+    if not is_hf_url and len(url_segments) > 3:
         raise ValueError(f"Unable to retrieve user and repo ID from the passed HF ID: {hf_id}")
 
+    if not is_hf_url and len(url_segments) == 3:
+        # Passed <repo_type>/<user>/<model_id> — accept singular type names
+        # (e.g. "dataset/user/id") which parse_hf_url doesn't handle.
+        repo_type, namespace, repo_id = url_segments
+    elif not is_hf_url and len(url_segments) == 2 and url_segments[0] == "buckets":
+        # Special case: "buckets/name" (no namespace) — parse_hf_url expects
+        # namespace/name for buckets, but this function accepts bare bucket names.
+        repo_type = "bucket"
+        namespace = None
+        repo_id = url_segments[1]
+    else:
+        # Delegate to the central parser for type detection, bucket handling, etc.
+        parsed = parse_hf_url(hf_id)
+
+        if isinstance(parsed, ParsedBucketUrl):
+            repo_type: Optional[str] = "bucket"
+            namespace = parsed.namespace
+            repo_id = parsed.bucket_name
+        else:
+            # When no type prefix is present, parse_hf_url defaults to "model".
+            # This function returns None instead.
+            repo_type = parsed.repo_type if parsed.has_explicit_type else None
+            namespace = parsed.namespace
+            repo_id = parsed.repo_name
+
     # Check if repo type is known (mapping "spaces" => "space" + empty value => `None`)
     if repo_type in constants.REPO_TYPES_MAPPING:
         repo_type = constants.REPO_TYPES_MAPPING[repo_type]