From 1490f6cb8296fd25accd358f96cca8f865c2fec5 Mon Sep 17 00:00:00 2001 From: Lucain Pouget Date: Mon, 2 Mar 2026 15:51:02 +0100 Subject: [PATCH 01/21] Add HfApi copy_files and remote hf buckets cp support --- docs/source/en/guides/buckets.md | 37 +++ docs/source/en/guides/cli.md | 19 +- docs/source/en/package_reference/cli.md | 10 +- pr_copy_files.md | 71 ++++++ src/huggingface_hub/cli/buckets.py | 33 ++- src/huggingface_hub/hf_api.py | 319 +++++++++++++++++++++++- tests/test_buckets.py | 78 ++++++ tests/test_buckets_cli.py | 58 ++++- 8 files changed, 597 insertions(+), 28 deletions(-) create mode 100644 pr_copy_files.md diff --git a/docs/source/en/guides/buckets.md b/docs/source/en/guides/buckets.md index 24c4870bfb..64f2f98a3b 100644 --- a/docs/source/en/guides/buckets.md +++ b/docs/source/en/guides/buckets.md @@ -456,6 +456,43 @@ You can also pipe the content of a file directly to `stdout` using `-`: >>> hf buckets cp hf://buckets/username/my-bucket/config.json - | jq . ``` +### Copy between remote HF handles + +Use [`copy_files`] to copy files between HF handles: + +```py +>>> from huggingface_hub import copy_files + +# Bucket to bucket (same bucket or different bucket) +>>> copy_files( +... "hf://buckets/username/source-bucket/checkpoints/model.safetensors", +... "hf://buckets/username/destination-bucket/archive/model.safetensors", +... ) + +# Repo to bucket +>>> copy_files( +... "hf://datasets/username/my-dataset/processed/", +... "hf://buckets/username/my-bucket/datasets/processed/", +... ) +``` + +The same is available from the CLI: + +```bash +# Bucket to bucket +>>> hf buckets cp hf://buckets/username/source-bucket/logs/ hf://buckets/username/destination-bucket/logs/ + +# Repo to bucket +>>> hf buckets cp hf://username/my-model/config.json hf://buckets/username/my-bucket/models/config.json +``` + +Notes: + +- Folder copy requires destination to end with `/`. +- Bucket-to-repo copy is not supported. +- For repo sources, files with an available `xet_hash` are copied directly. Regular files without `xet_hash` are downloaded + and re-uploaded. + ### Download a directory with the CLI Use `hf buckets sync` to download all files from a bucket to a local directory: diff --git a/docs/source/en/guides/cli.md b/docs/source/en/guides/cli.md index a8301779d0..974157e585 100644 --- a/docs/source/en/guides/cli.md +++ b/docs/source/en/guides/cli.md @@ -657,7 +657,7 @@ To filter by prefix, append the prefix to the bucket path: ### Copy single files -Use `hf buckets cp` to copy individual files to and from a bucket. Bucket paths use the `hf://buckets/` prefix. +Use `hf buckets cp` to copy files between local paths and HF handles, or between remote HF handles. To upload a file: @@ -687,6 +687,21 @@ You can also stream to stdout or from stdin using `-`: >>> echo "hello" | hf buckets cp - hf://buckets/username/my-bucket/hello.txt ``` +To copy between remote handles: + +```bash +# Bucket to bucket +>>> hf buckets cp hf://buckets/username/source-bucket/logs/ hf://buckets/username/archive-bucket/logs/ + +# Repo to bucket +>>> hf buckets cp hf://datasets/username/my-dataset/data/train/ hf://buckets/username/my-bucket/datasets/train/ +``` + +Notes: + +- Folder copy requires destination to end with `/`. +- Bucket-to-repo copy is not supported. + ### Sync directories Use `hf buckets sync` to synchronize directories between your local machine and a bucket. It compares source and destination and transfers only changed files. @@ -1527,4 +1542,4 @@ Use `hf endpoints catalog` to interact with the Inference Endpoints Model Catalo # Deploy with a custom name >>> hf endpoints catalog deploy --repo meta-llama/Llama-3.2-1B-Instruct --name my-llama-endpoint -``` \ No newline at end of file +``` diff --git a/docs/source/en/package_reference/cli.md b/docs/source/en/package_reference/cli.md index 7f9c5a7c6c..c818abfa5d 100644 --- a/docs/source/en/package_reference/cli.md +++ b/docs/source/en/package_reference/cli.md @@ -202,7 +202,7 @@ $ hf buckets [OPTIONS] COMMAND [ARGS]... **Commands**: -* `cp`: Copy a single file to or from a bucket. +* `cp`: Copy files to or from buckets. * `create`: Create a new bucket. * `delete`: Delete a bucket. * `info`: Get info about a bucket. @@ -213,7 +213,7 @@ $ hf buckets [OPTIONS] COMMAND [ARGS]... ### `hf buckets cp` -Copy a single file to or from a bucket. +Copy files to or from buckets. **Usage**: @@ -223,8 +223,8 @@ $ hf buckets cp [OPTIONS] SRC [DST] **Arguments**: -* `SRC`: Source: local file, hf://buckets/... path, or - for stdin [required] -* `[DST]`: Destination: local path, hf://buckets/... path, or - for stdout +* `SRC`: Source: local file, HF handle (hf://...), or - for stdin [required] +* `[DST]`: Destination: local path, HF handle (hf://...), or - for stdout **Options**: @@ -241,6 +241,8 @@ Examples $ hf buckets cp my-config.json hf://buckets/user/my-bucket/logs/ $ hf buckets cp my-config.json hf://buckets/user/my-bucket/remote-config.json $ hf buckets cp - hf://buckets/user/my-bucket/config.json + $ hf buckets cp hf://buckets/user/my-bucket/logs/ hf://buckets/user/archive-bucket/logs/ + $ hf buckets cp hf://datasets/user/my-dataset/processed/ hf://buckets/user/my-bucket/dataset/processed/ Learn more Use `hf --help` for more information about a command. diff --git a/pr_copy_files.md b/pr_copy_files.md new file mode 100644 index 0000000000..c1abcf370c --- /dev/null +++ b/pr_copy_files.md @@ -0,0 +1,71 @@ +## Summary + +Add a new `HfApi.copy_files` API and wire `hf buckets cp` to support remote HF-handle copy flows. + +### Implemented + +- Added `HfApi.copy_files(source, destination, *, token=None) -> int`. +- Supported copy directions: + - bucket -> same bucket + - bucket -> different bucket + - repo (model/dataset/space) -> bucket +- Explicitly reject bucket/repo destinations that are not buckets (bucket -> repo and repo -> repo). +- Added folder-copy rule: destination must end with `/` for folder sources. +- Added repo `@revision` support in source handles (including special refs parsing). +- Added top-level alias export: `copy_files = api.copy_files`. + +## Implementation details + +- Source/destination are parsed as HF handles (`hf://...`) with dedicated copy-handle parsing. +- For repo sources: + - if `xet_hash` exists: direct bucket add by hash + - if `xet_hash is None`: download file then re-upload to bucket +- For bucket sources: + - same-bucket copy: direct add by hash + - cross-bucket copy: download + re-upload fallback (required by backend behavior) +- Extended internal `_batch_bucket_files` to accept prebuilt `_BucketAddFile` entries and skip Xet upload when `xet_hash` is already known. + +## CLI changes + +- `hf buckets cp` now supports remote-to-remote handle copies by delegating to `api.copy_files`. +- Added new examples for: + - bucket -> bucket + - repo -> bucket +- Output now reports count for remote copies: + - `Copied file(s): SRC -> DST` + +## Tests + +### Added in `tests/test_buckets.py` + +- `test_copy_files_bucket_to_same_bucket_file` +- `test_copy_files_bucket_to_different_bucket_folder` +- `test_copy_files_repo_to_bucket_with_revision` +- `test_copy_files_bucket_to_repo_raises` +- `test_copy_files_folder_requires_destination_suffix` + +### Added in `tests/test_buckets_cli.py` + +- `test_cp_remote_bucket_to_bucket` +- `test_cp_remote_repo_to_bucket` +- `test_cp_error_bucket_to_repo` +- `test_cp_error_remote_folder_requires_destination_suffix` + +## Documentation + +Updated: + +- `docs/source/en/guides/buckets.md` +- `docs/source/en/guides/cli.md` +- `docs/source/en/package_reference/cli.md` (generated) + +## Validation + +Executed: + +- `make style` +- `make quality` +- `pytest tests/test_buckets.py -k copy_files -q` +- `pytest tests/test_buckets_cli.py -k "cp_remote_bucket_to_bucket or cp_remote_repo_to_bucket or cp_error_bucket_to_repo or cp_error_remote_folder_requires_destination_suffix" -q` + +All passed. diff --git a/src/huggingface_hub/cli/buckets.py b/src/huggingface_hub/cli/buckets.py index e058d1a28f..1654f72330 100644 --- a/src/huggingface_hub/cli/buckets.py +++ b/src/huggingface_hub/cli/buckets.py @@ -53,6 +53,10 @@ buckets_cli = typer_factory(help="Commands to interact with buckets.") +def _is_hf_handle(path: str) -> bool: + return path.startswith("hf://") + + def _parse_bucket_argument(argument: str) -> tuple[str, str]: """Parse a bucket argument accepting both 'namespace/name(/prefix)' and 'hf://buckets/namespace/name(/prefix)'. @@ -924,28 +928,45 @@ def sync( "hf buckets cp my-config.json hf://buckets/user/my-bucket/logs/", "hf buckets cp my-config.json hf://buckets/user/my-bucket/remote-config.json", "hf buckets cp - hf://buckets/user/my-bucket/config.json", + "hf buckets cp hf://buckets/user/my-bucket/logs/ hf://buckets/user/archive-bucket/logs/", + "hf buckets cp hf://datasets/user/my-dataset/processed/ hf://buckets/user/my-bucket/dataset/processed/", ], ) def cp( - src: Annotated[str, typer.Argument(help="Source: local file, hf://buckets/... path, or - for stdin")], + src: Annotated[str, typer.Argument(help="Source: local file, HF handle (hf://...), or - for stdin")], dst: Annotated[ - Optional[str], typer.Argument(help="Destination: local path, hf://buckets/... path, or - for stdout") + Optional[str], typer.Argument(help="Destination: local path, HF handle (hf://...), or - for stdout") ] = None, quiet: QuietOpt = False, token: TokenOpt = None, ) -> None: - """Copy a single file to or from a bucket.""" + """Copy files to or from buckets.""" api = get_hf_api(token=token) + src_is_hf = _is_hf_handle(src) + dst_is_hf = dst is not None and _is_hf_handle(dst) src_is_bucket = _is_bucket_path(src) dst_is_bucket = dst is not None and _is_bucket_path(dst) src_is_stdin = src == "-" dst_is_stdout = dst == "-" - # --- Validation --- - if src_is_bucket and dst_is_bucket: - raise typer.BadParameter("Remote-to-remote copy not supported.") + if src_is_hf and dst_is_hf: + assert dst is not None + if quiet: + disable_progress_bars() + try: + copied_count = api.copy_files(src, dst) + except ValueError as error: + raise typer.BadParameter(str(error)) + finally: + if quiet: + enable_progress_bars() + if not quiet: + print(f"Copied {copied_count} file(s): {src} -> {dst}") + return + + # --- Validation --- if not src_is_bucket and not dst_is_bucket and not src_is_stdin: if dst is None: raise typer.BadParameter("Missing destination. Provide a bucket path as DST.") diff --git a/src/huggingface_hub/hf_api.py b/src/huggingface_hub/hf_api.py index 7097592456..61740c441d 100644 --- a/src/huggingface_hub/hf_api.py +++ b/src/huggingface_hub/hf_api.py @@ -20,6 +20,7 @@ import json import re import struct +import tempfile import time import warnings from collections import defaultdict @@ -38,12 +39,13 @@ Iterator, Literal, Optional, + Sequence, Type, TypeVar, Union, overload, ) -from urllib.parse import quote +from urllib.parse import quote, unquote import httpcore import httpx @@ -248,6 +250,15 @@ _BUCKET_PATHS_INFO_BATCH_SIZE = 1000 _BUCKET_BATCH_ADD_CHUNK_SIZE = 100 _BUCKET_BATCH_DELETE_CHUNK_SIZE = 1000 +_HF_COPY_REPO_TYPE_PREFIXES: dict[str, Literal["model", "dataset", "space"]] = { + "model": constants.REPO_TYPE_MODEL, + "models": constants.REPO_TYPE_MODEL, + "dataset": constants.REPO_TYPE_DATASET, + "datasets": constants.REPO_TYPE_DATASET, + "space": constants.REPO_TYPE_SPACE, + "spaces": constants.REPO_TYPE_SPACE, +} +_SPECIAL_REFS_REVISION_REGEX = re.compile(r"(^refs\/convert\/\w+)|(^refs\/pr\/\d+)") logger = logging.get_logger(__name__) @@ -386,6 +397,68 @@ def repo_type_and_id_from_hf_id(hf_id: str, hub_url: Optional[str] = None) -> tu return repo_type, namespace, repo_id +def _parse_hf_copy_handle(hf_handle: str) -> Union[_BucketCopyHandle, _RepoCopyHandle]: + if not hf_handle.startswith("hf://"): + raise ValueError(f"Invalid HF handle: '{hf_handle}'. Expected a path starting with 'hf://'.") + + path = hf_handle.removeprefix("hf://") + if path.startswith("buckets/"): + bucket_id, bucket_path = _split_bucket_id_and_prefix(path.removeprefix("buckets/")) + return _BucketCopyHandle( + bucket_id=bucket_id, + path=bucket_path.strip("/"), + is_directory=hf_handle.endswith("/") and bucket_path != "", + ) + + path = path.strip("/") + if path == "": + raise ValueError(f"Invalid HF handle: '{hf_handle}'.") + + parts = path.split("/") + repo_type: Literal["model", "dataset", "space"] = constants.REPO_TYPE_MODEL + if parts[0] in _HF_COPY_REPO_TYPE_PREFIXES: + repo_type = _HF_COPY_REPO_TYPE_PREFIXES[parts[0]] + parts = parts[1:] + + if len(parts) < 2: + raise ValueError( + f"Invalid repo HF handle: '{hf_handle}'. Expected format 'hf:////path' " + "or with explicit repo type prefix." + ) + + namespace, repo_name_with_revision = parts[0], parts[1] + remaining_parts = parts[2:] + revision: Optional[str] = None + if "@" in repo_name_with_revision: + repo_name, revision = repo_name_with_revision.split("@", 1) + else: + repo_name = repo_name_with_revision + + if revision is None: + revision = constants.DEFAULT_REVISION + elif remaining_parts: + maybe_special_ref = f"{unquote(revision)}/{remaining_parts[0]}" + match = _SPECIAL_REFS_REVISION_REGEX.match(maybe_special_ref) + if match is not None: + special_ref = match.group() + revision = special_ref + suffix = maybe_special_ref.removeprefix(special_ref).lstrip("/") + remaining_parts = ([suffix] if suffix else []) + remaining_parts[1:] + else: + revision = unquote(revision) + else: + revision = unquote(revision) + + repo_path = "/".join(remaining_parts).strip("/") + return _RepoCopyHandle( + repo_type=repo_type, + repo_id=f"{namespace}/{repo_name}", + revision=revision, + path=repo_path, + is_directory=hf_handle.endswith("/") and repo_path != "", + ) + + @dataclass class LastCommitInfo(dict): oid: str @@ -648,6 +721,22 @@ def __repr__(self) -> str: return f"RepoUrl('{self}', endpoint='{self.endpoint}', repo_type='{self.repo_type}', repo_id='{self.repo_id}')" +@dataclass(frozen=True) +class _BucketCopyHandle: + bucket_id: str + path: str + is_directory: bool + + +@dataclass(frozen=True) +class _RepoCopyHandle: + repo_type: Literal["model", "dataset", "space"] + repo_id: str + revision: str + path: str + is_directory: bool + + @dataclass class RepoSibling: """ @@ -11721,6 +11810,197 @@ def get_bucket_paths_info( for path_info in response.json(): yield BucketFile(**path_info) + @validate_hf_hub_args + def copy_files( + self, + source: str, + destination: str, + *, + token: Union[str, bool, None] = None, + ) -> int: + """Copy files between HF handles. + + Supported: + - bucket -> bucket + - repo (model/dataset/space) -> bucket + + Unsupported: + - bucket -> repo + - local paths + + If source is a folder, destination must end with ``/``. + """ + source_handle = _parse_hf_copy_handle(source) + destination_handle = _parse_hf_copy_handle(destination) + + if isinstance(destination_handle, _RepoCopyHandle): + raise ValueError("Bucket-to-repo and repo-to-repo copy are not supported. Destination must be a bucket.") + + if isinstance(source_handle, _BucketCopyHandle) and isinstance(destination_handle, _RepoCopyHandle): + raise ValueError("Bucket-to-repo copy is not supported.") + + destination_bucket_id = destination_handle.bucket_id + destination_path = destination_handle.path + destination_is_directory = destination_handle.is_directory or destination_path == "" + + copy_count = 0 + hash_based_adds: list[_BucketAddFile] = [] + + def _resolve_target_path(src_file_path: str, src_root_path: Optional[str], is_single_file: bool) -> str: + basename = src_file_path.rsplit("/", 1)[-1] + if is_single_file: + if destination_path == "": + return basename + if destination_is_directory: + return f"{destination_path.rstrip('/')}/{basename}" + return destination_path + + if not destination_is_directory: + raise ValueError("Folder copy requires destination to end with '/'.") + if src_root_path is None: + rel_path = src_file_path + elif src_file_path.startswith(src_root_path + "/"): + rel_path = src_file_path[len(src_root_path) + 1 :] + elif src_file_path == src_root_path: + rel_path = src_file_path.rsplit("/", 1)[-1] + else: + raise ValueError(f"Unexpected source path while copying folder: '{src_file_path}'.") + + if rel_path == "": + raise ValueError("Cannot copy an empty relative path.") + if destination_path == "": + return rel_path + return f"{destination_path.rstrip('/')}/{rel_path}" + + def _flush_hash_based_adds() -> None: + nonlocal hash_based_adds + if not hash_based_adds: + return + for add_chunk in chunk_iterable(hash_based_adds, chunk_size=_BUCKET_BATCH_ADD_CHUNK_SIZE): + self._batch_bucket_files(destination_bucket_id, add=list(add_chunk), token=token) + hash_based_adds = [] + + if isinstance(source_handle, _BucketCopyHandle): + source_path = source_handle.path + same_bucket_copy = source_handle.bucket_id == destination_bucket_id + source_path_info = list(self.get_bucket_paths_info(source_handle.bucket_id, [source_path], token=token)) + if source_path_info: + source_file = source_path_info[0] + target_path = _resolve_target_path(source_file.path, None, is_single_file=True) + if same_bucket_copy: + hash_based_adds.append( + _BucketAddFile( + source=b"", + destination=target_path, + xet_hash=source_file.xet_hash, + size=source_file.size, + ) + ) + else: + with tempfile.TemporaryDirectory() as tmp_dir: + local_path = str(Path(tmp_dir) / source_file.path.rsplit("/", 1)[-1]) + self.download_bucket_files( + source_handle.bucket_id, [(source_file.path, local_path)], token=token + ) + self.batch_bucket_files(destination_bucket_id, add=[(local_path, target_path)], token=token) + copy_count += 1 + else: + if source_path != "" and not destination_is_directory: + raise ValueError("Folder copy requires destination to end with '/'.") + for item in self.list_bucket_tree( + source_handle.bucket_id, prefix=source_path or None, recursive=True, token=token + ): + if not isinstance(item, BucketFile): + continue + if source_path and not (item.path == source_path or item.path.startswith(source_path + "/")): + continue + target_path = _resolve_target_path(item.path, source_path or None, is_single_file=False) + if same_bucket_copy: + hash_based_adds.append( + _BucketAddFile(source=b"", destination=target_path, xet_hash=item.xet_hash, size=item.size) + ) + else: + _flush_hash_based_adds() + with tempfile.TemporaryDirectory() as tmp_dir: + local_path = str(Path(tmp_dir) / item.path.rsplit("/", 1)[-1]) + self.download_bucket_files(source_handle.bucket_id, [(item.path, local_path)], token=token) + self.batch_bucket_files( + destination_bucket_id, add=[(local_path, target_path)], token=token + ) + copy_count += 1 + else: + source_path = source_handle.path + source_path_info: list[Union[RepoFile, RepoFolder]] = [] + if source_path != "": + source_path_info = self.get_paths_info( + repo_id=source_handle.repo_id, + paths=[source_path], + repo_type=source_handle.repo_type, + revision=source_handle.revision, + token=token, + ) + + if len(source_path_info) == 1 and isinstance(source_path_info[0], RepoFile): + source_file = source_path_info[0] + target_path = _resolve_target_path(source_file.path, None, is_single_file=True) + if source_file.xet_hash is not None: + hash_based_adds.append( + _BucketAddFile( + source=b"", + destination=target_path, + xet_hash=source_file.xet_hash, + size=source_file.size, + ) + ) + else: + with tempfile.TemporaryDirectory() as tmp_dir: + local_path = self.hf_hub_download( + repo_id=source_handle.repo_id, + repo_type=source_handle.repo_type, + filename=source_file.path, + revision=source_handle.revision, + local_dir=tmp_dir, + token=token, + ) + self.batch_bucket_files(destination_bucket_id, add=[(local_path, target_path)], token=token) + copy_count += 1 + else: + if source_path and not destination_is_directory: + raise ValueError("Folder copy requires destination to end with '/'.") + for item in self.list_repo_tree( + repo_id=source_handle.repo_id, + path_in_repo=source_path or None, + recursive=True, + repo_type=source_handle.repo_type, + revision=source_handle.revision, + token=token, + ): + if not isinstance(item, RepoFile): + continue + target_path = _resolve_target_path(item.path, source_path or None, is_single_file=False) + if item.xet_hash is not None: + hash_based_adds.append( + _BucketAddFile(source=b"", destination=target_path, xet_hash=item.xet_hash, size=item.size) + ) + else: + _flush_hash_based_adds() + with tempfile.TemporaryDirectory() as tmp_dir: + local_path = self.hf_hub_download( + repo_id=source_handle.repo_id, + repo_type=source_handle.repo_type, + filename=item.path, + revision=source_handle.revision, + local_dir=tmp_dir, + token=token, + ) + self.batch_bucket_files( + destination_bucket_id, add=[(local_path, target_path)], token=token + ) + copy_count += 1 + + _flush_hash_based_adds() + return copy_count + @validate_hf_hub_args def batch_bucket_files( self, @@ -11772,12 +12052,14 @@ def batch_bucket_files( ... ) ``` """ - add = add or [] - delete = delete or [] + add = [] if add is None else add + delete = [] if delete is None else delete # Small batch: do everything in one call if len(add) + len(delete) <= _BUCKET_BATCH_ADD_CHUNK_SIZE: - self._batch_bucket_files(bucket_id, add=add or None, delete=delete or None, token=token) + add_payload: Optional[list[tuple[Union[str, Path, bytes], str]]] = add if len(add) > 0 else None + delete_payload: Optional[list[str]] = delete if len(delete) > 0 else None + self._batch_bucket_files(bucket_id, add=add_payload, delete=delete_payload, token=token) return # Large batch: chunk adds first, then deletes @@ -11790,7 +12072,8 @@ def batch_bucket_files( try: for add_chunk in chunk_iterable(add, chunk_size=_BUCKET_BATCH_ADD_CHUNK_SIZE): - self._batch_bucket_files(bucket_id, add=list(add_chunk), token=token, _progress=progress) + add_chunk_list: list[tuple[Union[str, Path, bytes], str]] = list(add_chunk) + self._batch_bucket_files(bucket_id, add=add_chunk_list, token=token, _progress=progress) for delete_chunk in chunk_iterable(delete, chunk_size=_BUCKET_BATCH_DELETE_CHUNK_SIZE): self._batch_bucket_files(bucket_id, delete=list(delete_chunk), token=token) @@ -11804,7 +12087,7 @@ def _batch_bucket_files( self, bucket_id: str, *, - add: Optional[list[tuple[Union[str, Path, bytes], str]]] = None, + add: Optional[Sequence[Union[tuple[Union[str, Path, bytes], str], _BucketAddFile]]] = None, delete: Optional[list[str]] = None, token: Union[str, bool, None] = None, _progress: Optional["XetProgressReporter"] = None, @@ -11813,8 +12096,12 @@ def _batch_bucket_files( # Convert public API inputs to internal operation objects operations: list[Union[_BucketAddFile, _BucketDeleteFile]] = [] if add: - for source, destination in add: - operations.append(_BucketAddFile(source=source, destination=destination)) + for item in add: + if isinstance(item, _BucketAddFile): + operations.append(item) + else: + source, destination = item + operations.append(_BucketAddFile(source=source, destination=destination)) if delete: for path in delete: operations.append(_BucketDeleteFile(path=path)) @@ -11829,10 +12116,11 @@ def _batch_bucket_files( headers = self._build_hf_headers(token=token) add_operations = [op for op in operations if isinstance(op, _BucketAddFile)] + add_operations_to_upload = [op for op in add_operations if op.xet_hash is None] add_bytes_operations = [op for op in add_operations if isinstance(op.source, bytes)] add_path_operations = [op for op in add_operations if not isinstance(op.source, bytes)] - if len(add_operations) > 0: + if len(add_operations_to_upload) > 0: try: xet_connection_info = fetch_xet_connection_info_from_repo_info( token_type=XetTokenType.WRITE, @@ -11877,14 +12165,16 @@ def token_refresher() -> tuple[str, int]: try: # 2.a. Upload path files xet_upload_infos = upload_files( - [str(op.source) for op in add_path_operations], + [str(op.source) for op in add_path_operations if op.xet_hash is None], xet_endpoint, access_token_info, token_refresher, progress_callback, "bucket", ) - for upload_info, op in zip(xet_upload_infos, add_path_operations): + for upload_info, op in zip( + xet_upload_infos, [op for op in add_path_operations if op.xet_hash is None] + ): op.xet_hash = upload_info.hash op.size = upload_info.filesize @@ -11893,14 +12183,16 @@ def token_refresher() -> tuple[str, int]: # 2.b. Upload bytes files xet_upload_infos = upload_bytes( - [op.source for op in add_bytes_operations], + [op.source for op in add_bytes_operations if op.xet_hash is None], xet_endpoint, access_token_info, token_refresher, progress_callback, "bucket", ) - for upload_info, op in zip(xet_upload_infos, add_bytes_operations): + for upload_info, op in zip( + xet_upload_infos, [op for op in add_bytes_operations if op.xet_hash is None] + ): op.xet_hash = upload_info.hash op.size = upload_info.filesize @@ -12598,6 +12890,7 @@ def get_local_safetensors_metadata(path: Union[str, Path]) -> SafetensorsRepoMet move_bucket = api.move_bucket list_bucket_tree = api.list_bucket_tree get_bucket_paths_info = api.get_bucket_paths_info +copy_files = api.copy_files batch_bucket_files = api.batch_bucket_files get_bucket_file_metadata = api.get_bucket_file_metadata download_bucket_files = api.download_bucket_files diff --git a/tests/test_buckets.py b/tests/test_buckets.py index 7bd80fa973..02bd948fbf 100644 --- a/tests/test_buckets.py +++ b/tests/test_buckets.py @@ -302,6 +302,84 @@ def test_download_bucket_files_raises_on_missing_when_requested(api: HfApi, buck assert "non_existent_file.txt" in str(exc_info.value) +@requires("hf_xet") +def test_copy_files_bucket_to_same_bucket_file(api: HfApi, bucket_write: str): + api.batch_bucket_files(bucket_write, add=[(b"bucket-content", "source.txt")]) + + copied_count = api.copy_files( + f"hf://buckets/{bucket_write}/source.txt", + f"hf://buckets/{bucket_write}/copied.txt", + ) + + assert copied_count == 1 + files = {entry.path for entry in api.list_bucket_tree(bucket_write)} + assert files >= {"source.txt", "copied.txt"} + + +@requires("hf_xet") +def test_copy_files_bucket_to_different_bucket_folder(api: HfApi, bucket_write: str): + source_bucket = bucket_write + destination_bucket = api.create_bucket(bucket_name()).bucket_id + api.batch_bucket_files(source_bucket, add=[(b"a", "logs/a.txt"), (b"b", "logs/sub/b.txt"), (b"c", "other/c.txt")]) + + copied_count = api.copy_files( + f"hf://buckets/{source_bucket}/logs", + f"hf://buckets/{destination_bucket}/backup/", + ) + + assert copied_count == 2 + destination_files = {entry.path for entry in api.list_bucket_tree(destination_bucket)} + assert destination_files >= {"backup/a.txt", "backup/sub/b.txt"} + assert "backup/c.txt" not in destination_files + + +@requires("hf_xet") +def test_copy_files_repo_to_bucket_with_revision(api: HfApi, bucket_write: str, tmp_path): + repo_id = api.create_repo(repo_id=repo_name(prefix="copy-files")).repo_id + branch = "copy-files-branch" + try: + api.upload_file(repo_id=repo_id, path_in_repo="main.txt", path_or_fileobj=b"main") + api.create_branch(repo_id=repo_id, branch=branch) + api.upload_file( + repo_id=repo_id, path_in_repo="nested/from-branch.txt", path_or_fileobj=b"branch", revision=branch + ) + + copied_count = api.copy_files( + f"hf://{repo_id}@{branch}/nested/from-branch.txt", + f"hf://buckets/{bucket_write}/from-repo.txt", + ) + + assert copied_count == 1 + output_path = tmp_path / "from-repo.txt" + api.download_bucket_files(bucket_write, [("from-repo.txt", output_path)]) + assert output_path.read_bytes() == b"branch" + finally: + api.delete_repo(repo_id=repo_id) + + +@requires("hf_xet") +def test_copy_files_bucket_to_repo_raises(api: HfApi, bucket_write: str): + repo_id = api.create_repo(repo_id=repo_name(prefix="copy-files-dst")).repo_id + try: + api.batch_bucket_files(bucket_write, add=[(b"x", "x.txt")]) + with pytest.raises(ValueError, match="Destination must be a bucket"): + api.copy_files(f"hf://buckets/{bucket_write}/x.txt", f"hf://{repo_id}/x.txt") + finally: + api.delete_repo(repo_id=repo_id) + + +@requires("hf_xet") +def test_copy_files_folder_requires_destination_suffix(api: HfApi, bucket_write: str): + destination_bucket = api.create_bucket(bucket_name()).bucket_id + api.batch_bucket_files(bucket_write, add=[(b"x", "folder/x.txt")]) + + with pytest.raises(ValueError, match="destination to end with '/'"): + api.copy_files( + f"hf://buckets/{bucket_write}/folder", + f"hf://buckets/{destination_bucket}/target-folder", + ) + + @pytest.mark.parametrize( "source, destination, expected_content_type", [ diff --git a/tests/test_buckets_cli.py b/tests/test_buckets_cli.py index e5f7ba83b1..bbec77a37f 100644 --- a/tests/test_buckets_cli.py +++ b/tests/test_buckets_cli.py @@ -876,10 +876,62 @@ def test_cp_download_creates_parent_dirs(bucket_with_files: str, tmp_path: Path) # -- Validation error tests -- -def test_cp_error_remote_to_remote(): - """Both src and dst are bucket paths.""" - result = cli("hf buckets cp hf://buckets/user/a/file.txt hf://buckets/user/b/file.txt") +def test_cp_remote_bucket_to_bucket(api: HfApi): + source_bucket = api.create_bucket(bucket_name()).bucket_id + destination_bucket = api.create_bucket(bucket_name()).bucket_id + api.batch_bucket_files(source_bucket, add=[(b"aaa", "logs/a.txt"), (b"bbb", "logs/sub/b.txt"), (b"ccc", "c.txt")]) + + result = cli( + f"hf buckets cp hf://buckets/{source_bucket}/logs hf://buckets/{destination_bucket}/backup/", + ) + + assert result.exit_code == 0, result.output + assert "Copied 2 file(s)" in result.output + files = _remote_files(api, destination_bucket) + assert files >= {"backup/a.txt", "backup/sub/b.txt"} + assert "backup/c.txt" not in files + + +def test_cp_remote_repo_to_bucket(api: HfApi): + repo_id = api.create_repo(repo_id=repo_name(prefix="cp-copy")).repo_id + branch = "cp-copy-branch" + destination_bucket = api.create_bucket(bucket_name()).bucket_id + + try: + api.upload_file(repo_id=repo_id, path_in_repo="main.txt", path_or_fileobj=b"main") + api.create_branch(repo_id=repo_id, branch=branch) + api.upload_file( + repo_id=repo_id, path_in_repo="nested/from-branch.txt", path_or_fileobj=b"branch", revision=branch + ) + + result = cli( + f"hf buckets cp hf://{repo_id}@{branch}/nested/from-branch.txt hf://buckets/{destination_bucket}/copied.txt" + ) + assert result.exit_code == 0, result.output + assert "Copied 1 file(s)" in result.output + finally: + api.delete_repo(repo_id=repo_id) + + +def test_cp_error_bucket_to_repo(api: HfApi, bucket_write: str): + repo_id = api.create_repo(repo_id=repo_name(prefix="cp-copy-dst")).repo_id + try: + api.batch_bucket_files(bucket_write, add=[(b"data", "file.txt")]) + result = cli(f"hf buckets cp hf://buckets/{bucket_write}/file.txt hf://{repo_id}/file.txt") + assert result.exit_code != 0 + assert "destination must be a bucket" in result.output.lower() + finally: + api.delete_repo(repo_id=repo_id) + + +def test_cp_error_remote_folder_requires_destination_suffix(api: HfApi): + source_bucket = api.create_bucket(bucket_name()).bucket_id + destination_bucket = api.create_bucket(bucket_name()).bucket_id + api.batch_bucket_files(source_bucket, add=[(b"aaa", "folder/a.txt")]) + + result = cli(f"hf buckets cp hf://buckets/{source_bucket}/folder hf://buckets/{destination_bucket}/target") assert result.exit_code != 0 + assert "destination to end with '/'" in result.output def test_cp_error_both_local(tmp_path: Path): From bca3c0a9d0b60a95ea06ec98cc654b99861e7c39 Mon Sep 17 00:00:00 2001 From: Lucain Pouget Date: Mon, 2 Mar 2026 16:07:55 +0100 Subject: [PATCH 02/21] Adjust copy_files return type and cp output message --- pr.md | 88 ++++++++++++++++++++++++++++++ src/huggingface_hub/cli/buckets.py | 4 +- src/huggingface_hub/hf_api.py | 9 +-- tests/test_buckets.py | 9 +-- tests/test_buckets_cli.py | 6 +- 5 files changed, 99 insertions(+), 17 deletions(-) create mode 100644 pr.md diff --git a/pr.md b/pr.md new file mode 100644 index 0000000000..7e32f91f85 --- /dev/null +++ b/pr.md @@ -0,0 +1,88 @@ +## Summary + +This PR adds a new `HfApi.copy_files` API and extends `hf buckets cp` to support remote HF-handle copy workflows. + +### New capability + +- Copy from bucket to bucket (same bucket or different bucket) +- Copy from repo (model/dataset/space) to bucket +- Reject bucket->repo and repo->repo destinations (not supported yet) + +## API changes + +### Added + +- `HfApi.copy_files(source, destination, *, token=None) -> None` +- top-level alias export: `copy_files` + +### Handle support + +- Source and destination accept HF handles (`hf://...`) +- Repo source handles support explicit repo type prefixes and optional `@revision` + +Examples: + +- `hf://buckets///path/to/file` +- `hf://datasets///path/to/folder/` +- `hf:////path/to/file` +- `hf:///@/path/to/file` + +## Copy behavior + +- File source: copy one file +- Folder source: recursively copy files under the source folder +- Folder copy requires destination to end with `/` + +### Content transfer strategy + +- Repo source file with `xet_hash`: copied directly by hash +- Repo source file without `xet_hash` (regular small file): download then re-upload +- Bucket source to same bucket: copied by hash +- Bucket source to different bucket: download then re-upload fallback + +## Internal update + +Extended `_batch_bucket_files` internals to accept prebuilt `_BucketAddFile` entries, allowing direct hash-based add operations when hash metadata is already known. + +## CLI changes + +Updated `hf buckets cp`: + +- now supports remote->remote HF handle copies via `api.copy_files` +- preserves existing local<->bucket and stdin/stdout behavior +- remote copy output (non-quiet): + - `Copied: SRC -> DST` + +## Tests added + +### `tests/test_buckets.py` + +- `test_copy_files_bucket_to_same_bucket_file` +- `test_copy_files_bucket_to_different_bucket_folder` +- `test_copy_files_repo_to_bucket_with_revision` +- `test_copy_files_bucket_to_repo_raises` +- `test_copy_files_folder_requires_destination_suffix` + +### `tests/test_buckets_cli.py` + +- `test_cp_remote_bucket_to_bucket` +- `test_cp_remote_repo_to_bucket` +- `test_cp_error_bucket_to_repo` +- `test_cp_error_remote_folder_requires_destination_suffix` + +## Documentation + +Updated: + +- `docs/source/en/guides/buckets.md` +- `docs/source/en/guides/cli.md` +- `docs/source/en/package_reference/cli.md` (generated) + +## Validation + +Executed successfully: + +- `make style` +- `make quality` +- `pytest tests/test_buckets.py -k copy_files -q` +- `pytest tests/test_buckets_cli.py -k "cp_remote_bucket_to_bucket or cp_remote_repo_to_bucket or cp_error_bucket_to_repo or cp_error_remote_folder_requires_destination_suffix" -q` diff --git a/src/huggingface_hub/cli/buckets.py b/src/huggingface_hub/cli/buckets.py index 1654f72330..0c7cbd511d 100644 --- a/src/huggingface_hub/cli/buckets.py +++ b/src/huggingface_hub/cli/buckets.py @@ -955,7 +955,7 @@ def cp( if quiet: disable_progress_bars() try: - copied_count = api.copy_files(src, dst) + api.copy_files(src, dst) except ValueError as error: raise typer.BadParameter(str(error)) finally: @@ -963,7 +963,7 @@ def cp( enable_progress_bars() if not quiet: - print(f"Copied {copied_count} file(s): {src} -> {dst}") + print(f"Copied: {src} -> {dst}") return # --- Validation --- diff --git a/src/huggingface_hub/hf_api.py b/src/huggingface_hub/hf_api.py index 61740c441d..784a5ad743 100644 --- a/src/huggingface_hub/hf_api.py +++ b/src/huggingface_hub/hf_api.py @@ -11817,7 +11817,7 @@ def copy_files( destination: str, *, token: Union[str, bool, None] = None, - ) -> int: + ) -> None: """Copy files between HF handles. Supported: @@ -11843,7 +11843,6 @@ def copy_files( destination_path = destination_handle.path destination_is_directory = destination_handle.is_directory or destination_path == "" - copy_count = 0 hash_based_adds: list[_BucketAddFile] = [] def _resolve_target_path(src_file_path: str, src_root_path: Optional[str], is_single_file: bool) -> str: @@ -11903,7 +11902,6 @@ def _flush_hash_based_adds() -> None: source_handle.bucket_id, [(source_file.path, local_path)], token=token ) self.batch_bucket_files(destination_bucket_id, add=[(local_path, target_path)], token=token) - copy_count += 1 else: if source_path != "" and not destination_is_directory: raise ValueError("Folder copy requires destination to end with '/'.") @@ -11927,7 +11925,6 @@ def _flush_hash_based_adds() -> None: self.batch_bucket_files( destination_bucket_id, add=[(local_path, target_path)], token=token ) - copy_count += 1 else: source_path = source_handle.path source_path_info: list[Union[RepoFile, RepoFolder]] = [] @@ -11963,7 +11960,6 @@ def _flush_hash_based_adds() -> None: token=token, ) self.batch_bucket_files(destination_bucket_id, add=[(local_path, target_path)], token=token) - copy_count += 1 else: if source_path and not destination_is_directory: raise ValueError("Folder copy requires destination to end with '/'.") @@ -11996,10 +11992,9 @@ def _flush_hash_based_adds() -> None: self.batch_bucket_files( destination_bucket_id, add=[(local_path, target_path)], token=token ) - copy_count += 1 _flush_hash_based_adds() - return copy_count + return None @validate_hf_hub_args def batch_bucket_files( diff --git a/tests/test_buckets.py b/tests/test_buckets.py index 02bd948fbf..63399ea180 100644 --- a/tests/test_buckets.py +++ b/tests/test_buckets.py @@ -306,12 +306,11 @@ def test_download_bucket_files_raises_on_missing_when_requested(api: HfApi, buck def test_copy_files_bucket_to_same_bucket_file(api: HfApi, bucket_write: str): api.batch_bucket_files(bucket_write, add=[(b"bucket-content", "source.txt")]) - copied_count = api.copy_files( + api.copy_files( f"hf://buckets/{bucket_write}/source.txt", f"hf://buckets/{bucket_write}/copied.txt", ) - assert copied_count == 1 files = {entry.path for entry in api.list_bucket_tree(bucket_write)} assert files >= {"source.txt", "copied.txt"} @@ -322,12 +321,11 @@ def test_copy_files_bucket_to_different_bucket_folder(api: HfApi, bucket_write: destination_bucket = api.create_bucket(bucket_name()).bucket_id api.batch_bucket_files(source_bucket, add=[(b"a", "logs/a.txt"), (b"b", "logs/sub/b.txt"), (b"c", "other/c.txt")]) - copied_count = api.copy_files( + api.copy_files( f"hf://buckets/{source_bucket}/logs", f"hf://buckets/{destination_bucket}/backup/", ) - assert copied_count == 2 destination_files = {entry.path for entry in api.list_bucket_tree(destination_bucket)} assert destination_files >= {"backup/a.txt", "backup/sub/b.txt"} assert "backup/c.txt" not in destination_files @@ -344,12 +342,11 @@ def test_copy_files_repo_to_bucket_with_revision(api: HfApi, bucket_write: str, repo_id=repo_id, path_in_repo="nested/from-branch.txt", path_or_fileobj=b"branch", revision=branch ) - copied_count = api.copy_files( + api.copy_files( f"hf://{repo_id}@{branch}/nested/from-branch.txt", f"hf://buckets/{bucket_write}/from-repo.txt", ) - assert copied_count == 1 output_path = tmp_path / "from-repo.txt" api.download_bucket_files(bucket_write, [("from-repo.txt", output_path)]) assert output_path.read_bytes() == b"branch" diff --git a/tests/test_buckets_cli.py b/tests/test_buckets_cli.py index bbec77a37f..9c0fe703db 100644 --- a/tests/test_buckets_cli.py +++ b/tests/test_buckets_cli.py @@ -886,7 +886,7 @@ def test_cp_remote_bucket_to_bucket(api: HfApi): ) assert result.exit_code == 0, result.output - assert "Copied 2 file(s)" in result.output + assert f"Copied: hf://buckets/{source_bucket}/logs -> hf://buckets/{destination_bucket}/backup/" in result.output files = _remote_files(api, destination_bucket) assert files >= {"backup/a.txt", "backup/sub/b.txt"} assert "backup/c.txt" not in files @@ -908,7 +908,9 @@ def test_cp_remote_repo_to_bucket(api: HfApi): f"hf buckets cp hf://{repo_id}@{branch}/nested/from-branch.txt hf://buckets/{destination_bucket}/copied.txt" ) assert result.exit_code == 0, result.output - assert "Copied 1 file(s)" in result.output + assert ( + f"Copied: hf://{repo_id}@{branch}/nested/from-branch.txt -> hf://buckets/{destination_bucket}/copied.txt" + ) in result.output finally: api.delete_repo(repo_id=repo_id) From 0ad2305d6d73f6c8a4d7f8348e52b0aad4f1bc6e Mon Sep 17 00:00:00 2001 From: Lucain Pouget Date: Mon, 2 Mar 2026 16:13:01 +0100 Subject: [PATCH 03/21] remove useless --- pr.md | 88 ------------------------------------------------ pr_copy_files.md | 71 -------------------------------------- 2 files changed, 159 deletions(-) delete mode 100644 pr.md delete mode 100644 pr_copy_files.md diff --git a/pr.md b/pr.md deleted file mode 100644 index 7e32f91f85..0000000000 --- a/pr.md +++ /dev/null @@ -1,88 +0,0 @@ -## Summary - -This PR adds a new `HfApi.copy_files` API and extends `hf buckets cp` to support remote HF-handle copy workflows. - -### New capability - -- Copy from bucket to bucket (same bucket or different bucket) -- Copy from repo (model/dataset/space) to bucket -- Reject bucket->repo and repo->repo destinations (not supported yet) - -## API changes - -### Added - -- `HfApi.copy_files(source, destination, *, token=None) -> None` -- top-level alias export: `copy_files` - -### Handle support - -- Source and destination accept HF handles (`hf://...`) -- Repo source handles support explicit repo type prefixes and optional `@revision` - -Examples: - -- `hf://buckets///path/to/file` -- `hf://datasets///path/to/folder/` -- `hf:////path/to/file` -- `hf:///@/path/to/file` - -## Copy behavior - -- File source: copy one file -- Folder source: recursively copy files under the source folder -- Folder copy requires destination to end with `/` - -### Content transfer strategy - -- Repo source file with `xet_hash`: copied directly by hash -- Repo source file without `xet_hash` (regular small file): download then re-upload -- Bucket source to same bucket: copied by hash -- Bucket source to different bucket: download then re-upload fallback - -## Internal update - -Extended `_batch_bucket_files` internals to accept prebuilt `_BucketAddFile` entries, allowing direct hash-based add operations when hash metadata is already known. - -## CLI changes - -Updated `hf buckets cp`: - -- now supports remote->remote HF handle copies via `api.copy_files` -- preserves existing local<->bucket and stdin/stdout behavior -- remote copy output (non-quiet): - - `Copied: SRC -> DST` - -## Tests added - -### `tests/test_buckets.py` - -- `test_copy_files_bucket_to_same_bucket_file` -- `test_copy_files_bucket_to_different_bucket_folder` -- `test_copy_files_repo_to_bucket_with_revision` -- `test_copy_files_bucket_to_repo_raises` -- `test_copy_files_folder_requires_destination_suffix` - -### `tests/test_buckets_cli.py` - -- `test_cp_remote_bucket_to_bucket` -- `test_cp_remote_repo_to_bucket` -- `test_cp_error_bucket_to_repo` -- `test_cp_error_remote_folder_requires_destination_suffix` - -## Documentation - -Updated: - -- `docs/source/en/guides/buckets.md` -- `docs/source/en/guides/cli.md` -- `docs/source/en/package_reference/cli.md` (generated) - -## Validation - -Executed successfully: - -- `make style` -- `make quality` -- `pytest tests/test_buckets.py -k copy_files -q` -- `pytest tests/test_buckets_cli.py -k "cp_remote_bucket_to_bucket or cp_remote_repo_to_bucket or cp_error_bucket_to_repo or cp_error_remote_folder_requires_destination_suffix" -q` diff --git a/pr_copy_files.md b/pr_copy_files.md deleted file mode 100644 index c1abcf370c..0000000000 --- a/pr_copy_files.md +++ /dev/null @@ -1,71 +0,0 @@ -## Summary - -Add a new `HfApi.copy_files` API and wire `hf buckets cp` to support remote HF-handle copy flows. - -### Implemented - -- Added `HfApi.copy_files(source, destination, *, token=None) -> int`. -- Supported copy directions: - - bucket -> same bucket - - bucket -> different bucket - - repo (model/dataset/space) -> bucket -- Explicitly reject bucket/repo destinations that are not buckets (bucket -> repo and repo -> repo). -- Added folder-copy rule: destination must end with `/` for folder sources. -- Added repo `@revision` support in source handles (including special refs parsing). -- Added top-level alias export: `copy_files = api.copy_files`. - -## Implementation details - -- Source/destination are parsed as HF handles (`hf://...`) with dedicated copy-handle parsing. -- For repo sources: - - if `xet_hash` exists: direct bucket add by hash - - if `xet_hash is None`: download file then re-upload to bucket -- For bucket sources: - - same-bucket copy: direct add by hash - - cross-bucket copy: download + re-upload fallback (required by backend behavior) -- Extended internal `_batch_bucket_files` to accept prebuilt `_BucketAddFile` entries and skip Xet upload when `xet_hash` is already known. - -## CLI changes - -- `hf buckets cp` now supports remote-to-remote handle copies by delegating to `api.copy_files`. -- Added new examples for: - - bucket -> bucket - - repo -> bucket -- Output now reports count for remote copies: - - `Copied file(s): SRC -> DST` - -## Tests - -### Added in `tests/test_buckets.py` - -- `test_copy_files_bucket_to_same_bucket_file` -- `test_copy_files_bucket_to_different_bucket_folder` -- `test_copy_files_repo_to_bucket_with_revision` -- `test_copy_files_bucket_to_repo_raises` -- `test_copy_files_folder_requires_destination_suffix` - -### Added in `tests/test_buckets_cli.py` - -- `test_cp_remote_bucket_to_bucket` -- `test_cp_remote_repo_to_bucket` -- `test_cp_error_bucket_to_repo` -- `test_cp_error_remote_folder_requires_destination_suffix` - -## Documentation - -Updated: - -- `docs/source/en/guides/buckets.md` -- `docs/source/en/guides/cli.md` -- `docs/source/en/package_reference/cli.md` (generated) - -## Validation - -Executed: - -- `make style` -- `make quality` -- `pytest tests/test_buckets.py -k copy_files -q` -- `pytest tests/test_buckets_cli.py -k "cp_remote_bucket_to_bucket or cp_remote_repo_to_bucket or cp_error_bucket_to_repo or cp_error_remote_folder_requires_destination_suffix" -q` - -All passed. From 122c4ae84bfa18a8d4e6661431e087de448e68f1 Mon Sep 17 00:00:00 2001 From: Lucain Pouget Date: Mon, 2 Mar 2026 16:32:55 +0100 Subject: [PATCH 04/21] docs --- docs/source/en/guides/buckets.md | 39 ++++++++++++++++---------------- docs/source/en/guides/cli.md | 5 ++-- src/huggingface_hub/hf_api.py | 2 +- 3 files changed, 23 insertions(+), 23 deletions(-) diff --git a/docs/source/en/guides/buckets.md b/docs/source/en/guides/buckets.md index 64f2f98a3b..4256c1e8e5 100644 --- a/docs/source/en/guides/buckets.md +++ b/docs/source/en/guides/buckets.md @@ -456,14 +456,28 @@ You can also pipe the content of a file directly to `stdout` using `-`: >>> hf buckets cp hf://buckets/username/my-bucket/config.json - | jq . ``` -### Copy between remote HF handles +### Download a directory with the CLI + +Use `hf buckets sync` to download all files from a bucket to a local directory: + +```bash +# Download bucket contents to a local directory +>>> hf buckets sync hf://buckets/username/my-bucket ./data + +# Download only a specific prefix +>>> hf buckets sync hf://buckets/username/my-bucket/models ./local-models +``` + +See the [Sync directories](#sync-directories) section below for the full set of sync options. + +## Copy files to Bucket -Use [`copy_files`] to copy files between HF handles: +Use [`copy_files`] to copy files already hosted on the Hub to a Bucket: ```py >>> from huggingface_hub import copy_files -# Bucket to bucket (same bucket or different bucket) +# Bucket to bucket (same or different bucket) >>> copy_files( ... "hf://buckets/username/source-bucket/checkpoints/model.safetensors", ... "hf://buckets/username/destination-bucket/archive/model.safetensors", @@ -489,23 +503,8 @@ The same is available from the CLI: Notes: - Folder copy requires destination to end with `/`. -- Bucket-to-repo copy is not supported. -- For repo sources, files with an available `xet_hash` are copied directly. Regular files without `xet_hash` are downloaded - and re-uploaded. - -### Download a directory with the CLI - -Use `hf buckets sync` to download all files from a bucket to a local directory: - -```bash -# Download bucket contents to a local directory ->>> hf buckets sync hf://buckets/username/my-bucket ./data - -# Download only a specific prefix ->>> hf buckets sync hf://buckets/username/my-bucket/models ./local-models -``` - -See the [Sync directories](#sync-directories) section below for the full set of sync options. +- Bucket-to-repo copy is not yet supported. +- Small text files are not tracked with Xet on repo sources. To copy them to a Bucket, they are downloaded and re-uploaded. ## Sync directories diff --git a/docs/source/en/guides/cli.md b/docs/source/en/guides/cli.md index 974157e585..625a8a7a0e 100644 --- a/docs/source/en/guides/cli.md +++ b/docs/source/en/guides/cli.md @@ -657,7 +657,8 @@ To filter by prefix, append the prefix to the bucket path: ### Copy single files -Use `hf buckets cp` to copy files between local paths and HF handles, or between remote HF handles. +Use `hf buckets cp` to copy individual files to and from a bucket, or to copy any file hosted on the Hub to a Bucket. +Bucket paths use the `hf://buckets/` prefix. To upload a file: @@ -687,7 +688,7 @@ You can also stream to stdout or from stdin using `-`: >>> echo "hello" | hf buckets cp - hf://buckets/username/my-bucket/hello.txt ``` -To copy between remote handles: +To copy from a repo or a bucket on the Hub: ```bash # Bucket to bucket diff --git a/src/huggingface_hub/hf_api.py b/src/huggingface_hub/hf_api.py index 784a5ad743..a68d3a1880 100644 --- a/src/huggingface_hub/hf_api.py +++ b/src/huggingface_hub/hf_api.py @@ -11818,7 +11818,7 @@ def copy_files( *, token: Union[str, bool, None] = None, ) -> None: - """Copy files between HF handles. + """Copy files on the Hub. Supported: - bucket -> bucket From 641470a7a621342e94e84a8a9a63b65a1f2805d0 Mon Sep 17 00:00:00 2001 From: Lucain Pouget Date: Mon, 2 Mar 2026 16:34:43 +0100 Subject: [PATCH 05/21] do not catch --- src/huggingface_hub/cli/buckets.py | 2 -- 1 file changed, 2 deletions(-) diff --git a/src/huggingface_hub/cli/buckets.py b/src/huggingface_hub/cli/buckets.py index 0c7cbd511d..a45ff18dfb 100644 --- a/src/huggingface_hub/cli/buckets.py +++ b/src/huggingface_hub/cli/buckets.py @@ -956,8 +956,6 @@ def cp( disable_progress_bars() try: api.copy_files(src, dst) - except ValueError as error: - raise typer.BadParameter(str(error)) finally: if quiet: enable_progress_bars() From 5c5b5ea197c4789f1506ee5e6f219f4fa7b553e3 Mon Sep 17 00:00:00 2001 From: Lucain Pouget Date: Mon, 2 Mar 2026 16:35:15 +0100 Subject: [PATCH 06/21] comment --- src/huggingface_hub/cli/buckets.py | 2 ++ 1 file changed, 2 insertions(+) diff --git a/src/huggingface_hub/cli/buckets.py b/src/huggingface_hub/cli/buckets.py index a45ff18dfb..0e75d4dea7 100644 --- a/src/huggingface_hub/cli/buckets.py +++ b/src/huggingface_hub/cli/buckets.py @@ -950,6 +950,7 @@ def cp( src_is_stdin = src == "-" dst_is_stdout = dst == "-" + # Remote to remote copy if src_is_hf and dst_is_hf: assert dst is not None if quiet: @@ -964,6 +965,7 @@ def cp( print(f"Copied: {src} -> {dst}") return + # Local to remote copy # --- Validation --- if not src_is_bucket and not dst_is_bucket and not src_is_stdin: if dst is None: From d97522fce2a514beb8ad0caeac6157ce59428f32 Mon Sep 17 00:00:00 2001 From: Lucain Pouget Date: Mon, 2 Mar 2026 17:13:07 +0100 Subject: [PATCH 07/21] much better --- src/huggingface_hub/__init__.py | 3 + src/huggingface_hub/_buckets.py | 9 +- src/huggingface_hub/hf_api.py | 172 +++++++++++++++----------------- 3 files changed, 88 insertions(+), 96 deletions(-) diff --git a/src/huggingface_hub/__init__.py b/src/huggingface_hub/__init__.py index 01713d269b..0cc6b315c9 100644 --- a/src/huggingface_hub/__init__.py +++ b/src/huggingface_hub/__init__.py @@ -201,6 +201,7 @@ "cancel_job", "change_discussion_status", "comment_discussion", + "copy_files", "create_branch", "create_bucket", "create_collection", @@ -891,6 +892,7 @@ "check_cli_update", "close_session", "comment_discussion", + "copy_files", "create_branch", "create_bucket", "create_collection", @@ -1305,6 +1307,7 @@ def __dir__(): cancel_job, # noqa: F401 change_discussion_status, # noqa: F401 comment_discussion, # noqa: F401 + copy_files, # noqa: F401 create_branch, # noqa: F401 create_bucket, # noqa: F401 create_collection, # noqa: F401 diff --git a/src/huggingface_hub/_buckets.py b/src/huggingface_hub/_buckets.py index f291890d06..aa3033fd7f 100644 --- a/src/huggingface_hub/_buckets.py +++ b/src/huggingface_hub/_buckets.py @@ -114,9 +114,12 @@ def __post_init__(self) -> None: if self.content_type is None: # or default to destination path content type self.content_type = mimetypes.guess_type(self.destination)[0] - self.mtime = int( - os.path.getmtime(self.source) * 1000 if not isinstance(self.source, bytes) else time.time() * 1000 - ) + self.mtime = int(time.time() * 1000) + if isinstance(self.source, str): + try: + self.mtime = int(os.path.getmtime(self.source) * 1000) + except FileNotFoundError: + pass @dataclass diff --git a/src/huggingface_hub/hf_api.py b/src/huggingface_hub/hf_api.py index a68d3a1880..6e11d79cf1 100644 --- a/src/huggingface_hub/hf_api.py +++ b/src/huggingface_hub/hf_api.py @@ -20,7 +20,6 @@ import json import re import struct -import tempfile import time import warnings from collections import defaultdict @@ -11818,17 +11817,48 @@ def copy_files( *, token: Union[str, bool, None] = None, ) -> None: - """Copy files on the Hub. + """Copy files between locations on the Hub. - Supported: - - bucket -> bucket - - repo (model/dataset/space) -> bucket + Copy files from a bucket or repository (model, dataset, space) to a bucket. Both individual files and + entire folders are supported. When copying a folder, the destination path must end with `/`. - Unsupported: - - bucket -> repo - - local paths + Currently, only bucket destinations are supported. Copying to a repository is not supported. - If source is a folder, destination must end with ``/``. + Args: + source (`str`): + Source location as an `hf://` handle. Can be a bucket path (e.g. `"hf://buckets/my-bucket/path/to/file"`) + or a repo path (e.g. `"hf://username/my-model/weights.bin"`, `"hf://datasets/username/my-dataset/data/"`). + destination (`str`): + Destination location as an `hf://` handle pointing to a bucket + (e.g. `"hf://buckets/my-bucket/target/path"`). Must end with `/` when copying a folder. + token (`bool` or `str`, *optional*): + A valid user access token (string). Defaults to the locally saved + token, which is the recommended method for authentication (see + https://huggingface.co/docs/huggingface_hub/quick-start#authentication). + To disable authentication, pass `False`. + + Raises: + [`ValueError`](https://docs.python.org/3/library/exceptions.html#ValueError): + If the destination is not a bucket, if the source/destination handles are invalid, or if a folder + copy is attempted without a trailing `/` on the destination. + + Example: + ```python + >>> from huggingface_hub import HfApi + >>> api = HfApi() + + # Copy a single file between buckets + >>> api.copy_files("hf://buckets/my-bucket/data.bin", "hf://buckets/other-bucket/data.bin") + + # Copy a folder from a bucket to another bucket + >>> api.copy_files("hf://buckets/my-bucket/models/", "hf://buckets/other-bucket/backup/") + + # Copy a file from a model repo to a bucket + >>> api.copy_files("hf://username/my-model/model.safetensors", "hf://buckets/my-bucket/") + + # Copy an entire dataset to a bucket + >>> api.copy_files("hf://datasets/username/my-dataset/", "hf://buckets/my-bucket/datasets/") + ``` """ source_handle = _parse_hf_copy_handle(source) destination_handle = _parse_hf_copy_handle(destination) @@ -11836,14 +11866,11 @@ def copy_files( if isinstance(destination_handle, _RepoCopyHandle): raise ValueError("Bucket-to-repo and repo-to-repo copy are not supported. Destination must be a bucket.") - if isinstance(source_handle, _BucketCopyHandle) and isinstance(destination_handle, _RepoCopyHandle): - raise ValueError("Bucket-to-repo copy is not supported.") - destination_bucket_id = destination_handle.bucket_id destination_path = destination_handle.path destination_is_directory = destination_handle.is_directory or destination_path == "" - hash_based_adds: list[_BucketAddFile] = [] + all_adds: list[Union[_BucketAddFile, tuple[str, str]]] = [] def _resolve_target_path(src_file_path: str, src_root_path: Optional[str], is_single_file: bool) -> str: basename = src_file_path.rsplit("/", 1)[-1] @@ -11871,38 +11898,39 @@ def _resolve_target_path(src_file_path: str, src_root_path: Optional[str], is_si return rel_path return f"{destination_path.rstrip('/')}/{rel_path}" - def _flush_hash_based_adds() -> None: - nonlocal hash_based_adds - if not hash_based_adds: - return - for add_chunk in chunk_iterable(hash_based_adds, chunk_size=_BUCKET_BATCH_ADD_CHUNK_SIZE): - self._batch_bucket_files(destination_bucket_id, add=list(add_chunk), token=token) - hash_based_adds = [] + def _hash_copy(target_path: str, xet_hash: str, size: int) -> _BucketAddFile: + """Server-side copy by xet hash — no data transfer needed.""" + return _BucketAddFile(source=b"", destination=target_path, xet_hash=xet_hash, size=size) + + def _download_from_repo(file_path: str) -> str: + """Download a repo file to local cache, return the cache path.""" + return self.hf_hub_download( + repo_id=source_handle.repo_id, + repo_type=source_handle.repo_type, + filename=file_path, + revision=source_handle.revision, + token=token, + ) + + def _add_repo_file(file: RepoFile, target_path: str) -> None: + """Queue a repo file: hash-copy if xet-backed, otherwise download first.""" + if file.xet_hash is not None: + all_adds.append(_hash_copy(target_path, file.xet_hash, file.size)) + else: + all_adds.append((_download_from_repo(file.path), target_path)) + # === Source is a bucket: always hash-based copy (no download needed) === if isinstance(source_handle, _BucketCopyHandle): source_path = source_handle.path - same_bucket_copy = source_handle.bucket_id == destination_bucket_id source_path_info = list(self.get_bucket_paths_info(source_handle.bucket_id, [source_path], token=token)) + if source_path_info: + # Source path matched a single file source_file = source_path_info[0] target_path = _resolve_target_path(source_file.path, None, is_single_file=True) - if same_bucket_copy: - hash_based_adds.append( - _BucketAddFile( - source=b"", - destination=target_path, - xet_hash=source_file.xet_hash, - size=source_file.size, - ) - ) - else: - with tempfile.TemporaryDirectory() as tmp_dir: - local_path = str(Path(tmp_dir) / source_file.path.rsplit("/", 1)[-1]) - self.download_bucket_files( - source_handle.bucket_id, [(source_file.path, local_path)], token=token - ) - self.batch_bucket_files(destination_bucket_id, add=[(local_path, target_path)], token=token) + all_adds.append(_hash_copy(target_path, source_file.xet_hash, source_file.size)) else: + # Source path is a folder (or prefix) — list and copy all matching files if source_path != "" and not destination_is_directory: raise ValueError("Folder copy requires destination to end with '/'.") for item in self.list_bucket_tree( @@ -11913,18 +11941,9 @@ def _flush_hash_based_adds() -> None: if source_path and not (item.path == source_path or item.path.startswith(source_path + "/")): continue target_path = _resolve_target_path(item.path, source_path or None, is_single_file=False) - if same_bucket_copy: - hash_based_adds.append( - _BucketAddFile(source=b"", destination=target_path, xet_hash=item.xet_hash, size=item.size) - ) - else: - _flush_hash_based_adds() - with tempfile.TemporaryDirectory() as tmp_dir: - local_path = str(Path(tmp_dir) / item.path.rsplit("/", 1)[-1]) - self.download_bucket_files(source_handle.bucket_id, [(item.path, local_path)], token=token) - self.batch_bucket_files( - destination_bucket_id, add=[(local_path, target_path)], token=token - ) + all_adds.append(_hash_copy(target_path, item.xet_hash, item.size)) + + # === Source is a repo: hash-copy if xet-backed, download otherwise === else: source_path = source_handle.path source_path_info: list[Union[RepoFile, RepoFolder]] = [] @@ -11938,34 +11957,16 @@ def _flush_hash_based_adds() -> None: ) if len(source_path_info) == 1 and isinstance(source_path_info[0], RepoFile): - source_file = source_path_info[0] - target_path = _resolve_target_path(source_file.path, None, is_single_file=True) - if source_file.xet_hash is not None: - hash_based_adds.append( - _BucketAddFile( - source=b"", - destination=target_path, - xet_hash=source_file.xet_hash, - size=source_file.size, - ) - ) - else: - with tempfile.TemporaryDirectory() as tmp_dir: - local_path = self.hf_hub_download( - repo_id=source_handle.repo_id, - repo_type=source_handle.repo_type, - filename=source_file.path, - revision=source_handle.revision, - local_dir=tmp_dir, - token=token, - ) - self.batch_bucket_files(destination_bucket_id, add=[(local_path, target_path)], token=token) + # Source path matched a single file + target_path = _resolve_target_path(source_path_info[0].path, None, is_single_file=True) + _add_repo_file(source_path_info[0], target_path) else: + # Source path is a folder — list and copy all files recursively if source_path and not destination_is_directory: raise ValueError("Folder copy requires destination to end with '/'.") for item in self.list_repo_tree( repo_id=source_handle.repo_id, - path_in_repo=source_path or None, + path_in_repo=source_path, recursive=True, repo_type=source_handle.repo_type, revision=source_handle.revision, @@ -11974,27 +11975,12 @@ def _flush_hash_based_adds() -> None: if not isinstance(item, RepoFile): continue target_path = _resolve_target_path(item.path, source_path or None, is_single_file=False) - if item.xet_hash is not None: - hash_based_adds.append( - _BucketAddFile(source=b"", destination=target_path, xet_hash=item.xet_hash, size=item.size) - ) - else: - _flush_hash_based_adds() - with tempfile.TemporaryDirectory() as tmp_dir: - local_path = self.hf_hub_download( - repo_id=source_handle.repo_id, - repo_type=source_handle.repo_type, - filename=item.path, - revision=source_handle.revision, - local_dir=tmp_dir, - token=token, - ) - self.batch_bucket_files( - destination_bucket_id, add=[(local_path, target_path)], token=token - ) - - _flush_hash_based_adds() - return None + _add_repo_file(item, target_path) + + # Single batched call at the end for all collected adds + if all_adds: + for add_chunk in chunk_iterable(all_adds, chunk_size=_BUCKET_BATCH_ADD_CHUNK_SIZE): + self._batch_bucket_files(destination_bucket_id, add=list(add_chunk), token=token) @validate_hf_hub_args def batch_bucket_files( From d99a877d04c4ef63fbb9caad3a1f05a378ba86b5 Mon Sep 17 00:00:00 2001 From: Lucain Pouget Date: Fri, 3 Apr 2026 18:39:36 +0200 Subject: [PATCH 08/21] Server-side copies --- docs/source/en/guides/buckets.md | 19 ++++- src/huggingface_hub/_buckets.py | 15 ++++ src/huggingface_hub/hf_api.py | 122 +++++++++++++++++++++++-------- 3 files changed, 125 insertions(+), 31 deletions(-) diff --git a/docs/source/en/guides/buckets.md b/docs/source/en/guides/buckets.md index 4256c1e8e5..28b991a02c 100644 --- a/docs/source/en/guides/buckets.md +++ b/docs/source/en/guides/buckets.md @@ -348,6 +348,20 @@ Use [`batch_bucket_files`] to upload files to a bucket. You can upload from loca ... ) ``` +You can also copy xet files from another bucket or repository using the `copy` parameter. This is a server-side +operation — no data is downloaded or re-uploaded: + +```python +# Copy files by xet hash (source_repo_type, source_repo_id, xet_hash, destination) +>>> batch_bucket_files( +... "username/my-bucket", +... copy=[ +... ("bucket", "username/source-bucket", "xet-hash-abc123", "models/model.safetensors"), +... ("model", "username/my-model", "xet-hash-def456", "models/config.safetensors"), +... ], +... ) +``` + You can also delete files while uploading others. ```python @@ -360,7 +374,7 @@ You can also delete files while uploading others. ``` > [!WARNING] -> Calls to [`batch_bucket_files`] are non-transactional. If an error occurs during the process, some files may have been uploaded or deleted while others haven't. +> Calls to [`batch_bucket_files`] are non-transactional. If an error occurs during the process, some files may have been uploaded, copied, or deleted while others haven't. ### Upload a single file with the CLI @@ -504,7 +518,8 @@ Notes: - Folder copy requires destination to end with `/`. - Bucket-to-repo copy is not yet supported. -- Small text files are not tracked with Xet on repo sources. To copy them to a Bucket, they are downloaded and re-uploaded. +- Files tracked with Xet (in buckets or repos) are copied server-side by hash — no data is downloaded or re-uploaded. +- Small text files not tracked with Xet on repo sources are downloaded and re-uploaded to the destination bucket. ## Sync directories diff --git a/src/huggingface_hub/_buckets.py b/src/huggingface_hub/_buckets.py index 58b825161b..e38952d6cd 100644 --- a/src/huggingface_hub/_buckets.py +++ b/src/huggingface_hub/_buckets.py @@ -122,6 +122,21 @@ def __post_init__(self) -> None: pass +@dataclass +class _BucketCopyFile: + destination: str + xet_hash: str + source_repo_type: str # "model", "dataset", "space", "bucket" + source_repo_id: str + size: int | None = field(default=None) + mtime: int = field(init=False) + content_type: str | None = field(init=False) + + def __post_init__(self) -> None: + self.content_type = mimetypes.guess_type(self.destination)[0] + self.mtime = int(time.time() * 1000) + + @dataclass class _BucketDeleteFile: path: str diff --git a/src/huggingface_hub/hf_api.py b/src/huggingface_hub/hf_api.py index 56be36192a..ea3f065aab 100644 --- a/src/huggingface_hub/hf_api.py +++ b/src/huggingface_hub/hf_api.py @@ -35,8 +35,6 @@ BinaryIO, Literal, Optional, - Sequence, - Type, TypeVar, overload, ) @@ -62,6 +60,7 @@ BucketUrl, SyncPlan, _BucketAddFile, + _BucketCopyFile, _BucketDeleteFile, _split_bucket_id_and_prefix, sync_bucket_internal, @@ -414,7 +413,7 @@ def repo_type_and_id_from_hf_id(hf_id: str, hub_url: str | None = None) -> tuple return repo_type, namespace, repo_id -def _parse_hf_copy_handle(hf_handle: str) -> Union[_BucketCopyHandle, _RepoCopyHandle]: +def _parse_hf_copy_handle(hf_handle: str) -> _BucketCopyHandle | _RepoCopyHandle: if not hf_handle.startswith("hf://"): raise ValueError(f"Invalid HF handle: '{hf_handle}'. Expected a path starting with 'hf://'.") @@ -12499,13 +12498,7 @@ def get_bucket_paths_info( yield BucketFile(**path_info) @validate_hf_hub_args - def copy_files( - self, - source: str, - destination: str, - *, - token: Union[str, bool, None] = None, - ) -> None: + def copy_files(self, source: str, destination: str, *, token: str | bool | None = None) -> None: """Copy files between locations on the Hub. Copy files from a bucket or repository (model, dataset, space) to a bucket. Both individual files and @@ -12559,7 +12552,8 @@ def copy_files( destination_path = destination_handle.path destination_is_directory = destination_handle.is_directory or destination_path == "" - all_adds: list[Union[_BucketAddFile, tuple[str, str]]] = [] + all_adds: list[_BucketAddFile | tuple[str, str]] = [] + all_copies: list[_BucketCopyFile] = [] def _resolve_target_path(src_file_path: str, src_root_path: Optional[str], is_single_file: bool) -> str: basename = src_file_path.rsplit("/", 1)[-1] @@ -12587,9 +12581,17 @@ def _resolve_target_path(src_file_path: str, src_root_path: Optional[str], is_si return rel_path return f"{destination_path.rstrip('/')}/{rel_path}" - def _hash_copy(target_path: str, xet_hash: str, size: int) -> _BucketAddFile: + def _copy_by_hash( + target_path: str, xet_hash: str, size: int, source_repo_type: str, source_repo_id: str + ) -> _BucketCopyFile: """Server-side copy by xet hash — no data transfer needed.""" - return _BucketAddFile(source=b"", destination=target_path, xet_hash=xet_hash, size=size) + return _BucketCopyFile( + destination=target_path, + xet_hash=xet_hash, + source_repo_type=source_repo_type, + source_repo_id=source_repo_id, + size=size, + ) def _download_from_repo(file_path: str) -> str: """Download a repo file to local cache, return the cache path.""" @@ -12602,9 +12604,13 @@ def _download_from_repo(file_path: str) -> str: ) def _add_repo_file(file: RepoFile, target_path: str) -> None: - """Queue a repo file: hash-copy if xet-backed, otherwise download first.""" + """Queue a repo file: copy-by-hash if xet-backed, otherwise download first.""" if file.xet_hash is not None: - all_adds.append(_hash_copy(target_path, file.xet_hash, file.size)) + all_copies.append( + _copy_by_hash( + target_path, file.xet_hash, file.size, source_handle.repo_type, source_handle.repo_id + ) + ) else: all_adds.append((_download_from_repo(file.path), target_path)) @@ -12617,7 +12623,11 @@ def _add_repo_file(file: RepoFile, target_path: str) -> None: # Source path matched a single file source_file = source_path_info[0] target_path = _resolve_target_path(source_file.path, None, is_single_file=True) - all_adds.append(_hash_copy(target_path, source_file.xet_hash, source_file.size)) + all_copies.append( + _copy_by_hash( + target_path, source_file.xet_hash, source_file.size, "bucket", source_handle.bucket_id + ) + ) else: # Source path is a folder (or prefix) — list and copy all matching files if source_path != "" and not destination_is_directory: @@ -12630,12 +12640,14 @@ def _add_repo_file(file: RepoFile, target_path: str) -> None: if source_path and not (item.path == source_path or item.path.startswith(source_path + "/")): continue target_path = _resolve_target_path(item.path, source_path or None, is_single_file=False) - all_adds.append(_hash_copy(target_path, item.xet_hash, item.size)) + all_copies.append( + _copy_by_hash(target_path, item.xet_hash, item.size, "bucket", source_handle.bucket_id) + ) - # === Source is a repo: hash-copy if xet-backed, download otherwise === + # === Source is a repo: copy-by-hash if xet-backed, download otherwise === else: source_path = source_handle.path - source_path_info: list[Union[RepoFile, RepoFolder]] = [] + source_path_info: list[RepoFile | RepoFolder] = [] if source_path != "": source_path_info = self.get_paths_info( repo_id=source_handle.repo_id, @@ -12666,7 +12678,10 @@ def _add_repo_file(file: RepoFile, target_path: str) -> None: target_path = _resolve_target_path(item.path, source_path or None, is_single_file=False) _add_repo_file(item, target_path) - # Single batched call at the end for all collected adds + # Send copies first (no upload needed), then adds (may need upload) + if all_copies: + for copy_chunk in chunk_iterable(all_copies, chunk_size=_BUCKET_BATCH_ADD_CHUNK_SIZE): + self._batch_bucket_files(destination_bucket_id, copy=list(copy_chunk), token=token) if all_adds: for add_chunk in chunk_iterable(all_adds, chunk_size=_BUCKET_BATCH_ADD_CHUNK_SIZE): self._batch_bucket_files(destination_bucket_id, add=list(add_chunk), token=token) @@ -12677,12 +12692,14 @@ def batch_bucket_files( bucket_id: str, *, add: list[tuple[str | Path | bytes, str]] | None = None, + copy: list[tuple[str, str, str, str]] | None = None, delete: list[str] | None = None, token: str | bool | None = None, ): - """Add and/or delete files in a bucket. + """Add, copy, and/or delete files in a bucket. - This is a non-transactional operation. If an error occurs in the process, some files may have been uploaded or deleted, + This is a non-transactional operation. If an error occurs in the process, some files may have been uploaded, + copied, or deleted while others haven't. Args: bucket_id (`str`): @@ -12690,6 +12707,15 @@ def batch_bucket_files( add (`list` of `tuple`, *optional*): Files to upload. Each element is a `(source, destination)` tuple where `source` is a path to a local file (`str` or `Path`) or raw `bytes` content, and `destination` is the path in the bucket. + copy (`list` of `tuple`, *optional*): + Files to copy by xet hash. Each element is a `(source_repo_type, source_repo_id, xet_hash, + destination)` tuple where: + - `source_repo_type` is the type of the source repository: `"model"`, `"dataset"`, `"space"`, or + `"bucket"`. + - `source_repo_id` is the ID of the source repository or bucket (e.g. `"username/my-model"`). + - `xet_hash` is the xet hash of the file to copy. + - `destination` is the destination path in the bucket. + This is a server-side operation — no data is downloaded or re-uploaded. delete (`list` of `str`, *optional*): Paths of files to delete from the bucket. token (`bool` or `str`, *optional*): @@ -12711,6 +12737,15 @@ def batch_bucket_files( ... ], ... ) + # Copy xet files from another bucket or repo (server-side, no data transfer) + >>> batch_bucket_files( + ... "username/my-bucket", + ... copy=[ + ... ("bucket", "username/source-bucket", "xet-hash-abc123", "models/model.safetensors"), + ... ("model", "username/my-model", "xet-hash-def456", "models/config.safetensors"), + ... ], + ... ) + # Delete files >>> batch_bucket_files("username/my-bucket", delete=["old-model.bin"]) @@ -12723,16 +12758,31 @@ def batch_bucket_files( ``` """ add = [] if add is None else add + copy = [] if copy is None else copy delete = [] if delete is None else delete + # Convert public copy tuples to internal _BucketCopyFile objects + copy_operations = [ + _BucketCopyFile( + destination=destination, + xet_hash=xet_hash, + source_repo_type=source_repo_type, + source_repo_id=source_repo_id, + ) + for source_repo_type, source_repo_id, xet_hash, destination in copy + ] + # Small batch: do everything in one call - if len(add) + len(delete) <= _BUCKET_BATCH_ADD_CHUNK_SIZE: - add_payload: Optional[list[tuple[Union[str, Path, bytes], str]]] = add if len(add) > 0 else None - delete_payload: Optional[list[str]] = delete if len(delete) > 0 else None - self._batch_bucket_files(bucket_id, add=add_payload, delete=delete_payload, token=token) + if len(add) + len(copy_operations) + len(delete) <= _BUCKET_BATCH_ADD_CHUNK_SIZE: + add_payload: list[tuple[str | Path | bytes, str] | _BucketAddFile] | None = ( + list(add) if len(add) > 0 else None + ) + copy_payload: list[_BucketCopyFile] | None = copy_operations if len(copy_operations) > 0 else None + delete_payload: list[str] | None = delete if len(delete) > 0 else None + self._batch_bucket_files(bucket_id, add=add_payload, copy=copy_payload, delete=delete_payload, token=token) return - # Large batch: chunk adds first, then deletes + # Large batch: chunk copies first (no upload), then adds, then deletes from .utils._xet_progress_reporting import XetProgressReporter if add and not are_progress_bars_disabled(): @@ -12741,8 +12791,11 @@ def batch_bucket_files( progress = None try: + for copy_chunk in chunk_iterable(copy_operations, chunk_size=_BUCKET_BATCH_ADD_CHUNK_SIZE): + self._batch_bucket_files(bucket_id, copy=list(copy_chunk), token=token) + for add_chunk in chunk_iterable(add, chunk_size=_BUCKET_BATCH_ADD_CHUNK_SIZE): - add_chunk_list: list[tuple[Union[str, Path, bytes], str]] = list(add_chunk) + add_chunk_list: list[tuple[str | Path | bytes, str] | _BucketAddFile] = list(add_chunk) self._batch_bucket_files(bucket_id, add=add_chunk_list, token=token, _progress=progress) for delete_chunk in chunk_iterable(delete, chunk_size=_BUCKET_BATCH_DELETE_CHUNK_SIZE): @@ -12758,13 +12811,14 @@ def _batch_bucket_files( bucket_id: str, *, add: list[tuple[str | Path | bytes, str] | _BucketAddFile] | None = None, + copy: list[_BucketCopyFile] | None = None, delete: list[str] | None = None, token: str | bool | None = None, _progress: XetProgressReporter | None = None, ): """Internal method: process a single batch of bucket file operations (upload to XET + call /batch).""" # Convert public API inputs to internal operation objects - operations: list[_BucketAddFile | _BucketDeleteFile] = [] + operations: list[_BucketAddFile | _BucketCopyFile | _BucketDeleteFile] = [] if add: for item in add: if isinstance(item, _BucketAddFile): @@ -12772,6 +12826,8 @@ def _batch_bucket_files( else: source, destination = item operations.append(_BucketAddFile(source=source, destination=destination)) + if copy: + operations.extend(copy) if delete: for path in delete: operations.append(_BucketDeleteFile(path=path)) @@ -12886,6 +12942,14 @@ def _payload_as_ndjson() -> Iterable[bytes]: } if op.content_type is not None: payload["contentType"] = op.content_type + elif isinstance(op, _BucketCopyFile): + payload = { + "type": "copyFile", + "path": op.destination, + "xetHash": op.xet_hash, + "sourceRepoType": op.source_repo_type, + "sourceRepoId": op.source_repo_id, + } else: payload = { "type": "deleteFile", From 1dee89ac4edbf401abb1028e7830f82881ab47f3 Mon Sep 17 00:00:00 2001 From: Lucain Pouget Date: Tue, 7 Apr 2026 14:52:57 +0200 Subject: [PATCH 09/21] check remote path --- docs/source/en/guides/buckets.md | 11 ++-- docs/source/en/guides/cli.md | 2 +- src/huggingface_hub/hf_api.py | 28 ++++++---- tests/test_buckets.py | 87 ++++++++++++++++++++++++++++++-- 4 files changed, 109 insertions(+), 19 deletions(-) diff --git a/docs/source/en/guides/buckets.md b/docs/source/en/guides/buckets.md index 28b991a02c..fd980e6f2d 100644 --- a/docs/source/en/guides/buckets.md +++ b/docs/source/en/guides/buckets.md @@ -348,20 +348,21 @@ Use [`batch_bucket_files`] to upload files to a bucket. You can upload from loca ... ) ``` -You can also copy xet files from another bucket or repository using the `copy` parameter. This is a server-side -operation — no data is downloaded or re-uploaded: +You can also copy xet files from another bucket or repository using the `copy` parameter. This is a server-side operation — no data is downloaded or re-uploaded: ```python # Copy files by xet hash (source_repo_type, source_repo_id, xet_hash, destination) >>> batch_bucket_files( ... "username/my-bucket", ... copy=[ -... ("bucket", "username/source-bucket", "xet-hash-abc123", "models/model.safetensors"), -... ("model", "username/my-model", "xet-hash-def456", "models/config.safetensors"), +... ("bucket", "username/source-bucket", "", "models/model.safetensors"), +... ("model", "username/my-model", "", "models/config.safetensors"), ... ], ... ) ``` +Xet hashes can be retrieved using `list_repo_tree`. + You can also delete files while uploading others. ```python @@ -516,7 +517,7 @@ The same is available from the CLI: Notes: -- Folder copy requires destination to end with `/`. +- When copying a folder, the destination is checked remotely to determine whether it is an existing file. If it is, an error is raised. Otherwise, it is treated as a directory. - Bucket-to-repo copy is not yet supported. - Files tracked with Xet (in buckets or repos) are copied server-side by hash — no data is downloaded or re-uploaded. - Small text files not tracked with Xet on repo sources are downloaded and re-uploaded to the destination bucket. diff --git a/docs/source/en/guides/cli.md b/docs/source/en/guides/cli.md index 898800538d..2bd94a87c8 100644 --- a/docs/source/en/guides/cli.md +++ b/docs/source/en/guides/cli.md @@ -716,7 +716,7 @@ To copy from a repo or a bucket on the Hub: Notes: -- Folder copy requires destination to end with `/`. +- When copying a folder, the destination is checked remotely to determine whether it is an existing file. If it is, an error is raised. Otherwise, it is treated as a directory. - Bucket-to-repo copy is not supported. ### Sync directories diff --git a/src/huggingface_hub/hf_api.py b/src/huggingface_hub/hf_api.py index 884cdf3040..7bc8b0ee74 100644 --- a/src/huggingface_hub/hf_api.py +++ b/src/huggingface_hub/hf_api.py @@ -12502,7 +12502,7 @@ def copy_files(self, source: str, destination: str, *, token: str | bool | None """Copy files between locations on the Hub. Copy files from a bucket or repository (model, dataset, space) to a bucket. Both individual files and - entire folders are supported. When copying a folder, the destination path must end with `/`. + entire folders are supported. Currently, only bucket destinations are supported. Copying to a repository is not supported. @@ -12512,7 +12512,7 @@ def copy_files(self, source: str, destination: str, *, token: str | bool | None or a repo path (e.g. `"hf://username/my-model/weights.bin"`, `"hf://datasets/username/my-dataset/data/"`). destination (`str`): Destination location as an `hf://` handle pointing to a bucket - (e.g. `"hf://buckets/my-bucket/target/path"`). Must end with `/` when copying a folder. + (e.g. `"hf://buckets/my-bucket/target/path"`). token (`bool` or `str`, *optional*): A valid user access token (string). Defaults to the locally saved token, which is the recommended method for authentication (see @@ -12522,7 +12522,7 @@ def copy_files(self, source: str, destination: str, *, token: str | bool | None Raises: [`ValueError`](https://docs.python.org/3/library/exceptions.html#ValueError): If the destination is not a bucket, if the source/destination handles are invalid, or if a folder - copy is attempted without a trailing `/` on the destination. + copy targets an existing file in the destination bucket. Example: ```python @@ -12564,8 +12564,6 @@ def _resolve_target_path(src_file_path: str, src_root_path: Optional[str], is_si return f"{destination_path.rstrip('/')}/{basename}" return destination_path - if not destination_is_directory: - raise ValueError("Folder copy requires destination to end with '/'.") if src_root_path is None: rel_path = src_file_path elif src_file_path.startswith(src_root_path + "/"): @@ -12630,8 +12628,14 @@ def _add_repo_file(file: RepoFile, target_path: str) -> None: ) else: # Source path is a folder (or prefix) — list and copy all matching files - if source_path != "" and not destination_is_directory: - raise ValueError("Folder copy requires destination to end with '/'.") + if source_path != "": + # Always verify the destination is not an existing file in the bucket. + dest_info = list( + self.get_bucket_paths_info(destination_bucket_id, [destination_path], token=token) + ) + if dest_info: + raise ValueError(f"Cannot copy a folder to a file destination '{destination_path}'.") + destination_is_directory = True for item in self.list_bucket_tree( source_handle.bucket_id, prefix=source_path or None, recursive=True, token=token ): @@ -12663,8 +12667,14 @@ def _add_repo_file(file: RepoFile, target_path: str) -> None: _add_repo_file(source_path_info[0], target_path) else: # Source path is a folder — list and copy all files recursively - if source_path and not destination_is_directory: - raise ValueError("Folder copy requires destination to end with '/'.") + if source_path: + # Always verify the destination is not an existing file in the bucket. + dest_info = list( + self.get_bucket_paths_info(destination_bucket_id, [destination_path], token=token) + ) + if dest_info: + raise ValueError(f"Cannot copy a folder to a file destination '{destination_path}'.") + destination_is_directory = True for item in self.list_repo_tree( repo_id=source_handle.repo_id, path_in_repo=source_path, diff --git a/tests/test_buckets.py b/tests/test_buckets.py index f323671ce7..ed13432556 100644 --- a/tests/test_buckets.py +++ b/tests/test_buckets.py @@ -366,17 +366,96 @@ def test_copy_files_bucket_to_repo_raises(api: HfApi, bucket_write: str): @requires("hf_xet") -def test_copy_files_folder_requires_destination_suffix(api: HfApi, bucket_write: str): +def test_copy_files_folder_to_nonexistent_dest(api: HfApi, bucket_write: str): + """source=folder, dest doesn't exist => files copied under dest path.""" destination_bucket = api.create_bucket(bucket_name()).bucket_id - api.batch_bucket_files(bucket_write, add=[(b"x", "folder/x.txt")]) + api.batch_bucket_files(bucket_write, add=[(b"a", "folder/a.txt"), (b"b", "folder/sub/b.txt")]) - with pytest.raises(ValueError, match="destination to end with '/'"): + api.copy_files( + f"hf://buckets/{bucket_write}/folder", + f"hf://buckets/{destination_bucket}/target-folder", + ) + + destination_files = {entry.path for entry in api.list_bucket_tree(destination_bucket)} + assert destination_files >= {"target-folder/a.txt", "target-folder/sub/b.txt"} + + +@requires("hf_xet") +def test_copy_files_folder_to_existing_folder_dest(api: HfApi, bucket_write: str): + """source=folder, dest is an existing folder => files merged under dest path.""" + destination_bucket = api.create_bucket(bucket_name()).bucket_id + api.batch_bucket_files(bucket_write, add=[(b"a", "folder/a.txt"), (b"b", "folder/sub/b.txt")]) + api.batch_bucket_files(destination_bucket, add=[(b"existing", "target-folder/existing.txt")]) + + api.copy_files( + f"hf://buckets/{bucket_write}/folder", + f"hf://buckets/{destination_bucket}/target-folder", + ) + + destination_files = {entry.path for entry in api.list_bucket_tree(destination_bucket)} + assert destination_files >= {"target-folder/existing.txt", "target-folder/a.txt", "target-folder/sub/b.txt"} + + +@requires("hf_xet") +def test_copy_files_folder_to_existing_file_dest_raises(api: HfApi, bucket_write: str): + """source=folder, dest is an existing file => must raise.""" + destination_bucket = api.create_bucket(bucket_name()).bucket_id + api.batch_bucket_files(bucket_write, add=[(b"a", "folder/a.txt")]) + api.batch_bucket_files(destination_bucket, add=[(b"existing-file", "target-file")]) + + with pytest.raises(ValueError, match="Cannot copy a folder to a file destination"): api.copy_files( f"hf://buckets/{bucket_write}/folder", - f"hf://buckets/{destination_bucket}/target-folder", + f"hf://buckets/{destination_bucket}/target-file", ) +@requires("hf_xet") +def test_copy_files_file_to_existing_file_dest(api: HfApi, bucket_write: str): + """source=file, dest is an existing file => must work (overwrite).""" + destination_bucket = api.create_bucket(bucket_name()).bucket_id + api.batch_bucket_files(bucket_write, add=[(b"new-content", "source.txt")]) + api.batch_bucket_files(destination_bucket, add=[(b"old-content", "dest.txt")]) + + api.copy_files( + f"hf://buckets/{bucket_write}/source.txt", + f"hf://buckets/{destination_bucket}/dest.txt", + ) + + destination_files = {entry.path for entry in api.list_bucket_tree(destination_bucket)} + assert "dest.txt" in destination_files + + +@requires("hf_xet") +def test_copy_files_file_to_nonexistent_dest(api: HfApi, bucket_write: str): + """source=file, dest doesn't exist => must work (creates file).""" + destination_bucket = api.create_bucket(bucket_name()).bucket_id + api.batch_bucket_files(bucket_write, add=[(b"content", "source.txt")]) + + api.copy_files( + f"hf://buckets/{bucket_write}/source.txt", + f"hf://buckets/{destination_bucket}/new-file.txt", + ) + + destination_files = {entry.path for entry in api.list_bucket_tree(destination_bucket)} + assert "new-file.txt" in destination_files + + +@requires("hf_xet") +def test_copy_files_file_to_folder_dest(api: HfApi, bucket_write: str): + """source=file, dest is a folder (trailing '/') => file added to folder.""" + destination_bucket = api.create_bucket(bucket_name()).bucket_id + api.batch_bucket_files(bucket_write, add=[(b"content", "source.txt")]) + + api.copy_files( + f"hf://buckets/{bucket_write}/source.txt", + f"hf://buckets/{destination_bucket}/folder/", + ) + + destination_files = {entry.path for entry in api.list_bucket_tree(destination_bucket)} + assert "folder/source.txt" in destination_files + + @pytest.mark.parametrize( "source, destination, expected_content_type", [ From da4efab7814cddca5261b8f2ea7b5405b8c97630 Mon Sep 17 00:00:00 2001 From: Lucain Pouget Date: Tue, 7 Apr 2026 14:55:43 +0200 Subject: [PATCH 10/21] simpler --- docs/source/en/guides/buckets.md | 1 - docs/source/en/guides/cli.md | 1 - src/huggingface_hub/hf_api.py | 21 +++------------------ tests/test_buckets.py | 14 -------------- 4 files changed, 3 insertions(+), 34 deletions(-) diff --git a/docs/source/en/guides/buckets.md b/docs/source/en/guides/buckets.md index fd980e6f2d..41a9ae8075 100644 --- a/docs/source/en/guides/buckets.md +++ b/docs/source/en/guides/buckets.md @@ -517,7 +517,6 @@ The same is available from the CLI: Notes: -- When copying a folder, the destination is checked remotely to determine whether it is an existing file. If it is, an error is raised. Otherwise, it is treated as a directory. - Bucket-to-repo copy is not yet supported. - Files tracked with Xet (in buckets or repos) are copied server-side by hash — no data is downloaded or re-uploaded. - Small text files not tracked with Xet on repo sources are downloaded and re-uploaded to the destination bucket. diff --git a/docs/source/en/guides/cli.md b/docs/source/en/guides/cli.md index 2bd94a87c8..6fc428c654 100644 --- a/docs/source/en/guides/cli.md +++ b/docs/source/en/guides/cli.md @@ -716,7 +716,6 @@ To copy from a repo or a bucket on the Hub: Notes: -- When copying a folder, the destination is checked remotely to determine whether it is an existing file. If it is, an error is raised. Otherwise, it is treated as a directory. - Bucket-to-repo copy is not supported. ### Sync directories diff --git a/src/huggingface_hub/hf_api.py b/src/huggingface_hub/hf_api.py index 7bc8b0ee74..6795d1c57d 100644 --- a/src/huggingface_hub/hf_api.py +++ b/src/huggingface_hub/hf_api.py @@ -12521,8 +12521,7 @@ def copy_files(self, source: str, destination: str, *, token: str | bool | None Raises: [`ValueError`](https://docs.python.org/3/library/exceptions.html#ValueError): - If the destination is not a bucket, if the source/destination handles are invalid, or if a folder - copy targets an existing file in the destination bucket. + If the destination is not a bucket or if the source/destination handles are invalid. Example: ```python @@ -12628,14 +12627,7 @@ def _add_repo_file(file: RepoFile, target_path: str) -> None: ) else: # Source path is a folder (or prefix) — list and copy all matching files - if source_path != "": - # Always verify the destination is not an existing file in the bucket. - dest_info = list( - self.get_bucket_paths_info(destination_bucket_id, [destination_path], token=token) - ) - if dest_info: - raise ValueError(f"Cannot copy a folder to a file destination '{destination_path}'.") - destination_is_directory = True + destination_is_directory = True for item in self.list_bucket_tree( source_handle.bucket_id, prefix=source_path or None, recursive=True, token=token ): @@ -12667,14 +12659,7 @@ def _add_repo_file(file: RepoFile, target_path: str) -> None: _add_repo_file(source_path_info[0], target_path) else: # Source path is a folder — list and copy all files recursively - if source_path: - # Always verify the destination is not an existing file in the bucket. - dest_info = list( - self.get_bucket_paths_info(destination_bucket_id, [destination_path], token=token) - ) - if dest_info: - raise ValueError(f"Cannot copy a folder to a file destination '{destination_path}'.") - destination_is_directory = True + destination_is_directory = True for item in self.list_repo_tree( repo_id=source_handle.repo_id, path_in_repo=source_path, diff --git a/tests/test_buckets.py b/tests/test_buckets.py index ed13432556..ed9d846485 100644 --- a/tests/test_buckets.py +++ b/tests/test_buckets.py @@ -396,20 +396,6 @@ def test_copy_files_folder_to_existing_folder_dest(api: HfApi, bucket_write: str assert destination_files >= {"target-folder/existing.txt", "target-folder/a.txt", "target-folder/sub/b.txt"} -@requires("hf_xet") -def test_copy_files_folder_to_existing_file_dest_raises(api: HfApi, bucket_write: str): - """source=folder, dest is an existing file => must raise.""" - destination_bucket = api.create_bucket(bucket_name()).bucket_id - api.batch_bucket_files(bucket_write, add=[(b"a", "folder/a.txt")]) - api.batch_bucket_files(destination_bucket, add=[(b"existing-file", "target-file")]) - - with pytest.raises(ValueError, match="Cannot copy a folder to a file destination"): - api.copy_files( - f"hf://buckets/{bucket_write}/folder", - f"hf://buckets/{destination_bucket}/target-file", - ) - - @requires("hf_xet") def test_copy_files_file_to_existing_file_dest(api: HfApi, bucket_write: str): """source=file, dest is an existing file => must work (overwrite).""" From c685a7a8499162e9e5f21bd700f1ebc02fb52eea Mon Sep 17 00:00:00 2001 From: Lucain Pouget Date: Tue, 7 Apr 2026 14:57:01 +0200 Subject: [PATCH 11/21] docs --- docs/source/en/guides/buckets.md | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/docs/source/en/guides/buckets.md b/docs/source/en/guides/buckets.md index 41a9ae8075..af2d89689e 100644 --- a/docs/source/en/guides/buckets.md +++ b/docs/source/en/guides/buckets.md @@ -355,8 +355,8 @@ You can also copy xet files from another bucket or repository using the `copy` p >>> batch_bucket_files( ... "username/my-bucket", ... copy=[ -... ("bucket", "username/source-bucket", "", "models/model.safetensors"), -... ("model", "username/my-model", "", "models/config.safetensors"), +... ("bucket", "username/source-bucket", "", "models/model.safetensors"), +... ("model", "username/my-model", "", "models/config.safetensors"), ... ], ... ) ``` From b58c13f4806367cf6371aeef07e6186a2844c0f8 Mon Sep 17 00:00:00 2001 From: Lucain Pouget Date: Tue, 7 Apr 2026 15:01:58 +0200 Subject: [PATCH 12/21] type --- docs/source/en/guides/cli.md | 1 - src/huggingface_hub/hf_api.py | 14 +++----------- 2 files changed, 3 insertions(+), 12 deletions(-) diff --git a/docs/source/en/guides/cli.md b/docs/source/en/guides/cli.md index 6fc428c654..1f21a5f65a 100644 --- a/docs/source/en/guides/cli.md +++ b/docs/source/en/guides/cli.md @@ -674,7 +674,6 @@ To filter by prefix, append the prefix to the bucket path: ### Copy single files Use `hf buckets cp` to copy individual files to and from a bucket, or to copy any file hosted on the Hub to a Bucket. -Bucket paths use the `hf://buckets/` prefix. To upload a file: diff --git a/src/huggingface_hub/hf_api.py b/src/huggingface_hub/hf_api.py index 6795d1c57d..270586a874 100644 --- a/src/huggingface_hub/hf_api.py +++ b/src/huggingface_hub/hf_api.py @@ -29,15 +29,7 @@ from functools import wraps from itertools import islice from pathlib import Path -from typing import ( - TYPE_CHECKING, - Any, - BinaryIO, - Literal, - Optional, - TypeVar, - overload, -) +from typing import TYPE_CHECKING, Any, BinaryIO, Literal, TypeVar, overload from urllib.parse import quote, unquote import httpcore @@ -444,7 +436,7 @@ def _parse_hf_copy_handle(hf_handle: str) -> _BucketCopyHandle | _RepoCopyHandle namespace, repo_name_with_revision = parts[0], parts[1] remaining_parts = parts[2:] - revision: Optional[str] = None + revision: str | None = None if "@" in repo_name_with_revision: repo_name, revision = repo_name_with_revision.split("@", 1) else: @@ -12554,7 +12546,7 @@ def copy_files(self, source: str, destination: str, *, token: str | bool | None all_adds: list[_BucketAddFile | tuple[str, str]] = [] all_copies: list[_BucketCopyFile] = [] - def _resolve_target_path(src_file_path: str, src_root_path: Optional[str], is_single_file: bool) -> str: + def _resolve_target_path(src_file_path: str, src_root_path: str | None, is_single_file: bool) -> str: basename = src_file_path.rsplit("/", 1)[-1] if is_single_file: if destination_path == "": From d1115fa8796ab271bcc124683c23e9a9cba75224 Mon Sep 17 00:00:00 2001 From: Lucain Pouget Date: Tue, 7 Apr 2026 15:24:47 +0200 Subject: [PATCH 13/21] no dummy check --- src/huggingface_hub/hf_api.py | 39 ++++++++++++++++++----------------- 1 file changed, 20 insertions(+), 19 deletions(-) diff --git a/src/huggingface_hub/hf_api.py b/src/huggingface_hub/hf_api.py index 270586a874..868bfc701b 100644 --- a/src/huggingface_hub/hf_api.py +++ b/src/huggingface_hub/hf_api.py @@ -95,6 +95,7 @@ XetRefreshTokenError, ) from .file_download import DryRunFileInfo, HfFileMetadata, get_hf_file_metadata, hf_hub_url +from .hf_file_system import SPECIAL_REFS_REVISION_REGEX from .repocard_data import DatasetCardData, ModelCardData, SpaceCardData from .utils import ( DEFAULT_IGNORE_PATTERNS, @@ -239,15 +240,6 @@ _BUCKET_BATCH_ADD_CHUNK_SIZE = 100 _BUCKET_BATCH_DELETE_CHUNK_SIZE = 1000 -_HF_COPY_REPO_TYPE_PREFIXES: dict[str, Literal["model", "dataset", "space"]] = { - "model": constants.REPO_TYPE_MODEL, - "models": constants.REPO_TYPE_MODEL, - "dataset": constants.REPO_TYPE_DATASET, - "datasets": constants.REPO_TYPE_DATASET, - "space": constants.REPO_TYPE_SPACE, - "spaces": constants.REPO_TYPE_SPACE, -} -_SPECIAL_REFS_REVISION_REGEX = re.compile(r"(^refs\/convert\/\w+)|(^refs\/pr\/\d+)") logger = logging.get_logger(__name__) @@ -415,7 +407,6 @@ def _parse_hf_copy_handle(hf_handle: str) -> _BucketCopyHandle | _RepoCopyHandle return _BucketCopyHandle( bucket_id=bucket_id, path=bucket_path.strip("/"), - is_directory=hf_handle.endswith("/") and bucket_path != "", ) path = path.strip("/") @@ -424,14 +415,13 @@ def _parse_hf_copy_handle(hf_handle: str) -> _BucketCopyHandle | _RepoCopyHandle parts = path.split("/") repo_type: Literal["model", "dataset", "space"] = constants.REPO_TYPE_MODEL - if parts[0] in _HF_COPY_REPO_TYPE_PREFIXES: - repo_type = _HF_COPY_REPO_TYPE_PREFIXES[parts[0]] + if parts[0] in constants.REPO_TYPES_MAPPING: + repo_type = constants.REPO_TYPES_MAPPING[parts[0]] parts = parts[1:] if len(parts) < 2: raise ValueError( - f"Invalid repo HF handle: '{hf_handle}'. Expected format 'hf:////path' " - "or with explicit repo type prefix." + f"Invalid repo HF handle: '{hf_handle}'. Expected format 'hf:////path' or with explicit repo type prefix." ) namespace, repo_name_with_revision = parts[0], parts[1] @@ -446,7 +436,7 @@ def _parse_hf_copy_handle(hf_handle: str) -> _BucketCopyHandle | _RepoCopyHandle revision = constants.DEFAULT_REVISION elif remaining_parts: maybe_special_ref = f"{unquote(revision)}/{remaining_parts[0]}" - match = _SPECIAL_REFS_REVISION_REGEX.match(maybe_special_ref) + match = SPECIAL_REFS_REVISION_REGEX.match(maybe_special_ref) if match is not None: special_ref = match.group() revision = special_ref @@ -463,7 +453,6 @@ def _parse_hf_copy_handle(hf_handle: str) -> _BucketCopyHandle | _RepoCopyHandle repo_id=f"{namespace}/{repo_name}", revision=revision, path=repo_path, - is_directory=hf_handle.endswith("/") and repo_path != "", ) @@ -733,7 +722,6 @@ def __repr__(self) -> str: class _BucketCopyHandle: bucket_id: str path: str - is_directory: bool @dataclass(frozen=True) @@ -742,7 +730,6 @@ class _RepoCopyHandle: repo_id: str revision: str path: str - is_directory: bool @dataclass @@ -12541,7 +12528,21 @@ def copy_files(self, source: str, destination: str, *, token: str | bool | None destination_bucket_id = destination_handle.bucket_id destination_path = destination_handle.path - destination_is_directory = destination_handle.is_directory or destination_path == "" + if destination_path == "": + destination_is_directory = True + else: + # Check if destination path is an existing file or a directory in the bucket + dest_path_info = list(self.get_bucket_paths_info(destination_bucket_id, [destination_path], token=token)) + if dest_path_info: + destination_is_directory = False + else: + destination_is_directory = ( + next( + self.list_bucket_tree(destination_bucket_id, prefix=destination_path, recursive=False, token=token), + None, + ) + is not None + ) all_adds: list[_BucketAddFile | tuple[str, str]] = [] all_copies: list[_BucketCopyFile] = [] From 9f82d8170c2c4ad86432d55e9a77331c72fbb7b2 Mon Sep 17 00:00:00 2001 From: Lucain Pouget Date: Tue, 7 Apr 2026 15:29:08 +0200 Subject: [PATCH 14/21] fix imports and types --- src/huggingface_hub/hf_api.py | 18 +++++++++++++++--- src/huggingface_hub/hf_file_system.py | 14 +------------- 2 files changed, 16 insertions(+), 16 deletions(-) diff --git a/src/huggingface_hub/hf_api.py b/src/huggingface_hub/hf_api.py index 868bfc701b..1b9301bbfc 100644 --- a/src/huggingface_hub/hf_api.py +++ b/src/huggingface_hub/hf_api.py @@ -95,7 +95,6 @@ XetRefreshTokenError, ) from .file_download import DryRunFileInfo, HfFileMetadata, get_hf_file_metadata, hf_hub_url -from .hf_file_system import SPECIAL_REFS_REVISION_REGEX from .repocard_data import DatasetCardData, ModelCardData, SpaceCardData from .utils import ( DEFAULT_IGNORE_PATTERNS, @@ -240,6 +239,15 @@ _BUCKET_BATCH_ADD_CHUNK_SIZE = 100 _BUCKET_BATCH_DELETE_CHUNK_SIZE = 1000 +# Regex used to match special revisions with "/" in them (see #1710) +SPECIAL_REFS_REVISION_REGEX = re.compile( + r""" + (^refs\/convert\/\w+) # `refs/convert/parquet` revisions + | + (^refs\/pr\/\d+) # PR revisions + """, + re.VERBOSE, +) logger = logging.get_logger(__name__) @@ -416,7 +424,7 @@ def _parse_hf_copy_handle(hf_handle: str) -> _BucketCopyHandle | _RepoCopyHandle parts = path.split("/") repo_type: Literal["model", "dataset", "space"] = constants.REPO_TYPE_MODEL if parts[0] in constants.REPO_TYPES_MAPPING: - repo_type = constants.REPO_TYPES_MAPPING[parts[0]] + repo_type = constants.REPO_TYPES_MAPPING[parts[0]] # type: ignore parts = parts[1:] if len(parts) < 2: @@ -12538,7 +12546,11 @@ def copy_files(self, source: str, destination: str, *, token: str | bool | None else: destination_is_directory = ( next( - self.list_bucket_tree(destination_bucket_id, prefix=destination_path, recursive=False, token=token), + iter( + self.list_bucket_tree( + destination_bucket_id, prefix=destination_path, recursive=False, token=token + ) + ), None, ) is not None diff --git a/src/huggingface_hub/hf_file_system.py b/src/huggingface_hub/hf_file_system.py index 3ae8493006..55aa490ff2 100644 --- a/src/huggingface_hub/hf_file_system.py +++ b/src/huggingface_hub/hf_file_system.py @@ -1,5 +1,4 @@ import os -import re import tempfile import threading from collections import deque @@ -28,22 +27,11 @@ RevisionNotFoundError, ) from .file_download import hf_hub_url, http_get -from .hf_api import BucketFile, BucketFolder, HfApi, LastCommitInfo, RepoFile, RepoFolder +from .hf_api import SPECIAL_REFS_REVISION_REGEX, BucketFile, BucketFolder, HfApi, LastCommitInfo, RepoFile, RepoFolder from .utils import HFValidationError, hf_raise_for_status, http_backoff, http_stream_backoff from .utils.insecure_hashlib import md5 -# Regex used to match special revisions with "/" in them (see #1710) -SPECIAL_REFS_REVISION_REGEX = re.compile( - r""" - (^refs\/convert\/\w+) # `refs/convert/parquet` revisions - | - (^refs\/pr\/\d+) # PR revisions - """, - re.VERBOSE, -) - - @dataclass class HfFileSystemResolvedPath: """Top level Data structure containing information about a resolved Hugging Face file system path.""" From e6711cbbd657e86fb70e08cad4e85bb6cfacb2bc Mon Sep 17 00:00:00 2001 From: Lucain Pouget Date: Tue, 7 Apr 2026 15:30:22 +0200 Subject: [PATCH 15/21] add todo --- src/huggingface_hub/hf_api.py | 1 + 1 file changed, 1 insertion(+) diff --git a/src/huggingface_hub/hf_api.py b/src/huggingface_hub/hf_api.py index 1b9301bbfc..aee0678161 100644 --- a/src/huggingface_hub/hf_api.py +++ b/src/huggingface_hub/hf_api.py @@ -406,6 +406,7 @@ def repo_type_and_id_from_hf_id(hf_id: str, hub_url: str | None = None) -> tuple def _parse_hf_copy_handle(hf_handle: str) -> _BucketCopyHandle | _RepoCopyHandle: + # TODO: Harmonize hf:// parsing. See https://github.com/huggingface/huggingface_hub/issues/3971 if not hf_handle.startswith("hf://"): raise ValueError(f"Invalid HF handle: '{hf_handle}'. Expected a path starting with 'hf://'.") From de374cfc0c71210649038a426818f6f208b0d286 Mon Sep 17 00:00:00 2001 From: Lucain Pouget Date: Tue, 7 Apr 2026 15:48:16 +0200 Subject: [PATCH 16/21] simplified --- src/huggingface_hub/hf_api.py | 60 +++++++++++++++++------------------ 1 file changed, 29 insertions(+), 31 deletions(-) diff --git a/src/huggingface_hub/hf_api.py b/src/huggingface_hub/hf_api.py index aee0678161..b61dd99484 100644 --- a/src/huggingface_hub/hf_api.py +++ b/src/huggingface_hub/hf_api.py @@ -12513,20 +12513,19 @@ def copy_files(self, source: str, destination: str, *, token: str | bool | None Example: ```python - >>> from huggingface_hub import HfApi - >>> api = HfApi() + >>> from huggingface_hub import copy_files # Copy a single file between buckets - >>> api.copy_files("hf://buckets/my-bucket/data.bin", "hf://buckets/other-bucket/data.bin") + >>> copy_files("hf://buckets/my-bucket/data.bin", "hf://buckets/other-bucket/data.bin") # Copy a folder from a bucket to another bucket - >>> api.copy_files("hf://buckets/my-bucket/models/", "hf://buckets/other-bucket/backup/") + >>> copy_files("hf://buckets/my-bucket/models/", "hf://buckets/other-bucket/backup/") # Copy a file from a model repo to a bucket - >>> api.copy_files("hf://username/my-model/model.safetensors", "hf://buckets/my-bucket/") + >>> copy_files("hf://username/my-model/model.safetensors", "hf://buckets/my-bucket/") # Copy an entire dataset to a bucket - >>> api.copy_files("hf://datasets/username/my-dataset/", "hf://buckets/my-bucket/datasets/") + >>> copy_files("hf://datasets/username/my-dataset/", "hf://buckets/my-bucket/datasets/") ``` """ source_handle = _parse_hf_copy_handle(source) @@ -12615,6 +12614,7 @@ def _add_repo_file(file: RepoFile, target_path: str) -> None: ) ) else: + # TODO: optimize this to download in parallel (low prio) all_adds.append((_download_from_repo(file.path), target_path)) # === Source is a bucket: always hash-based copy (no download needed) === @@ -12742,8 +12742,8 @@ def batch_bucket_files( >>> batch_bucket_files( ... "username/my-bucket", ... copy=[ - ... ("bucket", "username/source-bucket", "xet-hash-abc123", "models/model.safetensors"), - ... ("model", "username/my-model", "xet-hash-def456", "models/config.safetensors"), + ... ("bucket", "username/source-bucket", "", "models/model.safetensors"), + ... ("model", "username/my-model", "", "models/config.safetensors"), ... ], ... ) @@ -12758,29 +12758,15 @@ def batch_bucket_files( ... ) ``` """ - add = [] if add is None else add - copy = [] if copy is None else copy - delete = [] if delete is None else delete - - # Convert public copy tuples to internal _BucketCopyFile objects - copy_operations = [ - _BucketCopyFile( - destination=destination, - xet_hash=xet_hash, - source_repo_type=source_repo_type, - source_repo_id=source_repo_id, - ) - for source_repo_type, source_repo_id, xet_hash, destination in copy - ] + add = add or [] + copy = copy or [] + delete = delete or [] # Small batch: do everything in one call - if len(add) + len(copy_operations) + len(delete) <= _BUCKET_BATCH_ADD_CHUNK_SIZE: - add_payload: list[tuple[str | Path | bytes, str] | _BucketAddFile] | None = ( - list(add) if len(add) > 0 else None + if len(add) + len(copy) + len(delete) <= _BUCKET_BATCH_ADD_CHUNK_SIZE: + self._batch_bucket_files( + bucket_id, add=add or None, copy=copy or None, delete=delete or None, token=token ) - copy_payload: list[_BucketCopyFile] | None = copy_operations if len(copy_operations) > 0 else None - delete_payload: list[str] | None = delete if len(delete) > 0 else None - self._batch_bucket_files(bucket_id, add=add_payload, copy=copy_payload, delete=delete_payload, token=token) return # Large batch: chunk copies first (no upload), then adds, then deletes @@ -12792,7 +12778,7 @@ def batch_bucket_files( progress = None try: - for copy_chunk in chunk_iterable(copy_operations, chunk_size=_BUCKET_BATCH_ADD_CHUNK_SIZE): + for copy_chunk in chunk_iterable(copy, chunk_size=_BUCKET_BATCH_ADD_CHUNK_SIZE): self._batch_bucket_files(bucket_id, copy=list(copy_chunk), token=token) for add_chunk in chunk_iterable(add, chunk_size=_BUCKET_BATCH_ADD_CHUNK_SIZE): @@ -12812,7 +12798,7 @@ def _batch_bucket_files( bucket_id: str, *, add: list[tuple[str | Path | bytes, str] | _BucketAddFile] | None = None, - copy: list[_BucketCopyFile] | None = None, + copy: list[tuple[str, str, str, str] | _BucketCopyFile] | None = None, delete: list[str] | None = None, token: str | bool | None = None, _progress: XetProgressReporter | None = None, @@ -12828,7 +12814,19 @@ def _batch_bucket_files( source, destination = item operations.append(_BucketAddFile(source=source, destination=destination)) if copy: - operations.extend(copy) + for item in copy: + if isinstance(item, _BucketCopyFile): + operations.append(item) + else: + source_repo_type, source_repo_id, xet_hash, destination = item + operations.append( + _BucketCopyFile( + destination=destination, + xet_hash=xet_hash, + source_repo_type=source_repo_type, + source_repo_id=source_repo_id, + ) + ) if delete: for path in delete: operations.append(_BucketDeleteFile(path=path)) From 5e76d93167a89501e001057e71c0ca853d545513 Mon Sep 17 00:00:00 2001 From: Lucain Pouget Date: Tue, 7 Apr 2026 15:57:12 +0200 Subject: [PATCH 17/21] useless --- src/huggingface_hub/hf_api.py | 7 ++----- 1 file changed, 2 insertions(+), 5 deletions(-) diff --git a/src/huggingface_hub/hf_api.py b/src/huggingface_hub/hf_api.py index b61dd99484..02f801f553 100644 --- a/src/huggingface_hub/hf_api.py +++ b/src/huggingface_hub/hf_api.py @@ -12764,9 +12764,7 @@ def batch_bucket_files( # Small batch: do everything in one call if len(add) + len(copy) + len(delete) <= _BUCKET_BATCH_ADD_CHUNK_SIZE: - self._batch_bucket_files( - bucket_id, add=add or None, copy=copy or None, delete=delete or None, token=token - ) + self._batch_bucket_files(bucket_id, add=add or None, copy=copy or None, delete=delete or None, token=token) return # Large batch: chunk copies first (no upload), then adds, then deletes @@ -12782,8 +12780,7 @@ def batch_bucket_files( self._batch_bucket_files(bucket_id, copy=list(copy_chunk), token=token) for add_chunk in chunk_iterable(add, chunk_size=_BUCKET_BATCH_ADD_CHUNK_SIZE): - add_chunk_list: list[tuple[str | Path | bytes, str] | _BucketAddFile] = list(add_chunk) - self._batch_bucket_files(bucket_id, add=add_chunk_list, token=token, _progress=progress) + self._batch_bucket_files(bucket_id, add=list(add_chunk), token=token, _progress=progress) for delete_chunk in chunk_iterable(delete, chunk_size=_BUCKET_BATCH_DELETE_CHUNK_SIZE): self._batch_bucket_files(bucket_id, delete=list(delete_chunk), token=token) From d601517af9be2d26c6086ae7aeab17ac685a34f2 Mon Sep 17 00:00:00 2001 From: Lucain Pouget Date: Tue, 7 Apr 2026 15:58:39 +0200 Subject: [PATCH 18/21] type --- src/huggingface_hub/hf_api.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/src/huggingface_hub/hf_api.py b/src/huggingface_hub/hf_api.py index 02f801f553..313ace18ee 100644 --- a/src/huggingface_hub/hf_api.py +++ b/src/huggingface_hub/hf_api.py @@ -12764,7 +12764,7 @@ def batch_bucket_files( # Small batch: do everything in one call if len(add) + len(copy) + len(delete) <= _BUCKET_BATCH_ADD_CHUNK_SIZE: - self._batch_bucket_files(bucket_id, add=add or None, copy=copy or None, delete=delete or None, token=token) + self._batch_bucket_files(bucket_id, add=add or None, copy=copy or None, delete=delete or None, token=token) # type: ignore return # Large batch: chunk copies first (no upload), then adds, then deletes From 47345a0dce5ab2f3cb9acdeab038659206d8534b Mon Sep 17 00:00:00 2001 From: Lucain Pouget Date: Tue, 7 Apr 2026 16:04:53 +0200 Subject: [PATCH 19/21] all good --- src/huggingface_hub/hf_api.py | 7 +++++-- 1 file changed, 5 insertions(+), 2 deletions(-) diff --git a/src/huggingface_hub/hf_api.py b/src/huggingface_hub/hf_api.py index 313ace18ee..ef2a2ae2c8 100644 --- a/src/huggingface_hub/hf_api.py +++ b/src/huggingface_hub/hf_api.py @@ -12796,7 +12796,7 @@ def _batch_bucket_files( *, add: list[tuple[str | Path | bytes, str] | _BucketAddFile] | None = None, copy: list[tuple[str, str, str, str] | _BucketCopyFile] | None = None, - delete: list[str] | None = None, + delete: list[str | _BucketDeleteFile] | None = None, token: str | bool | None = None, _progress: XetProgressReporter | None = None, ): @@ -12826,7 +12826,10 @@ def _batch_bucket_files( ) if delete: for path in delete: - operations.append(_BucketDeleteFile(path=path)) + if isinstance(path, _BucketDeleteFile): + operations.append(path) + else: + operations.append(_BucketDeleteFile(path=path)) if not operations: return From b46f914960d3f15b8238dfe6c3573da3cc587ded Mon Sep 17 00:00:00 2001 From: Lucain Pouget Date: Tue, 7 Apr 2026 16:19:41 +0200 Subject: [PATCH 20/21] review tests --- tests/test_buckets.py | 125 ++++++++++++++++++++------------------ tests/test_buckets_cli.py | 32 +++------- 2 files changed, 76 insertions(+), 81 deletions(-) diff --git a/tests/test_buckets.py b/tests/test_buckets.py index ed9d846485..fb79c2b8e2 100644 --- a/tests/test_buckets.py +++ b/tests/test_buckets.py @@ -95,6 +95,13 @@ def bucket_write(api: HfApi) -> str: return bucket.bucket_id +@pytest.fixture(scope="function") +def bucket_write_2(api: HfApi) -> str: + """Second bucket for read-write tests (rebuilt every test).""" + bucket = api.create_bucket(bucket_name()) + return bucket.bucket_id + + def test_create_bucket(api: HfApi): bucket_id = f"{USER}/{bucket_name()}" bucket_url = api.create_bucket(bucket_id) @@ -303,7 +310,7 @@ def test_download_bucket_files_raises_on_missing_when_requested(api: HfApi, buck @requires("hf_xet") -def test_copy_files_bucket_to_same_bucket_file(api: HfApi, bucket_write: str): +def test_copy_files_bucket_to_same_bucket_file(api: HfApi, bucket_write: str, tmp_path): api.batch_bucket_files(bucket_write, add=[(b"bucket-content", "source.txt")]) api.copy_files( @@ -311,135 +318,135 @@ def test_copy_files_bucket_to_same_bucket_file(api: HfApi, bucket_write: str): f"hf://buckets/{bucket_write}/copied.txt", ) - files = {entry.path for entry in api.list_bucket_tree(bucket_write)} - assert files >= {"source.txt", "copied.txt"} + output_path = tmp_path / "copied.txt" + api.download_bucket_files(bucket_write, [("copied.txt", str(output_path))]) + assert output_path.read_bytes() == b"bucket-content" @requires("hf_xet") -def test_copy_files_bucket_to_different_bucket_folder(api: HfApi, bucket_write: str): - source_bucket = bucket_write - destination_bucket = api.create_bucket(bucket_name()).bucket_id - api.batch_bucket_files(source_bucket, add=[(b"a", "logs/a.txt"), (b"b", "logs/sub/b.txt"), (b"c", "other/c.txt")]) +def test_copy_files_bucket_to_different_bucket_folder(api: HfApi, bucket_write: str, bucket_write_2: str, tmp_path): + api.batch_bucket_files(bucket_write, add=[(b"a", "logs/a.txt"), (b"b", "logs/sub/b.txt"), (b"c", "other/c.txt")]) api.copy_files( - f"hf://buckets/{source_bucket}/logs", - f"hf://buckets/{destination_bucket}/backup/", + f"hf://buckets/{bucket_write}/logs", + f"hf://buckets/{bucket_write_2}/backup/", ) - destination_files = {entry.path for entry in api.list_bucket_tree(destination_bucket)} - assert destination_files >= {"backup/a.txt", "backup/sub/b.txt"} + destination_files = {entry.path for entry in api.list_bucket_tree(bucket_write_2)} + assert "backup/a.txt" in destination_files + assert "backup/sub/b.txt" in destination_files assert "backup/c.txt" not in destination_files + # Check exact content + a_path = tmp_path / "a.txt" + b_path = tmp_path / "b.txt" + api.download_bucket_files(bucket_write_2, [("backup/a.txt", str(a_path)), ("backup/sub/b.txt", str(b_path))]) + assert a_path.read_bytes() == b"a" + assert b_path.read_bytes() == b"b" + @requires("hf_xet") def test_copy_files_repo_to_bucket_with_revision(api: HfApi, bucket_write: str, tmp_path): repo_id = api.create_repo(repo_id=repo_name(prefix="copy-files")).repo_id branch = "copy-files-branch" - try: - api.upload_file(repo_id=repo_id, path_in_repo="main.txt", path_or_fileobj=b"main") - api.create_branch(repo_id=repo_id, branch=branch) - api.upload_file( - repo_id=repo_id, path_in_repo="nested/from-branch.txt", path_or_fileobj=b"branch", revision=branch - ) + api.upload_file(repo_id=repo_id, path_in_repo="main.txt", path_or_fileobj=b"main") + api.create_branch(repo_id=repo_id, branch=branch) + api.upload_file(repo_id=repo_id, path_in_repo="nested/from-branch.txt", path_or_fileobj=b"branch", revision=branch) - api.copy_files( - f"hf://{repo_id}@{branch}/nested/from-branch.txt", - f"hf://buckets/{bucket_write}/from-repo.txt", - ) + api.copy_files( + f"hf://{repo_id}@{branch}/nested/from-branch.txt", + f"hf://buckets/{bucket_write}/from-repo.txt", + ) - output_path = tmp_path / "from-repo.txt" - api.download_bucket_files(bucket_write, [("from-repo.txt", output_path)]) - assert output_path.read_bytes() == b"branch" - finally: - api.delete_repo(repo_id=repo_id) + output_path = tmp_path / "from-repo.txt" + api.download_bucket_files(bucket_write, [("from-repo.txt", str(output_path))]) + assert output_path.read_bytes() == b"branch" @requires("hf_xet") def test_copy_files_bucket_to_repo_raises(api: HfApi, bucket_write: str): repo_id = api.create_repo(repo_id=repo_name(prefix="copy-files-dst")).repo_id - try: - api.batch_bucket_files(bucket_write, add=[(b"x", "x.txt")]) - with pytest.raises(ValueError, match="Destination must be a bucket"): - api.copy_files(f"hf://buckets/{bucket_write}/x.txt", f"hf://{repo_id}/x.txt") - finally: - api.delete_repo(repo_id=repo_id) + api.batch_bucket_files(bucket_write, add=[(b"x", "x.txt")]) + with pytest.raises(ValueError, match="Destination must be a bucket"): + api.copy_files(f"hf://buckets/{bucket_write}/x.txt", f"hf://{repo_id}/x.txt") @requires("hf_xet") -def test_copy_files_folder_to_nonexistent_dest(api: HfApi, bucket_write: str): +def test_copy_files_folder_to_nonexistent_dest(api: HfApi, bucket_write: str, bucket_write_2: str): """source=folder, dest doesn't exist => files copied under dest path.""" - destination_bucket = api.create_bucket(bucket_name()).bucket_id api.batch_bucket_files(bucket_write, add=[(b"a", "folder/a.txt"), (b"b", "folder/sub/b.txt")]) api.copy_files( f"hf://buckets/{bucket_write}/folder", - f"hf://buckets/{destination_bucket}/target-folder", + f"hf://buckets/{bucket_write_2}/target-folder", ) - destination_files = {entry.path for entry in api.list_bucket_tree(destination_bucket)} - assert destination_files >= {"target-folder/a.txt", "target-folder/sub/b.txt"} + destination_files = {entry.path for entry in api.list_bucket_tree(bucket_write_2)} + assert "target-folder/a.txt" in destination_files + assert "target-folder/sub/b.txt" in destination_files @requires("hf_xet") -def test_copy_files_folder_to_existing_folder_dest(api: HfApi, bucket_write: str): +def test_copy_files_folder_to_existing_folder_dest(api: HfApi, bucket_write: str, bucket_write_2: str): """source=folder, dest is an existing folder => files merged under dest path.""" - destination_bucket = api.create_bucket(bucket_name()).bucket_id api.batch_bucket_files(bucket_write, add=[(b"a", "folder/a.txt"), (b"b", "folder/sub/b.txt")]) - api.batch_bucket_files(destination_bucket, add=[(b"existing", "target-folder/existing.txt")]) + api.batch_bucket_files(bucket_write_2, add=[(b"existing", "target-folder/existing.txt")]) api.copy_files( f"hf://buckets/{bucket_write}/folder", - f"hf://buckets/{destination_bucket}/target-folder", + f"hf://buckets/{bucket_write_2}/target-folder", ) - destination_files = {entry.path for entry in api.list_bucket_tree(destination_bucket)} - assert destination_files >= {"target-folder/existing.txt", "target-folder/a.txt", "target-folder/sub/b.txt"} + destination_files = {entry.path for entry in api.list_bucket_tree(bucket_write_2)} + assert "target-folder/existing.txt" in destination_files + assert "target-folder/a.txt" in destination_files + assert "target-folder/sub/b.txt" in destination_files @requires("hf_xet") -def test_copy_files_file_to_existing_file_dest(api: HfApi, bucket_write: str): +def test_copy_files_file_to_existing_file_dest(api: HfApi, bucket_write: str, bucket_write_2: str, tmp_path): """source=file, dest is an existing file => must work (overwrite).""" - destination_bucket = api.create_bucket(bucket_name()).bucket_id api.batch_bucket_files(bucket_write, add=[(b"new-content", "source.txt")]) - api.batch_bucket_files(destination_bucket, add=[(b"old-content", "dest.txt")]) + api.batch_bucket_files(bucket_write_2, add=[(b"old-content", "dest.txt")]) api.copy_files( f"hf://buckets/{bucket_write}/source.txt", - f"hf://buckets/{destination_bucket}/dest.txt", + f"hf://buckets/{bucket_write_2}/dest.txt", ) - destination_files = {entry.path for entry in api.list_bucket_tree(destination_bucket)} - assert "dest.txt" in destination_files + output_path = tmp_path / "dest.txt" + api.download_bucket_files(bucket_write_2, [("dest.txt", str(output_path))]) + assert output_path.read_bytes() == b"new-content" @requires("hf_xet") -def test_copy_files_file_to_nonexistent_dest(api: HfApi, bucket_write: str): +def test_copy_files_file_to_nonexistent_dest(api: HfApi, bucket_write: str, bucket_write_2: str, tmp_path): """source=file, dest doesn't exist => must work (creates file).""" - destination_bucket = api.create_bucket(bucket_name()).bucket_id api.batch_bucket_files(bucket_write, add=[(b"content", "source.txt")]) api.copy_files( f"hf://buckets/{bucket_write}/source.txt", - f"hf://buckets/{destination_bucket}/new-file.txt", + f"hf://buckets/{bucket_write_2}/new-file.txt", ) - destination_files = {entry.path for entry in api.list_bucket_tree(destination_bucket)} - assert "new-file.txt" in destination_files + output_path = tmp_path / "new-file.txt" + api.download_bucket_files(bucket_write_2, [("new-file.txt", str(output_path))]) + assert output_path.read_bytes() == b"content" @requires("hf_xet") -def test_copy_files_file_to_folder_dest(api: HfApi, bucket_write: str): +def test_copy_files_file_to_folder_dest(api: HfApi, bucket_write: str, bucket_write_2: str, tmp_path): """source=file, dest is a folder (trailing '/') => file added to folder.""" - destination_bucket = api.create_bucket(bucket_name()).bucket_id api.batch_bucket_files(bucket_write, add=[(b"content", "source.txt")]) api.copy_files( f"hf://buckets/{bucket_write}/source.txt", - f"hf://buckets/{destination_bucket}/folder/", + f"hf://buckets/{bucket_write_2}/folder/", ) - destination_files = {entry.path for entry in api.list_bucket_tree(destination_bucket)} - assert "folder/source.txt" in destination_files + output_path = tmp_path / "source.txt" + api.download_bucket_files(bucket_write_2, [("folder/source.txt", str(output_path))]) + assert output_path.read_bytes() == b"content" @pytest.mark.parametrize( diff --git a/tests/test_buckets_cli.py b/tests/test_buckets_cli.py index 9c0fe703db..da95411171 100644 --- a/tests/test_buckets_cli.py +++ b/tests/test_buckets_cli.py @@ -881,14 +881,11 @@ def test_cp_remote_bucket_to_bucket(api: HfApi): destination_bucket = api.create_bucket(bucket_name()).bucket_id api.batch_bucket_files(source_bucket, add=[(b"aaa", "logs/a.txt"), (b"bbb", "logs/sub/b.txt"), (b"ccc", "c.txt")]) - result = cli( - f"hf buckets cp hf://buckets/{source_bucket}/logs hf://buckets/{destination_bucket}/backup/", - ) + cli(f"hf buckets cp hf://buckets/{source_bucket}/logs hf://buckets/{destination_bucket}/backup/") - assert result.exit_code == 0, result.output - assert f"Copied: hf://buckets/{source_bucket}/logs -> hf://buckets/{destination_bucket}/backup/" in result.output files = _remote_files(api, destination_bucket) - assert files >= {"backup/a.txt", "backup/sub/b.txt"} + assert "backup/a.txt" in files + assert "backup/sub/b.txt" in files assert "backup/c.txt" not in files @@ -897,22 +894,13 @@ def test_cp_remote_repo_to_bucket(api: HfApi): branch = "cp-copy-branch" destination_bucket = api.create_bucket(bucket_name()).bucket_id - try: - api.upload_file(repo_id=repo_id, path_in_repo="main.txt", path_or_fileobj=b"main") - api.create_branch(repo_id=repo_id, branch=branch) - api.upload_file( - repo_id=repo_id, path_in_repo="nested/from-branch.txt", path_or_fileobj=b"branch", revision=branch - ) - - result = cli( - f"hf buckets cp hf://{repo_id}@{branch}/nested/from-branch.txt hf://buckets/{destination_bucket}/copied.txt" - ) - assert result.exit_code == 0, result.output - assert ( - f"Copied: hf://{repo_id}@{branch}/nested/from-branch.txt -> hf://buckets/{destination_bucket}/copied.txt" - ) in result.output - finally: - api.delete_repo(repo_id=repo_id) + api.upload_file(repo_id=repo_id, path_in_repo="main.txt", path_or_fileobj=b"main") + api.create_branch(repo_id=repo_id, branch=branch) + api.upload_file(repo_id=repo_id, path_in_repo="nested/from-branch.txt", path_or_fileobj=b"branch", revision=branch) + + cli(f"hf buckets cp hf://{repo_id}@{branch}/nested/from-branch.txt hf://buckets/{destination_bucket}/copied.txt") + + assert "copied.txt" in _remote_files(api, destination_bucket) def test_cp_error_bucket_to_repo(api: HfApi, bucket_write: str): From a996a0ffc6ae0235f7479cecbe4badecd639d1d3 Mon Sep 17 00:00:00 2001 From: Lucain Pouget Date: Tue, 7 Apr 2026 16:54:36 +0200 Subject: [PATCH 21/21] mypy happy --- src/huggingface_hub/hf_api.py | 60 +++++++++++++++++++---------------- 1 file changed, 32 insertions(+), 28 deletions(-) diff --git a/src/huggingface_hub/hf_api.py b/src/huggingface_hub/hf_api.py index ef2a2ae2c8..84137037d0 100644 --- a/src/huggingface_hub/hf_api.py +++ b/src/huggingface_hub/hf_api.py @@ -423,9 +423,9 @@ def _parse_hf_copy_handle(hf_handle: str) -> _BucketCopyHandle | _RepoCopyHandle raise ValueError(f"Invalid HF handle: '{hf_handle}'.") parts = path.split("/") - repo_type: Literal["model", "dataset", "space"] = constants.REPO_TYPE_MODEL + repo_type: str = constants.REPO_TYPE_MODEL if parts[0] in constants.REPO_TYPES_MAPPING: - repo_type = constants.REPO_TYPES_MAPPING[parts[0]] # type: ignore + repo_type = constants.REPO_TYPES_MAPPING[parts[0]] parts = parts[1:] if len(parts) < 2: @@ -458,7 +458,7 @@ def _parse_hf_copy_handle(hf_handle: str) -> _BucketCopyHandle | _RepoCopyHandle repo_path = "/".join(remaining_parts).strip("/") return _RepoCopyHandle( - repo_type=repo_type, + repo_type=repo_type, # type: ignore repo_id=f"{namespace}/{repo_name}", revision=revision, path=repo_path, @@ -12598,10 +12598,10 @@ def _copy_by_hash( def _download_from_repo(file_path: str) -> str: """Download a repo file to local cache, return the cache path.""" return self.hf_hub_download( - repo_id=source_handle.repo_id, - repo_type=source_handle.repo_type, + repo_id=source_handle.repo_id, # type: ignore + repo_type=source_handle.repo_type, # type: ignore filename=file_path, - revision=source_handle.revision, + revision=source_handle.revision, # type: ignore token=token, ) @@ -12610,7 +12610,11 @@ def _add_repo_file(file: RepoFile, target_path: str) -> None: if file.xet_hash is not None: all_copies.append( _copy_by_hash( - target_path, file.xet_hash, file.size, source_handle.repo_type, source_handle.repo_id + target_path, + file.xet_hash, + file.size, + source_handle.repo_type, # type: ignore + source_handle.repo_id, # type: ignore ) ) else: @@ -12649,9 +12653,9 @@ def _add_repo_file(file: RepoFile, target_path: str) -> None: # === Source is a repo: copy-by-hash if xet-backed, download otherwise === else: source_path = source_handle.path - source_path_info: list[RepoFile | RepoFolder] = [] + source_repo_path_info: list[RepoFile | RepoFolder] = [] if source_path != "": - source_path_info = self.get_paths_info( + source_repo_path_info = self.get_paths_info( repo_id=source_handle.repo_id, paths=[source_path], repo_type=source_handle.repo_type, @@ -12659,14 +12663,14 @@ def _add_repo_file(file: RepoFile, target_path: str) -> None: token=token, ) - if len(source_path_info) == 1 and isinstance(source_path_info[0], RepoFile): + if len(source_repo_path_info) == 1 and isinstance(source_repo_path_info[0], RepoFile): # Source path matched a single file - target_path = _resolve_target_path(source_path_info[0].path, None, is_single_file=True) - _add_repo_file(source_path_info[0], target_path) + target_path = _resolve_target_path(source_repo_path_info[0].path, None, is_single_file=True) + _add_repo_file(source_repo_path_info[0], target_path) else: # Source path is a folder — list and copy all files recursively destination_is_directory = True - for item in self.list_repo_tree( + for repo_item in self.list_repo_tree( repo_id=source_handle.repo_id, path_in_repo=source_path, recursive=True, @@ -12674,10 +12678,10 @@ def _add_repo_file(file: RepoFile, target_path: str) -> None: revision=source_handle.revision, token=token, ): - if not isinstance(item, RepoFile): + if not isinstance(repo_item, RepoFile): continue - target_path = _resolve_target_path(item.path, source_path or None, is_single_file=False) - _add_repo_file(item, target_path) + target_path = _resolve_target_path(repo_item.path, source_path or None, is_single_file=False) + _add_repo_file(repo_item, target_path) # Send copies first (no upload needed), then adds (may need upload) if all_copies: @@ -12804,18 +12808,18 @@ def _batch_bucket_files( # Convert public API inputs to internal operation objects operations: list[_BucketAddFile | _BucketCopyFile | _BucketDeleteFile] = [] if add: - for item in add: - if isinstance(item, _BucketAddFile): - operations.append(item) + for add_item in add: + if isinstance(add_item, _BucketAddFile): + operations.append(add_item) else: - source, destination = item + source, destination = add_item operations.append(_BucketAddFile(source=source, destination=destination)) if copy: - for item in copy: - if isinstance(item, _BucketCopyFile): - operations.append(item) + for copy_item in copy: + if isinstance(copy_item, _BucketCopyFile): + operations.append(copy_item) else: - source_repo_type, source_repo_id, xet_hash, destination = item + source_repo_type, source_repo_id, xet_hash, destination = copy_item operations.append( _BucketCopyFile( destination=destination, @@ -12825,11 +12829,11 @@ def _batch_bucket_files( ) ) if delete: - for path in delete: - if isinstance(path, _BucketDeleteFile): - operations.append(path) + for delete_item in delete: + if isinstance(delete_item, _BucketDeleteFile): + operations.append(delete_item) else: - operations.append(_BucketDeleteFile(path=path)) + operations.append(_BucketDeleteFile(path=delete_item)) if not operations: return