-
Notifications
You must be signed in to change notification settings - Fork 982
Add experimental bucket+mount transport for Jobs script upload #4025
New issue
Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.
By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.
Already on GitHub? Sign in to your account
Open
davanstrien
wants to merge
4
commits into
main
Choose a base branch
from
feat/bucket-transport-jobs
base: main
Could not load branches
Branch not found: {{ refName }}
Loading
Could not load tags
Nothing to show
Loading
Are you sure you want to change the base?
Some commits from the old base branch may be removed from the timeline,
and old review comments may become outdated.
+373
−46
Open
Changes from 3 commits
Commits
Show all changes
4 commits
Select commit
Hold shift + click to select a range
be789f6
Add experimental bucket+mount transport for Jobs script upload
davanstrien f0ea7e1
Merge branch 'main' into feat/bucket-transport-jobs
davanstrien 40472d8
Update src/huggingface_hub/hf_api.py
davanstrien 93e0070
Fix tests and docstring to use `scripts/` prefix (drop underscore)
davanstrien File filter
Filter by extension
Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
There are no files selected for viewing
This file contains hidden or bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
This file contains hidden or bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
| Original file line number | Diff line number | Diff line change |
|---|---|---|
|
|
@@ -130,6 +130,7 @@ | |
| from .utils._auth import _get_token_from_environment, _get_token_from_file, _get_token_from_google_colab | ||
| from .utils._deprecation import _deprecate_arguments, _deprecate_method | ||
| from .utils._http import _httpx_follow_relative_redirects_with_backoff | ||
| from .utils._runtime import is_xet_available | ||
| from .utils._typing import CallableT | ||
| from .utils._verification import collect_local_files, resolve_local_root, verify_maps | ||
| from .utils.endpoint_helpers import _is_emission_within_threshold | ||
|
|
@@ -11467,7 +11468,7 @@ def run_uv_job( | |
| secrets = secrets or {} | ||
|
|
||
| # Build command | ||
| command, env, secrets = self._create_uv_command_env_and_secrets( | ||
| command, env, secrets, extra_volumes = self._create_uv_command_env_and_secrets( | ||
| script=script, | ||
| script_args=script_args, | ||
| dependencies=dependencies, | ||
|
|
@@ -11476,7 +11477,10 @@ def run_uv_job( | |
| secrets=secrets, | ||
| namespace=namespace, | ||
| token=token, | ||
| volumes=volumes, | ||
| ) | ||
| if extra_volumes: | ||
| volumes = (volumes or []) + extra_volumes | ||
| # Create RunCommand args | ||
| return self.run_job( | ||
| image=image, | ||
|
|
@@ -11886,7 +11890,7 @@ def create_scheduled_uv_job( | |
| """ | ||
| image = image or "ghcr.io/astral-sh/uv:python3.12-bookworm" | ||
| # Build command | ||
| command, env, secrets = self._create_uv_command_env_and_secrets( | ||
| command, env, secrets, extra_volumes = self._create_uv_command_env_and_secrets( | ||
| script=script, | ||
| script_args=script_args, | ||
| dependencies=dependencies, | ||
|
|
@@ -11895,7 +11899,10 @@ def create_scheduled_uv_job( | |
| secrets=secrets, | ||
| namespace=namespace, | ||
| token=token, | ||
| volumes=volumes, | ||
| ) | ||
| if extra_volumes: | ||
| volumes = (volumes or []) + extra_volumes | ||
| # Create RunCommand args | ||
| return self.create_scheduled_job( | ||
| image=image, | ||
|
|
@@ -11913,6 +11920,10 @@ def create_scheduled_uv_job( | |
| token=token, | ||
| ) | ||
|
|
||
| # Bucket transport constants for Jobs | ||
| _HF_JOBS_ARTIFACTS_MOUNT_PATH = "/artifacts" | ||
| _HF_JOBS_ARTIFACTS_BUCKET_NAME = "jobs-artifacts" | ||
|
Comment on lines
+11923
to
+11925
Member
There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. good names ! |
||
|
|
||
| def _create_uv_command_env_and_secrets( | ||
| self, | ||
| *, | ||
|
|
@@ -11924,7 +11935,8 @@ def _create_uv_command_env_and_secrets( | |
| secrets: dict[str, Any] | None, | ||
| namespace: str | None, | ||
| token: bool | str | None, | ||
| ) -> tuple[list[str], dict[str, Any], dict[str, Any]]: | ||
| volumes: list[Volume] | None = None, | ||
| ) -> tuple[list[str], dict[str, Any], dict[str, Any], list[Volume]]: | ||
| env = env or {} | ||
| secrets = secrets or {} | ||
|
|
||
|
|
@@ -11957,50 +11969,125 @@ def _create_uv_command_env_and_secrets( | |
| if len(local_files_to_include) == 0: | ||
| # Direct URL execution or command - no upload needed | ||
| command = ["uv", "run"] + uv_args + [script] + script_args | ||
| else: | ||
| # Find appropriate remote file names | ||
| remote_to_local_file_names: dict[str, str] = {} | ||
| for local_file_to_include in local_files_to_include: | ||
| local_file_path = Path(local_file_to_include) | ||
| # remove spaces for proper xargs parsing | ||
| remote_file_path = Path(local_file_path.name.replace(" ", "_")) | ||
| if remote_file_path.name in remote_to_local_file_names: | ||
| for i in itertools.count(): | ||
| remote_file_name = remote_file_path.with_stem(remote_file_path.stem + f"({i})").name | ||
| if remote_file_name not in remote_to_local_file_names: | ||
| remote_to_local_file_names[remote_file_name] = local_file_to_include | ||
| break | ||
| else: | ||
| remote_to_local_file_names[remote_file_path.name] = local_file_to_include | ||
| local_to_remote_file_names = { | ||
| local_file_to_include: remote_file_name | ||
| for remote_file_name, local_file_to_include in remote_to_local_file_names.items() | ||
| } | ||
| return command, env, secrets, [] | ||
|
|
||
| # Find appropriate remote file names | ||
| remote_to_local_file_names: dict[str, str] = {} | ||
| for local_file_to_include in local_files_to_include: | ||
| local_file_path = Path(local_file_to_include) | ||
| # remove spaces for proper xargs parsing | ||
| remote_file_path = Path(local_file_path.name.replace(" ", "_")) | ||
| if remote_file_path.name in remote_to_local_file_names: | ||
| for i in itertools.count(): | ||
| remote_file_name = remote_file_path.with_stem(remote_file_path.stem + f"({i})").name | ||
| if remote_file_name not in remote_to_local_file_names: | ||
| remote_to_local_file_names[remote_file_name] = local_file_to_include | ||
| break | ||
| else: | ||
| remote_to_local_file_names[remote_file_path.name] = local_file_to_include | ||
| local_to_remote_file_names = { | ||
| local_file_to_include: remote_file_name | ||
| for remote_file_name, local_file_to_include in remote_to_local_file_names.items() | ||
| } | ||
|
|
||
| # Replace local paths with remote paths in command | ||
| if script in local_to_remote_file_names: | ||
| script = local_to_remote_file_names[script] | ||
| script_args = [ | ||
| local_to_remote_file_names[arg] if arg in local_to_remote_file_names else arg for arg in script_args | ||
| ] | ||
| # Try bucket transport if opted in | ||
| use_bucket = constants.HF_JOBS_USE_BUCKET_TRANSPORT | ||
| if use_bucket: | ||
| # Check if /artifacts mount path is already taken by user volumes | ||
| existing_mount_paths = {v.mount_path for v in (volumes or [])} | ||
| if self._HF_JOBS_ARTIFACTS_MOUNT_PATH in existing_mount_paths: | ||
| logger.info( | ||
| f"Mount path {self._HF_JOBS_ARTIFACTS_MOUNT_PATH} already in use, falling back to base64 transport." | ||
| ) | ||
| use_bucket = False | ||
| elif not is_xet_available(): | ||
| logger.info("hf_xet not available, falling back to base64 transport for Jobs.") | ||
| use_bucket = False | ||
|
|
||
| # Load content to pass as environment variable with format | ||
| # file1 base64content1 | ||
| # file2 base64content2 | ||
| # ... | ||
| env["LOCAL_FILES_ENCODED"] = "\n".join( | ||
| remote_file_name + " " + base64.b64encode(Path(local_file_to_include).read_bytes()).decode() | ||
| for remote_file_name, local_file_to_include in remote_to_local_file_names.items() | ||
| ) | ||
| # Shell-quote each arg to prevent metacharacters (e.g. '>') from being interpreted by bash | ||
| quoted_parts = ["'" + arg.replace("'", r"'\''") + "'" for arg in [*uv_args, script, *script_args]] | ||
| command = [ | ||
| "bash", | ||
| "-c", | ||
| """echo $LOCAL_FILES_ENCODED | xargs -n 2 bash -c 'echo "$1" | base64 -d > "$0"' && """ | ||
| + f"uv run {' '.join(quoted_parts)}", | ||
| ] | ||
| return command, env, secrets | ||
| if use_bucket: | ||
| try: | ||
| extra_volumes, scripts_prefix = self._upload_scripts_to_bucket( | ||
| namespace=namespace, | ||
| remote_to_local_file_names=remote_to_local_file_names, | ||
| token=token, | ||
| ) | ||
| # Rewrite script and script_args to reference the mounted path | ||
| mount_path = self._HF_JOBS_ARTIFACTS_MOUNT_PATH | ||
| if script in local_to_remote_file_names: | ||
| script = f"{mount_path}/{scripts_prefix}/{local_to_remote_file_names[script]}" | ||
| script_args = [ | ||
| f"{mount_path}/{scripts_prefix}/{local_to_remote_file_names[arg]}" | ||
| if arg in local_to_remote_file_names | ||
| else arg | ||
| for arg in script_args | ||
| ] | ||
| command = ["uv", "run"] + uv_args + [script] + script_args | ||
| return command, env, secrets, extra_volumes | ||
| except Exception: | ||
| logger.warning( | ||
| "Failed to upload scripts to bucket, falling back to base64 transport.", | ||
| exc_info=True, | ||
| ) | ||
|
|
||
| # Base64 transport path (default) | ||
| # Replace local paths with remote paths in command | ||
| if script in local_to_remote_file_names: | ||
| script = local_to_remote_file_names[script] | ||
| script_args = [ | ||
| local_to_remote_file_names[arg] if arg in local_to_remote_file_names else arg for arg in script_args | ||
| ] | ||
|
|
||
| # Load content to pass as environment variable with format | ||
| # file1 base64content1 | ||
| # file2 base64content2 | ||
| # ... | ||
| env["LOCAL_FILES_ENCODED"] = "\n".join( | ||
| remote_file_name + " " + base64.b64encode(Path(local_file_to_include).read_bytes()).decode() | ||
| for remote_file_name, local_file_to_include in remote_to_local_file_names.items() | ||
| ) | ||
| # Shell-quote each arg to prevent metacharacters (e.g. '>') from being interpreted by bash | ||
| quoted_parts = ["'" + arg.replace("'", r"'\''") + "'" for arg in [*uv_args, script, *script_args]] | ||
| command = [ | ||
| "bash", | ||
| "-c", | ||
| """echo $LOCAL_FILES_ENCODED | xargs -n 2 bash -c 'echo "$1" | base64 -d > "$0"' && """ | ||
| + f"uv run {' '.join(quoted_parts)}", | ||
| ] | ||
| return command, env, secrets, [] | ||
|
|
||
| def _upload_scripts_to_bucket( | ||
| self, | ||
| *, | ||
| namespace: str, | ||
| remote_to_local_file_names: dict[str, str], | ||
| token: bool | str | None, | ||
| ) -> tuple[list[Volume], str]: | ||
| """Upload script files to a bucket and return volumes to mount plus the scripts prefix. | ||
|
|
||
| Creates a bucket ``{namespace}/jobs-artifacts`` (if it doesn't exist) and uploads | ||
| each script to ``_scripts/{uuid}/{remote_name}`` inside it. Returns a :class:`Volume` | ||
| that mounts the bucket at ``/artifacts`` so the job can access the scripts directly. | ||
| """ | ||
| import uuid | ||
|
|
||
| bucket_id = f"{namespace}/{self._HF_JOBS_ARTIFACTS_BUCKET_NAME}" | ||
| subfolder_id = str(uuid.uuid4()) | ||
| scripts_prefix = f"scripts/{subfolder_id}" | ||
|
|
||
| self.create_bucket(bucket_id=bucket_id, exist_ok=True, token=token) | ||
|
|
||
| add_ops: list[tuple[str | Path | bytes, str]] = [ | ||
| (Path(local_path), f"{scripts_prefix}/{remote_name}") | ||
| for remote_name, local_path in remote_to_local_file_names.items() | ||
| ] | ||
| self.batch_bucket_files(bucket_id=bucket_id, add=add_ops, token=token) | ||
|
|
||
| volume = Volume( | ||
| type="bucket", | ||
| source=bucket_id, | ||
| mount_path=self._HF_JOBS_ARTIFACTS_MOUNT_PATH, | ||
| ) | ||
| return [volume], scripts_prefix | ||
|
|
||
| @validate_hf_hub_args | ||
| def create_bucket( | ||
|
|
||
Oops, something went wrong.
Oops, something went wrong.
Add this suggestion to a batch that can be applied as a single commit.
This suggestion is invalid because no changes were made to the code.
Suggestions cannot be applied while the pull request is closed.
Suggestions cannot be applied while viewing a subset of changes.
Only one suggestion per line can be applied in a batch.
Add this suggestion to a batch that can be applied as a single commit.
Applying suggestions on deleted lines is not supported.
You must change the existing code in this line in order to create a valid suggestion.
Outdated suggestions cannot be applied.
This suggestion has been applied or marked resolved.
Suggestions cannot be applied from pending reviews.
Suggestions cannot be applied on multi-line comments.
Suggestions cannot be applied while the pull request is queued to merge.
Suggestion cannot be applied right now. Please check back later.
There was a problem hiding this comment.
Choose a reason for hiding this comment
The reason will be displayed to describe this comment to others. Learn more.
imo we can already set it to True by default
There was a problem hiding this comment.
Choose a reason for hiding this comment
The reason will be displayed to describe this comment to others. Learn more.
probably makes sense indeed! I'll wait to see what @Wauplin thinks