diff --git a/.github/workflows/multi_arch_build_tarballs.yml b/.github/workflows/multi_arch_build_tarballs.yml new file mode 100644 index 0000000000..ecddc4ba4f --- /dev/null +++ b/.github/workflows/multi_arch_build_tarballs.yml @@ -0,0 +1,102 @@ +# Copyright Advanced Micro Devices, Inc. +# SPDX-License-Identifier: MIT + +name: Build Multi-Arch Tarballs + +on: + workflow_dispatch: + inputs: + artifact_run_id: + description: "Run ID to fetch artifacts from (defaults to current run)" + type: string + default: "" + artifact_github_repo: + description: "GitHub repository for artifact_run_id" + type: string + default: ROCm/TheRock + dist_amdgpu_families: + description: "Semicolon-separated list of GPU families (e.g. 'gfx94X-dcgpu;gfx110X-all')" + type: string + platform: + type: choice + description: "Platform to fetch artifacts for" + options: + - linux + - windows + default: "linux" + package_version: + description: "ROCm package version string (e.g. '7.13.0.dev0+abc123')" + type: string + release_type: + description: 'Release type: "" for CI, or "dev", "nightly", "prerelease".' + type: string + default: "" + workflow_call: + inputs: + artifact_run_id: + type: string + default: "" + artifact_github_repo: + type: string + default: "" + dist_amdgpu_families: + type: string + required: true + platform: + type: string + default: "linux" + package_version: + type: string + required: true + release_type: + type: string + default: "" +permissions: + contents: read + +run-name: Build Multi-Arch Tarballs (${{ inputs.dist_amdgpu_families }}, ${{ inputs.platform }}) + +jobs: + build_tarballs: + name: Build Tarballs + runs-on: ${{ github.repository_owner == 'ROCm' && 'azure-linux-scale-rocm' || 'ubuntu-24.04' }} + permissions: + id-token: write + env: + ARTIFACT_RUN_ID: "${{ inputs.artifact_run_id != '' && inputs.artifact_run_id || github.run_id }}" + RELEASE_TYPE: ${{ inputs.release_type }} + + steps: + - name: Checkout + uses: actions/checkout@de0fac2e4500dabe0009e67214ff5f5447ce83dd # v6.0.2 + + - name: Setting up Python + uses: actions/setup-python@a309ff8b426b58ec0e2a45f0f869d46889d02405 # v6.2.0 + with: + python-version: "3.12" + + - name: Install Python requirements + run: pip install -r requirements.txt + + - name: Build tarballs + run: | + python build_tools/build_tarballs.py \ + --run-id="${{ env.ARTIFACT_RUN_ID }}" \ + --run-github-repo="${{ inputs.artifact_github_repo }}" \ + --dist-amdgpu-families="${{ inputs.dist_amdgpu_families }}" \ + --platform="${{ inputs.platform }}" \ + --package-version="${{ inputs.package_version }}" \ + --output-dir="${{ github.workspace }}/tarballs" + + - name: Configure AWS Credentials + uses: ./.github/actions/configure_aws_artifacts_credentials + with: + release_type: ${{ inputs.release_type }} + + - name: Upload tarballs + id: upload + run: | + python build_tools/github_actions/upload_tarballs.py \ + --input-tarballs-dir="${{ github.workspace }}/tarballs" \ + --run-id="${{ github.run_id }}" \ + --platform="${{ inputs.platform }}" diff --git a/build_tools/_therock_utils/workflow_outputs.py b/build_tools/_therock_utils/workflow_outputs.py index f2fd7ea80b..53b90f038a 100644 --- a/build_tools/_therock_utils/workflow_outputs.py +++ b/build_tools/_therock_utils/workflow_outputs.py @@ -220,6 +220,12 @@ def python_packages(self, artifact_group: str = "") -> StorageLocation: suffix = f"/{artifact_group}" if artifact_group else "" return StorageLocation(self.bucket, f"{self.prefix}/python{suffix}") + # -- Tarballs --------------------------------------------------------------- + + def tarballs(self) -> StorageLocation: + """Location for the tarballs directory.""" + return StorageLocation(self.bucket, f"{self.prefix}/tarballs") + # -- Factories -------------------------------------------------------------- @classmethod diff --git a/build_tools/build_tarballs.py b/build_tools/build_tarballs.py new file mode 100644 index 0000000000..b24de194d0 --- /dev/null +++ b/build_tools/build_tarballs.py @@ -0,0 +1,242 @@ +#!/usr/bin/env python +# Copyright Advanced Micro Devices, Inc. +# SPDX-License-Identifier: MIT + +"""Fetch multi-arch build artifacts and package them into per-family tarballs. + +For each GPU family in --dist-amdgpu-families, this script: +1. Fetches artifacts (generic + family-specific) using artifact_manager.py +2. Flattens them into a single install-prefix-like layout +3. Compresses the result into a tarball + +When KPACK_SPLIT_ARTIFACTS is enabled in the build manifest, device-specific +files are split by individual GPU target and don't conflict across families. +In that case, this script also produces a combined multi-arch tarball +containing all targets in a single install prefix. + +A shared download cache avoids re-downloading generic (host) artifacts +when processing multiple families. + +Tarball naming follows the existing release convention: + therock-dist-{platform}-{family}-{version}.tar.gz + therock-dist-{platform}-multiarch-{version}.tar.gz (KPACK split only) + +Example +------- + python build_tools/build_tarballs.py \\ + --run-id=24104028483 \\ + --dist-amdgpu-families="gfx94X-dcgpu;gfx110X-all" \\ + --platform=linux \\ + --package-version="7.13.0.dev0+abc123" \\ + --output-dir=/tmp/tarballs + +Manual testing +-------------- +Find a recent multi-arch CI run at +https://github.com/ROCm/TheRock/actions/workflows/multi_arch_ci.yml +and use its run ID. Use ``--platform`` to select which platform's +artifacts to fetch (defaults to the current system). + +Expected output: one .tar.gz per family in ``--output-dir``, named +``therock-dist-{platform}-{family}-{version}.tar.gz``. If +KPACK_SPLIT_ARTIFACTS is enabled in the build, also a +``therock-dist-{platform}-multiarch-{version}.tar.gz``. + +Each tarball should contain a standard install prefix layout +(``bin/``, ``lib/``, ``include/``, ``share/``, etc.) with GPU-specific +files (e.g. ``lib/hipblaslt/library/*.co``) only for the target family. +""" + +import argparse +from concurrent.futures import ProcessPoolExecutor, as_completed +import json +import shlex +import subprocess +import sys +from pathlib import Path + + +def log(msg: str): + print(msg, flush=True) + + +def run_command(args: list[str | Path], cwd: Path | None = None): + args = [str(arg) for arg in args] + log(f"++ Exec{f' [{cwd}]' if cwd else ''}$ {shlex.join(args)}") + subprocess.check_call(args, cwd=str(cwd) if cwd else None, stdin=subprocess.DEVNULL) + + +def fetch_and_flatten( + *, + run_id: str, + amdgpu_families: list[str], + platform: str, + output_dir: Path, + download_cache_dir: Path, + run_github_repo: str | None = None, +): + """Fetch artifacts for one or more families and flatten into output_dir.""" + families_str = ";".join(amdgpu_families) + log(f"\n{'='*60}") + log(f"Fetching artifacts for {families_str}") + log(f"{'='*60}") + + cmd = [ + sys.executable, + "build_tools/artifact_manager.py", + "fetch", + f"--run-id={run_id}", + "--stage=all", + f"--amdgpu-families={families_str}", + "--expand-family-to-targets", + f"--platform={platform}", + f"--output-dir={output_dir}", + "--flatten", + f"--download-cache-dir={download_cache_dir}", + ] + if run_github_repo: + cmd.append(f"--run-github-repo={run_github_repo}") + run_command(cmd) + + +def is_kpack_split(flatten_dir: Path) -> bool: + """Check if KPACK_SPLIT_ARTIFACTS is enabled from the build manifest.""" + manifest_path = flatten_dir / "share" / "therock" / "therock_manifest.json" + if not manifest_path.exists(): + return False + manifest = json.loads(manifest_path.read_text()) + return manifest.get("flags", {}).get("KPACK_SPLIT_ARTIFACTS", False) + + +def compress_tarball(*, source_dir: Path, tarball_path: Path): + """Compress a directory into a .tar.gz tarball. + + Uses subprocess ``tar cfz`` rather than Python's ``tarfile`` module + (tarfile was significantly slower and produced larger output with default + settings — its ``compresslevel`` parameter may help but was not tuned). + + Uses gzip to match the existing release tarball format. Switching to + zstd (``tar cf - . | zstd``) would be faster with better compression, + but requires downstream consumers to support ``.tar.zst``. + """ + log(f"\nCompressing {source_dir} -> {tarball_path}") + tarball_path.parent.mkdir(parents=True, exist_ok=True) + run_command(["tar", "cfz", str(tarball_path), "."], cwd=source_dir) + size_mb = tarball_path.stat().st_size / (1024 * 1024) + log(f" Created {tarball_path.name} ({size_mb:.1f} MB)") + + +def main(argv=None): + parser = argparse.ArgumentParser( + description="Fetch multi-arch artifacts and package into per-family tarballs" + ) + parser.add_argument("--run-id", required=True, help="Workflow run ID to fetch from") + parser.add_argument( + "--run-github-repo", + type=str, + default=None, + help="GitHub repository for --run-id in 'owner/repo' format. " + "Defaults to GITHUB_REPOSITORY env var or 'ROCm/TheRock'", + ) + parser.add_argument( + "--dist-amdgpu-families", + required=True, + help="Semicolon-separated GPU families (e.g. 'gfx94X-dcgpu;gfx110X-all')", + ) + parser.add_argument( + "--platform", + default="linux", + choices=["linux", "windows"], + help="Platform to fetch artifacts for", + ) + parser.add_argument( + "--package-version", + required=True, + help="ROCm package version string for tarball naming", + ) + parser.add_argument( + "--output-dir", + type=Path, + required=True, + help="Output directory for tarballs", + ) + args = parser.parse_args(argv) + # Normalize empty string to None (workflow inputs default to "") + args.run_github_repo = args.run_github_repo or None + + families = [f.strip() for f in args.dist_amdgpu_families.split(";") if f.strip()] + if not families: + raise ValueError("No GPU families specified") + + work_dir = args.output_dir / ".work" + download_cache_dir = work_dir / "download-cache" + download_cache_dir.mkdir(parents=True, exist_ok=True) + + log(f"Building tarballs for {len(families)} families: {', '.join(families)}") + log(f" Platform: {args.platform}") + log(f" Version: {args.package_version}") + log(f" Output: {args.output_dir}") + + # Phase 1: Fetch and flatten sequentially. + # Sequential so the shared download cache avoids re-downloading generic + # (host) artifacts for each family. + family_dirs = [] + compress_tasks = [] + for family in families: + flatten_dir = work_dir / family + fetch_and_flatten( + run_id=args.run_id, + amdgpu_families=[family], + platform=args.platform, + output_dir=flatten_dir, + download_cache_dir=download_cache_dir, + run_github_repo=args.run_github_repo, + ) + family_dirs.append(flatten_dir) + tarball_name = ( + f"therock-dist-{args.platform}-{family}-{args.package_version}.tar.gz" + ) + compress_tasks.append((flatten_dir, args.output_dir / tarball_name)) + + # Phase 1.5: If KPACK_SPLIT_ARTIFACTS is enabled, fetch all families + # into a single combined directory. With KPACK split, device-specific + # files are per individual GPU target and don't conflict, so all + # families can coexist in a single install prefix. + kpack_split = is_kpack_split(family_dirs[0]) + if kpack_split and len(families) > 1: + log("::: KPACK_SPLIT_ARTIFACTS detected — building multi-arch tarball") + multiarch_dir = work_dir / "multiarch" + fetch_and_flatten( + run_id=args.run_id, + amdgpu_families=families, + platform=args.platform, + output_dir=multiarch_dir, + download_cache_dir=download_cache_dir, + run_github_repo=args.run_github_repo, + ) + tarball_name = ( + f"therock-dist-{args.platform}-multiarch-{args.package_version}.tar.gz" + ) + compress_tasks.append((multiarch_dir, args.output_dir / tarball_name)) + + # Phase 2: Compress all tarballs in parallel. + # Each tar cfz is single-threaded, so running N families concurrently + # on a multi-core runner scales well with minimal per-job slowdown. + # TODO: Add --compress-workers flag to cap concurrency on smaller runners. + log(f"\nCompressing {len(compress_tasks)} tarballs in parallel...") + with ProcessPoolExecutor(max_workers=len(compress_tasks)) as executor: + futures = { + executor.submit(compress_tarball, source_dir=src, tarball_path=dst): dst + for src, dst in compress_tasks + } + for future in as_completed(futures): + future.result() # Raises on failure + + log(f"\nDone. Tarballs in {args.output_dir}:") + for tb in sorted(args.output_dir.glob("*.tar.gz")): + size_mb = tb.stat().st_size / (1024 * 1024) + log(f" {tb.name} ({size_mb:.1f} MB)") + + +if __name__ == "__main__": + main() diff --git a/build_tools/github_actions/upload_tarballs.py b/build_tools/github_actions/upload_tarballs.py new file mode 100644 index 0000000000..fd44c3afcd --- /dev/null +++ b/build_tools/github_actions/upload_tarballs.py @@ -0,0 +1,110 @@ +#!/usr/bin/env python3 +# Copyright Advanced Micro Devices, Inc. +# SPDX-License-Identifier: MIT + +"""Upload tarballs to S3. + +Uploads all .tar.gz files from the input directory to the tarballs/ +subdirectory of the workflow output root in S3. + +Usage: + upload_tarballs.py --input-tarballs-dir TARBALLS_DIR --run-id RUN_ID [--platform PLATFORM] + +Manual testing: + # Test with local output (no S3 credentials needed): + python build_tools/github_actions/upload_tarballs.py \\ + --input-tarballs-dir /tmp/tarballs \\ + --run-id 12345 \\ + --platform linux \\ + --output-dir /tmp/upload-test + + # Verify: /tmp/upload-test/12345-linux/tarballs/*.tar.gz + + # Dry run (prints plan without uploading): + python build_tools/github_actions/upload_tarballs.py \\ + --input-tarballs-dir /tmp/tarballs \\ + --run-id 12345 \\ + --dry-run +""" + +import argparse +import platform as platform_module +import sys +from pathlib import Path + +_BUILD_TOOLS_DIR = Path(__file__).resolve().parent.parent +sys.path.insert(0, str(_BUILD_TOOLS_DIR)) + +from _therock_utils.storage_backend import create_storage_backend +from _therock_utils.workflow_outputs import WorkflowOutputRoot + + +def log(*args): + print(*args) + sys.stdout.flush() + + +def main(): + parser = argparse.ArgumentParser(description="Upload tarballs to S3") + parser.add_argument( + "--input-tarballs-dir", + type=Path, + required=True, + help="Directory containing .tar.gz tarballs to upload", + ) + parser.add_argument( + "--run-id", + type=str, + required=True, + help="Workflow run ID", + ) + parser.add_argument( + "--output-dir", + type=Path, + default=None, + help="Output to local directory instead of S3", + ) + parser.add_argument( + "--platform", + type=str, + default=platform_module.system().lower(), + choices=["linux", "windows"], + help="Platform for the upload path (default: current system)", + ) + parser.add_argument( + "--dry-run", + action="store_true", + help="Print what would happen without uploading", + ) + args = parser.parse_args() + + tarballs_dir = args.input_tarballs_dir.resolve() + if not tarballs_dir.is_dir(): + raise FileNotFoundError(f"Tarballs directory not found: {tarballs_dir}") + + tarball_files = sorted(tarballs_dir.glob("*.tar.gz")) + if not tarball_files: + raise FileNotFoundError(f"No .tar.gz files found in {tarballs_dir}") + + log(f"[INFO] Tarballs directory: {tarballs_dir}") + log(f"[INFO] Run ID: {args.run_id}") + log(f"[INFO] Platform: {args.platform}") + log(f"[INFO] Found {len(tarball_files)} tarballs:") + for f in tarball_files: + size_mb = f.stat().st_size / (1024 * 1024) + log(f" {f.name} ({size_mb:.1f} MB)") + + output_root = WorkflowOutputRoot.from_workflow_run( + run_id=args.run_id, platform=args.platform + ) + tarballs_loc = output_root.tarballs() + backend = create_storage_backend(staging_dir=args.output_dir, dry_run=args.dry_run) + + log(f"\n[INFO] Uploading to {tarballs_loc.s3_uri}") + count = backend.upload_directory(tarballs_dir, tarballs_loc, include=["*.tar.gz"]) + log(f"[INFO] Uploaded {count} files") + log("[INFO] Done!") + + +if __name__ == "__main__": + main() diff --git a/build_tools/tests/build_tarballs_test.py b/build_tools/tests/build_tarballs_test.py new file mode 100644 index 0000000000..fb9e551230 --- /dev/null +++ b/build_tools/tests/build_tarballs_test.py @@ -0,0 +1,65 @@ +#!/usr/bin/env python +"""Unit tests for build_tarballs.py.""" + +import json +import os +import sys +import tarfile +import tempfile +import unittest +from pathlib import Path + +sys.path.insert(0, os.fspath(Path(__file__).parent.parent)) + +from build_tarballs import compress_tarball, is_kpack_split + + +class TestIsKpackSplit(unittest.TestCase): + def _write_manifest(self, tmpdir: Path, flags: dict): + manifest_dir = tmpdir / "share" / "therock" + manifest_dir.mkdir(parents=True) + manifest = {"flags": flags} + (manifest_dir / "therock_manifest.json").write_text(json.dumps(manifest)) + + def test_enabled(self): + with tempfile.TemporaryDirectory() as tmpdir: + tmpdir = Path(tmpdir) + self._write_manifest(tmpdir, {"KPACK_SPLIT_ARTIFACTS": True}) + self.assertTrue(is_kpack_split(tmpdir)) + + def test_disabled(self): + with tempfile.TemporaryDirectory() as tmpdir: + tmpdir = Path(tmpdir) + self._write_manifest(tmpdir, {"KPACK_SPLIT_ARTIFACTS": False}) + self.assertFalse(is_kpack_split(tmpdir)) + + def test_no_manifest(self): + with tempfile.TemporaryDirectory() as tmpdir: + self.assertFalse(is_kpack_split(Path(tmpdir))) + + +class TestCompressTarball(unittest.TestCase): + def test_creates_tarball(self): + with tempfile.TemporaryDirectory() as tmpdir: + tmpdir = Path(tmpdir) + src = tmpdir / "src" + src.mkdir() + (src / "bin").mkdir() + (src / "bin" / "hello").write_text("hello world") + (src / "lib").mkdir() + (src / "lib" / "libfoo.so").write_bytes(b"\x00" * 1024) + + tarball_path = tmpdir / "output" / "test.tar.gz" + compress_tarball(source_dir=src, tarball_path=tarball_path) + + self.assertTrue(tarball_path.exists()) + self.assertGreater(tarball_path.stat().st_size, 0) + + with tarfile.open(tarball_path, "r:gz") as tf: + names = tf.getnames() + self.assertIn("./bin/hello", names) + self.assertIn("./lib/libfoo.so", names) + + +if __name__ == "__main__": + unittest.main() diff --git a/docs/development/workflow_outputs.md b/docs/development/workflow_outputs.md index cca1237101..34e8131348 100644 --- a/docs/development/workflow_outputs.md +++ b/docs/development/workflow_outputs.md @@ -85,6 +85,9 @@ propagate artifact group naming consistently. *.whl *.tar.gz index.html + + tarballs/ + therock-dist-{platform}-{family}-{version}.tar.gz ``` The `comp-summary.*` files appear both in the `therock-build-prof/` subdirectory @@ -122,6 +125,10 @@ families in parallel, producing identically-named log files (e.g., *.whl (per-family wheels, e.g., rocm_sdk_devel) *.tar.gz (sdist) index.html + + tarballs/ + therock-dist-{platform}-{family}-{version}.tar.gz + therock-dist-{platform}-multiarch-{version}.tar.gz (KPACK split only) ``` Example for a run with foundation + math-libs stages: @@ -225,6 +232,7 @@ root.stage_log_dir(stage_name="foundation") # generic stage, no family root.manifest_dir(artifact_group="gfx94X-dcgpu") root.manifest(artifact_group="gfx94X-dcgpu") root.python_packages(artifact_group="gfx110X-all") +root.tarballs() ``` The `lookup_workflow_run` parameter controls whether `from_workflow_run()` calls @@ -268,6 +276,7 @@ To add a new output type: | ------------------------------------------------------------------------------------------ | ------------------------------------------------------------------------- | | [`post_build_upload.py`](/build_tools/github_actions/post_build_upload.py) | `WorkflowOutputRoot` + `StorageBackend` for artifacts, logs, manifests | | [`post_stage_upload.py`](/build_tools/github_actions/post_stage_upload.py) | `WorkflowOutputRoot` + `StorageBackend` for multi-arch stage logs | +| [`upload_tarballs.py`](/build_tools/github_actions/upload_tarballs.py) | `WorkflowOutputRoot` + `StorageBackend` for tarballs | | [`upload_python_packages.py`](/build_tools/github_actions/upload_python_packages.py) | `WorkflowOutputRoot` + `StorageBackend` for Python wheels and index | | [`upload_pytorch_manifest.py`](/build_tools/github_actions/upload_pytorch_manifest.py) | `WorkflowOutputRoot` + `StorageBackend` for PyTorch manifests | | [`upload_test_report_script.py`](/build_tools/github_actions/upload_test_report_script.py) | `WorkflowOutputRoot` for S3 base URI (upload not yet migrated to backend) |