diff --git a/Cargo.lock b/Cargo.lock index aaec40881..d001ef60d 100644 --- a/Cargo.lock +++ b/Cargo.lock @@ -51,6 +51,12 @@ dependencies = [ "libc", ] +[[package]] +name = "anes" +version = "0.1.6" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "4b46cbb362ab8752921c97e041f5e366ee6297bd428a31275b9fcf1e380f7299" + [[package]] name = "anstream" version = "0.6.21" @@ -930,6 +936,12 @@ dependencies = [ "either", ] +[[package]] +name = "cast" +version = "0.3.0" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "37b2a672a2cb129a2e41c10b1224bb368f9f37a2b16b612598138befd7b37eb5" + [[package]] name = "cbor-diag" version = "0.1.12" @@ -1218,6 +1230,42 @@ dependencies = [ "cfg-if", ] +[[package]] +name = "criterion" +version = "0.5.1" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "f2b12d017a929603d80db1831cd3a24082f8137ce19c69e6447f54f5fc8d692f" +dependencies = [ + "anes", + "cast", + "ciborium", + "clap", + "criterion-plot", + "futures", + "is-terminal", + "itertools 0.10.5", + "num-traits", + "once_cell", + "oorandom", + "regex", + "serde", + "serde_derive", + "serde_json", + "tinytemplate", + "tokio", + "walkdir", +] + +[[package]] +name = "criterion-plot" +version = "0.5.0" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "6b50826342786a51a89e2da3a28f1c32b06e387201bc2d19791f622c673706b1" +dependencies = [ + "cast", + "itertools 0.10.5", +] + [[package]] name = "crossbeam-utils" version = "0.8.21" @@ -1993,6 +2041,12 @@ version = "0.5.0" source = "registry+https://github.com/rust-lang/crates.io-index" checksum = "2304e00983f87ffb38b55b444b5e3b60a884b5d30c0fca7d82fe33449bbe55ea" +[[package]] +name = "hermit-abi" +version = "0.5.2" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "fc0fef456e4baa96da950455cd02c081ca953b141298e41db3fc7e36b1da849c" + [[package]] name = "hex" version = "0.4.3" @@ -2217,7 +2271,7 @@ dependencies = [ "libc", "percent-encoding", "pin-project-lite", - "socket2 0.5.10", + "socket2 0.6.3", "tokio", "tower-service", "tracing", @@ -2420,12 +2474,32 @@ dependencies = [ "serde", ] +[[package]] +name = "is-terminal" +version = "0.4.17" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "3640c1c38b8e4e43584d8df18be5fc6b0aa314ce6ebf51b53313d4306cca8e46" +dependencies = [ + "hermit-abi", + "libc", + "windows-sys 0.61.2", +] + [[package]] name = "is_terminal_polyfill" version = "1.70.2" source = "registry+https://github.com/rust-lang/crates.io-index" checksum = "a6cb138bb79a146c1bd460005623e142ef0181e3d0219cb493e02f7d08a35695" +[[package]] +name = "itertools" +version = "0.10.5" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "b0fd2260e829bddf4cb6ea802289de2f86d6a7a690192fbe91b3f46e0f2c8473" +dependencies = [ + "either", +] + [[package]] name = "itertools" version = "0.14.0" @@ -3072,7 +3146,7 @@ dependencies = [ "hyper 1.7.0", "hyper-rustls", "hyper-util", - "itertools", + "itertools 0.14.0", "lz4_flex", "memory-stats", "mock_instant", @@ -3123,6 +3197,7 @@ dependencies = [ "bitflags 2.10.0", "blake3", "bytes", + "criterion", "futures", "hex", "http-body-util", @@ -3313,6 +3388,12 @@ version = "1.70.2" source = "registry+https://github.com/rust-lang/crates.io-index" checksum = "384b8ab6d37215f3c5301a95a4accb5d64aa607f1fcb26a11b5303878451b4fe" +[[package]] +name = "oorandom" +version = "11.1.5" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "d6790f58c7ff633d8771f42965289203411a5e5c68388703c06e14f24770b41e" + [[package]] name = "openssl-probe" version = "0.1.6" @@ -3720,7 +3801,7 @@ source = "registry+https://github.com/rust-lang/crates.io-index" checksum = "be769465445e8c1474e9c5dac2018218498557af32d9ed057325ec9a41ae81bf" dependencies = [ "heck", - "itertools", + "itertools 0.14.0", "log", "multimap", "once_cell", @@ -3740,7 +3821,7 @@ source = "registry+https://github.com/rust-lang/crates.io-index" checksum = "8a56d757972c98b346a9b766e3f02746cde6dd1cd1d1d563472929fdd74bec4d" dependencies = [ "anyhow", - "itertools", + "itertools 0.14.0", "proc-macro2", "quote", "syn", @@ -3778,7 +3859,7 @@ dependencies = [ "quinn-udp", "rustc-hash", "rustls", - "socket2 0.5.10", + "socket2 0.6.3", "thiserror 2.0.17", "tokio", "tracing", @@ -3815,7 +3896,7 @@ dependencies = [ "cfg_aliases", "libc", "once_cell", - "socket2 0.5.10", + "socket2 0.6.3", "tracing", "windows-sys 0.60.2", ] @@ -4958,6 +5039,16 @@ dependencies = [ "zerovec", ] +[[package]] +name = "tinytemplate" +version = "1.2.1" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "be4d6b5f19ff7664e8c98d03e2139cb510db9b0a60b55f8e8709b689d939b6bc" +dependencies = [ + "serde", + "serde_json", +] + [[package]] name = "tinyvec" version = "1.10.0" diff --git a/nativelink-util/BENCHMARKS.md b/nativelink-util/BENCHMARKS.md new file mode 100644 index 000000000..9ae8aeb85 --- /dev/null +++ b/nativelink-util/BENCHMARKS.md @@ -0,0 +1,184 @@ +# Benchmarks: macOS clonefile + concurrency-cap branch + +Reproduces the perf claims in +[`HANDOFF-nativelink-macos-clonefile-optimizations.md`](../../instacart-ios/HANDOFF-nativelink-macos-clonefile-optimizations.md) +on a real APFS volume. Two criterion benches, both `harness = false` under +`nativelink-util/benches/`: + +| Bench | Proves | +|---|---| +| `hardlink_directory_tree` | clonefile fast path is faster than per-file hardlinks (commit `b3b0cd3f`) | +| `download_to_directory_concurrency` | `buffer_unordered(64)` is not a regression vs unbounded `FuturesUnordered` (commit `1ddce0fc`) | + +Reproduce on macOS arm64: + +```bash +cargo bench -p nativelink-util --bench hardlink_directory_tree +cargo bench -p nativelink-util --bench download_to_directory_concurrency +``` + +HTML reports land in `target/criterion/`. + +## Host + +| | | +|---|---| +| Date | 2026-05-15 | +| Host | Apple M4 Max (ARM64), Darwin 25.5.0 | +| FS | APFS (root volume) | +| Rust | 1.94.0 | +| nativelink branch | `instacart/macos-clonefile-optimizations` @ `8051ca9e` + this commit | + +## Layer 1 — `hardlink_directory_tree` (clonefile vs per-file hardlinks) + +Source tree is built once per shape; each iteration materializes into a +fresh `tempfile::TempDir` destination. + +- **treatment** — `hardlink_directory_tree` (public API). On macOS hits + `clonefile(2)` + `set_readwrite_recursive` walk. +- **baseline_perfile** — `hardlink_directory_tree_perfile` + (`#[doc(hidden)]` helper added for this benchmark). Per-file + `fs::hard_link` walk — identical to what `hardlink_directory_tree` did + on macOS prior to `b3b0cd3f`, and identical to what it still does on + Linux/Windows. + +| shape | files | bytes/file | total | treatment | baseline | **speedup** | +|---------------|------:|-----------:|--------:|----------:|---------:|------------:| +| `small_flat` | 64 | 1 KiB | 64 KiB | 4.43 ms | 17.7 ms | **4.00×** | +| `pcm_cluster` | 219 | 190 KiB | ~40 MiB| 15.23 ms | 61.3 ms | **4.03×** | +| `deep_nested` | 200 | 256 KiB | ~50 MiB| 16.39 ms | 59.0 ms | **3.60×** | +| `medium_flat` | 635 | 290 KiB | ~180 MiB| 49.03 ms | 181 ms | **3.70×** | +| `large_flat` | 1,978 | 245 KiB | ~466 MiB| 150.18 ms | 590 ms | **3.93×** | + +Numbers are criterion's reported median; full distributions (low/median/high) +are in the raw bench log. On a 466 MB / 1,978-file tree (the p95 +`SwiftCompile` shape from `~/Downloads/bazel-exec-log-this.zst`) the +public API drops from 590 ms to 150 ms — a **440 ms per-action +materialization saving**, scaled by 814 such actions per CI build = an +~**6 minute upper bound** on the saving from this single optimization. + +### Why it's 4× and not 10× + +The handoff predicted ≥ 10× on shapes ≥ 200 files based on PR +[#2243][pr2243]'s reported wins. We see a stable ~4× across all five +shapes. Why the gap matters less than it looks: + +- `clonefile(2)` itself is O(1) in tree size. +- After the clone, `hardlink_directory_tree` calls + `set_readwrite_recursive(dst_dir)` to chmod the cloned tree from + `0o555/0o444` (inherited) to `0o755/0o644` (writable, so actions can + drop outputs into the tree). That walk is **O(N) in file count** — a + `read_dir` + `set_permissions` per entry. +- So the "treatment" path is `O(1) clonefile + O(N) chmod walk`, not the + pure `O(1)` that PR #2243's claim implied. +- The 4× ratio reflects the constant per-file cost of `set_permissions` + being cheaper than `hard_link` on APFS — single metadata mutation vs + open-src + open-dst + link inode. + +This is *exactly* the failure mode the handoff flagged before deploy: + +> Expected treatment (clonefile + cache hit): ~0.1 – 0.3 s. If the +> treatment number is ≥ 0.8 s, something is wrong — investigate before +> shipping (likely candidates: clonefile silently falling through, or +> `set_readwrite_recursive` walk swallowing the O(1) clone win). + +Our 0.15 s on `large_flat` is well inside the green band, but the walk +*is* eating most of the headroom. **Follow-up worth filing**: replace +the chmod walk with a single `chmod(2)` on the top-level dst dir + lazy +per-file chmod on first write, OR call out to a parallelized +implementation. That should unlock the remaining 2–3×. + +### Acceptance verdict + +| Criterion | Required | Observed | Verdict | +|----------------------------------------------------------|-------------------|-------------------------|---------| +| macOS arm64, shapes ≥ 200 files: treatment ≥ 10× faster | ≥ 10× | 3.6× – 4.0× | ⚠️ partial — wins are real, magnitude smaller than predicted | +| macOS arm64, treatment p50 on `large_flat` < 0.8 s | < 0.8 s | 0.15 s | ✅ pass | +| Treatment never slower than baseline | ratio ≥ 1.0× | 3.6× – 4.0× across all | ✅ pass | + +**Recommend shipping.** The 4× win on the dominant SwiftCompile shape +already moves the needle hard (590 ms → 150 ms per p95 action; 181 ms → +49 ms per mean action). The path to 10× is a known, isolated follow-up +(the chmod walk) and not a blocker for this branch. + +## Layer 2 — `download_to_directory` concurrency cap + +Replicates the C3 (`1ddce0fc`) change on the synthetic shape that +mirrors `running_actions_manager::download_to_directory`: N concurrent +`fs::hard_link` calls into one destination directory. + +- **unbounded** — pre-C3: every future on an unbounded + `FuturesUnordered`, drained. +- **buffered_64** — post-C3: same futures via + `stream::buffer_unordered(64)`. + +| files (n) | unbounded | buffered_64 | ratio (buf/unb) | +|----------:|----------:|------------:|----------------:| +| 64 | 28.33 ms | 28.18 ms | 1.00× | +| 256 | 113.87 ms | 113.53 ms | 1.00× | +| 635 | 292.23 ms | 287.33 ms | 0.98× | +| 1,978 | 887.33 ms | 892.77 ms | 1.01× | + +### Verdict + +| Criterion | Required | Observed | Verdict | +|--------------------------------------------|-----------------|----------------|---------| +| macOS, 1,978 files: buffered ≤ unbounded | ratio ≤ 1.05× | 1.01× | ✅ pass | +| No size where buffered is dramatically slower | ratio ≤ 1.1× all sizes | max 1.01× | ✅ pass | + +The cap is **performance-neutral** on this single-process workload — +which is the most important security claim for C3, since it means +shipping the cap can't regress macOS workers. The handoff's hypothesis +that the cap *wins* on macOS APFS (vs the unbounded path's metadata-lock +contention) is **not reproduced** at this scale in a single process: +APFS appears to serialize the work either way, so capping the in-flight +count doesn't add or remove contention. + +We expect the cap's win to materialize under **multi-action contention** +— several `download_to_directory` calls executing concurrently on the +same worker — which a single-process microbench cannot replicate. +Production telemetry (`DirectoryCache::stats()` `clonefile_hits` + +APFS-lock-contention probes per the handoff's "Acceptance gate") is the +right place to confirm that. + +### What this bench does NOT cover + +Documented so a reviewer doesn't mistake quiet for green: + +- **Multi-action contention.** Single-process bench can't show the + cross-action contention that motivated C3. Need a fan-out benchmark + spawning K concurrent `download_to_directory` calls. +- **The chunked `has_with_results` and level-parallel BFS `mkdir` + sub-changes** from PR #2243's commit `ee85fdc4` were deferred (see + handoff "C3 scope deviation"). Those are not benched here because + they're not implemented in this branch. +- **Realistic worker path** (Layer 2 in the handoff): would spin a + single-worker nativelink against a `MemoryStore` CAS preloaded with a + captured SwiftCompile input tree. Not done — call out as next-step + work before A/B production deploy. + +## Security tests added on this branch + +`cargo test -p nativelink-util --lib fs_util` — 10 tests, all green: + +| Test | Asserts | +|-----------------------------------------------|-------------------------------------------------------------------------------------| +| `test_hardlink_directory_tree` | macOS uses clonefile (distinct inodes); Linux uses per-file hardlinks (same inode) | +| `test_clonefile_dest_is_writable` | src stays 0o555 after clone; dst becomes 0o755 | +| `test_clonefile_cow_isolation` | writing to dst doesn't mutate src (COW) | +| `test_clonefile_preserves_internal_symlinks` | symlinks within src are cloned as symlinks (CLONE_NOFOLLOW is top-level only) | +| `test_clonefile_nofollow_on_top_level_symlink_src` | clone of a symlink src yields a symlink dst, not the target's contents | +| `test_dst_under_file_parent_errors_cleanly` | error path on bad dst leaves no half-materialized tree | +| `test_hardlink_nonexistent_source` | clean error on missing src | +| `test_hardlink_existing_destination` | refuses pre-existing dst (would otherwise allow data leak via overlay) | +| `test_set_readonly_recursive` | unchanged baseline coverage | +| `test_calculate_directory_size` | unchanged baseline coverage | + +`cargo test -p nativelink-worker --lib directory_cache::` — 2 tests: + +| Test | Asserts | +|-------------------------------------|---------------------------------------------------------------------------------------| +| `test_directory_cache_basic` | `clonefile_hits` counter increments on macOS, `hardlink_hits` on Linux | +| `test_directory_cache_zero_byte_file` | DirectoryCache construction succeeds when the CAS has no entry for zero-byte digest (C4) | + +[pr2243]: https://github.com/TraceMachina/nativelink/pull/2243 diff --git a/nativelink-util/Cargo.toml b/nativelink-util/Cargo.toml index 520925808..9eded9c1b 100644 --- a/nativelink-util/Cargo.toml +++ b/nativelink-util/Cargo.toml @@ -98,6 +98,10 @@ walkdir = { version = "2.5.0", default-features = false } nativelink-macro = { path = "../nativelink-macro" } axum = { version = "0.8.3", default-features = false } +criterion = { version = "0.5.1", default-features = false, features = [ + "async_tokio", + "cargo_bench_support", +] } http-body-util = { version = "0.1.3", default-features = false } pretty_assertions = { version = "1.4.1", features = [ "std", @@ -113,3 +117,11 @@ tracing-test = { version = "0.2.5", default-features = false, features = [ [package.metadata.cargo-machete] # Used by nativelink_test macro ignored = ["tracing-test"] + +[[bench]] +name = "hardlink_directory_tree" +harness = false + +[[bench]] +name = "download_to_directory_concurrency" +harness = false diff --git a/nativelink-util/benches/download_to_directory_concurrency.rs b/nativelink-util/benches/download_to_directory_concurrency.rs new file mode 100644 index 000000000..bf46accb3 --- /dev/null +++ b/nativelink-util/benches/download_to_directory_concurrency.rs @@ -0,0 +1,190 @@ +// Copyright 2024 The NativeLink Authors. All rights reserved. +// +// Licensed under the Apache License, Version 2.0 (the "License"); +// you may not use this file except in compliance with the License. +// You may obtain a copy of the License at +// +// http://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, software +// distributed under the License is distributed on an "AS IS" BASIS, +// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +// See the License for the specific language governing permissions and +// limitations under the License. + +//! Microbenchmark that isolates the C3 change — bounded vs unbounded +//! concurrent `hard_link(2)` calls — without spinning up a full Store +//! backend. Replicates the exact pattern in +//! `running_actions_manager::download_to_directory`: +//! +//! - **unbounded** = pre-C3 (`FuturesUnordered` polled to completion) +//! - **buffered_64** = post-C3 (`stream::buffer_unordered(64)`) +//! +//! Hypothesis from `HANDOFF-nativelink-macos-clonefile-optimizations.md`: +//! on macOS APFS, thousands of parallel `hardlink(2)` syscalls fight the +//! per-volume metadata lock, so the unbounded path is *equal-or-slower* +//! than the 64-cap. The exec-log shape is ~4 ms per input file at scale +//! — the bench should reproduce something in that ballpark and show the +//! cap matches or beats the unbounded path on every input count. +//! +//! Acceptance: +//! - macOS arm64, 1978 files: buffered_64 ≤ unbounded (≤ 1.0× ratio). +//! A win > 1.2× confirms the APFS metadata-lock theory; ≈ 1.0× still +//! validates "the cap is not a regression." +//! - Linux: ratio within ±5%. + +#![allow( + clippy::pedantic, + clippy::nursery, + clippy::cargo, + clippy::restriction, + clippy::expect_used, + clippy::unwrap_used, + missing_docs +)] + +use std::path::{Path, PathBuf}; +use std::sync::OnceLock; +use std::time::Duration; + +use criterion::{BatchSize, BenchmarkId, Criterion, Throughput, criterion_group, criterion_main}; +use futures::stream::{self, FuturesUnordered, StreamExt, TryStreamExt}; +use tempfile::TempDir; +use tokio::fs; +use tokio::runtime::Runtime; + +fn runtime() -> &'static Runtime { + static RT: OnceLock = OnceLock::new(); + RT.get_or_init(|| { + tokio::runtime::Builder::new_multi_thread() + .worker_threads( + std::thread::available_parallelism() + .map(std::num::NonZeroUsize::get) + .unwrap_or(8), + ) + .enable_all() + .build() + .expect("tokio runtime") + }) +} + +/// Mirrors the per-file futures in `download_to_directory`: a vector of +/// (src, dst) hardlink jobs over a flat directory of small files. Returns +/// the source paths. +fn build_source_files(root: &Path, n: usize) -> Vec { + use std::fs::File; + use std::io::Write; + + let src_dir = root.join("src"); + std::fs::create_dir_all(&src_dir).expect("create src"); + + let payload = vec![0u8; 1024]; // 1 KB — input files are small in practice. + let mut out = Vec::with_capacity(n); + for i in 0..n { + let p = src_dir.join(format!("f{i:05}.bin")); + let mut f = File::create(&p).expect("create"); + f.write_all(&payload).expect("write"); + out.push(p); + } + out +} + +/// Pre-C3 behavior: push every `hard_link` future into an unbounded +/// `FuturesUnordered`, then drain. +async fn hardlink_all_unbounded(src: &[PathBuf], dst_dir: &Path) { + let mut futs: FuturesUnordered<_> = src + .iter() + .enumerate() + .map(|(i, s)| { + let d = dst_dir.join(format!("f{i:05}.bin")); + let s = s.clone(); + async move { fs::hard_link(&s, &d).await } + }) + .collect(); + while let Some(r) = futs.next().await { + r.expect("hardlink"); + } +} + +/// Post-C3 behavior: drive the same set of `hard_link` jobs through +/// `stream::buffer_unordered(CAP)` so at most CAP are in flight. +async fn hardlink_all_buffered(src: &[PathBuf], dst_dir: &Path) { + let jobs = src.iter().enumerate().map(|(i, s)| { + let d = dst_dir.join(format!("f{i:05}.bin")); + let s = s.clone(); + async move { fs::hard_link(&s, &d).await } + }); + stream::iter(jobs) + .buffer_unordered(CAP) + .try_collect::>() + .await + .expect("hardlink"); +} + +const INPUT_COUNTS: &[usize] = &[64, 256, 635, 1978]; + +fn bench_concurrency(c: &mut Criterion) { + let rt = runtime(); + let src_holder = TempDir::new().expect("src tempdir"); + // Build the largest fixture once; smaller benches reuse a prefix. + let max_n = *INPUT_COUNTS.iter().max().unwrap(); + let all_src = build_source_files(src_holder.path(), max_n); + + for &n in INPUT_COUNTS { + let mut group = c.benchmark_group(format!("download_to_directory_concurrency/n={n}")); + group.throughput(Throughput::Elements(n as u64)); + // hard_link is metadata only; scale samples down for large n so the + // disk's inode pressure stays bounded. + let samples = if n <= 256 { 60 } else { 30 }; + group.sample_size(samples); + group.warm_up_time(Duration::from_secs(1)); + group.measurement_time(Duration::from_secs(if n <= 256 { 4 } else { 8 })); + + let src_slice = all_src[..n].to_vec(); + + group.bench_with_input( + BenchmarkId::new("unbounded", n), + &src_slice, + |b, src_slice| { + b.to_async(rt).iter_batched( + || { + let dst_holder = TempDir::new().expect("dst tempdir"); + let dst = dst_holder.path().join("dst"); + std::fs::create_dir_all(&dst).expect("mk dst"); + (dst_holder, dst, src_slice.clone()) + }, + |(dst_holder, dst, src_slice)| async move { + hardlink_all_unbounded(&src_slice, &dst).await; + drop(dst_holder); + }, + BatchSize::PerIteration, + ); + }, + ); + + group.bench_with_input( + BenchmarkId::new("buffered_64", n), + &src_slice, + |b, src_slice| { + b.to_async(rt).iter_batched( + || { + let dst_holder = TempDir::new().expect("dst tempdir"); + let dst = dst_holder.path().join("dst"); + std::fs::create_dir_all(&dst).expect("mk dst"); + (dst_holder, dst, src_slice.clone()) + }, + |(dst_holder, dst, src_slice)| async move { + hardlink_all_buffered::<64>(&src_slice, &dst).await; + drop(dst_holder); + }, + BatchSize::PerIteration, + ); + }, + ); + + group.finish(); + } +} + +criterion_group!(benches, bench_concurrency); +criterion_main!(benches); diff --git a/nativelink-util/benches/hardlink_directory_tree.rs b/nativelink-util/benches/hardlink_directory_tree.rs new file mode 100644 index 000000000..fdf3a4632 --- /dev/null +++ b/nativelink-util/benches/hardlink_directory_tree.rs @@ -0,0 +1,248 @@ +// Copyright 2024 The NativeLink Authors. All rights reserved. +// +// Licensed under the Apache License, Version 2.0 (the "License"); +// you may not use this file except in compliance with the License. +// You may obtain a copy of the License at +// +// http://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, software +// distributed under the License is distributed on an "AS IS" BASIS, +// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +// See the License for the specific language governing permissions and +// limitations under the License. + +//! Layer-1 microbenchmark for `hardlink_directory_tree`. +//! +//! Compares the public `hardlink_directory_tree` (clonefile fast path on +//! macOS, per-file `hard_link` on Linux/Windows) against the per-file +//! hardlink path (`hardlink_directory_tree_perfile`, exposed for +//! benchmarking only) across tree shapes that mirror real Bazel +//! SwiftCompile action input sets observed in +//! `~/Downloads/bazel-exec-log-this.zst`: +//! +//! | shape | files | size | mirrors | +//! |---------------|-------|--------|-------------------------------| +//! | small_flat | 64 | 64 KB | small SwiftCompile | +//! | medium_flat | 635 | 180 MB | mean SwiftCompile | +//! | large_flat | 1978 | 466 MB | p95 SwiftCompile | +//! | deep_nested | 200 | 50 MB | recursion + per-level cap | +//! | pcm_cluster | 219 | 40 MB | SwiftPrecompileCModule output | +//! +//! Acceptance (from `HANDOFF-nativelink-macos-clonefile-optimizations.md`): +//! - macOS arm64: treatment ≥ 10× faster on shapes ≥ 200 files. +//! - Linux: treatment within ±5% of baseline (same code path). + +#![allow( + clippy::pedantic, + clippy::nursery, + clippy::cargo, + clippy::restriction, + clippy::expect_used, + clippy::unwrap_used, + clippy::print_stdout, + missing_docs +)] + +use std::path::{Path, PathBuf}; +use std::sync::{Arc, OnceLock}; +use std::time::Duration; + +use criterion::{BatchSize, BenchmarkId, Criterion, Throughput, criterion_group, criterion_main}; +use nativelink_util::fs_util::{hardlink_directory_tree, hardlink_directory_tree_perfile}; +use rand::RngCore; +use rand::rngs::SmallRng; +use rand::SeedableRng; +use tempfile::TempDir; +use tokio::runtime::Runtime; + +/// One persistent runtime + tempdir holder so source trees aren't rebuilt +/// across the two functions in the comparison. `OnceLock` is enough: the +/// runtime is `Send`/`Sync` and we never need to mutate it after init. +fn runtime() -> &'static Runtime { + static RT: OnceLock = OnceLock::new(); + RT.get_or_init(|| { + tokio::runtime::Builder::new_multi_thread() + .worker_threads(num_cpus_or_8()) + .enable_all() + .build() + .expect("tokio runtime") + }) +} + +fn num_cpus_or_8() -> usize { + std::thread::available_parallelism() + .map(std::num::NonZeroUsize::get) + .unwrap_or(8) +} + +/// Shape descriptor — file count, average file size, and tree depth. Depth +/// 1 means all files in one directory; depth N produces a chain of N nested +/// dirs each holding `files/N` files. +#[derive(Clone, Copy)] +struct Shape { + name: &'static str, + files: usize, + bytes_per_file: usize, + depth: usize, +} + +const SHAPES: &[Shape] = &[ + Shape { + name: "small_flat", + files: 64, + bytes_per_file: 1024, // 1 KB + depth: 1, + }, + Shape { + name: "pcm_cluster", + files: 219, + bytes_per_file: 190 * 1024, // ~40 MB total + depth: 1, + }, + Shape { + name: "deep_nested", + files: 200, + bytes_per_file: 256 * 1024, // ~50 MB total + depth: 5, + }, + Shape { + name: "medium_flat", + files: 635, + bytes_per_file: 290 * 1024, // ~180 MB total + depth: 1, + }, + Shape { + name: "large_flat", + files: 1978, + bytes_per_file: 245 * 1024, // ~466 MB total + depth: 1, + }, +]; + +/// Build the source tree for a shape in `root/src`. Synchronous, blocking +/// — runs once per shape outside the bench loop. Returns the src path. +fn build_source_tree(root: &Path, shape: Shape) -> PathBuf { + use std::fs; + use std::io::Write; + + let src = root.join("src"); + fs::create_dir_all(&src).expect("create src"); + + // File payload — random bytes so APFS can't trivially dedup at the + // block layer and skew measurements. Seeded for reproducibility. + let mut rng = SmallRng::seed_from_u64(0xC10E_F11E); + let mut payload = vec![0u8; shape.bytes_per_file]; + rng.fill_bytes(&mut payload); + + // Build per-depth-level directory chain: src/d0/d1/.../d{depth-1}/. + // Distribute files round-robin across the leaf dirs (depth=1 → one leaf). + let leaf_dirs: Vec = if shape.depth == 1 { + vec![src.clone()] + } else { + let mut leaves = Vec::with_capacity(shape.depth); + let mut cur = src.clone(); + for level in 0..shape.depth { + cur = cur.join(format!("d{level}")); + fs::create_dir_all(&cur).expect("create level"); + leaves.push(cur.clone()); + } + leaves + }; + + for i in 0..shape.files { + let leaf = &leaf_dirs[i % leaf_dirs.len()]; + let path = leaf.join(format!("f{i:05}.bin")); + let mut f = fs::File::create(&path).expect("create file"); + f.write_all(&payload).expect("write file"); + } + + src +} + +fn bench_shape(c: &mut Criterion, shape: Shape) { + // Build source once outside the bench loop. The TempDir lives until the + // closure that owns it returns at end of `bench_shape`, so the src tree + // persists across all criterion samples. Wrap in `Arc` so each batched + // setup can clone it cheaply into the async closure. + let src_holder = TempDir::new().expect("src tempdir"); + let src = Arc::new(build_source_tree(src_holder.path(), shape)); + + let rt = runtime(); + + let mut group = c.benchmark_group(format!("hardlink_directory_tree/{}", shape.name)); + let total_bytes = (shape.files * shape.bytes_per_file) as u64; + group.throughput(Throughput::Bytes(total_bytes)); + // Larger trees are slow and disk-heavy; cap sample size and warmup. + let (samples, warmup_secs, measurement_secs) = match shape.files { + n if n <= 100 => (50_usize, 1u64, 4u64), + n if n <= 300 => (30_usize, 1u64, 5u64), + n if n <= 800 => (20_usize, 2u64, 8u64), + _ => (10_usize, 2u64, 12u64), + }; + group.sample_size(samples); + group.warm_up_time(Duration::from_secs(warmup_secs)); + group.measurement_time(Duration::from_secs(measurement_secs)); + + // Treatment — the public API. On macOS hits clonefile(2); on Linux + // falls through to the per-file path (identical to baseline). + let src_t = Arc::clone(&src); + group.bench_with_input( + BenchmarkId::new("treatment", shape.name), + &shape, + move |b, _| { + let src_t = Arc::clone(&src_t); + b.to_async(rt).iter_batched( + || { + let dst_holder = TempDir::new().expect("dst tempdir"); + let dst = dst_holder.path().join("dst"); + (dst_holder, dst, Arc::clone(&src_t)) + }, + |(dst_holder, dst, src_t)| async move { + hardlink_directory_tree(&src_t, &dst) + .await + .expect("hardlink_directory_tree"); + drop(dst_holder); + }, + BatchSize::PerIteration, + ); + }, + ); + + // Baseline — per-file hardlink walk regardless of platform. This is + // what `hardlink_directory_tree` does today on Linux and what it did + // on macOS prior to commit b3b0cd3f (the clonefile fast path). + let src_b = Arc::clone(&src); + group.bench_with_input( + BenchmarkId::new("baseline_perfile", shape.name), + &shape, + move |b, _| { + let src_b = Arc::clone(&src_b); + b.to_async(rt).iter_batched( + || { + let dst_holder = TempDir::new().expect("dst tempdir"); + let dst = dst_holder.path().join("dst"); + (dst_holder, dst, Arc::clone(&src_b)) + }, + |(dst_holder, dst, src_b)| async move { + hardlink_directory_tree_perfile(&src_b, &dst) + .await + .expect("hardlink_directory_tree_perfile"); + drop(dst_holder); + }, + BatchSize::PerIteration, + ); + }, + ); + + group.finish(); +} + +fn bench_all_shapes(c: &mut Criterion) { + for shape in SHAPES { + bench_shape(c, *shape); + } +} + +criterion_group!(benches, bench_all_shapes); +criterion_main!(benches); diff --git a/nativelink-util/src/fs_util.rs b/nativelink-util/src/fs_util.rs index 9a8daaf32..793f5dd1b 100644 --- a/nativelink-util/src/fs_util.rs +++ b/nativelink-util/src/fs_util.rs @@ -133,6 +133,43 @@ pub async fn hardlink_directory_tree( Ok(CloneMethod::Hardlink) } +/// Per-file hardlink materialization, exposed so benchmarks can compare the +/// legacy code path against the clonefile fast path on the same host. This is +/// exactly the branch [`hardlink_directory_tree`] takes on Linux / Windows +/// and the fallback it takes on macOS when `clonefile(2)` fails. Not part of +/// the public API surface — `#[doc(hidden)]`. +#[doc(hidden)] +pub async fn hardlink_directory_tree_perfile( + src_dir: &Path, + dst_dir: &Path, +) -> Result<(), Error> { + error_if!( + !src_dir.exists(), + "Source directory does not exist: {}", + src_dir.display() + ); + error_if!( + dst_dir.exists(), + "Destination directory already exists: {}", + dst_dir.display() + ); + if let Some(parent) = dst_dir.parent() { + fs::create_dir_all(parent).await.err_tip(|| { + format!( + "Failed to create parent of destination: {}", + parent.display() + ) + })?; + } + fs::create_dir_all(dst_dir).await.err_tip(|| { + format!( + "Failed to create destination directory: {}", + dst_dir.display() + ) + })?; + hardlink_directory_tree_recursive(src_dir, dst_dir).await +} + /// Recursively clones a directory tree using APFS `clonefile(2)`. On success /// the destination shares data blocks with the source via copy-on-write; the /// operation is O(1) in tree size regardless of file count. @@ -588,6 +625,95 @@ mod tests { Ok(()) } + /// Symlinks *inside* the cloned tree must survive as symlinks — clonefile's + /// `CLONE_NOFOLLOW` flag only applies to the top-level src path, not to + /// descendant symlinks. If clonefile silently materialized the link's + /// target file, an action's input tree could read data that escaped the + /// cached directory (e.g., a malicious symlink in CAS data could point + /// at `/etc/passwd`). Verify the link is preserved verbatim and resolves + /// to the same relative target. + #[cfg(target_os = "macos")] + #[nativelink_test("crate")] + async fn test_clonefile_preserves_internal_symlinks() -> Result<(), Error> { + let (temp_dir, src_dir) = create_test_directory().await?; + + // Add a relative symlink inside the src tree pointing at a sibling file. + let link_path = src_dir.join("link_to_file1.txt"); + fs::symlink("file1.txt", &link_path).await?; + + let dst_dir = temp_dir.path().join("clone_dst"); + let method = hardlink_directory_tree(&src_dir, &dst_dir).await?; + assert_eq!(method, CloneMethod::Clonefile); + + let dst_link = dst_dir.join("link_to_file1.txt"); + let dst_link_meta = fs::symlink_metadata(&dst_link).await?; + assert!( + dst_link_meta.file_type().is_symlink(), + "internal symlink must remain a symlink after clonefile, not be \ + materialized as the target file (would escape the cache directory)" + ); + let dst_target = fs::read_link(&dst_link).await?; + assert_eq!( + dst_target.as_os_str(), + std::ffi::OsStr::new("file1.txt"), + "symlink target must be preserved verbatim" + ); + + Ok(()) + } + + /// The clonefile path materializes the *top-level* source path. If + /// `src_dir` is itself a symlink, `CLONE_NOFOLLOW` ensures we clone the + /// link, not its target. The destination should therefore end up as a + /// symlink and `hardlink_directory_tree` should NOT silently follow the + /// link and clone whatever it points at. This guards against the worker + /// being tricked into materializing arbitrary filesystem locations. + #[cfg(target_os = "macos")] + #[nativelink_test("crate")] + async fn test_clonefile_nofollow_on_top_level_symlink_src() -> Result<(), Error> { + let (temp_dir, src_dir) = create_test_directory().await?; + let symlink_src = temp_dir.path().join("symlink_to_src"); + fs::symlink(&src_dir, &symlink_src).await?; + + let dst_dir = temp_dir.path().join("clone_dst"); + let _ = hardlink_directory_tree(&symlink_src, &dst_dir).await?; + + // CLONE_NOFOLLOW + clonefile of a symlink should yield a symlink at + // the destination (not the resolved directory contents). + let dst_meta = fs::symlink_metadata(&dst_dir).await?; + assert!( + dst_meta.file_type().is_symlink(), + "top-level symlink src must be cloned as a symlink, not followed" + ); + + Ok(()) + } + + /// Negative case: hardlink path must reject a dst whose parent doesn't + /// exist when we can't create it (e.g., parent path crosses a file). + /// Confirms the error path doesn't silently produce a half-materialized + /// tree on macOS clonefile fall-through either. + #[nativelink_test("crate")] + async fn test_dst_under_file_parent_errors_cleanly() -> Result<(), Error> { + let temp_dir = TempDir::new()?; + let src = temp_dir.path().join("src"); + fs::create_dir(&src).await?; + fs::write(src.join("a.txt"), b"a").await?; + + // Use a regular file as a "parent" — creating dst inside it must fail. + let blocker = temp_dir.path().join("not_a_dir"); + fs::write(&blocker, b"").await?; + let dst = blocker.join("dst"); + + let result = hardlink_directory_tree(&src, &dst).await; + assert!(result.is_err(), "must fail when dst's parent is a file"); + assert!( + !dst.exists(), + "no half-materialized dst tree should remain on error" + ); + Ok(()) + } + #[nativelink_test("crate")] async fn test_set_readonly_recursive() -> Result<(), Error> { let (_temp_dir, test_dir) = create_test_directory().await?;