Skip to content
Open
Show file tree
Hide file tree
Changes from 1 commit
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
2 changes: 2 additions & 0 deletions benches/bench.rs
Original file line number Diff line number Diff line change
Expand Up @@ -4,13 +4,15 @@ use criterion::{criterion_group, criterion_main};

mod general_ops;
mod insert_unique_unchecked;
mod prefetch;
mod set_ops;
mod with_capacity;

criterion_group!(
benches,
general_ops::register_benches,
insert_unique_unchecked::register_benches,
prefetch::register_benches,
set_ops::register_benches,
with_capacity::register_benches
);
Expand Down
87 changes: 87 additions & 0 deletions benches/prefetch.rs
Original file line number Diff line number Diff line change
@@ -0,0 +1,87 @@
//! Batch-lookup benchmark: look up a list of keys in a large `HashMap`, with
//! and without software-prefetching a key a few iterations ahead.
//!
//! Prefetching only pays off when the table is large enough that its control
Comment thread
clarfonthey marked this conversation as resolved.
Outdated
//! bytes spill out of the L2/L3 cache *and* the caller can issue the prefetch
//! far enough ahead of the use. So this benchmark sweeps the table size and
//! uses a randomized lookup order (so the access pattern is cache-hostile).
//! On a small, cache-resident table the prefetch is noise (or a slight loss);
//! the win shows up on the large sizes.

use criterion::{BenchmarkId, Criterion, Throughput};
use hashbrown::{DefaultHashBuilder, HashMap};
use std::hint::black_box;

// 16-byte keys, like a common join-key shape (two u64s).
type Key = (u64, u64);

const SIZES: &[usize] = &[1 << 12, 1 << 16, 1 << 18, 1 << 20, 1 << 22];
const LOOKAHEAD: usize = 8;
const N_QUERIES: usize = 1 << 16;

fn build_map(n: usize) -> HashMap<Key, u64, DefaultHashBuilder> {
let mut m = HashMap::with_capacity_and_hasher(n, DefaultHashBuilder::default());
for i in 0..n as u64 {
m.insert((i, i.wrapping_mul(0x9E37_79B9_7F4A_7C15)), i);
}
m
}

// A cheap PRNG so the lookup order is unpredictable to the prefetcher.
fn xorshift(state: &mut u64) -> u64 {
let mut x = *state;
x ^= x << 13;
x ^= x >> 7;
x ^= x << 17;
*state = x;
x
}

fn query_keys(n: usize) -> Vec<Key> {
let mut state = 0x1234_5678_9ABC_DEF0u64;
(0..N_QUERIES)
.map(|_| {
let i = xorshift(&mut state) % n as u64;
(i, i.wrapping_mul(0x9E37_79B9_7F4A_7C15))
})
.collect()
}

fn lookup_naive(map: &HashMap<Key, u64, DefaultHashBuilder>, keys: &[Key]) -> u64 {
let mut sum = 0u64;
for k in keys {
if let Some(&v) = map.get(k) {
sum = sum.wrapping_add(v);
}
}
sum
}

fn lookup_prefetched(map: &HashMap<Key, u64, DefaultHashBuilder>, keys: &[Key]) -> u64 {
let mut sum = 0u64;
for (i, k) in keys.iter().enumerate() {
if let Some(next) = keys.get(i + LOOKAHEAD) {
map.prefetch(next);
}
if let Some(&v) = map.get(k) {
sum = sum.wrapping_add(v);
}
}
sum
}

pub(crate) fn register_benches(c: &mut Criterion) {
let mut group = c.benchmark_group("batch_lookup");
group.throughput(Throughput::Elements(N_QUERIES as u64));
for &n in SIZES {
let map = build_map(n);
let keys = query_keys(n);
group.bench_with_input(BenchmarkId::new("naive", n), &n, |b, _| {
b.iter(|| black_box(lookup_naive(black_box(&map), black_box(&keys))));
});
group.bench_with_input(BenchmarkId::new("prefetch", n), &n, |b, _| {
b.iter(|| black_box(lookup_prefetched(black_box(&map), black_box(&keys))));
});
}
group.finish();
}
1 change: 1 addition & 0 deletions src/lib.rs
Original file line number Diff line number Diff line change
Expand Up @@ -52,6 +52,7 @@ mod macros;
mod alloc;
mod control;
mod hasher;
mod prefetch;
mod raw;
mod util;

Expand Down
88 changes: 88 additions & 0 deletions src/map.rs
Original file line number Diff line number Diff line change
Expand Up @@ -1292,6 +1292,49 @@ where
}
}

/// Issues a software prefetch hint for the table memory that a lookup of
/// `k` would touch first.
///
/// This hashes `k` and then prefetches the control-byte group at the start
/// of its probe sequence and the corresponding bucket. It is purely a
/// performance hint with no observable effect, and it compiles to nothing
/// on architectures without a prefetch instruction.
///
/// It is only worth using when looking up *many* keys in a sequence and the
/// map is large enough that the control bytes do not fit in cache: in that
/// case you can call `prefetch` on a key several iterations ahead of the one
/// currently being looked up, so the cache lines it needs are in flight
/// before the lookup reaches them. For a single lookup, or a map that fits
/// in cache, it does nothing useful.
///
/// # Examples
///
/// ```
/// use hashbrown::HashMap;
///
/// let map: HashMap<u32, u32> = (0..1000).map(|i| (i, i)).collect();
/// let queries: Vec<u32> = (0..1000).rev().collect();
///
/// let mut sum = 0u64;
/// for (i, q) in queries.iter().enumerate() {
/// if let Some(next) = queries.get(i + 8) {
/// map.prefetch(next);
/// }
/// if let Some(&v) = map.get(q) {
/// sum += u64::from(v);
/// }
/// }
/// # let _ = sum;
/// ```
#[inline]
pub fn prefetch<Q>(&self, k: &Q)
where
Q: Hash + Equivalent<K> + ?Sized,
{
let hash = make_hash::<Q, S>(&self.hash_builder, k);
self.table.prefetch(hash);
}

/// Returns the key-value pair corresponding to the supplied key.
///
/// The supplied key may be any borrowed form of the map's key type, but
Expand Down Expand Up @@ -6899,6 +6942,51 @@ mod test_map {
HashMap::<u32, u32>::with_capacity(1).allocation_size() > core::mem::size_of::<u32>()
);
}

#[test]
fn test_prefetch() {
// `prefetch` is a hint with no observable effect; the contract we can
// test is "calling it never misbehaves and never disturbs the table",
// across the interesting shapes: the empty singleton, a tiny table, a
// larger one, a ZST-value table, present and absent keys, and a key
// hash that probes the last bucket.
let empty: HashMap<u32, u32> = HashMap::new();
empty.prefetch(&0);
empty.prefetch(&12345);

let zst: HashMap<u32, ()> = (0..200).map(|i| (i, ())).collect();
for i in 0..256 {
zst.prefetch(&i);
}

let mut map: HashMap<u32, u32> = HashMap::new();
for i in 0..1000u32 {
map.insert(i, i.wrapping_mul(7));
}
for i in 0..2000u32 {
map.prefetch(&i);
}
// The table is still intact and lookups still work after prefetching.
for i in 0..1000u32 {
assert_eq!(map.get(&i), Some(&i.wrapping_mul(7)));
}
for i in 1000..2000u32 {
assert_eq!(map.get(&i), None);
}

// The look-ahead pattern from the docs.
let queries: Vec<u32> = (0..1000u32).rev().collect();
let mut found = 0;
for (i, &q) in queries.iter().enumerate() {
if let Some(&next) = queries.get(i + 8) {
map.prefetch(&next);
}
if map.get(&q).is_some() {
found += 1;
}
}
assert_eq!(found, 1000);
}
}

#[cfg(all(test, unix, any(feature = "nightly", feature = "allocator-api2")))]
Expand Down
54 changes: 54 additions & 0 deletions src/prefetch.rs
Original file line number Diff line number Diff line change
@@ -0,0 +1,54 @@
//! Software prefetch hint.
//!
//! A prefetch is a *hint* to the CPU that the cache line containing a given
//! address will be accessed soon, so the memory subsystem can start fetching it
//! while the core does other work. It is purely advisory: it never reads or
//! writes memory, never faults (even for an invalid or dangling pointer), and is
//! a no-op in the Rust abstract machine. Architectures without a stable prefetch
//! intrinsic simply compile it away.
//!
//! `core::intrinsics::prefetch_read_data` is unstable, so we cannot use it here.
//! Instead we use the stable per-architecture intrinsics where they exist
//! (`_mm_prefetch` on x86/x86-64) and fall back to a no-op everywhere else.

/// Issues an L1 read prefetch for the cache line containing `ptr`.
///
/// This is a hint only. `ptr` does not need to be valid, aligned, or even
/// non-null; an out-of-bounds or dangling pointer is fine and will not fault.
/// On targets without a stable prefetch intrinsic this is a no-op.
#[inline]
#[allow(clippy::let_unit_value)]
pub(crate) fn prefetch_read_l1(ptr: *const u8) {
Comment thread
clarfonthey marked this conversation as resolved.
#[cfg(all(
any(target_arch = "x86", target_arch = "x86_64"),
target_feature = "sse",
not(miri),
))]
{
#[cfg(target_arch = "x86")]
use core::arch::x86::{_MM_HINT_T0, _mm_prefetch};
#[cfg(target_arch = "x86_64")]
use core::arch::x86_64::{_MM_HINT_T0, _mm_prefetch};

// SAFETY: `_mm_prefetch` is a hint instruction; it performs no memory
// access, never faults, and accepts any address (the Intel SDM and the
// `core::arch` docs both spell this out). The only safety requirement is
// that the `sse` target feature is available, which the `cfg` above
// guarantees on x86 / x86-64.
unsafe {
_mm_prefetch::<_MM_HINT_T0>(ptr.cast::<i8>());
}
}

#[cfg(not(all(
any(target_arch = "x86", target_arch = "x86_64"),
target_feature = "sse",
not(miri),
)))]
{
// No stable prefetch intrinsic on this target (aarch64 has none yet,
// and `core::intrinsics::prefetch_read_data` is unstable). Make sure
// `ptr` is still "used" so callers don't trip an unused-variable lint.
let _ = ptr;
}
}
61 changes: 61 additions & 0 deletions src/raw.rs
Original file line number Diff line number Diff line change
Expand Up @@ -1210,6 +1210,24 @@ impl<T, A: Allocator> RawTable<T, A> {
}
}

/// Issues a software prefetch hint for the table memory that a lookup of
/// `hash` would touch first: the control-byte group at the start of the
/// probe sequence and the corresponding data bucket.
///
/// This is purely a performance hint and has no observable effect. It is
/// most useful when looking up many keys in a row: hash and prefetch a key a
/// few iterations ahead of the one currently being looked up, so its cache
/// lines are in flight by the time `get`/`find` reaches them. On a single
/// lookup, or on a table small enough to stay in cache, it does nothing
/// useful (and on architectures without a prefetch instruction it compiles
/// away entirely).
#[inline]
pub(crate) fn prefetch(&self, hash: u64) {
// SAFETY: We use the same `table_layout` that was used to allocate
// this table.
unsafe { self.table.prefetch(hash, Self::TABLE_LAYOUT) }
}

/// Gets a reference to an element in the table.
#[inline]
pub(crate) fn get(&self, hash: u64, eq: impl FnMut(&T) -> bool) -> Option<&T> {
Expand Down Expand Up @@ -2454,6 +2472,49 @@ impl RawTableInner {
}
}

/// Issues a software prefetch hint for the control-byte group and data
/// bucket at the start of the probe sequence for `hash`.
///
/// `table_layout` must be the layout used to allocate this table (so that
/// the data-bucket address is computed correctly).
///
/// This is a hint only: it performs no memory access, never faults, and is
/// a no-op in the abstract machine. On the empty singleton table the
/// "addresses" point into / just before the shared empty control array,
/// which is fine — prefetching them is still harmless.
///
/// # Safety
///
/// `table_layout` must match the layout used to allocate this table.
/// (The function does not dereference any pointer, but it computes one from
/// `table_layout.size`; a mismatched layout would only mean prefetching the
/// wrong cache line, never UB.)
#[inline]
unsafe fn prefetch(&self, hash: u64, table_layout: TableLayout) {
let pos = h1(hash) & self.bucket_mask;

// Control bytes: the group `Group::load` would read first. `pos` is a
// valid control index (`pos <= bucket_mask < num_ctrl_bytes`), so the
// pointer is in-bounds even before accounting for the hint-only nature
// of prefetch.
let ctrl_ptr = self.ctrl.as_ptr().wrapping_add(pos);

// Data bucket at index `pos`: `data_end - (pos + 1) * size`. `data_end`
// is `self.ctrl`, so this is `self.ctrl - (pos + 1) * size`. Use
// `wrapping_*` so this can never be UB even for the empty singleton
// (where it points just before the shared empty control array).
let data_ptr = self
Comment thread
clarfonthey marked this conversation as resolved.
.ctrl
.as_ptr()
.wrapping_sub((pos + 1).wrapping_mul(table_layout.size));

crate::prefetch::prefetch_read_l1(ctrl_ptr);
// For zero-sized values there is no data array to prefetch.
if table_layout.size != 0 {
crate::prefetch::prefetch_read_l1(data_ptr);
}
}

#[inline]
unsafe fn record_item_insert_at(&mut self, index: usize, old_ctrl: Tag, new_ctrl: Tag) {
self.growth_left -= usize::from(old_ctrl.special_is_empty());
Expand Down
24 changes: 24 additions & 0 deletions src/set.rs
Original file line number Diff line number Diff line change
Expand Up @@ -854,6 +854,30 @@ where
self.map.contains_key(value)
}

/// Issues a software prefetch hint for the table memory that a lookup of
/// `value` would touch first.
///
/// This hashes `value` and then prefetches the control-byte group at the
/// start of its probe sequence and the corresponding bucket. It is purely a
/// performance hint with no observable effect, and it compiles to nothing
/// on architectures without a prefetch instruction.
///
/// It is only worth using when looking up *many* values in a sequence and
/// the set is large enough that the control bytes do not fit in cache: in
/// that case you can call `prefetch` on a value several iterations ahead of
/// the one currently being looked up. For a single lookup, or a set that
/// fits in cache, it does nothing useful. See [`HashMap::prefetch`] for an
/// example of the look-ahead pattern.
///
/// [`HashMap::prefetch`]: crate::HashMap::prefetch
#[cfg_attr(feature = "inline-more", inline)]
pub fn prefetch<Q>(&self, value: &Q)
where
Q: Hash + Equivalent<T> + ?Sized,
{
self.map.prefetch(value);
}

/// Returns a reference to the value in the set, if any, that is equal to the given value.
///
/// The value may be any borrowed form of the set's value type, but
Expand Down
Loading
Loading