From d42336867bcc197c7ee520fac346ee464f7c4359 Mon Sep 17 00:00:00 2001 From: John Starks Date: Fri, 15 May 2026 22:18:40 +0000 Subject: [PATCH 01/36] vm_topology: add MMIO layout allocator Add a pure-math layout allocator in vm_topology::layout that assigns address ranges to MMIO consumers within a flat physical address space. The allocator has no knowledge of specific architectures, firmware types, or pinned-address conventions -- those are the responsibility of the caller. Consumers call request() to declare allocation needs, passing a &mut MemoryRange that will be filled in when allocate() runs. Pinned requests are placed first at their fixed addresses with overlap validation. Dynamic requests (Below4GiB / Above4GiB) are sorted by alignment desc, size desc, input order asc, then greedy top-down placed into the remaining free space. allocate() returns a sorted Vec of all allocations. Free-space tracking uses subtract_ranges for linear-time updates. Pinned overlap detection sorts by address and checks adjacent pairs. --- vm/vmcore/vm_topology/src/layout.rs | 988 ++++++++++++++++++++++++++++ vm/vmcore/vm_topology/src/lib.rs | 1 + 2 files changed, 989 insertions(+) create mode 100644 vm/vmcore/vm_topology/src/layout.rs diff --git a/vm/vmcore/vm_topology/src/layout.rs b/vm/vmcore/vm_topology/src/layout.rs new file mode 100644 index 0000000000..99c0f73187 --- /dev/null +++ b/vm/vmcore/vm_topology/src/layout.rs @@ -0,0 +1,988 @@ +// Copyright (c) Microsoft Corporation. +// Licensed under the MIT License. + +//! MMIO layout allocator. +//! +//! This module provides a pure-math layout allocator that assigns address +//! ranges to MMIO consumers within a flat physical address space. It has no +//! knowledge of specific architectures, firmware types, or pinned-address +//! conventions — those are the responsibility of the caller (typically +//! `vm_manifest_builder`). +//! +//! # Usage +//! +//! ``` +//! use memory_range::MemoryRange; +//! use vm_topology::layout::{Constraint, LayoutBuilder}; +//! +//! let mut reserved = MemoryRange::EMPTY; +//! let mut vmbus = MemoryRange::EMPTY; +//! let mut pcie_bar = MemoryRange::EMPTY; +//! +//! let mut builder = LayoutBuilder::new(48); +//! +//! // Reserve a pinned range for architectural devices. +//! builder.request("reserved", &mut reserved, 32 * 1024 * 1024, 4096, Constraint::Pinned(0xFE00_0000)); +//! +//! // Dynamic allocation below 4 GiB. +//! builder.request("vmbus", &mut vmbus, 128 * 1024 * 1024, 1024 * 1024, Constraint::Below4GiB); +//! +//! // Dynamic allocation above 4 GiB. +//! builder.request( +//! "pcie_bar", +//! &mut pcie_bar, +//! 1024 * 1024 * 1024, +//! 1024 * 1024, +//! Constraint::Above4GiB, +//! ); +//! +//! let sorted = builder.allocate().unwrap(); +//! assert_eq!(reserved, MemoryRange::new(0xFE00_0000..0x1_0000_0000)); +//! assert_eq!(sorted.len(), 3); +//! ``` + +use memory_range::MemoryRange; +use memory_range::subtract_ranges; +use thiserror::Error; + +const PAGE_SIZE: u64 = 4096; +const FOUR_GIB: u64 = 0x1_0000_0000; + +/// The constraint on where a layout request can be placed. +#[derive(Debug, Clone, Copy, PartialEq, Eq)] +pub enum Constraint { + /// The allocation must fit entirely below the 4 GiB boundary. + Below4GiB, + /// The allocation must start at or above the 4 GiB boundary. + Above4GiB, + /// The allocation must be placed at exactly the given address. + Pinned(u64), +} + +/// A builder for computing an MMIO layout by collecting requests and +/// then allocating them within a physical address space. +/// +/// The address space is `[0, 1 << physical_address_width)`. Consumers +/// call [`Self::request`] to declare allocation needs (passing a +/// `&mut MemoryRange` that will be filled in), then [`Self::allocate`] +/// to run the greedy placement algorithm. +pub struct LayoutBuilder<'a> { + physical_address_width: u8, + targets: Vec<&'a mut MemoryRange>, + requests: Vec, +} + +struct RequestEntry { + tag: String, + size: u64, + alignment: u64, + constraint: Constraint, + input_order: usize, +} + +/// Error returned by [`LayoutBuilder::allocate`]. +#[derive(Debug, Error)] +pub enum AllocateError { + /// The physical address width is invalid (must be 1..=63). + #[error("invalid physical address width {0} (must be 1..=63)")] + InvalidAddressWidth(u8), + /// A request has an invalid size (must be > 0 and page-aligned). + #[error("{tag}: invalid size {size:#x} (must be > 0 and a multiple of {PAGE_SIZE:#x})")] + InvalidSize { + /// The tag identifying the request. + tag: String, + /// The invalid size. + size: u64, + }, + /// A request has an invalid alignment. + #[error("{tag}: invalid alignment {alignment:#x} (must be >= {PAGE_SIZE:#x} and a power of 2)")] + InvalidAlignment { + /// The tag identifying the request. + tag: String, + /// The invalid alignment. + alignment: u64, + }, + /// A pinned request has a non-page-aligned address. + #[error("{tag}: pinned address {address:#x} is not page-aligned")] + InvalidPinnedAddress { + /// The tag identifying the request. + tag: String, + /// The invalid address. + address: u64, + }, + /// A pinned request extends beyond the physical address space. + #[error("{tag}: pinned range {address:#x}..{end:#x} exceeds address space limit {limit:#x}")] + PinnedOutOfBounds { + /// The tag identifying the request. + tag: String, + /// The start address. + address: u64, + /// The end address. + end: u64, + /// The address space limit. + limit: u64, + }, + /// Two pinned requests overlap. + #[error("pinned requests {tag_a} ({range_a}) and {tag_b} ({range_b}) overlap")] + PinnedOverlap { + /// The tag of the first pinned request. + tag_a: String, + /// The range of the first pinned request. + range_a: MemoryRange, + /// The tag of the second pinned request. + tag_b: String, + /// The range of the second pinned request. + range_b: MemoryRange, + }, + /// A dynamic request could not be satisfied. + #[error( + "{tag}: cannot allocate {size:#x} bytes with alignment {alignment:#x} \ + and constraint {constraint:?}; remaining free space in region: {free_space:#x} bytes" + )] + Exhausted { + /// The tag identifying the request. + tag: String, + /// The requested size. + size: u64, + /// The requested alignment. + alignment: u64, + /// The placement constraint. + constraint: Constraint, + /// The remaining free space in the constrained region. + free_space: u64, + }, +} + +impl<'a> LayoutBuilder<'a> { + /// Creates a new layout builder for the given physical address width. + /// + /// The address space is `[0, 1 << physical_address_width)`. + /// `physical_address_width` must be in the range `1..=63`. + pub fn new(physical_address_width: u8) -> Self { + Self { + physical_address_width, + targets: Vec::new(), + requests: Vec::new(), + } + } + + /// Adds a request to the builder. + /// + /// - `tag`: A descriptive name for the request (used in error messages). + /// - `target`: A mutable reference to a [`MemoryRange`] that will be + /// filled in with the allocated range when [`Self::allocate`] is + /// called. + /// - `size`: The size in bytes. Must be > 0 and a multiple of 4096. + /// - `alignment`: The required alignment. Must be >= 4096 and a power + /// of 2. + /// - `constraint`: Where the allocation may be placed. + pub fn request( + &mut self, + tag: impl Into, + target: &'a mut MemoryRange, + size: u64, + alignment: u64, + constraint: Constraint, + ) { + let input_order = self.requests.len(); + self.targets.push(target); + self.requests.push(RequestEntry { + tag: tag.into(), + size, + alignment, + constraint, + input_order, + }); + } + + /// Allocates all requests, fills in each target `&mut MemoryRange`, + /// and returns every allocation sorted by address. + /// + /// The algorithm: + /// 1. Places all [`Constraint::Pinned`] requests at their fixed + /// addresses, validating no overlaps. + /// 2. Sorts non-pinned requests by `(alignment desc, size desc, + /// input_order asc)`. + /// 3. Greedy top-down placement: for each non-pinned request, finds + /// the highest-address position in the constrained region that + /// satisfies size and alignment. + /// 4. Writes each result to its `&mut MemoryRange` target and + /// returns a `Vec` of all allocations sorted by + /// address. + pub fn allocate(mut self) -> Result, AllocateError> { + let width = self.physical_address_width; + if !(1..=63).contains(&width) { + return Err(AllocateError::InvalidAddressWidth(width)); + } + let address_limit = 1u64 << width; + + // Validate all requests up front. + for req in &self.requests { + if req.size == 0 || req.size % PAGE_SIZE != 0 { + return Err(AllocateError::InvalidSize { + tag: req.tag.clone(), + size: req.size, + }); + } + if req.alignment < PAGE_SIZE || !req.alignment.is_power_of_two() { + return Err(AllocateError::InvalidAlignment { + tag: req.tag.clone(), + alignment: req.alignment, + }); + } + if let Constraint::Pinned(addr) = req.constraint { + if addr % PAGE_SIZE != 0 { + return Err(AllocateError::InvalidPinnedAddress { + tag: req.tag.clone(), + address: addr, + }); + } + let end = + addr.checked_add(req.size) + .ok_or_else(|| AllocateError::PinnedOutOfBounds { + tag: req.tag.clone(), + address: addr, + end: u64::MAX, + limit: address_limit, + })?; + if end > address_limit { + return Err(AllocateError::PinnedOutOfBounds { + tag: req.tag.clone(), + address: addr, + end, + limit: address_limit, + }); + } + } + } + + let mut allocations: Vec = vec![MemoryRange::EMPTY; self.requests.len()]; + + // Step 1: Collect pinned requests, sort by address, check for + // overlaps with a single adjacent-pair scan. + let mut pinned: Vec<(MemoryRange, usize)> = self + .requests + .iter() + .enumerate() + .filter_map(|(i, req)| { + if let Constraint::Pinned(addr) = req.constraint { + Some((MemoryRange::new(addr..addr + req.size), i)) + } else { + None + } + }) + .collect(); + + pinned.sort_by_key(|(range, _)| range.start()); + + for [(range_a, idx_a), (range_b, idx_b)] in pinned.array_windows() { + if range_a.overlaps(range_b) { + return Err(AllocateError::PinnedOverlap { + tag_a: self.requests[*idx_a].tag.clone(), + range_a: *range_a, + tag_b: self.requests[*idx_b].tag.clone(), + range_b: *range_b, + }); + } + } + + for &(range, idx) in &pinned { + allocations[idx] = range; + } + + // Compute free space by subtracting all pinned ranges from the + // full address space in one pass. Both inputs are sorted and + // non-overlapping, so subtract_ranges runs in linear time. + let pinned_ranges: Vec = pinned.iter().map(|(r, _)| *r).collect(); + let mut free_ranges: Vec = subtract_ranges( + [MemoryRange::new(0..address_limit)], + pinned_ranges.iter().copied(), + ) + .collect(); + + // Step 2: Collect non-Pinned request indices, sort by + // (alignment DESC, size DESC, input_order ASC). + let mut dynamic: Vec = self + .requests + .iter() + .enumerate() + .filter(|(_, req)| !matches!(req.constraint, Constraint::Pinned(_))) + .map(|(i, _)| i) + .collect(); + + dynamic.sort_by(|&a, &b| { + let ra = &self.requests[a]; + let rb = &self.requests[b]; + rb.alignment + .cmp(&ra.alignment) + .then(rb.size.cmp(&ra.size)) + .then(ra.input_order.cmp(&rb.input_order)) + }); + + // Step 3: Greedy top-down placement. For each dynamic request, + // reverse-scan the sorted free list for the highest-address fit, + // then update the free list via subtract_ranges. + for &idx in &dynamic { + let req = &self.requests[idx]; + let (region_start, region_end) = match req.constraint { + Constraint::Below4GiB => (0, FOUR_GIB.min(address_limit)), + Constraint::Above4GiB => (FOUR_GIB, address_limit), + Constraint::Pinned(_) => unreachable!(), + }; + + match find_highest_fit( + &free_ranges, + req.size, + req.alignment, + region_start, + region_end, + ) { + Some(alloc_start) => { + let alloc_range = MemoryRange::new(alloc_start..alloc_start + req.size); + allocations[idx] = alloc_range; + free_ranges = + subtract_ranges(free_ranges.iter().copied(), [alloc_range]).collect(); + } + None => { + let free_in_region: u64 = free_ranges + .iter() + .filter_map(|r| { + let eff_start = r.start().max(region_start); + let eff_end = r.end().min(region_end); + if eff_start < eff_end { + Some(eff_end - eff_start) + } else { + None + } + }) + .sum(); + return Err(AllocateError::Exhausted { + tag: req.tag.clone(), + size: req.size, + alignment: req.alignment, + constraint: req.constraint, + free_space: free_in_region, + }); + } + } + } + + // Step 4: Write results to targets and build sorted output. + for (target, alloc) in self.targets.iter_mut().zip(allocations.iter()) { + **target = *alloc; + } + + allocations.sort(); + Ok(allocations) + } +} + +/// Finds the highest aligned start address within `[region_start, region_end)` +/// that fits `size` bytes within one of the free ranges. +/// +/// The free list must be sorted by address. Iterates in reverse to find +/// the highest-address match first. +fn find_highest_fit( + free_ranges: &[MemoryRange], + size: u64, + alignment: u64, + region_start: u64, + region_end: u64, +) -> Option { + for range in free_ranges.iter().rev() { + // Clip the free range to the constrained region. + let eff_start = range.start().max(region_start); + let eff_end = range.end().min(region_end); + + if eff_start >= eff_end || eff_end - eff_start < size { + continue; + } + + // Find the highest aligned start where [start, start + size) fits. + let latest_start = eff_end - size; + let aligned_start = latest_start & !(alignment - 1); + + if aligned_start >= eff_start { + return Some(aligned_start); + } + } + + None +} + +#[cfg(test)] +mod tests { + use super::*; + + const KIB: u64 = 1024; + const MIB: u64 = 1024 * KIB; + const GIB: u64 = 1024 * MIB; + + #[test] + fn empty_input() { + let builder = LayoutBuilder::new(48); + let sorted = builder.allocate().unwrap(); + assert!(sorted.is_empty()); + } + + #[test] + fn single_pinned() { + let mut target = MemoryRange::EMPTY; + let mut builder = LayoutBuilder::new(48); + builder.request( + "test", + &mut target, + 4 * MIB, + PAGE_SIZE, + Constraint::Pinned(0xFC00_0000), + ); + let sorted = builder.allocate().unwrap(); + assert_eq!(target, MemoryRange::new(0xFC00_0000..0xFC00_0000 + 4 * MIB)); + assert_eq!(sorted.len(), 1); + } + + #[test] + fn multiple_pinned_non_overlapping() { + let mut t1 = MemoryRange::EMPTY; + let mut t2 = MemoryRange::EMPTY; + let mut builder = LayoutBuilder::new(48); + builder.request("a", &mut t1, 4 * KIB, PAGE_SIZE, Constraint::Pinned(0x1000)); + builder.request("b", &mut t2, 4 * KIB, PAGE_SIZE, Constraint::Pinned(0x2000)); + let sorted = builder.allocate().unwrap(); + assert_eq!(t1, MemoryRange::new(0x1000..0x2000)); + assert_eq!(t2, MemoryRange::new(0x2000..0x3000)); + assert_eq!(sorted[0], MemoryRange::new(0x1000..0x2000)); + assert_eq!(sorted[1], MemoryRange::new(0x2000..0x3000)); + } + + #[test] + fn pinned_overlap_rejected() { + let mut t1 = MemoryRange::EMPTY; + let mut t2 = MemoryRange::EMPTY; + let mut builder = LayoutBuilder::new(48); + builder.request("a", &mut t1, 8 * KIB, PAGE_SIZE, Constraint::Pinned(0x1000)); + builder.request("b", &mut t2, 4 * KIB, PAGE_SIZE, Constraint::Pinned(0x2000)); + let err = builder.allocate().unwrap_err(); + assert!(matches!(err, AllocateError::PinnedOverlap { .. })); + } + + #[test] + fn pinned_out_of_bounds() { + let mut target = MemoryRange::EMPTY; + let mut builder = LayoutBuilder::new(32); + builder.request( + "oob", + &mut target, + 8 * KIB, + PAGE_SIZE, + Constraint::Pinned(0xFFFF_F000), + ); + let err = builder.allocate().unwrap_err(); + assert!(matches!(err, AllocateError::PinnedOutOfBounds { .. })); + } + + #[test] + fn pinned_at_edge_of_address_space() { + let mut target = MemoryRange::EMPTY; + let mut builder = LayoutBuilder::new(32); + builder.request( + "edge", + &mut target, + 4 * KIB, + PAGE_SIZE, + Constraint::Pinned(0xFFFF_F000), + ); + builder.allocate().unwrap(); + assert_eq!(target, MemoryRange::new(0xFFFF_F000..0x1_0000_0000)); + } + + #[test] + fn pinned_at_address_zero() { + let mut target = MemoryRange::EMPTY; + let mut builder = LayoutBuilder::new(48); + builder.request( + "zero", + &mut target, + 4 * KIB, + PAGE_SIZE, + Constraint::Pinned(0), + ); + builder.allocate().unwrap(); + assert_eq!(target, MemoryRange::new(0..PAGE_SIZE)); + } + + #[test] + fn single_below_4gib() { + let mut target = MemoryRange::EMPTY; + let mut builder = LayoutBuilder::new(48); + builder.request("test", &mut target, MIB, MIB, Constraint::Below4GiB); + builder.allocate().unwrap(); + assert!(target.end() <= FOUR_GIB); + assert_eq!(target.len(), MIB); + assert_eq!(target.start() % MIB, 0); + } + + #[test] + fn single_above_4gib() { + let mut target = MemoryRange::EMPTY; + let mut builder = LayoutBuilder::new(48); + builder.request("test", &mut target, GIB, MIB, Constraint::Above4GiB); + builder.allocate().unwrap(); + assert!(target.start() >= FOUR_GIB); + assert_eq!(target.len(), GIB); + assert_eq!(target.start() % MIB, 0); + } + + #[test] + fn below_4gib_top_down_placement() { + let mut t1 = MemoryRange::EMPTY; + let mut t2 = MemoryRange::EMPTY; + let mut builder = LayoutBuilder::new(48); + builder.request("a", &mut t1, MIB, MIB, Constraint::Below4GiB); + builder.request("b", &mut t2, MIB, MIB, Constraint::Below4GiB); + builder.allocate().unwrap(); + // Same alignment and size → input order tiebreaker. t1 (order 0) + // is placed first (highest address), t2 (order 1) gets the next + // highest. + assert!(t1.start() > t2.start()); + assert!(!t1.overlaps(&t2)); + } + + #[test] + fn alignment_driven_ordering() { + let mut small = MemoryRange::EMPTY; + let mut big = MemoryRange::EMPTY; + let mut builder = LayoutBuilder::new(48); + builder.request("small", &mut small, MIB, MIB, Constraint::Below4GiB); + builder.request("big", &mut big, MIB, 256 * MIB, Constraint::Below4GiB); + builder.allocate().unwrap(); + assert_eq!(big.start() % (256 * MIB), 0); + assert_eq!(small.start() % MIB, 0); + assert!(!big.overlaps(&small)); + assert!(big.end() <= FOUR_GIB); + assert!(small.end() <= FOUR_GIB); + } + + #[test] + fn size_driven_ordering_same_alignment() { + let mut small = MemoryRange::EMPTY; + let mut big = MemoryRange::EMPTY; + let mut builder = LayoutBuilder::new(48); + builder.request("small", &mut small, MIB, MIB, Constraint::Below4GiB); + builder.request("big", &mut big, 4 * MIB, MIB, Constraint::Below4GiB); + builder.allocate().unwrap(); + assert!(big.start() > small.start()); + } + + #[test] + fn pinned_plus_dynamic() { + let mut reserved = MemoryRange::EMPTY; + let mut dynamic = MemoryRange::EMPTY; + let mut builder = LayoutBuilder::new(32); + builder.request( + "reserved", + &mut reserved, + 32 * MIB, + PAGE_SIZE, + Constraint::Pinned(0xFE00_0000), + ); + builder.request("dynamic", &mut dynamic, MIB, MIB, Constraint::Below4GiB); + builder.allocate().unwrap(); + assert_eq!(reserved, MemoryRange::new(0xFE00_0000..0x1_0000_0000)); + assert!(!dynamic.overlaps(&reserved)); + assert!(dynamic.end() <= FOUR_GIB); + } + + #[test] + fn exhaustion_below_4gib() { + let mut t1 = MemoryRange::EMPTY; + let mut t2 = MemoryRange::EMPTY; + let mut builder = LayoutBuilder::new(32); + builder.request("pin", &mut t1, GIB, PAGE_SIZE, Constraint::Pinned(0)); + builder.request( + "too_big", + &mut t2, + 4 * GIB, + PAGE_SIZE, + Constraint::Below4GiB, + ); + let err = builder.allocate().unwrap_err(); + assert!(matches!(err, AllocateError::Exhausted { .. })); + } + + #[test] + fn exhaustion_above_4gib_narrow_width() { + let mut target = MemoryRange::EMPTY; + let mut builder = LayoutBuilder::new(32); + builder.request( + "above", + &mut target, + PAGE_SIZE, + PAGE_SIZE, + Constraint::Above4GiB, + ); + let err = builder.allocate().unwrap_err(); + assert!(matches!(err, AllocateError::Exhausted { .. })); + } + + #[test] + fn exhaustion_alignment_fragmentation() { + let mut t1 = MemoryRange::EMPTY; + let mut t2 = MemoryRange::EMPTY; + let mut builder = LayoutBuilder::new(36); + builder.request( + "pin", + &mut t1, + 0xF800_0000, + PAGE_SIZE, + Constraint::Pinned(0), + ); + builder.request( + "misaligned", + &mut t2, + 128 * MIB, + 256 * MIB, + Constraint::Below4GiB, + ); + let err = builder.allocate().unwrap_err(); + assert!(matches!(err, AllocateError::Exhausted { .. })); + } + + #[test] + fn below_4gib_above_4gib_filtering() { + let mut below = MemoryRange::EMPTY; + let mut above = MemoryRange::EMPTY; + let mut builder = LayoutBuilder::new(48); + builder.request("below", &mut below, MIB, MIB, Constraint::Below4GiB); + builder.request("above", &mut above, MIB, MIB, Constraint::Above4GiB); + builder.allocate().unwrap(); + assert!(below.end() <= FOUR_GIB); + assert!(above.start() >= FOUR_GIB); + } + + #[test] + fn sorted_ranges_order() { + let mut t_above = MemoryRange::EMPTY; + let mut t_pinned = MemoryRange::EMPTY; + let mut t_below = MemoryRange::EMPTY; + let mut builder = LayoutBuilder::new(48); + builder.request("above", &mut t_above, GIB, MIB, Constraint::Above4GiB); + builder.request( + "pinned", + &mut t_pinned, + PAGE_SIZE, + PAGE_SIZE, + Constraint::Pinned(0x1000), + ); + builder.request("below", &mut t_below, MIB, MIB, Constraint::Below4GiB); + let sorted = builder.allocate().unwrap(); + assert_eq!(sorted.len(), 3); + // Pinned at 0x1000 should be first. + assert_eq!(sorted[0], t_pinned); + for [a, b] in sorted.array_windows() { + assert!(a.start() < b.start()); + } + } + + #[test] + fn determinism() { + let mut prev_sorted: Option> = None; + for _ in 0..10 { + let mut a = MemoryRange::EMPTY; + let mut b = MemoryRange::EMPTY; + let mut c = MemoryRange::EMPTY; + let mut d = MemoryRange::EMPTY; + let mut e = MemoryRange::EMPTY; + let mut builder = LayoutBuilder::new(48); + builder.request("a", &mut a, 4 * MIB, MIB, Constraint::Below4GiB); + builder.request("b", &mut b, GIB, 256 * MIB, Constraint::Above4GiB); + builder.request( + "c", + &mut c, + 32 * MIB, + PAGE_SIZE, + Constraint::Pinned(0xFE00_0000), + ); + builder.request("d", &mut d, 128 * MIB, MIB, Constraint::Below4GiB); + builder.request("e", &mut e, PAGE_SIZE, PAGE_SIZE, Constraint::Below4GiB); + let sorted = builder.allocate().unwrap(); + if let Some(prev) = &prev_sorted { + assert_eq!(prev, &sorted); + } + prev_sorted = Some(sorted); + } + } + + #[test] + fn invalid_address_width() { + let builder = LayoutBuilder::new(0); + assert!(matches!( + builder.allocate().unwrap_err(), + AllocateError::InvalidAddressWidth(0) + )); + let builder = LayoutBuilder::new(64); + assert!(matches!( + builder.allocate().unwrap_err(), + AllocateError::InvalidAddressWidth(64) + )); + } + + #[test] + fn invalid_size() { + let mut target = MemoryRange::EMPTY; + let mut builder = LayoutBuilder::new(48); + builder.request("bad", &mut target, 0, PAGE_SIZE, Constraint::Below4GiB); + assert!(matches!( + builder.allocate().unwrap_err(), + AllocateError::InvalidSize { .. } + )); + + let mut target = MemoryRange::EMPTY; + let mut builder = LayoutBuilder::new(48); + builder.request("bad", &mut target, 100, PAGE_SIZE, Constraint::Below4GiB); + assert!(matches!( + builder.allocate().unwrap_err(), + AllocateError::InvalidSize { .. } + )); + } + + #[test] + fn invalid_alignment() { + let mut target = MemoryRange::EMPTY; + let mut builder = LayoutBuilder::new(48); + builder.request("bad", &mut target, PAGE_SIZE, 1024, Constraint::Below4GiB); + assert!(matches!( + builder.allocate().unwrap_err(), + AllocateError::InvalidAlignment { .. } + )); + + let mut target = MemoryRange::EMPTY; + let mut builder = LayoutBuilder::new(48); + builder.request( + "bad", + &mut target, + PAGE_SIZE, + 6 * KIB, + Constraint::Below4GiB, + ); + assert!(matches!( + builder.allocate().unwrap_err(), + AllocateError::InvalidAlignment { .. } + )); + } + + #[test] + fn invalid_pinned_address() { + let mut target = MemoryRange::EMPTY; + let mut builder = LayoutBuilder::new(48); + builder.request( + "bad", + &mut target, + PAGE_SIZE, + PAGE_SIZE, + Constraint::Pinned(0x1234), + ); + assert!(matches!( + builder.allocate().unwrap_err(), + AllocateError::InvalidPinnedAddress { .. } + )); + } + + #[test] + fn realistic_x86_layout() { + let mut reserved = MemoryRange::EMPTY; + let mut vmbus_low = MemoryRange::EMPTY; + let mut vmbus_high = MemoryRange::EMPTY; + let mut pcie_ecam = MemoryRange::EMPTY; + let mut pcie_low = MemoryRange::EMPTY; + let mut pcie_high = MemoryRange::EMPTY; + let mut virtio = [MemoryRange::EMPTY; 4]; + + let mut builder = LayoutBuilder::new(48); + builder.request( + "reserved", + &mut reserved, + 32 * MIB, + PAGE_SIZE, + Constraint::Pinned(0xFE00_0000), + ); + builder.request( + "vmbus_low", + &mut vmbus_low, + 128 * MIB, + MIB, + Constraint::Below4GiB, + ); + builder.request( + "vmbus_high", + &mut vmbus_high, + GIB, + MIB, + Constraint::Above4GiB, + ); + builder.request( + "pcie_ecam", + &mut pcie_ecam, + 256 * MIB, + 256 * MIB, + Constraint::Below4GiB, + ); + builder.request( + "pcie_low", + &mut pcie_low, + 64 * MIB, + MIB, + Constraint::Below4GiB, + ); + builder.request("pcie_high", &mut pcie_high, GIB, MIB, Constraint::Above4GiB); + for (i, v) in virtio.iter_mut().enumerate() { + builder.request( + format!("virtio_{i}"), + v, + PAGE_SIZE, + PAGE_SIZE, + Constraint::Below4GiB, + ); + } + + let sorted = builder.allocate().unwrap(); + + assert_eq!(reserved, MemoryRange::new(0xFE00_0000..0x1_0000_0000)); + assert!(vmbus_low.end() <= FOUR_GIB); + assert!(pcie_ecam.end() <= FOUR_GIB); + assert!(pcie_low.end() <= FOUR_GIB); + for v in &virtio { + assert!(v.end() <= FOUR_GIB); + } + assert!(vmbus_high.start() >= FOUR_GIB); + assert!(pcie_high.start() >= FOUR_GIB); + + for [a, b] in sorted.array_windows() { + assert!(a.end() <= b.start(), "overlap: {} and {}", a, b); + } + + assert_eq!(pcie_ecam.start() % (256 * MIB), 0); + + for r in &sorted { + assert!(r.end() <= 1u64 << 48); + } + } + + #[test] + fn realistic_aarch64_layout() { + let mut reserved = MemoryRange::EMPTY; + let mut vmbus_low = MemoryRange::EMPTY; + let mut vmbus_high = MemoryRange::EMPTY; + + let mut builder = LayoutBuilder::new(48); + builder.request( + "reserved", + &mut reserved, + 272 * MIB, + PAGE_SIZE, + Constraint::Pinned(0xEF00_0000), + ); + builder.request( + "vmbus_low", + &mut vmbus_low, + 128 * MIB, + MIB, + Constraint::Below4GiB, + ); + builder.request( + "vmbus_high", + &mut vmbus_high, + GIB, + MIB, + Constraint::Above4GiB, + ); + + let sorted = builder.allocate().unwrap(); + + assert_eq!(reserved, MemoryRange::new(0xEF00_0000..0x1_0000_0000)); + assert!(vmbus_low.end() <= FOUR_GIB); + assert!(vmbus_high.start() >= FOUR_GIB); + + for [a, b] in sorted.array_windows() { + assert!(a.end() <= b.start()); + } + } + + #[test] + fn pinned_at_top_of_space() { + let mut target = MemoryRange::EMPTY; + let width: u8 = 36; + let limit = 1u64 << width; + let mut builder = LayoutBuilder::new(width); + builder.request( + "top", + &mut target, + PAGE_SIZE, + PAGE_SIZE, + Constraint::Pinned(limit - PAGE_SIZE), + ); + builder.allocate().unwrap(); + assert_eq!(target, MemoryRange::new((limit - PAGE_SIZE)..limit)); + } + + #[test] + fn many_small_allocations() { + let mut targets = [MemoryRange::EMPTY; 100]; + let mut builder = LayoutBuilder::new(48); + for (i, t) in targets.iter_mut().enumerate() { + builder.request( + format!("s{i}"), + t, + PAGE_SIZE, + PAGE_SIZE, + Constraint::Below4GiB, + ); + } + let sorted = builder.allocate().unwrap(); + assert_eq!(sorted.len(), 100); + for [a, b] in sorted.array_windows() { + assert!(a.end() <= b.start()); + } + for r in &sorted { + assert!(r.end() <= FOUR_GIB); + } + } + + #[test] + fn mixed_constraints_with_pinned() { + let mut p1 = MemoryRange::EMPTY; + let mut p2 = MemoryRange::EMPTY; + let mut d1 = MemoryRange::EMPTY; + let mut d2 = MemoryRange::EMPTY; + let mut builder = LayoutBuilder::new(48); + builder.request("p1", &mut p1, GIB, PAGE_SIZE, Constraint::Pinned(GIB)); + builder.request("p2", &mut p2, GIB, PAGE_SIZE, Constraint::Pinned(3 * GIB)); + builder.request("d1", &mut d1, 512 * MIB, MIB, Constraint::Below4GiB); + builder.request("d2", &mut d2, 512 * MIB, MIB, Constraint::Below4GiB); + builder.allocate().unwrap(); + + assert_eq!(p1, MemoryRange::new(GIB..2 * GIB)); + assert_eq!(p2, MemoryRange::new(3 * GIB..4 * GIB)); + + assert!(!d1.overlaps(&d2)); + assert!(!d1.overlaps(&p1)); + assert!(!d1.overlaps(&p2)); + assert!(!d2.overlaps(&p1)); + assert!(!d2.overlaps(&p2)); + } + + #[test] + fn narrow_address_space() { + let mut target = MemoryRange::EMPTY; + let mut builder = LayoutBuilder::new(20); + builder.request( + "test", + &mut target, + PAGE_SIZE, + PAGE_SIZE, + Constraint::Below4GiB, + ); + builder.allocate().unwrap(); + assert!(target.end() <= 1u64 << 20); + } +} diff --git a/vm/vmcore/vm_topology/src/lib.rs b/vm/vmcore/vm_topology/src/lib.rs index c9649cca18..4d9a163375 100644 --- a/vm/vmcore/vm_topology/src/lib.rs +++ b/vm/vmcore/vm_topology/src/lib.rs @@ -6,6 +6,7 @@ #![forbid(unsafe_code)] +pub mod layout; pub mod memory; pub mod pcie; pub mod processor; From 888a96b4b2d03b829b2721bc5ed32cd188e83b92 Mon Sep 17 00:00:00 2001 From: John Starks Date: Sat, 16 May 2026 00:50:23 +0000 Subject: [PATCH 02/36] vm_topology: rework layout allocator around RAM --- vm/vmcore/vm_topology/src/layout.rs | 1560 ++++++++++++++------------- 1 file changed, 836 insertions(+), 724 deletions(-) diff --git a/vm/vmcore/vm_topology/src/layout.rs b/vm/vmcore/vm_topology/src/layout.rs index 99c0f73187..e16e009c4e 100644 --- a/vm/vmcore/vm_topology/src/layout.rs +++ b/vm/vmcore/vm_topology/src/layout.rs @@ -1,92 +1,369 @@ // Copyright (c) Microsoft Corporation. // Licensed under the MIT License. -//! MMIO layout allocator. +//! VM address-space layout allocator. //! -//! This module provides a pure-math layout allocator that assigns address -//! ranges to MMIO consumers within a flat physical address space. It has no -//! knowledge of specific architectures, firmware types, or pinned-address -//! conventions — those are the responsibility of the caller (typically -//! `vm_manifest_builder`). +//! This module provides a pure-math layout allocator that places fixed ranges, +//! 32-bit MMIO, ordinary RAM, and 64-bit MMIO in a flat guest physical address +//! map. It has no knowledge of specific architectures, firmware types, or +//! chipset conventions; callers express those policies as fixed ranges and +//! dynamic requests. //! //! # Usage //! //! ``` //! use memory_range::MemoryRange; -//! use vm_topology::layout::{Constraint, LayoutBuilder}; +//! use vm_topology::layout::{LayoutBuilder, Placement}; //! //! let mut reserved = MemoryRange::EMPTY; +//! let mut ram = Vec::new(); //! let mut vmbus = MemoryRange::EMPTY; -//! let mut pcie_bar = MemoryRange::EMPTY; //! -//! let mut builder = LayoutBuilder::new(48); -//! -//! // Reserve a pinned range for architectural devices. -//! builder.request("reserved", &mut reserved, 32 * 1024 * 1024, 4096, Constraint::Pinned(0xFE00_0000)); -//! -//! // Dynamic allocation below 4 GiB. -//! builder.request("vmbus", &mut vmbus, 128 * 1024 * 1024, 1024 * 1024, Constraint::Below4GiB); -//! -//! // Dynamic allocation above 4 GiB. +//! let mut builder = LayoutBuilder::new(); //! builder.request( -//! "pcie_bar", -//! &mut pcie_bar, -//! 1024 * 1024 * 1024, +//! "reserved", +//! &mut reserved, +//! 32 * 1024 * 1024, +//! 4096, +//! Placement::Fixed(0xFE00_0000), +//! ); +//! builder.request( +//! "vmbus", +//! &mut vmbus, +//! 128 * 1024 * 1024, //! 1024 * 1024, -//! Constraint::Above4GiB, +//! Placement::Mmio32, //! ); +//! builder.ram("ram", &mut ram, 2 * 1024 * 1024 * 1024, 4096); //! //! let sorted = builder.allocate().unwrap(); //! assert_eq!(reserved, MemoryRange::new(0xFE00_0000..0x1_0000_0000)); +//! assert_eq!(ram, [MemoryRange::new(0..0x8000_0000)]); +//! assert_eq!(vmbus.end(), 0xFE00_0000); //! assert_eq!(sorted.len(), 3); //! ``` use memory_range::MemoryRange; -use memory_range::subtract_ranges; use thiserror::Error; const PAGE_SIZE: u64 = 4096; const FOUR_GIB: u64 = 0x1_0000_0000; +const ADDRESS_LIMIT: u64 = MemoryRange::MAX_ADDRESS; + +/// The placement class for a single-range layout request. +#[derive(Debug, Clone, Copy, PartialEq, Eq)] +pub enum Placement { + /// The allocation must be placed exactly at the given address. + Fixed(u64), + /// The allocation must fit below the 4 GiB boundary and is placed top down. + Mmio32, + /// The allocation is placed bottom up from the end of RAM. + Mmio64, +} + +/// The kind of a produced allocation. +#[derive(Debug, Clone, Copy, PartialEq, Eq)] +pub enum PlacedRangeKind { + /// A fixed allocation supplied by the caller. + Fixed, + /// A 32-bit MMIO allocation. + Mmio32, + /// An ordinary RAM allocation. + Ram, + /// A 64-bit MMIO allocation. + Mmio64, +} -/// The constraint on where a layout request can be placed. +/// Allocation phase reported in [`AllocateError::Exhausted`]. #[derive(Debug, Clone, Copy, PartialEq, Eq)] -pub enum Constraint { - /// The allocation must fit entirely below the 4 GiB boundary. - Below4GiB, - /// The allocation must start at or above the 4 GiB boundary. - Above4GiB, - /// The allocation must be placed at exactly the given address. - Pinned(u64), +pub enum AllocationPhase { + /// 32-bit MMIO placement. + Mmio32, + /// RAM placement. + Ram, + /// 64-bit MMIO placement. + Mmio64, } -/// A builder for computing an MMIO layout by collecting requests and -/// then allocating them within a physical address space. -/// -/// The address space is `[0, 1 << physical_address_width)`. Consumers -/// call [`Self::request`] to declare allocation needs (passing a -/// `&mut MemoryRange` that will be filled in), then [`Self::allocate`] -/// to run the greedy placement algorithm. +/// A placed range returned by [`LayoutBuilder::allocate`]. +#[derive(Debug, Clone, PartialEq, Eq)] +pub struct PlacedRange { + /// The caller-supplied tag for the request. + pub tag: String, + /// The kind of allocation. + pub kind: PlacedRangeKind, + /// The placed range. + pub range: MemoryRange, +} + +/// A builder for computing a deterministic VM address-space layout. pub struct LayoutBuilder<'a> { - physical_address_width: u8, - targets: Vec<&'a mut MemoryRange>, - requests: Vec, + fixed: Vec>, + mmio32: Vec>, + ram: Vec>, + mmio64: Vec>, } -struct RequestEntry { +struct FixedRequest<'a> { tag: String, + target: &'a mut MemoryRange, + base: u64, size: u64, alignment: u64, - constraint: Constraint, - input_order: usize, +} + +struct DynamicRequest<'a> { + tag: String, + target: &'a mut MemoryRange, + size: u64, + alignment: u64, +} + +struct RamRequest<'a> { + tag: String, + target: &'a mut Vec, + size: u64, + alignment: u64, +} + +trait RequestDetails { + fn tag(&self) -> &str; + fn size(&self) -> u64; + fn alignment(&self) -> u64; +} + +impl RequestDetails for DynamicRequest<'_> { + fn tag(&self) -> &str { + &self.tag + } + + fn size(&self) -> u64 { + self.size + } + + fn alignment(&self) -> u64 { + self.alignment + } +} + +impl RequestDetails for RamRequest<'_> { + fn tag(&self) -> &str { + &self.tag + } + + fn size(&self) -> u64 { + self.size + } + + fn alignment(&self) -> u64 { + self.alignment + } +} + +struct AllocationState { + // Sorted, non-overlapping ranges not yet consumed by any request. Keeping + // free space as the primary state lets each phase update the map + // incrementally instead of repeatedly subtracting all allocations from the + // whole address space. + free: Vec, + allocations: Vec, + // Highest end address of ordinary RAM. High MMIO starts here so the layout + // top is driven by requested topology rather than a caller-provided high + // MMIO bucket size or host physical-address width. + ram_end: u64, +} + +impl AllocationState { + fn new() -> Self { + Self { + free: vec![MemoryRange::new(0..ADDRESS_LIMIT)], + allocations: Vec::new(), + ram_end: 0, + } + } + + fn place_fixed(&mut self, requests: &mut [FixedRequest<'_>]) -> Result<(), AllocateError> { + // Fixed ranges represent policy decisions made by the caller: reserved + // architectural/chipset zones, firmware conventions, and any other + // pinned addresses. They seed the free list before dynamic placement; + // this layer does not assign special meaning to particular fixed tags. + let mut fixed = requests + .iter() + .enumerate() + .map(|(index, request)| { + ( + MemoryRange::new(request.base..request.base + request.size), + index, + ) + }) + .collect::>(); + + fixed.sort_by_key(|(range, _)| range.start()); + + for pair in fixed.windows(2) { + let (range_a, index_a) = pair[0]; + let (range_b, index_b) = pair[1]; + if range_a.overlaps(&range_b) { + return Err(AllocateError::FixedOverlap { + tag_a: requests[index_a].tag.clone(), + range_a, + tag_b: requests[index_b].tag.clone(), + range_b, + }); + } + } + + for &(range, request_index) in &fixed { + *requests[request_index].target = range; + self.allocate_range(&requests[request_index].tag, PlacedRangeKind::Fixed, range); + } + + Ok(()) + } + + fn place_mmio32(&mut self, requests: &mut [DynamicRequest<'_>]) -> Result<(), AllocateError> { + // Pack 32-bit MMIO from the top of the 4 GiB window downward so RAM can + // start at GPA 0 and grow upward through the lowest remaining space. + // Alignment/size ordering keeps large, constrained windows from being + // fragmented by small devices. `sort_by` is stable, so otherwise equal + // requests keep caller order. + requests.sort_by(|request, other_request| { + other_request + .alignment + .cmp(&request.alignment) + .then(other_request.size.cmp(&request.size)) + }); + + for request in requests { + let Some(start) = + find_highest_fit(&self.free, request.size, request.alignment, 0, FOUR_GIB) + else { + return Err(exhausted_error( + request, + AllocationPhase::Mmio32, + &self.free, + 0, + FOUR_GIB, + )); + }; + + let range = MemoryRange::new(start..start + request.size); + *request.target = range; + self.allocate_range(&request.tag, PlacedRangeKind::Mmio32, range); + } + + Ok(()) + } + + fn place_ram(&mut self, requests: &mut [RamRequest<'_>]) -> Result<(), AllocateError> { + // Ordinary RAM is the only splittable request type in this API. It is + // placed after low MMIO so the resulting RAM extents describe the + // actual guest-visible memory map, including holes below 4 GiB. + for request in requests { + let ranges = find_lowest_splittable_fit( + &self.free, + request.size, + request.alignment, + 0, + ADDRESS_LIMIT, + ) + .ok_or_else(|| { + exhausted_error(request, AllocationPhase::Ram, &self.free, 0, ADDRESS_LIMIT) + })?; + + request.target.clear(); + request.target.extend_from_slice(&ranges); + for range in ranges { + self.allocate_range(&request.tag, PlacedRangeKind::Ram, range); + } + } + + Ok(()) + } + + fn place_mmio64(&mut self, requests: &mut [DynamicRequest<'_>]) -> Result<(), AllocateError> { + // High MMIO is allocated bottom up from the end of RAM. The allocator + // intentionally does not take host physical-address width as an input; + // callers validate the resulting top against host capabilities later. + requests.sort_by(|request, other_request| { + other_request + .alignment + .cmp(&request.alignment) + .then(other_request.size.cmp(&request.size)) + }); + + for request in requests { + let Some(start) = find_lowest_fit( + &self.free, + request.size, + request.alignment, + self.ram_end, + ADDRESS_LIMIT, + ) else { + return Err(exhausted_error( + request, + AllocationPhase::Mmio64, + &self.free, + self.ram_end, + ADDRESS_LIMIT, + )); + }; + + let range = MemoryRange::new(start..start + request.size); + *request.target = range; + self.allocate_range(&request.tag, PlacedRangeKind::Mmio64, range); + } + + Ok(()) + } + + fn record(&mut self, tag: &str, kind: PlacedRangeKind, range: MemoryRange) { + self.allocations.push(PlacedRange { + tag: tag.to_string(), + kind, + range, + }); + + if kind == PlacedRangeKind::Ram { + self.ram_end = self.ram_end.max(range.end()); + } + } + + fn allocate_range(&mut self, tag: &str, kind: PlacedRangeKind, range: MemoryRange) { + self.remove_free_range(range); + self.record(tag, kind, range); + } + + fn remove_free_range(&mut self, allocated: MemoryRange) { + let free_index = self + .free + .partition_point(|range| range.start() <= allocated.start()) + .checked_sub(1) + .expect("allocated range must be contained in the free list"); + assert!(self.free[free_index].contains(&allocated)); + let free_range = self.free.remove(free_index); + + let mut insert_index = free_index; + if free_range.start() < allocated.start() { + self.free.insert( + insert_index, + MemoryRange::new(free_range.start()..allocated.start()), + ); + insert_index += 1; + } + if allocated.end() < free_range.end() { + self.free.insert( + insert_index, + MemoryRange::new(allocated.end()..free_range.end()), + ); + } + } } /// Error returned by [`LayoutBuilder::allocate`]. #[derive(Debug, Error)] pub enum AllocateError { - /// The physical address width is invalid (must be 1..=63). - #[error("invalid physical address width {0} (must be 1..=63)")] - InvalidAddressWidth(u8), - /// A request has an invalid size (must be > 0 and page-aligned). + /// A request has an invalid size. #[error("{tag}: invalid size {size:#x} (must be > 0 and a multiple of {PAGE_SIZE:#x})")] InvalidSize { /// The tag identifying the request. @@ -102,42 +379,41 @@ pub enum AllocateError { /// The invalid alignment. alignment: u64, }, - /// A pinned request has a non-page-aligned address. - #[error("{tag}: pinned address {address:#x} is not page-aligned")] - InvalidPinnedAddress { + /// A fixed request has a non-page-aligned address. + #[error("{tag}: fixed address {address:#x} is not page-aligned")] + InvalidFixedAddress { /// The tag identifying the request. tag: String, /// The invalid address. address: u64, }, - /// A pinned request extends beyond the physical address space. - #[error("{tag}: pinned range {address:#x}..{end:#x} exceeds address space limit {limit:#x}")] - PinnedOutOfBounds { + /// A fixed request's range cannot be represented. + #[error( + "{tag}: fixed range starting at {address:#x} with size {size:#x} exceeds the address space" + )] + FixedRangeOverflow { /// The tag identifying the request. tag: String, /// The start address. address: u64, - /// The end address. - end: u64, - /// The address space limit. - limit: u64, + /// The requested size. + size: u64, }, - /// Two pinned requests overlap. - #[error("pinned requests {tag_a} ({range_a}) and {tag_b} ({range_b}) overlap")] - PinnedOverlap { - /// The tag of the first pinned request. + /// Two fixed requests overlap. + #[error("fixed requests {tag_a} ({range_a}) and {tag_b} ({range_b}) overlap")] + FixedOverlap { + /// The tag of the first fixed request. tag_a: String, - /// The range of the first pinned request. + /// The range of the first fixed request. range_a: MemoryRange, - /// The tag of the second pinned request. + /// The tag of the second fixed request. tag_b: String, - /// The range of the second pinned request. + /// The range of the second fixed request. range_b: MemoryRange, }, /// A dynamic request could not be satisfied. #[error( - "{tag}: cannot allocate {size:#x} bytes with alignment {alignment:#x} \ - and constraint {constraint:?}; remaining free space in region: {free_space:#x} bytes" + "{tag}: cannot allocate {size:#x} bytes with alignment {alignment:#x} during {phase:?}; remaining free space in phase: {free_space:#x} bytes" )] Exhausted { /// The tag identifying the request. @@ -146,242 +422,194 @@ pub enum AllocateError { size: u64, /// The requested alignment. alignment: u64, - /// The placement constraint. - constraint: Constraint, - /// The remaining free space in the constrained region. + /// The allocation phase. + phase: AllocationPhase, + /// The remaining free space in the phase. free_space: u64, }, } impl<'a> LayoutBuilder<'a> { - /// Creates a new layout builder for the given physical address width. - /// - /// The address space is `[0, 1 << physical_address_width)`. - /// `physical_address_width` must be in the range `1..=63`. - pub fn new(physical_address_width: u8) -> Self { + /// Creates a new layout builder. + pub fn new() -> Self { Self { - physical_address_width, - targets: Vec::new(), - requests: Vec::new(), + fixed: Vec::new(), + mmio32: Vec::new(), + ram: Vec::new(), + mmio64: Vec::new(), } } - /// Adds a request to the builder. + /// Adds a single-range request to the builder. /// - /// - `tag`: A descriptive name for the request (used in error messages). - /// - `target`: A mutable reference to a [`MemoryRange`] that will be - /// filled in with the allocated range when [`Self::allocate`] is - /// called. - /// - `size`: The size in bytes. Must be > 0 and a multiple of 4096. - /// - `alignment`: The required alignment. Must be >= 4096 and a power - /// of 2. - /// - `constraint`: Where the allocation may be placed. + /// The target is filled in when [`Self::allocate`] succeeds. pub fn request( &mut self, tag: impl Into, target: &'a mut MemoryRange, size: u64, alignment: u64, - constraint: Constraint, + placement: Placement, ) { - let input_order = self.requests.len(); - self.targets.push(target); - self.requests.push(RequestEntry { + match placement { + Placement::Fixed(base) => self.fixed.push(FixedRequest { + tag: tag.into(), + target, + base, + size, + alignment, + }), + Placement::Mmio32 => self.mmio32.push(DynamicRequest { + tag: tag.into(), + target, + size, + alignment, + }), + Placement::Mmio64 => self.mmio64.push(DynamicRequest { + tag: tag.into(), + target, + size, + alignment, + }), + } + } + + /// Adds an ordinary RAM request to the builder. + /// + /// RAM is placed bottom up from GPA 0 and may split around fixed and MMIO32 + /// ranges. The target vector is replaced with the placed RAM extents when + /// [`Self::allocate`] succeeds. + pub fn ram( + &mut self, + tag: impl Into, + target: &'a mut Vec, + size: u64, + alignment: u64, + ) { + self.ram.push(RamRequest { tag: tag.into(), + target, size, alignment, - constraint, - input_order, }); } - /// Allocates all requests, fills in each target `&mut MemoryRange`, - /// and returns every allocation sorted by address. - /// - /// The algorithm: - /// 1. Places all [`Constraint::Pinned`] requests at their fixed - /// addresses, validating no overlaps. - /// 2. Sorts non-pinned requests by `(alignment desc, size desc, - /// input_order asc)`. - /// 3. Greedy top-down placement: for each non-pinned request, finds - /// the highest-address position in the constrained region that - /// satisfies size and alignment. - /// 4. Writes each result to its `&mut MemoryRange` target and - /// returns a `Vec` of all allocations sorted by - /// address. - pub fn allocate(mut self) -> Result, AllocateError> { - let width = self.physical_address_width; - if !(1..=63).contains(&width) { - return Err(AllocateError::InvalidAddressWidth(width)); - } - let address_limit = 1u64 << width; - - // Validate all requests up front. - for req in &self.requests { - if req.size == 0 || req.size % PAGE_SIZE != 0 { - return Err(AllocateError::InvalidSize { - tag: req.tag.clone(), - size: req.size, - }); - } - if req.alignment < PAGE_SIZE || !req.alignment.is_power_of_two() { - return Err(AllocateError::InvalidAlignment { - tag: req.tag.clone(), - alignment: req.alignment, - }); - } - if let Constraint::Pinned(addr) = req.constraint { - if addr % PAGE_SIZE != 0 { - return Err(AllocateError::InvalidPinnedAddress { - tag: req.tag.clone(), - address: addr, - }); - } - let end = - addr.checked_add(req.size) - .ok_or_else(|| AllocateError::PinnedOutOfBounds { - tag: req.tag.clone(), - address: addr, - end: u64::MAX, - limit: address_limit, - })?; - if end > address_limit { - return Err(AllocateError::PinnedOutOfBounds { - tag: req.tag.clone(), - address: addr, - end, - limit: address_limit, - }); - } - } - } + /// Allocates all requests, fills in each target, and returns every placed + /// range sorted by address. + pub fn allocate(mut self) -> Result, AllocateError> { + validate_fixed_requests(&self.fixed)?; + validate_dynamic_requests(&self.mmio32)?; + validate_ram_requests(&self.ram)?; + validate_dynamic_requests(&self.mmio64)?; + + let mut state = AllocationState::new(); + state.place_fixed(&mut self.fixed)?; + state.place_mmio32(&mut self.mmio32)?; + state.place_ram(&mut self.ram)?; + state.place_mmio64(&mut self.mmio64)?; + + state.allocations.sort_by_key(|allocation| allocation.range); + Ok(state.allocations) + } +} - let mut allocations: Vec = vec![MemoryRange::EMPTY; self.requests.len()]; +impl Default for LayoutBuilder<'_> { + fn default() -> Self { + Self::new() + } +} - // Step 1: Collect pinned requests, sort by address, check for - // overlaps with a single adjacent-pair scan. - let mut pinned: Vec<(MemoryRange, usize)> = self - .requests - .iter() - .enumerate() - .filter_map(|(i, req)| { - if let Constraint::Pinned(addr) = req.constraint { - Some((MemoryRange::new(addr..addr + req.size), i)) - } else { - None - } - }) - .collect(); +fn validate_size_alignment(tag: &str, size: u64, alignment: u64) -> Result<(), AllocateError> { + if size == 0 || !size.is_multiple_of(PAGE_SIZE) { + return Err(AllocateError::InvalidSize { + tag: tag.to_string(), + size, + }); + } - pinned.sort_by_key(|(range, _)| range.start()); + if alignment < PAGE_SIZE || !alignment.is_power_of_two() { + return Err(AllocateError::InvalidAlignment { + tag: tag.to_string(), + alignment, + }); + } - for [(range_a, idx_a), (range_b, idx_b)] in pinned.array_windows() { - if range_a.overlaps(range_b) { - return Err(AllocateError::PinnedOverlap { - tag_a: self.requests[*idx_a].tag.clone(), - range_a: *range_a, - tag_b: self.requests[*idx_b].tag.clone(), - range_b: *range_b, - }); - } + Ok(()) +} + +fn validate_fixed_requests(requests: &[FixedRequest<'_>]) -> Result<(), AllocateError> { + for request in requests { + validate_size_alignment(&request.tag, request.size, request.alignment)?; + if !request.base.is_multiple_of(PAGE_SIZE) { + return Err(AllocateError::InvalidFixedAddress { + tag: request.tag.clone(), + address: request.base, + }); } - for &(range, idx) in &pinned { - allocations[idx] = range; + let Some(end) = request.base.checked_add(request.size) else { + return Err(AllocateError::FixedRangeOverflow { + tag: request.tag.clone(), + address: request.base, + size: request.size, + }); + }; + + if end > ADDRESS_LIMIT { + return Err(AllocateError::FixedRangeOverflow { + tag: request.tag.clone(), + address: request.base, + size: request.size, + }); } + } - // Compute free space by subtracting all pinned ranges from the - // full address space in one pass. Both inputs are sorted and - // non-overlapping, so subtract_ranges runs in linear time. - let pinned_ranges: Vec = pinned.iter().map(|(r, _)| *r).collect(); - let mut free_ranges: Vec = subtract_ranges( - [MemoryRange::new(0..address_limit)], - pinned_ranges.iter().copied(), - ) - .collect(); - - // Step 2: Collect non-Pinned request indices, sort by - // (alignment DESC, size DESC, input_order ASC). - let mut dynamic: Vec = self - .requests - .iter() - .enumerate() - .filter(|(_, req)| !matches!(req.constraint, Constraint::Pinned(_))) - .map(|(i, _)| i) - .collect(); - - dynamic.sort_by(|&a, &b| { - let ra = &self.requests[a]; - let rb = &self.requests[b]; - rb.alignment - .cmp(&ra.alignment) - .then(rb.size.cmp(&ra.size)) - .then(ra.input_order.cmp(&rb.input_order)) - }); + Ok(()) +} - // Step 3: Greedy top-down placement. For each dynamic request, - // reverse-scan the sorted free list for the highest-address fit, - // then update the free list via subtract_ranges. - for &idx in &dynamic { - let req = &self.requests[idx]; - let (region_start, region_end) = match req.constraint { - Constraint::Below4GiB => (0, FOUR_GIB.min(address_limit)), - Constraint::Above4GiB => (FOUR_GIB, address_limit), - Constraint::Pinned(_) => unreachable!(), - }; +fn validate_dynamic_requests(requests: &[DynamicRequest<'_>]) -> Result<(), AllocateError> { + for request in requests { + validate_size_alignment(&request.tag, request.size, request.alignment)?; + } - match find_highest_fit( - &free_ranges, - req.size, - req.alignment, - region_start, - region_end, - ) { - Some(alloc_start) => { - let alloc_range = MemoryRange::new(alloc_start..alloc_start + req.size); - allocations[idx] = alloc_range; - free_ranges = - subtract_ranges(free_ranges.iter().copied(), [alloc_range]).collect(); - } - None => { - let free_in_region: u64 = free_ranges - .iter() - .filter_map(|r| { - let eff_start = r.start().max(region_start); - let eff_end = r.end().min(region_end); - if eff_start < eff_end { - Some(eff_end - eff_start) - } else { - None - } - }) - .sum(); - return Err(AllocateError::Exhausted { - tag: req.tag.clone(), - size: req.size, - alignment: req.alignment, - constraint: req.constraint, - free_space: free_in_region, - }); - } - } - } + Ok(()) +} - // Step 4: Write results to targets and build sorted output. - for (target, alloc) in self.targets.iter_mut().zip(allocations.iter()) { - **target = *alloc; - } +fn validate_ram_requests(requests: &[RamRequest<'_>]) -> Result<(), AllocateError> { + for request in requests { + validate_size_alignment(&request.tag, request.size, request.alignment)?; + } - allocations.sort(); - Ok(allocations) + Ok(()) +} + +fn exhausted_error( + request: &impl RequestDetails, + phase: AllocationPhase, + free_ranges: &[MemoryRange], + region_start: u64, + region_end: u64, +) -> AllocateError { + AllocateError::Exhausted { + tag: request.tag().to_string(), + size: request.size(), + alignment: request.alignment(), + phase, + free_space: free_space_in_region(free_ranges, region_start, region_end), } } -/// Finds the highest aligned start address within `[region_start, region_end)` -/// that fits `size` bytes within one of the free ranges. -/// -/// The free list must be sorted by address. Iterates in reverse to find -/// the highest-address match first. +fn free_space_in_region(free_ranges: &[MemoryRange], region_start: u64, region_end: u64) -> u64 { + free_ranges + .iter() + .map(|range| { + let effective_start = range.start().max(region_start); + let effective_end = range.end().min(region_end); + effective_end.saturating_sub(effective_start) + }) + .sum() +} + fn find_highest_fit( free_ranges: &[MemoryRange], size: u64, @@ -390,19 +618,46 @@ fn find_highest_fit( region_end: u64, ) -> Option { for range in free_ranges.iter().rev() { - // Clip the free range to the constrained region. - let eff_start = range.start().max(region_start); - let eff_end = range.end().min(region_end); + let effective_start = range.start().max(region_start); + let effective_end = range.end().min(region_end); - if eff_start >= eff_end || eff_end - eff_start < size { + if effective_start >= effective_end || effective_end - effective_start < size { continue; } - // Find the highest aligned start where [start, start + size) fits. - let latest_start = eff_end - size; - let aligned_start = latest_start & !(alignment - 1); + let latest_start = effective_end - size; + let aligned_start = align_down(latest_start, alignment); + if aligned_start >= effective_start { + return Some(aligned_start); + } + } + + None +} + +fn find_lowest_fit( + free_ranges: &[MemoryRange], + size: u64, + alignment: u64, + region_start: u64, + region_end: u64, +) -> Option { + for range in free_ranges { + let effective_start = range.start().max(region_start); + let effective_end = range.end().min(region_end); - if aligned_start >= eff_start { + if effective_start >= effective_end { + continue; + } + + let Some(aligned_start) = align_up(effective_start, alignment) else { + continue; + }; + let Some(end) = aligned_start.checked_add(size) else { + continue; + }; + + if end <= effective_end { return Some(aligned_start); } } @@ -410,6 +665,56 @@ fn find_highest_fit( None } +fn find_lowest_splittable_fit( + free_ranges: &[MemoryRange], + size: u64, + alignment: u64, + region_start: u64, + region_end: u64, +) -> Option> { + let mut remaining = size; + let mut ranges = Vec::new(); + + for range in free_ranges { + let effective_start = range.start().max(region_start); + let effective_end = range.end().min(region_end); + + if effective_start >= effective_end { + continue; + } + + let Some(aligned_start) = align_up(effective_start, alignment) else { + continue; + }; + if aligned_start >= effective_end { + continue; + } + + let available = effective_end - aligned_start; + let allocation_size = available.min(remaining); + ranges.push(MemoryRange::new( + aligned_start..aligned_start + allocation_size, + )); + remaining -= allocation_size; + + if remaining == 0 { + return Some(ranges); + } + } + + None +} + +fn align_down(value: u64, alignment: u64) -> u64 { + value & !(alignment - 1) +} + +fn align_up(value: u64, alignment: u64) -> Option { + value + .checked_add(alignment - 1) + .map(|value| align_down(value, alignment)) +} + #[cfg(test)] mod tests { use super::*; @@ -420,569 +725,376 @@ mod tests { #[test] fn empty_input() { - let builder = LayoutBuilder::new(48); - let sorted = builder.allocate().unwrap(); + let sorted = LayoutBuilder::new().allocate().unwrap(); assert!(sorted.is_empty()); } #[test] - fn single_pinned() { + fn fixed_request_fills_target() { let mut target = MemoryRange::EMPTY; - let mut builder = LayoutBuilder::new(48); + let mut builder = LayoutBuilder::new(); builder.request( - "test", + "fixed", &mut target, 4 * MIB, PAGE_SIZE, - Constraint::Pinned(0xFC00_0000), + Placement::Fixed(0xFC00_0000), ); - let sorted = builder.allocate().unwrap(); - assert_eq!(target, MemoryRange::new(0xFC00_0000..0xFC00_0000 + 4 * MIB)); - assert_eq!(sorted.len(), 1); - } - #[test] - fn multiple_pinned_non_overlapping() { - let mut t1 = MemoryRange::EMPTY; - let mut t2 = MemoryRange::EMPTY; - let mut builder = LayoutBuilder::new(48); - builder.request("a", &mut t1, 4 * KIB, PAGE_SIZE, Constraint::Pinned(0x1000)); - builder.request("b", &mut t2, 4 * KIB, PAGE_SIZE, Constraint::Pinned(0x2000)); let sorted = builder.allocate().unwrap(); - assert_eq!(t1, MemoryRange::new(0x1000..0x2000)); - assert_eq!(t2, MemoryRange::new(0x2000..0x3000)); - assert_eq!(sorted[0], MemoryRange::new(0x1000..0x2000)); - assert_eq!(sorted[1], MemoryRange::new(0x2000..0x3000)); - } - #[test] - fn pinned_overlap_rejected() { - let mut t1 = MemoryRange::EMPTY; - let mut t2 = MemoryRange::EMPTY; - let mut builder = LayoutBuilder::new(48); - builder.request("a", &mut t1, 8 * KIB, PAGE_SIZE, Constraint::Pinned(0x1000)); - builder.request("b", &mut t2, 4 * KIB, PAGE_SIZE, Constraint::Pinned(0x2000)); - let err = builder.allocate().unwrap_err(); - assert!(matches!(err, AllocateError::PinnedOverlap { .. })); + assert_eq!(target, MemoryRange::new(0xFC00_0000..0xFC40_0000)); + assert_eq!(sorted[0].range, target); + assert_eq!(sorted[0].kind, PlacedRangeKind::Fixed); } #[test] - fn pinned_out_of_bounds() { - let mut target = MemoryRange::EMPTY; - let mut builder = LayoutBuilder::new(32); + fn fixed_overlap_rejected() { + let mut first = MemoryRange::EMPTY; + let mut second = MemoryRange::EMPTY; + let mut builder = LayoutBuilder::new(); builder.request( - "oob", - &mut target, + "first", + &mut first, 8 * KIB, PAGE_SIZE, - Constraint::Pinned(0xFFFF_F000), + Placement::Fixed(0x1000), ); - let err = builder.allocate().unwrap_err(); - assert!(matches!(err, AllocateError::PinnedOutOfBounds { .. })); + builder.request( + "second", + &mut second, + 4 * KIB, + PAGE_SIZE, + Placement::Fixed(0x2000), + ); + + let error = builder.allocate().unwrap_err(); + + assert!(matches!(error, AllocateError::FixedOverlap { .. })); } #[test] - fn pinned_at_edge_of_address_space() { + fn invalid_request_rejected() { let mut target = MemoryRange::EMPTY; - let mut builder = LayoutBuilder::new(32); + let mut builder = LayoutBuilder::new(); + builder.request("zero", &mut target, 0, PAGE_SIZE, Placement::Mmio32); + assert!(matches!( + builder.allocate().unwrap_err(), + AllocateError::InvalidSize { .. } + )); + + let mut target = MemoryRange::EMPTY; + let mut builder = LayoutBuilder::new(); + builder.request("alignment", &mut target, PAGE_SIZE, KIB, Placement::Mmio32); + assert!(matches!( + builder.allocate().unwrap_err(), + AllocateError::InvalidAlignment { .. } + )); + + let mut target = MemoryRange::EMPTY; + let mut builder = LayoutBuilder::new(); builder.request( - "edge", + "fixed", &mut target, - 4 * KIB, PAGE_SIZE, - Constraint::Pinned(0xFFFF_F000), + PAGE_SIZE, + Placement::Fixed(0x1234), ); - builder.allocate().unwrap(); - assert_eq!(target, MemoryRange::new(0xFFFF_F000..0x1_0000_0000)); + assert!(matches!( + builder.allocate().unwrap_err(), + AllocateError::InvalidFixedAddress { .. } + )); } #[test] - fn pinned_at_address_zero() { + fn fixed_range_overflow_rejected() { let mut target = MemoryRange::EMPTY; - let mut builder = LayoutBuilder::new(48); + let mut builder = LayoutBuilder::new(); builder.request( - "zero", + "overflow", &mut target, - 4 * KIB, + 2 * PAGE_SIZE, PAGE_SIZE, - Constraint::Pinned(0), + Placement::Fixed(ADDRESS_LIMIT - PAGE_SIZE), ); - builder.allocate().unwrap(); - assert_eq!(target, MemoryRange::new(0..PAGE_SIZE)); - } - #[test] - fn single_below_4gib() { - let mut target = MemoryRange::EMPTY; - let mut builder = LayoutBuilder::new(48); - builder.request("test", &mut target, MIB, MIB, Constraint::Below4GiB); - builder.allocate().unwrap(); - assert!(target.end() <= FOUR_GIB); - assert_eq!(target.len(), MIB); - assert_eq!(target.start() % MIB, 0); + assert!(matches!( + builder.allocate().unwrap_err(), + AllocateError::FixedRangeOverflow { .. } + )); } #[test] - fn single_above_4gib() { - let mut target = MemoryRange::EMPTY; - let mut builder = LayoutBuilder::new(48); - builder.request("test", &mut target, GIB, MIB, Constraint::Above4GiB); - builder.allocate().unwrap(); - assert!(target.start() >= FOUR_GIB); - assert_eq!(target.len(), GIB); - assert_eq!(target.start() % MIB, 0); - } + fn mmio32_uses_top_down_placement_below_4_gib() { + let mut reserved = MemoryRange::EMPTY; + let mut first = MemoryRange::EMPTY; + let mut second = MemoryRange::EMPTY; + let mut builder = LayoutBuilder::new(); + builder.request( + "reserved", + &mut reserved, + 32 * MIB, + PAGE_SIZE, + Placement::Fixed(0xFE00_0000), + ); + builder.request("first", &mut first, MIB, MIB, Placement::Mmio32); + builder.request("second", &mut second, MIB, MIB, Placement::Mmio32); - #[test] - fn below_4gib_top_down_placement() { - let mut t1 = MemoryRange::EMPTY; - let mut t2 = MemoryRange::EMPTY; - let mut builder = LayoutBuilder::new(48); - builder.request("a", &mut t1, MIB, MIB, Constraint::Below4GiB); - builder.request("b", &mut t2, MIB, MIB, Constraint::Below4GiB); builder.allocate().unwrap(); - // Same alignment and size → input order tiebreaker. t1 (order 0) - // is placed first (highest address), t2 (order 1) gets the next - // highest. - assert!(t1.start() > t2.start()); - assert!(!t1.overlaps(&t2)); + + assert_eq!(first, MemoryRange::new(0xFDF0_0000..0xFE00_0000)); + assert_eq!(second, MemoryRange::new(0xFDE0_0000..0xFDF0_0000)); } #[test] - fn alignment_driven_ordering() { + fn mmio32_orders_by_alignment_then_size_then_request_order() { let mut small = MemoryRange::EMPTY; - let mut big = MemoryRange::EMPTY; - let mut builder = LayoutBuilder::new(48); - builder.request("small", &mut small, MIB, MIB, Constraint::Below4GiB); - builder.request("big", &mut big, MIB, 256 * MIB, Constraint::Below4GiB); + let mut aligned = MemoryRange::EMPTY; + let mut large = MemoryRange::EMPTY; + let mut builder = LayoutBuilder::new(); + builder.request("small", &mut small, MIB, MIB, Placement::Mmio32); + builder.request("aligned", &mut aligned, MIB, 256 * MIB, Placement::Mmio32); + builder.request("large", &mut large, 4 * MIB, MIB, Placement::Mmio32); + builder.allocate().unwrap(); - assert_eq!(big.start() % (256 * MIB), 0); - assert_eq!(small.start() % MIB, 0); - assert!(!big.overlaps(&small)); - assert!(big.end() <= FOUR_GIB); - assert!(small.end() <= FOUR_GIB); + + assert_eq!(aligned.start() % (256 * MIB), 0); + assert_eq!(large.len(), 4 * MIB); + assert_eq!(small.len(), MIB); + assert!(!aligned.overlaps(&large)); + assert!(!aligned.overlaps(&small)); + assert!(!large.overlaps(&small)); } #[test] - fn size_driven_ordering_same_alignment() { - let mut small = MemoryRange::EMPTY; - let mut big = MemoryRange::EMPTY; - let mut builder = LayoutBuilder::new(48); - builder.request("small", &mut small, MIB, MIB, Constraint::Below4GiB); - builder.request("big", &mut big, 4 * MIB, MIB, Constraint::Below4GiB); - builder.allocate().unwrap(); - assert!(big.start() > small.start()); + fn ram_starts_at_zero() { + let mut ram = Vec::new(); + let mut builder = LayoutBuilder::new(); + builder.ram("ram", &mut ram, 2 * GIB, PAGE_SIZE); + + let sorted = builder.allocate().unwrap(); + + assert_eq!(ram, [MemoryRange::new(0..2 * GIB)]); + assert_eq!(sorted[0].kind, PlacedRangeKind::Ram); + assert_eq!(sorted[0].range, ram[0]); } #[test] - fn pinned_plus_dynamic() { - let mut reserved = MemoryRange::EMPTY; - let mut dynamic = MemoryRange::EMPTY; - let mut builder = LayoutBuilder::new(32); - builder.request( - "reserved", - &mut reserved, - 32 * MIB, - PAGE_SIZE, - Constraint::Pinned(0xFE00_0000), - ); - builder.request("dynamic", &mut dynamic, MIB, MIB, Constraint::Below4GiB); + fn ram_splits_around_fixed_ranges_and_mmio32() { + let mut fixed = MemoryRange::EMPTY; + let mut mmio32 = MemoryRange::EMPTY; + let mut ram = Vec::new(); + let mut builder = LayoutBuilder::new(); + builder.request("fixed", &mut fixed, MIB, PAGE_SIZE, Placement::Fixed(GIB)); + builder.request("mmio32", &mut mmio32, 2 * GIB, MIB, Placement::Mmio32); + builder.ram("ram", &mut ram, 3 * GIB, PAGE_SIZE); + builder.allocate().unwrap(); - assert_eq!(reserved, MemoryRange::new(0xFE00_0000..0x1_0000_0000)); - assert!(!dynamic.overlaps(&reserved)); - assert!(dynamic.end() <= FOUR_GIB); - } - #[test] - fn exhaustion_below_4gib() { - let mut t1 = MemoryRange::EMPTY; - let mut t2 = MemoryRange::EMPTY; - let mut builder = LayoutBuilder::new(32); - builder.request("pin", &mut t1, GIB, PAGE_SIZE, Constraint::Pinned(0)); - builder.request( - "too_big", - &mut t2, - 4 * GIB, - PAGE_SIZE, - Constraint::Below4GiB, + assert_eq!( + ram, + [ + MemoryRange::new(0..GIB), + MemoryRange::new(GIB + MIB..2 * GIB), + MemoryRange::new(FOUR_GIB..FOUR_GIB + GIB + MIB), + ] ); - let err = builder.allocate().unwrap_err(); - assert!(matches!(err, AllocateError::Exhausted { .. })); } #[test] - fn exhaustion_above_4gib_narrow_width() { - let mut target = MemoryRange::EMPTY; - let mut builder = LayoutBuilder::new(32); - builder.request( - "above", - &mut target, - PAGE_SIZE, - PAGE_SIZE, - Constraint::Above4GiB, - ); - let err = builder.allocate().unwrap_err(); - assert!(matches!(err, AllocateError::Exhausted { .. })); + fn mmio64_uses_bottom_up_placement_from_end_of_ram() { + let mut ram = Vec::new(); + let mut first = MemoryRange::EMPTY; + let mut second = MemoryRange::EMPTY; + let mut builder = LayoutBuilder::new(); + builder.ram("ram", &mut ram, 2 * GIB, PAGE_SIZE); + builder.request("first", &mut first, MIB, MIB, Placement::Mmio64); + builder.request("second", &mut second, MIB, MIB, Placement::Mmio64); + + builder.allocate().unwrap(); + + assert_eq!(first, MemoryRange::new(2 * GIB..2 * GIB + MIB)); + assert_eq!(second, MemoryRange::new(2 * GIB + MIB..2 * GIB + 2 * MIB)); } #[test] - fn exhaustion_alignment_fragmentation() { - let mut t1 = MemoryRange::EMPTY; - let mut t2 = MemoryRange::EMPTY; - let mut builder = LayoutBuilder::new(36); + fn mmio64_skips_fixed_ranges_above_ram() { + let mut ram = Vec::new(); + let mut fixed = MemoryRange::EMPTY; + let mut mmio64 = MemoryRange::EMPTY; + let mut builder = LayoutBuilder::new(); + builder.ram("ram", &mut ram, 2 * GIB, PAGE_SIZE); builder.request( - "pin", - &mut t1, - 0xF800_0000, + "fixed", + &mut fixed, + MIB, PAGE_SIZE, - Constraint::Pinned(0), - ); - builder.request( - "misaligned", - &mut t2, - 128 * MIB, - 256 * MIB, - Constraint::Below4GiB, + Placement::Fixed(2 * GIB), ); - let err = builder.allocate().unwrap_err(); - assert!(matches!(err, AllocateError::Exhausted { .. })); - } + builder.request("mmio64", &mut mmio64, MIB, MIB, Placement::Mmio64); - #[test] - fn below_4gib_above_4gib_filtering() { - let mut below = MemoryRange::EMPTY; - let mut above = MemoryRange::EMPTY; - let mut builder = LayoutBuilder::new(48); - builder.request("below", &mut below, MIB, MIB, Constraint::Below4GiB); - builder.request("above", &mut above, MIB, MIB, Constraint::Above4GiB); builder.allocate().unwrap(); - assert!(below.end() <= FOUR_GIB); - assert!(above.start() >= FOUR_GIB); + + assert_eq!(mmio64, MemoryRange::new(2 * GIB + MIB..2 * GIB + 2 * MIB)); } #[test] - fn sorted_ranges_order() { - let mut t_above = MemoryRange::EMPTY; - let mut t_pinned = MemoryRange::EMPTY; - let mut t_below = MemoryRange::EMPTY; - let mut builder = LayoutBuilder::new(48); - builder.request("above", &mut t_above, GIB, MIB, Constraint::Above4GiB); + fn fixed_hypertransport_hole_is_regular_fixed_placement() { + let mut ram = Vec::new(); + let mut hypertransport = MemoryRange::EMPTY; + let mut builder = LayoutBuilder::new(); + builder.ram("ram", &mut ram, 2 * GIB, PAGE_SIZE); builder.request( - "pinned", - &mut t_pinned, - PAGE_SIZE, + "amd_hypertransport_hole", + &mut hypertransport, + GIB, PAGE_SIZE, - Constraint::Pinned(0x1000), + Placement::Fixed(0xFD_0000_0000), ); - builder.request("below", &mut t_below, MIB, MIB, Constraint::Below4GiB); - let sorted = builder.allocate().unwrap(); - assert_eq!(sorted.len(), 3); - // Pinned at 0x1000 should be first. - assert_eq!(sorted[0], t_pinned); - for [a, b] in sorted.array_windows() { - assert!(a.start() < b.start()); - } - } - #[test] - fn determinism() { - let mut prev_sorted: Option> = None; - for _ in 0..10 { - let mut a = MemoryRange::EMPTY; - let mut b = MemoryRange::EMPTY; - let mut c = MemoryRange::EMPTY; - let mut d = MemoryRange::EMPTY; - let mut e = MemoryRange::EMPTY; - let mut builder = LayoutBuilder::new(48); - builder.request("a", &mut a, 4 * MIB, MIB, Constraint::Below4GiB); - builder.request("b", &mut b, GIB, 256 * MIB, Constraint::Above4GiB); - builder.request( - "c", - &mut c, - 32 * MIB, - PAGE_SIZE, - Constraint::Pinned(0xFE00_0000), - ); - builder.request("d", &mut d, 128 * MIB, MIB, Constraint::Below4GiB); - builder.request("e", &mut e, PAGE_SIZE, PAGE_SIZE, Constraint::Below4GiB); - let sorted = builder.allocate().unwrap(); - if let Some(prev) = &prev_sorted { - assert_eq!(prev, &sorted); - } - prev_sorted = Some(sorted); - } - } - - #[test] - fn invalid_address_width() { - let builder = LayoutBuilder::new(0); - assert!(matches!( - builder.allocate().unwrap_err(), - AllocateError::InvalidAddressWidth(0) - )); - let builder = LayoutBuilder::new(64); - assert!(matches!( - builder.allocate().unwrap_err(), - AllocateError::InvalidAddressWidth(64) - )); - } - - #[test] - fn invalid_size() { - let mut target = MemoryRange::EMPTY; - let mut builder = LayoutBuilder::new(48); - builder.request("bad", &mut target, 0, PAGE_SIZE, Constraint::Below4GiB); - assert!(matches!( - builder.allocate().unwrap_err(), - AllocateError::InvalidSize { .. } - )); + let sorted = builder.allocate().unwrap(); - let mut target = MemoryRange::EMPTY; - let mut builder = LayoutBuilder::new(48); - builder.request("bad", &mut target, 100, PAGE_SIZE, Constraint::Below4GiB); - assert!(matches!( - builder.allocate().unwrap_err(), - AllocateError::InvalidSize { .. } - )); + assert_eq!( + hypertransport, + MemoryRange::new(0xFD_0000_0000..0xFD_4000_0000) + ); + assert_eq!(sorted.last().unwrap().range, hypertransport); } #[test] - fn invalid_alignment() { - let mut target = MemoryRange::EMPTY; - let mut builder = LayoutBuilder::new(48); - builder.request("bad", &mut target, PAGE_SIZE, 1024, Constraint::Below4GiB); - assert!(matches!( - builder.allocate().unwrap_err(), - AllocateError::InvalidAlignment { .. } - )); - - let mut target = MemoryRange::EMPTY; - let mut builder = LayoutBuilder::new(48); + fn exhaustion_reports_phase() { + let mut mmio32 = MemoryRange::EMPTY; + let mut builder = LayoutBuilder::new(); builder.request( - "bad", - &mut target, + "too_big", + &mut mmio32, + 4 * GIB + PAGE_SIZE, PAGE_SIZE, - 6 * KIB, - Constraint::Below4GiB, + Placement::Mmio32, ); assert!(matches!( builder.allocate().unwrap_err(), - AllocateError::InvalidAlignment { .. } + AllocateError::Exhausted { + phase: AllocationPhase::Mmio32, + .. + } )); - } - #[test] - fn invalid_pinned_address() { - let mut target = MemoryRange::EMPTY; - let mut builder = LayoutBuilder::new(48); + let mut ram = Vec::new(); + let mut fixed = MemoryRange::EMPTY; + let mut builder = LayoutBuilder::new(); builder.request( - "bad", - &mut target, - PAGE_SIZE, + "fixed", + &mut fixed, + ADDRESS_LIMIT, PAGE_SIZE, - Constraint::Pinned(0x1234), + Placement::Fixed(0), ); + builder.ram("ram", &mut ram, PAGE_SIZE, PAGE_SIZE); assert!(matches!( builder.allocate().unwrap_err(), - AllocateError::InvalidPinnedAddress { .. } + AllocateError::Exhausted { + phase: AllocationPhase::Ram, + .. + } )); - } - #[test] - fn realistic_x86_layout() { - let mut reserved = MemoryRange::EMPTY; - let mut vmbus_low = MemoryRange::EMPTY; - let mut vmbus_high = MemoryRange::EMPTY; - let mut pcie_ecam = MemoryRange::EMPTY; - let mut pcie_low = MemoryRange::EMPTY; - let mut pcie_high = MemoryRange::EMPTY; - let mut virtio = [MemoryRange::EMPTY; 4]; - - let mut builder = LayoutBuilder::new(48); + let mut ram = Vec::new(); + let mut fixed = MemoryRange::EMPTY; + let mut mmio64 = MemoryRange::EMPTY; + let mut builder = LayoutBuilder::new(); + builder.ram("ram", &mut ram, PAGE_SIZE, PAGE_SIZE); builder.request( - "reserved", - &mut reserved, - 32 * MIB, + "fixed", + &mut fixed, + ADDRESS_LIMIT - PAGE_SIZE, PAGE_SIZE, - Constraint::Pinned(0xFE00_0000), - ); - builder.request( - "vmbus_low", - &mut vmbus_low, - 128 * MIB, - MIB, - Constraint::Below4GiB, - ); - builder.request( - "vmbus_high", - &mut vmbus_high, - GIB, - MIB, - Constraint::Above4GiB, - ); - builder.request( - "pcie_ecam", - &mut pcie_ecam, - 256 * MIB, - 256 * MIB, - Constraint::Below4GiB, + Placement::Fixed(PAGE_SIZE), ); builder.request( - "pcie_low", - &mut pcie_low, - 64 * MIB, - MIB, - Constraint::Below4GiB, + "mmio64", + &mut mmio64, + PAGE_SIZE, + PAGE_SIZE, + Placement::Mmio64, ); - builder.request("pcie_high", &mut pcie_high, GIB, MIB, Constraint::Above4GiB); - for (i, v) in virtio.iter_mut().enumerate() { - builder.request( - format!("virtio_{i}"), - v, - PAGE_SIZE, - PAGE_SIZE, - Constraint::Below4GiB, - ); - } - - let sorted = builder.allocate().unwrap(); - - assert_eq!(reserved, MemoryRange::new(0xFE00_0000..0x1_0000_0000)); - assert!(vmbus_low.end() <= FOUR_GIB); - assert!(pcie_ecam.end() <= FOUR_GIB); - assert!(pcie_low.end() <= FOUR_GIB); - for v in &virtio { - assert!(v.end() <= FOUR_GIB); - } - assert!(vmbus_high.start() >= FOUR_GIB); - assert!(pcie_high.start() >= FOUR_GIB); - - for [a, b] in sorted.array_windows() { - assert!(a.end() <= b.start(), "overlap: {} and {}", a, b); - } - - assert_eq!(pcie_ecam.start() % (256 * MIB), 0); - - for r in &sorted { - assert!(r.end() <= 1u64 << 48); - } + assert!(matches!( + builder.allocate().unwrap_err(), + AllocateError::Exhausted { + phase: AllocationPhase::Mmio64, + .. + } + )); } #[test] - fn realistic_aarch64_layout() { - let mut reserved = MemoryRange::EMPTY; - let mut vmbus_low = MemoryRange::EMPTY; - let mut vmbus_high = MemoryRange::EMPTY; - - let mut builder = LayoutBuilder::new(48); - builder.request( - "reserved", - &mut reserved, - 272 * MIB, - PAGE_SIZE, - Constraint::Pinned(0xEF00_0000), - ); - builder.request( - "vmbus_low", - &mut vmbus_low, - 128 * MIB, - MIB, - Constraint::Below4GiB, - ); - builder.request( - "vmbus_high", - &mut vmbus_high, - GIB, - MIB, - Constraint::Above4GiB, - ); + fn sorted_result_preserves_tags_and_kinds() { + let mut ram = Vec::new(); + let mut mmio32 = MemoryRange::EMPTY; + let mut mmio64 = MemoryRange::EMPTY; + let mut builder = LayoutBuilder::new(); + builder.ram("ram", &mut ram, GIB, PAGE_SIZE); + builder.request("mmio32", &mut mmio32, MIB, MIB, Placement::Mmio32); + builder.request("mmio64", &mut mmio64, MIB, MIB, Placement::Mmio64); let sorted = builder.allocate().unwrap(); - assert_eq!(reserved, MemoryRange::new(0xEF00_0000..0x1_0000_0000)); - assert!(vmbus_low.end() <= FOUR_GIB); - assert!(vmbus_high.start() >= FOUR_GIB); - - for [a, b] in sorted.array_windows() { - assert!(a.end() <= b.start()); - } + assert_eq!(sorted[0].tag, "ram"); + assert_eq!(sorted[0].kind, PlacedRangeKind::Ram); + assert_eq!(sorted[1].tag, "mmio64"); + assert_eq!(sorted[1].kind, PlacedRangeKind::Mmio64); + assert_eq!(sorted[2].tag, "mmio32"); + assert_eq!(sorted[2].kind, PlacedRangeKind::Mmio32); } #[test] - fn pinned_at_top_of_space() { - let mut target = MemoryRange::EMPTY; - let width: u8 = 36; - let limit = 1u64 << width; - let mut builder = LayoutBuilder::new(width); - builder.request( - "top", - &mut target, - PAGE_SIZE, - PAGE_SIZE, - Constraint::Pinned(limit - PAGE_SIZE), - ); - builder.allocate().unwrap(); - assert_eq!(target, MemoryRange::new((limit - PAGE_SIZE)..limit)); - } + fn deterministic() { + let mut previous = None; - #[test] - fn many_small_allocations() { - let mut targets = [MemoryRange::EMPTY; 100]; - let mut builder = LayoutBuilder::new(48); - for (i, t) in targets.iter_mut().enumerate() { + for _ in 0..10 { + let mut ram = Vec::new(); + let mut reserved = MemoryRange::EMPTY; + let mut vmbus_low = MemoryRange::EMPTY; + let mut pcie_ecam = MemoryRange::EMPTY; + let mut pcie_high = MemoryRange::EMPTY; + let mut virtio = MemoryRange::EMPTY; + let mut builder = LayoutBuilder::new(); + builder.ram("ram", &mut ram, 2 * GIB, PAGE_SIZE); builder.request( - format!("s{i}"), - t, + "reserved", + &mut reserved, + 32 * MIB, + PAGE_SIZE, + Placement::Fixed(0xFE00_0000), + ); + builder.request( + "vmbus_low", + &mut vmbus_low, + 128 * MIB, + MIB, + Placement::Mmio32, + ); + builder.request( + "pcie_ecam", + &mut pcie_ecam, + 256 * MIB, + 256 * MIB, + Placement::Mmio32, + ); + builder.request("pcie_high", &mut pcie_high, GIB, MIB, Placement::Mmio64); + builder.request( + "virtio", + &mut virtio, PAGE_SIZE, PAGE_SIZE, - Constraint::Below4GiB, + Placement::Mmio32, ); - } - let sorted = builder.allocate().unwrap(); - assert_eq!(sorted.len(), 100); - for [a, b] in sorted.array_windows() { - assert!(a.end() <= b.start()); - } - for r in &sorted { - assert!(r.end() <= FOUR_GIB); - } - } - - #[test] - fn mixed_constraints_with_pinned() { - let mut p1 = MemoryRange::EMPTY; - let mut p2 = MemoryRange::EMPTY; - let mut d1 = MemoryRange::EMPTY; - let mut d2 = MemoryRange::EMPTY; - let mut builder = LayoutBuilder::new(48); - builder.request("p1", &mut p1, GIB, PAGE_SIZE, Constraint::Pinned(GIB)); - builder.request("p2", &mut p2, GIB, PAGE_SIZE, Constraint::Pinned(3 * GIB)); - builder.request("d1", &mut d1, 512 * MIB, MIB, Constraint::Below4GiB); - builder.request("d2", &mut d2, 512 * MIB, MIB, Constraint::Below4GiB); - builder.allocate().unwrap(); - - assert_eq!(p1, MemoryRange::new(GIB..2 * GIB)); - assert_eq!(p2, MemoryRange::new(3 * GIB..4 * GIB)); - assert!(!d1.overlaps(&d2)); - assert!(!d1.overlaps(&p1)); - assert!(!d1.overlaps(&p2)); - assert!(!d2.overlaps(&p1)); - assert!(!d2.overlaps(&p2)); - } - - #[test] - fn narrow_address_space() { - let mut target = MemoryRange::EMPTY; - let mut builder = LayoutBuilder::new(20); - builder.request( - "test", - &mut target, - PAGE_SIZE, - PAGE_SIZE, - Constraint::Below4GiB, - ); - builder.allocate().unwrap(); - assert!(target.end() <= 1u64 << 20); + let sorted = builder.allocate().unwrap(); + if let Some(previous) = &previous { + assert_eq!(previous, &sorted); + } + previous = Some(sorted); + } } } From 35dcf3910c1c888d7751ee55887a2c01f86a3563 Mon Sep 17 00:00:00 2001 From: John Starks Date: Sat, 16 May 2026 03:00:24 +0000 Subject: [PATCH 03/36] Route worker RAM layout through allocator --- openvmm/openvmm_core/src/worker/dispatch.rs | 60 ++-- .../openvmm_core/src/worker/memory_layout.rs | 298 ++++++++++++++++++ openvmm/openvmm_core/src/worker/mod.rs | 1 + vm/vmcore/vm_topology/src/memory.rs | 85 ++++- 4 files changed, 401 insertions(+), 43 deletions(-) create mode 100644 openvmm/openvmm_core/src/worker/memory_layout.rs diff --git a/openvmm/openvmm_core/src/worker/dispatch.rs b/openvmm/openvmm_core/src/worker/dispatch.rs index 7ebf795bea..200762cf31 100644 --- a/openvmm/openvmm_core/src/worker/dispatch.rs +++ b/openvmm/openvmm_core/src/worker/dispatch.rs @@ -5,6 +5,8 @@ use crate::emuplat; use crate::partition::BindHvliteVp; use crate::partition::HvlitePartition; use crate::vmgs_non_volatile_store::HvLiteVmgsNonVolatileStore; +use crate::worker::memory_layout::MemoryLayoutInput; +use crate::worker::memory_layout::resolve_memory_layout; use crate::worker::rom::RomBuilder; use acpi::dsdt; use anyhow::Context; @@ -895,50 +897,24 @@ impl InitializedVm { }; // Choose the memory layout of the VM. - let mem_layout = if let Some(ref sizes) = cfg.memory.numa_mem_sizes { - // When numa_mem_sizes is set, distribute guest RAM across vNUMA nodes - // for ACPI SRAT / FDT reporting. - // - // TODO: The vNUMA nodes reported are meant for test usage only, as they - // are not aligned to any physical NUMA node. There is more work to do - // to support useful vNUMA reporting. - let total: u64 = sizes - .iter() - .copied() - .try_fold(0u64, |acc, s| acc.checked_add(s)) - .context("numa memory sizes overflow")?; - anyhow::ensure!( - total == cfg.memory.mem_size, - "numa_mem_sizes total ({total:#x}) does not match mem_size ({:#x})", - cfg.memory.mem_size - ); - - MemoryLayout::new_with_numa( - sizes, - &cfg.memory.mmio_gaps, - &cfg.memory.pci_ecam_gaps, - &cfg.memory.pci_mmio_gaps, - vtl2_range, - ) - } else { - MemoryLayout::new( - cfg.memory.mem_size, - &cfg.memory.mmio_gaps, - &cfg.memory.pci_ecam_gaps, - &cfg.memory.pci_mmio_gaps, - vtl2_range, - ) - } + // + // When numa_mem_sizes is set, distribute guest RAM across vNUMA nodes + // for ACPI SRAT / FDT reporting. + // + // TODO: The vNUMA nodes reported are meant for test usage only, as they + // are not aligned to any physical NUMA node. There is more work to do + // to support useful vNUMA reporting. + let mem_layout = resolve_memory_layout(MemoryLayoutInput { + mem_size: cfg.memory.mem_size, + numa_mem_sizes: cfg.memory.numa_mem_sizes.as_deref(), + mmio_gaps: &cfg.memory.mmio_gaps, + pci_ecam_gaps: &cfg.memory.pci_ecam_gaps, + pci_mmio_gaps: &cfg.memory.pci_mmio_gaps, + vtl2_range, + physical_address_size, + }) .context("invalid memory configuration")?; - if mem_layout.end_of_layout() > 1 << physical_address_size { - anyhow::bail!( - "memory layout ends at {:#x}, which exceeds the address with of {} bits", - mem_layout.end_of_layout(), - physical_address_size - ); - } - // Place the alias map at the end of the address space. Newer versions // of OpenHCL support receiving this offset via devicetree (especially // important on ARM64 where the physical address width used here is not diff --git a/openvmm/openvmm_core/src/worker/memory_layout.rs b/openvmm/openvmm_core/src/worker/memory_layout.rs new file mode 100644 index 0000000000..d2ae7df689 --- /dev/null +++ b/openvmm/openvmm_core/src/worker/memory_layout.rs @@ -0,0 +1,298 @@ +// Copyright (c) Microsoft Corporation. +// Licensed under the MIT License. + +use anyhow::Context; +use anyhow::bail; +use memory_range::MemoryRange; +use vm_topology::layout::LayoutBuilder; +use vm_topology::layout::Placement; +use vm_topology::memory::MemoryLayout; +use vm_topology::memory::MemoryRangeWithNode; + +const PAGE_SIZE: u64 = 4096; + +pub(super) struct MemoryLayoutInput<'a> { + pub mem_size: u64, + pub numa_mem_sizes: Option<&'a [u64]>, + pub mmio_gaps: &'a [MemoryRange], + pub pci_ecam_gaps: &'a [MemoryRange], + pub pci_mmio_gaps: &'a [MemoryRange], + pub vtl2_range: Option, + pub physical_address_size: u8, +} + +pub(super) fn resolve_memory_layout(input: MemoryLayoutInput<'_>) -> anyhow::Result { + let ram_sizes = validate_ram_sizes(input.mem_size, input.numa_mem_sizes)?; + + let mut resolved_mmio_gaps = vec![MemoryRange::EMPTY; input.mmio_gaps.len()]; + let mut resolved_pci_ecam_gaps = vec![MemoryRange::EMPTY; input.pci_ecam_gaps.len()]; + let mut resolved_pci_mmio_gaps = vec![MemoryRange::EMPTY; input.pci_mmio_gaps.len()]; + let mut ram_ranges_by_node = vec![Vec::new(); ram_sizes.len()]; + + let mut builder = LayoutBuilder::new(); + add_fixed_ranges( + &mut builder, + "mmio", + input.mmio_gaps, + &mut resolved_mmio_gaps, + ); + add_fixed_ranges( + &mut builder, + "pci_ecam", + input.pci_ecam_gaps, + &mut resolved_pci_ecam_gaps, + ); + add_fixed_ranges( + &mut builder, + "pci_mmio", + input.pci_mmio_gaps, + &mut resolved_pci_mmio_gaps, + ); + + for (vnode, (ram_size, ram_ranges)) in ram_sizes + .iter() + .copied() + .zip(&mut ram_ranges_by_node) + .enumerate() + { + builder.ram(format!("ram[{vnode}]"), ram_ranges, ram_size, PAGE_SIZE); + } + + builder + .allocate() + .context("allocating memory layout ranges")?; + + let ram = ram_ranges_by_node + .into_iter() + .enumerate() + .flat_map(|(vnode, ranges)| { + ranges.into_iter().map(move |range| MemoryRangeWithNode { + range, + vnode: vnode as u32, + }) + }) + .collect::>(); + + let memory_layout = MemoryLayout::new_from_resolved_ranges( + ram, + input.mmio_gaps.to_vec(), + input.pci_ecam_gaps.to_vec(), + input.pci_mmio_gaps.to_vec(), + input.vtl2_range, + ) + .context("validating resolved memory layout")?; + + let address_space_limit = physical_address_limit(input.physical_address_size); + if memory_layout.end_of_layout() > address_space_limit { + bail!( + "memory layout ends at {:#x}, which exceeds the address width of {} bits", + memory_layout.end_of_layout(), + input.physical_address_size + ); + } + + Ok(memory_layout) +} + +fn add_fixed_ranges<'a>( + builder: &mut LayoutBuilder<'a>, + tag_prefix: &str, + ranges: &[MemoryRange], + targets: &'a mut [MemoryRange], +) { + for (index, (range, target)) in ranges.iter().zip(targets).enumerate() { + builder.request( + format!("{tag_prefix}[{index}]"), + target, + range.len(), + PAGE_SIZE, + Placement::Fixed(range.start()), + ); + } +} + +fn validate_ram_sizes(mem_size: u64, numa_mem_sizes: Option<&[u64]>) -> anyhow::Result> { + if mem_size == 0 || !mem_size.is_multiple_of(PAGE_SIZE) { + bail!("invalid memory size {mem_size:#x}"); + } + + let Some(numa_mem_sizes) = numa_mem_sizes else { + return Ok(vec![mem_size]); + }; + + if numa_mem_sizes.is_empty() { + bail!("empty NUMA memory sizes"); + } + + for &size in numa_mem_sizes { + if size == 0 || !size.is_multiple_of(PAGE_SIZE) { + bail!("invalid NUMA node memory size {size:#x}"); + } + } + + let total = numa_mem_sizes + .iter() + .copied() + .try_fold(0u64, |acc, size| acc.checked_add(size)) + .context("numa memory sizes overflow")?; + if total != mem_size { + bail!("numa_mem_sizes total ({total:#x}) does not match mem_size ({mem_size:#x})"); + } + + Ok(numa_mem_sizes.to_vec()) +} + +fn physical_address_limit(physical_address_size: u8) -> u64 { + if physical_address_size >= u64::BITS as u8 { + u64::MAX + } else { + 1u64 << physical_address_size + } +} + +#[cfg(test)] +mod tests { + use super::*; + use vm_topology::memory::AddressType; + + const GB: u64 = 1024 * 1024 * 1024; + const MB: u64 = 1024 * 1024; + + fn input<'a>( + mem_size: u64, + numa_mem_sizes: Option<&'a [u64]>, + mmio_gaps: &'a [MemoryRange], + pci_ecam_gaps: &'a [MemoryRange], + pci_mmio_gaps: &'a [MemoryRange], + vtl2_range: Option, + ) -> MemoryLayoutInput<'a> { + MemoryLayoutInput { + mem_size, + numa_mem_sizes, + mmio_gaps, + pci_ecam_gaps, + pci_mmio_gaps, + vtl2_range, + physical_address_size: 46, + } + } + + #[test] + fn non_numa_matches_memory_layout_new() { + let mmio = [ + MemoryRange::new(2 * GB..3 * GB), + MemoryRange::new(4 * GB..5 * GB), + ]; + let pci_ecam = [MemoryRange::new(8 * GB..8 * GB + MB)]; + let pci_mmio = [MemoryRange::new(6 * GB..7 * GB)]; + + let actual = + resolve_memory_layout(input(6 * GB, None, &mmio, &pci_ecam, &pci_mmio, None)).unwrap(); + let expected = MemoryLayout::new(6 * GB, &mmio, &pci_ecam, &pci_mmio, None).unwrap(); + + assert_eq!(actual.ram(), expected.ram()); + assert_eq!(actual.mmio(), expected.mmio()); + assert_eq!(actual.ram_size(), expected.ram_size()); + assert_eq!(actual.end_of_ram(), expected.end_of_ram()); + assert_eq!(actual.end_of_layout(), expected.end_of_layout()); + } + + #[test] + fn numa_preserves_node_ordering_and_splitting() { + let mmio = [MemoryRange::new(3 * GB..4 * GB)]; + let sizes = [2 * GB, 2 * GB]; + + let actual = + resolve_memory_layout(input(4 * GB, Some(&sizes), &mmio, &[], &[], None)).unwrap(); + let expected = MemoryLayout::new_with_numa(&sizes, &mmio, &[], &[], None).unwrap(); + + assert_eq!(actual.ram(), expected.ram()); + } + + #[test] + fn fixed_ranges_are_occupied_for_ram() { + let mmio = [MemoryRange::new(GB..2 * GB)]; + let pci_ecam = [MemoryRange::new(3 * GB..3 * GB + MB)]; + let pci_mmio = [MemoryRange::new(4 * GB..5 * GB)]; + + let actual = + resolve_memory_layout(input(4 * GB, None, &mmio, &pci_ecam, &pci_mmio, None)).unwrap(); + + assert_eq!(actual.probe_address(GB), Some(AddressType::Mmio)); + assert_eq!(actual.probe_address(3 * GB), Some(AddressType::PciEcam)); + assert_eq!(actual.probe_address(4 * GB), Some(AddressType::PciMmio)); + assert_eq!(actual.ram_size(), 4 * GB); + assert!(actual.ram().iter().all(|ram| { + !ram.range.overlaps(&mmio[0]) + && !ram.range.overlaps(&pci_ecam[0]) + && !ram.range.overlaps(&pci_mmio[0]) + })); + } + + #[test] + fn vtl2_is_validated_after_ram_placement() { + let mmio = [MemoryRange::new(GB..2 * GB)]; + let vtl2_range = MemoryRange::new(GB..GB + 2 * MB); + + let err = resolve_memory_layout(input(2 * GB, None, &mmio, &[], &[], Some(vtl2_range))) + .unwrap_err(); + + assert!( + err.to_string() + .contains("validating resolved memory layout") + ); + } + + #[test] + fn vtl2_does_not_split_ram() { + let vtl2_range = MemoryRange::new(GB..2 * GB); + + assert!( + resolve_memory_layout(input(2 * GB, None, &[], &[], &[], Some(vtl2_range))).is_err() + ); + } + + #[test] + fn deterministic_for_same_inputs() { + let mmio = [ + MemoryRange::new(GB..2 * GB), + MemoryRange::new(5 * GB..6 * GB), + ]; + let pci_ecam = [MemoryRange::new(3 * GB..3 * GB + MB)]; + let pci_mmio = [MemoryRange::new(7 * GB..8 * GB)]; + let sizes = [2 * GB, 3 * GB]; + + let first = resolve_memory_layout(input( + 5 * GB, + Some(&sizes), + &mmio, + &pci_ecam, + &pci_mmio, + None, + )) + .unwrap(); + let second = resolve_memory_layout(input( + 5 * GB, + Some(&sizes), + &mmio, + &pci_ecam, + &pci_mmio, + None, + )) + .unwrap(); + + assert_eq!(first.ram(), second.ram()); + assert_eq!(first.end_of_layout(), second.end_of_layout()); + } + + #[test] + fn host_width_validation_happens_after_allocation() { + let mmio = [MemoryRange::new(GB..4 * GB)]; + let mut config = input(3 * GB, None, &mmio, &[], &[], None); + config.physical_address_size = 32; + + let err = resolve_memory_layout(config).unwrap_err(); + + assert!(err.to_string().contains("memory layout ends at")); + } +} diff --git a/openvmm/openvmm_core/src/worker/mod.rs b/openvmm/openvmm_core/src/worker/mod.rs index 23d0fe91f0..b36faa2154 100644 --- a/openvmm/openvmm_core/src/worker/mod.rs +++ b/openvmm/openvmm_core/src/worker/mod.rs @@ -2,5 +2,6 @@ // Licensed under the MIT License. pub mod dispatch; +mod memory_layout; mod rom; pub mod vm_loaders; diff --git a/vm/vmcore/vm_topology/src/memory.rs b/vm/vmcore/vm_topology/src/memory.rs index 621884a410..2de1998f87 100644 --- a/vm/vmcore/vm_topology/src/memory.rs +++ b/vm/vmcore/vm_topology/src/memory.rs @@ -269,6 +269,26 @@ impl MemoryLayout { Self::build(memory.to_vec(), gaps.to_vec(), vec![], vec![], None) } + /// Makes a new memory layout from already-resolved RAM and fixed ranges. + /// + /// The RAM, MMIO, PCI ECAM, and PCI MMIO ranges must each be in sorted + /// order, non-empty, and non-overlapping. The combined layout is also + /// validated for overlaps, including the optional VTL2 range. + pub fn new_from_resolved_ranges( + ram: Vec, + mmio_gaps: Vec, + pci_ecam_gaps: Vec, + pci_mmio_gaps: Vec, + vtl2_range: Option, + ) -> Result { + validate_ranges_with_metadata(&ram)?; + validate_ranges(&mmio_gaps)?; + validate_ranges(&pci_ecam_gaps)?; + validate_ranges(&pci_mmio_gaps)?; + + Self::build(ram, mmio_gaps, pci_ecam_gaps, pci_mmio_gaps, vtl2_range) + } + /// Builds the memory layout. /// /// `ram` must already be known to be sorted. @@ -398,7 +418,7 @@ impl MemoryLayout { /// One past the last byte of RAM, MMIO, PCI ECAM, or PCI MMIO. pub fn end_of_layout(&self) -> u64 { [ - self.mmio.last().expect("mmio set").end(), + self.mmio.last().map(|r| r.end()).unwrap_or(0), self.end_of_ram(), self.pci_ecam.last().map(|r| r.end()).unwrap_or(0), self.pci_mmio.last().map(|r| r.end()).unwrap_or(0), @@ -555,6 +575,69 @@ mod tests { MemoryLayout::new(TB, &[], pci_ecam, pci_mmio, None).unwrap_err(); } + #[test] + fn resolved_ranges_constructor() { + let ram = vec![ + MemoryRangeWithNode { + range: MemoryRange::new(0..GB), + vnode: 0, + }, + MemoryRangeWithNode { + range: MemoryRange::new(2 * GB..3 * GB), + vnode: 1, + }, + ]; + let mmio = vec![MemoryRange::new(GB..2 * GB)]; + let pci_ecam = vec![MemoryRange::new(4 * GB..4 * GB + MB)]; + let pci_mmio = vec![MemoryRange::new(5 * GB..6 * GB)]; + + let layout = MemoryLayout::new_from_resolved_ranges( + ram.clone(), + mmio.clone(), + pci_ecam.clone(), + pci_mmio.clone(), + None, + ) + .unwrap(); + + assert_eq!(layout.ram(), ram); + assert_eq!(layout.mmio(), mmio); + assert_eq!(layout.probe_address(4 * GB), Some(AddressType::PciEcam)); + assert_eq!(layout.probe_address(5 * GB), Some(AddressType::PciMmio)); + } + + #[test] + fn resolved_ranges_reject_overlap_with_fixed_ranges() { + let ram = vec![MemoryRangeWithNode { + range: MemoryRange::new(0..2 * GB), + vnode: 0, + }]; + let mmio = vec![MemoryRange::new(GB..2 * GB)]; + + assert!(MemoryLayout::new_from_resolved_ranges(ram, mmio, vec![], vec![], None).is_err()); + } + + #[test] + fn resolved_ranges_validate_vtl2_against_ram_end() { + let ram = vec![ + MemoryRangeWithNode { + range: MemoryRange::new(0..GB), + vnode: 0, + }, + MemoryRangeWithNode { + range: MemoryRange::new(3 * GB..4 * GB), + vnode: 0, + }, + ]; + let mmio = vec![MemoryRange::new(GB..2 * GB)]; + let vtl2_range = MemoryRange::new(2 * GB..2 * GB + MB); + + assert!(matches!( + MemoryLayout::new_from_resolved_ranges(ram, mmio, vec![], vec![], Some(vtl2_range)), + Err(Error::Vtl2RangeBeforeEndOfRam) + )); + } + #[test] fn pci_ranges() { let mmio = &[MemoryRange::new(3 * GB..4 * GB)]; From 347fa637005ba4bd015aecc5d30be7fd768fd11b Mon Sep 17 00:00:00 2001 From: John Starks Date: Sat, 16 May 2026 03:19:55 +0000 Subject: [PATCH 04/36] Clarify VTL2 RAM layout validation --- openvmm/openvmm_core/src/worker/memory_layout.rs | 3 +++ 1 file changed, 3 insertions(+) diff --git a/openvmm/openvmm_core/src/worker/memory_layout.rs b/openvmm/openvmm_core/src/worker/memory_layout.rs index d2ae7df689..da5eccb4c6 100644 --- a/openvmm/openvmm_core/src/worker/memory_layout.rs +++ b/openvmm/openvmm_core/src/worker/memory_layout.rs @@ -73,6 +73,9 @@ pub(super) fn resolve_memory_layout(input: MemoryLayoutInput<'_>) -> anyhow::Res }) .collect::>(); + // VTL2 is validated after RAM allocation rather than added as a fixed + // allocator range. Otherwise splittable RAM could skip over VTL2 and + // continue above it, which is not today's VTL2 placement contract. let memory_layout = MemoryLayout::new_from_resolved_ranges( ram, input.mmio_gaps.to_vec(), From 18793c4010774189ac3b8e3b22769d5f908289a6 Mon Sep 17 00:00:00 2001 From: John Starks Date: Sat, 16 May 2026 03:59:23 +0000 Subject: [PATCH 05/36] vtl2 stuff --- openvmm/openvmm_core/src/worker/dispatch.rs | 17 +-- .../openvmm_core/src/worker/memory_layout.rs | 78 ++++++++++--- .../src/worker/vm_loaders/igvm.rs | 76 ++++--------- vm/vmcore/vm_topology/src/layout.rs | 104 +++++++++++++++++- 4 files changed, 185 insertions(+), 90 deletions(-) diff --git a/openvmm/openvmm_core/src/worker/dispatch.rs b/openvmm/openvmm_core/src/worker/dispatch.rs index 200762cf31..9d680cf8ed 100644 --- a/openvmm/openvmm_core/src/worker/dispatch.rs +++ b/openvmm/openvmm_core/src/worker/dispatch.rs @@ -866,7 +866,7 @@ impl InitializedVm { let physical_address_size = proto.max_physical_address_size(); // Determine if a special vtl2 memory allocation should be used. - let vtl2_range = if let LoadMode::Igvm { + let vtl2_layout = if let LoadMode::Igvm { vtl2_base_address, .. } = &cfg.load_mode { @@ -875,21 +875,16 @@ impl InitializedVm { | Vtl2BaseAddressType::Absolute(_) | Vtl2BaseAddressType::Vtl2Allocate { .. } => None, Vtl2BaseAddressType::MemoryLayout { size } => { - let vtl2_range = super::vm_loaders::igvm::vtl2_memory_range( - physical_address_size, - cfg.memory.mem_size, - &cfg.memory.mmio_gaps, - &cfg.memory.pci_ecam_gaps, - &cfg.memory.pci_mmio_gaps, + let vtl2_layout = super::vm_loaders::igvm::vtl2_memory_layout_request( igvm_file .as_ref() .expect("igvm file should be already parsed"), *size, ) - .context("unable to determine vtl2 memory range")?; - tracing::info!(?vtl2_range, "vtl2 memory range selected"); + .context("unable to determine vtl2 memory layout request")?; + tracing::info!(?vtl2_layout, "vtl2 memory layout request selected"); - Some(vtl2_range) + Some(vtl2_layout) } } } else { @@ -910,7 +905,7 @@ impl InitializedVm { mmio_gaps: &cfg.memory.mmio_gaps, pci_ecam_gaps: &cfg.memory.pci_ecam_gaps, pci_mmio_gaps: &cfg.memory.pci_mmio_gaps, - vtl2_range, + vtl2_layout, physical_address_size, }) .context("invalid memory configuration")?; diff --git a/openvmm/openvmm_core/src/worker/memory_layout.rs b/openvmm/openvmm_core/src/worker/memory_layout.rs index da5eccb4c6..8ae4f5db8b 100644 --- a/openvmm/openvmm_core/src/worker/memory_layout.rs +++ b/openvmm/openvmm_core/src/worker/memory_layout.rs @@ -1,6 +1,7 @@ // Copyright (c) Microsoft Corporation. // Licensed under the MIT License. +use super::vm_loaders::igvm::Vtl2MemoryLayoutRequest; use anyhow::Context; use anyhow::bail; use memory_range::MemoryRange; @@ -17,7 +18,7 @@ pub(super) struct MemoryLayoutInput<'a> { pub mmio_gaps: &'a [MemoryRange], pub pci_ecam_gaps: &'a [MemoryRange], pub pci_mmio_gaps: &'a [MemoryRange], - pub vtl2_range: Option, + pub vtl2_layout: Option, pub physical_address_size: u8, } @@ -28,6 +29,7 @@ pub(super) fn resolve_memory_layout(input: MemoryLayoutInput<'_>) -> anyhow::Res let mut resolved_pci_ecam_gaps = vec![MemoryRange::EMPTY; input.pci_ecam_gaps.len()]; let mut resolved_pci_mmio_gaps = vec![MemoryRange::EMPTY; input.pci_mmio_gaps.len()]; let mut ram_ranges_by_node = vec![Vec::new(); ram_sizes.len()]; + let mut vtl2_range = MemoryRange::EMPTY; let mut builder = LayoutBuilder::new(); add_fixed_ranges( @@ -58,6 +60,16 @@ pub(super) fn resolve_memory_layout(input: MemoryLayoutInput<'_>) -> anyhow::Res builder.ram(format!("ram[{vnode}]"), ram_ranges, ram_size, PAGE_SIZE); } + if let Some(vtl2_layout) = input.vtl2_layout { + builder.request( + "vtl2", + &mut vtl2_range, + vtl2_layout.size, + vtl2_layout.alignment, + Placement::PostMmio, + ); + } + builder .allocate() .context("allocating memory layout ranges")?; @@ -73,15 +85,14 @@ pub(super) fn resolve_memory_layout(input: MemoryLayoutInput<'_>) -> anyhow::Res }) .collect::>(); - // VTL2 is validated after RAM allocation rather than added as a fixed - // allocator range. Otherwise splittable RAM could skip over VTL2 and - // continue above it, which is not today's VTL2 placement contract. + let vtl2_range = input.vtl2_layout.map(|_| vtl2_range); + let memory_layout = MemoryLayout::new_from_resolved_ranges( ram, input.mmio_gaps.to_vec(), input.pci_ecam_gaps.to_vec(), input.pci_mmio_gaps.to_vec(), - input.vtl2_range, + vtl2_range, ) .context("validating resolved memory layout")?; @@ -167,7 +178,7 @@ mod tests { mmio_gaps: &'a [MemoryRange], pci_ecam_gaps: &'a [MemoryRange], pci_mmio_gaps: &'a [MemoryRange], - vtl2_range: Option, + vtl2_layout: Option, ) -> MemoryLayoutInput<'a> { MemoryLayoutInput { mem_size, @@ -175,11 +186,18 @@ mod tests { mmio_gaps, pci_ecam_gaps, pci_mmio_gaps, - vtl2_range, + vtl2_layout, physical_address_size: 46, } } + fn vtl2_layout(size: u64) -> Vtl2MemoryLayoutRequest { + Vtl2MemoryLayoutRequest { + size, + alignment: PAGE_SIZE, + } + } + #[test] fn non_numa_matches_memory_layout_new() { let mmio = [ @@ -233,25 +251,49 @@ mod tests { } #[test] - fn vtl2_is_validated_after_ram_placement() { + fn vtl2_is_allocated_after_all_mmio() { let mmio = [MemoryRange::new(GB..2 * GB)]; - let vtl2_range = MemoryRange::new(GB..GB + 2 * MB); + let pci_ecam = [MemoryRange::new(3 * GB..3 * GB + MB)]; + let pci_mmio = [MemoryRange::new(7 * GB..8 * GB)]; - let err = resolve_memory_layout(input(2 * GB, None, &mmio, &[], &[], Some(vtl2_range))) - .unwrap_err(); + let actual = resolve_memory_layout(input( + 4 * GB, + None, + &mmio, + &pci_ecam, + &pci_mmio, + Some(vtl2_layout(2 * MB)), + )) + .unwrap(); - assert!( - err.to_string() - .contains("validating resolved memory layout") + assert_eq!(actual.end_of_layout(), 8 * GB); + assert_eq!( + actual.vtl2_range(), + Some(MemoryRange::new(8 * GB..8 * GB + 2 * MB)) ); } #[test] - fn vtl2_does_not_split_ram() { - let vtl2_range = MemoryRange::new(GB..2 * GB); + fn vtl2_does_not_change_ram_placement() { + let mmio = [MemoryRange::new(GB..2 * GB)]; + + let without_vtl2 = + resolve_memory_layout(input(2 * GB, None, &mmio, &[], &[], None)).unwrap(); + let with_vtl2 = resolve_memory_layout(input( + 2 * GB, + None, + &mmio, + &[], + &[], + Some(vtl2_layout(2 * MB)), + )) + .unwrap(); - assert!( - resolve_memory_layout(input(2 * GB, None, &[], &[], &[], Some(vtl2_range))).is_err() + assert_eq!(with_vtl2.ram(), without_vtl2.ram()); + assert_eq!(with_vtl2.end_of_layout(), without_vtl2.end_of_layout()); + assert_eq!( + with_vtl2.vtl2_range(), + Some(MemoryRange::new(3 * GB..3 * GB + 2 * MB)) ); } diff --git a/openvmm/openvmm_core/src/worker/vm_loaders/igvm.rs b/openvmm/openvmm_core/src/worker/vm_loaders/igvm.rs index 05a0124e66..399ff44c63 100644 --- a/openvmm/openvmm_core/src/worker/vm_loaders/igvm.rs +++ b/openvmm/openvmm_core/src/worker/vm_loaders/igvm.rs @@ -76,16 +76,14 @@ pub enum Error { NoVtl2MemoryRange, #[error("no vtl2 memory source in igvm file")] Vtl2MemorySource, - #[error("invalid memory config")] - MemoryConfig(#[source] vm_topology::memory::Error), - #[error("not enough physical address bits to allocate vtl2 range")] - NotEnoughPhysicalAddressBits, #[error("building device tree for partition failed")] DeviceTree(fdt::builder::Error), #[error("supplied vtl2 memory {0} is not aligned to 2MB")] Vtl2MemoryAligned(u64), #[error("supplied vtl2 memory {0} is smaller than igvm file VTL2 range {1}")] Vtl2MemoryTooSmall(u64, u64), + #[error("invalid vtl2 relocation alignment {0:#x}")] + Vtl2RelocationAlignment(u64), #[error("unsupported guest architecture")] UnsupportedGuestArch, #[error("igvm file does not support vbs")] @@ -199,17 +197,21 @@ pub fn vtl2_memory_info(igvm_file: &IgvmFile) -> Result { } } -/// Determine a location to allocate VTL2 memory, based on VM information and a -/// provided `igvm_file`. -pub fn vtl2_memory_range( - physical_address_size: u8, - mem_size: u64, - mmio_gaps: &[MemoryRange], - pci_ecam_gaps: &[MemoryRange], - pci_mmio_gaps: &[MemoryRange], +/// Information needed to allocate a VTL2 memory range in the VM memory layout. +#[derive(Debug, Clone, Copy)] +pub struct Vtl2MemoryLayoutRequest { + /// The number of bytes to reserve for VTL2. + pub size: u64, + /// The required relocation alignment. + pub alignment: u64, +} + +/// Determine the VTL2 memory allocation constraints from a provided +/// `igvm_file`. +pub fn vtl2_memory_layout_request( igvm_file: &IgvmFile, vtl2_size: Option, -) -> Result { +) -> Result { let (mask, _max_vtl) = match vbs_platform_header(igvm_file)? { IgvmPlatformHeader::SupportedPlatform(info) => { debug_assert_eq!(info.platform_type, IgvmPlatformType::VSM_ISOLATION); @@ -228,6 +230,9 @@ pub fn vtl2_memory_range( let reloc_region = relocs.0.ok_or(Error::RelocationNotSupported)?[0].clone(); let alignment = reloc_region.relocation_alignment; + if alignment < HV_PAGE_SIZE || !alignment.is_power_of_two() { + return Err(Error::Vtl2RelocationAlignment(alignment)); + } let size = match vtl2_size { Some(vtl2_size) => { @@ -248,50 +253,7 @@ pub fn vtl2_memory_range( } }; - let align_base = |base| -> u64 { (base + alignment - 1) & !(alignment - 1) }; - - // Use one bit below the maximum possible address, as the VTL0 alias map - // will use the highest available bit of the physical address space. - let physical_address_size = physical_address_size - 1; - - // Create an initial memory layout to determine the highest used address. - let dummy_layout = MemoryLayout::new(mem_size, mmio_gaps, pci_ecam_gaps, pci_mmio_gaps, None) - .map_err(Error::MemoryConfig)?; - - // TODO: Underhill kernel panics if loaded at 32TB or higher. Restrict the - // max address to 32TB until this is fixed. - const MAX_ADDR_32TB: u64 = 32u64 << 40; // 0x2000_0000_0000 bytes - let max_physical_address = 1 << physical_address_size; - let max_physical_address = max_physical_address.min(MAX_ADDR_32TB); - - // With more than two mmio gaps, it's harder to reason about which space is - // free or not in the address space to allocate a VTL2 range. Take a - // shortcut and place VTL2 above the end of ram or mmio. - let (min_addr, max_addr) = (dummy_layout.end_of_layout(), max_physical_address); - - let aligned_min_addr = align_base(min_addr); - let aligned_max_addr = (max_addr / alignment) * alignment; - - assert!(aligned_min_addr >= reloc_region.minimum_relocation_gpa); - assert!(aligned_max_addr <= reloc_region.maximum_relocation_gpa); - - // It's possible that the min_addr is above the physical address size of the - // system. Fail now as mapping ram would fail later. - if aligned_min_addr >= aligned_max_addr { - return Err(Error::NotEnoughPhysicalAddressBits); - } - - tracing::trace!(min_addr, aligned_min_addr, max_addr, aligned_max_addr); - - // Select a random base within the alignment - let possible_bases = (aligned_max_addr - aligned_min_addr) / alignment; - let mut num: u64 = 0; - getrandom::fill(num.as_mut_bytes()).expect("crng failure"); - let selected_base = num % (possible_bases - 1); - let selected_addr = aligned_min_addr + (selected_base * alignment); - tracing::trace!(possible_bases, selected_base, selected_addr); - - Ok(MemoryRange::new(selected_addr..(selected_addr + size))) + Ok(Vtl2MemoryLayoutRequest { size, alignment }) } /// Build a device tree representing the whole guest partition. diff --git a/vm/vmcore/vm_topology/src/layout.rs b/vm/vmcore/vm_topology/src/layout.rs index e16e009c4e..729d59ee6e 100644 --- a/vm/vmcore/vm_topology/src/layout.rs +++ b/vm/vmcore/vm_topology/src/layout.rs @@ -4,10 +4,10 @@ //! VM address-space layout allocator. //! //! This module provides a pure-math layout allocator that places fixed ranges, -//! 32-bit MMIO, ordinary RAM, and 64-bit MMIO in a flat guest physical address -//! map. It has no knowledge of specific architectures, firmware types, or -//! chipset conventions; callers express those policies as fixed ranges and -//! dynamic requests. +//! 32-bit MMIO, ordinary RAM, 64-bit MMIO, and post-MMIO ranges in a flat guest +//! physical address map. It has no knowledge of specific architectures, +//! firmware types, or chipset conventions; callers express those policies as +//! fixed ranges and dynamic requests. //! //! # Usage //! @@ -59,6 +59,12 @@ pub enum Placement { Mmio32, /// The allocation is placed bottom up from the end of RAM. Mmio64, + /// The allocation is placed bottom up after RAM and all MMIO allocations. + /// + /// Post-MMIO requests are allocated in caller order, not sorted by size or + /// alignment, so they can be used for private implementation ranges that + /// must not perturb the guest-visible RAM/MMIO layout. + PostMmio, } /// The kind of a produced allocation. @@ -72,6 +78,8 @@ pub enum PlacedRangeKind { Ram, /// A 64-bit MMIO allocation. Mmio64, + /// A post-MMIO allocation. + PostMmio, } /// Allocation phase reported in [`AllocateError::Exhausted`]. @@ -83,6 +91,8 @@ pub enum AllocationPhase { Ram, /// 64-bit MMIO placement. Mmio64, + /// Post-MMIO placement. + PostMmio, } /// A placed range returned by [`LayoutBuilder::allocate`]. @@ -102,6 +112,7 @@ pub struct LayoutBuilder<'a> { mmio32: Vec>, ram: Vec>, mmio64: Vec>, + post_mmio: Vec>, } struct FixedRequest<'a> { @@ -317,6 +328,47 @@ impl AllocationState { Ok(()) } + fn place_post_mmio( + &mut self, + requests: &mut [DynamicRequest<'_>], + ) -> Result<(), AllocateError> { + // These ranges are intentionally placed after all RAM/MMIO work and in + // caller order. They are for implementation-private ranges that should + // not change the VTL0-visible layout or be reordered by alignment. + for request in requests { + let layout_top = self.layout_top(); + let Some(start) = find_lowest_fit( + &self.free, + request.size, + request.alignment, + layout_top, + ADDRESS_LIMIT, + ) else { + return Err(exhausted_error( + request, + AllocationPhase::PostMmio, + &self.free, + layout_top, + ADDRESS_LIMIT, + )); + }; + + let range = MemoryRange::new(start..start + request.size); + *request.target = range; + self.allocate_range(&request.tag, PlacedRangeKind::PostMmio, range); + } + + Ok(()) + } + + fn layout_top(&self) -> u64 { + self.allocations + .iter() + .map(|allocation| allocation.range.end()) + .max() + .unwrap_or(0) + } + fn record(&mut self, tag: &str, kind: PlacedRangeKind, range: MemoryRange) { self.allocations.push(PlacedRange { tag: tag.to_string(), @@ -437,6 +489,7 @@ impl<'a> LayoutBuilder<'a> { mmio32: Vec::new(), ram: Vec::new(), mmio64: Vec::new(), + post_mmio: Vec::new(), } } @@ -471,6 +524,12 @@ impl<'a> LayoutBuilder<'a> { size, alignment, }), + Placement::PostMmio => self.post_mmio.push(DynamicRequest { + tag: tag.into(), + target, + size, + alignment, + }), } } @@ -501,12 +560,14 @@ impl<'a> LayoutBuilder<'a> { validate_dynamic_requests(&self.mmio32)?; validate_ram_requests(&self.ram)?; validate_dynamic_requests(&self.mmio64)?; + validate_dynamic_requests(&self.post_mmio)?; let mut state = AllocationState::new(); state.place_fixed(&mut self.fixed)?; state.place_mmio32(&mut self.mmio32)?; state.place_ram(&mut self.ram)?; state.place_mmio64(&mut self.mmio64)?; + state.place_post_mmio(&mut self.post_mmio)?; state.allocations.sort_by_key(|allocation| allocation.range); Ok(state.allocations) @@ -938,6 +999,41 @@ mod tests { assert_eq!(mmio64, MemoryRange::new(2 * GIB + MIB..2 * GIB + 2 * MIB)); } + #[test] + fn post_mmio_uses_bottom_up_placement_after_all_mmio() { + let mut ram = Vec::new(); + let mut mmio64 = MemoryRange::EMPTY; + let mut post_mmio = MemoryRange::EMPTY; + let mut builder = LayoutBuilder::new(); + builder.ram("ram", &mut ram, 2 * GIB, PAGE_SIZE); + builder.request("mmio64", &mut mmio64, MIB, MIB, Placement::Mmio64); + builder.request("post_mmio", &mut post_mmio, MIB, MIB, Placement::PostMmio); + + builder.allocate().unwrap(); + + assert_eq!(mmio64, MemoryRange::new(2 * GIB..2 * GIB + MIB)); + assert_eq!( + post_mmio, + MemoryRange::new(2 * GIB + MIB..2 * GIB + 2 * MIB) + ); + } + + #[test] + fn post_mmio_preserves_request_order() { + let mut ram = Vec::new(); + let mut first = MemoryRange::EMPTY; + let mut aligned = MemoryRange::EMPTY; + let mut builder = LayoutBuilder::new(); + builder.ram("ram", &mut ram, 2 * GIB, PAGE_SIZE); + builder.request("first", &mut first, MIB, MIB, Placement::PostMmio); + builder.request("aligned", &mut aligned, MIB, GIB, Placement::PostMmio); + + builder.allocate().unwrap(); + + assert_eq!(first, MemoryRange::new(2 * GIB..2 * GIB + MIB)); + assert_eq!(aligned, MemoryRange::new(3 * GIB..3 * GIB + MIB)); + } + #[test] fn fixed_hypertransport_hole_is_regular_fixed_placement() { let mut ram = Vec::new(); From 289b13e6ca86ff4daa76cd91654cc97c53b67e07 Mon Sep 17 00:00:00 2001 From: John Starks Date: Sat, 16 May 2026 04:07:48 +0000 Subject: [PATCH 06/36] cleanup --- vm/vmcore/vm_topology/src/layout.rs | 197 +++++++++++++++++++++++----- 1 file changed, 166 insertions(+), 31 deletions(-) diff --git a/vm/vmcore/vm_topology/src/layout.rs b/vm/vmcore/vm_topology/src/layout.rs index 729d59ee6e..1c90599158 100644 --- a/vm/vmcore/vm_topology/src/layout.rs +++ b/vm/vmcore/vm_topology/src/layout.rs @@ -3,11 +3,11 @@ //! VM address-space layout allocator. //! -//! This module provides a pure-math layout allocator that places fixed ranges, -//! 32-bit MMIO, ordinary RAM, 64-bit MMIO, and post-MMIO ranges in a flat guest -//! physical address map. It has no knowledge of specific architectures, -//! firmware types, or chipset conventions; callers express those policies as -//! fixed ranges and dynamic requests. +//! This module provides a pure-math layout allocator that places reserved and +//! fixed ranges, 32-bit MMIO, ordinary RAM, 64-bit MMIO, and post-MMIO ranges in +//! a flat guest physical address map. It has no knowledge of specific +//! architectures, firmware types, or chipset conventions; callers express those +//! policies as reserved/fixed ranges and dynamic requests. //! //! # Usage //! @@ -70,6 +70,8 @@ pub enum Placement { /// The kind of a produced allocation. #[derive(Debug, Clone, Copy, PartialEq, Eq)] pub enum PlacedRangeKind { + /// A reserved range supplied by the caller. + Reserved, /// A fixed allocation supplied by the caller. Fixed, /// A 32-bit MMIO allocation. @@ -108,6 +110,7 @@ pub struct PlacedRange { /// A builder for computing a deterministic VM address-space layout. pub struct LayoutBuilder<'a> { + reserved: Vec, fixed: Vec>, mmio32: Vec>, ram: Vec>, @@ -115,6 +118,11 @@ pub struct LayoutBuilder<'a> { post_mmio: Vec>, } +struct ReservedRequest { + tag: String, + range: MemoryRange, +} + struct FixedRequest<'a> { tag: String, target: &'a mut MemoryRange, @@ -194,11 +202,7 @@ impl AllocationState { } fn place_fixed(&mut self, requests: &mut [FixedRequest<'_>]) -> Result<(), AllocateError> { - // Fixed ranges represent policy decisions made by the caller: reserved - // architectural/chipset zones, firmware conventions, and any other - // pinned addresses. They seed the free list before dynamic placement; - // this layer does not assign special meaning to particular fixed tags. - let mut fixed = requests + let fixed = requests .iter() .enumerate() .map(|(index, request)| { @@ -209,21 +213,6 @@ impl AllocationState { }) .collect::>(); - fixed.sort_by_key(|(range, _)| range.start()); - - for pair in fixed.windows(2) { - let (range_a, index_a) = pair[0]; - let (range_b, index_b) = pair[1]; - if range_a.overlaps(&range_b) { - return Err(AllocateError::FixedOverlap { - tag_a: requests[index_a].tag.clone(), - range_a, - tag_b: requests[index_b].tag.clone(), - range_b, - }); - } - } - for &(range, request_index) in &fixed { *requests[request_index].target = range; self.allocate_range(&requests[request_index].tag, PlacedRangeKind::Fixed, range); @@ -232,6 +221,12 @@ impl AllocationState { Ok(()) } + fn place_reserved(&mut self, requests: &[ReservedRequest]) { + for request in requests { + self.allocate_range(&request.tag, PlacedRangeKind::Reserved, request.range); + } + } + fn place_mmio32(&mut self, requests: &mut [DynamicRequest<'_>]) -> Result<(), AllocateError> { // Pack 32-bit MMIO from the top of the 4 GiB window downward so RAM can // start at GPA 0 and grow upward through the lowest remaining space. @@ -364,6 +359,7 @@ impl AllocationState { fn layout_top(&self) -> u64 { self.allocations .iter() + .filter(|allocation| allocation.kind != PlacedRangeKind::Reserved) .map(|allocation| allocation.range.end()) .max() .unwrap_or(0) @@ -451,16 +447,16 @@ pub enum AllocateError { /// The requested size. size: u64, }, - /// Two fixed requests overlap. - #[error("fixed requests {tag_a} ({range_a}) and {tag_b} ({range_b}) overlap")] + /// Two fixed or reserved requests overlap. + #[error("fixed/reserved requests {tag_a} ({range_a}) and {tag_b} ({range_b}) overlap")] FixedOverlap { - /// The tag of the first fixed request. + /// The tag of the first request. tag_a: String, - /// The range of the first fixed request. + /// The range of the first request. range_a: MemoryRange, - /// The tag of the second fixed request. + /// The tag of the second request. tag_b: String, - /// The range of the second fixed request. + /// The range of the second request. range_b: MemoryRange, }, /// A dynamic request could not be satisfied. @@ -485,6 +481,7 @@ impl<'a> LayoutBuilder<'a> { /// Creates a new layout builder. pub fn new() -> Self { Self { + reserved: Vec::new(), fixed: Vec::new(), mmio32: Vec::new(), ram: Vec::new(), @@ -493,6 +490,18 @@ impl<'a> LayoutBuilder<'a> { } } + /// Reserves a range so no allocation can use it. + /// + /// Reserved ranges are removed from the free list and may appear in the + /// returned [`PlacedRange`] list, but they do not affect post-MMIO + /// placement. Trailing reserved ranges are omitted from the returned list. + pub fn reserve(&mut self, tag: impl Into, range: MemoryRange) { + self.reserved.push(ReservedRequest { + tag: tag.into(), + range, + }); + } + /// Adds a single-range request to the builder. /// /// The target is filled in when [`Self::allocate`] succeeds. @@ -556,13 +565,16 @@ impl<'a> LayoutBuilder<'a> { /// Allocates all requests, fills in each target, and returns every placed /// range sorted by address. pub fn allocate(mut self) -> Result, AllocateError> { + validate_reserved_requests(&self.reserved)?; validate_fixed_requests(&self.fixed)?; + validate_pinned_ranges(&self.reserved, &self.fixed)?; validate_dynamic_requests(&self.mmio32)?; validate_ram_requests(&self.ram)?; validate_dynamic_requests(&self.mmio64)?; validate_dynamic_requests(&self.post_mmio)?; let mut state = AllocationState::new(); + state.place_reserved(&self.reserved); state.place_fixed(&mut self.fixed)?; state.place_mmio32(&mut self.mmio32)?; state.place_ram(&mut self.ram)?; @@ -570,6 +582,13 @@ impl<'a> LayoutBuilder<'a> { state.place_post_mmio(&mut self.post_mmio)?; state.allocations.sort_by_key(|allocation| allocation.range); + while state + .allocations + .last() + .is_some_and(|allocation| allocation.kind == PlacedRangeKind::Reserved) + { + state.allocations.pop(); + } Ok(state.allocations) } } @@ -598,6 +617,28 @@ fn validate_size_alignment(tag: &str, size: u64, alignment: u64) -> Result<(), A Ok(()) } +fn validate_reserved_requests(requests: &[ReservedRequest]) -> Result<(), AllocateError> { + for request in requests { + validate_size_alignment(&request.tag, request.range.len(), PAGE_SIZE)?; + if !request.range.start().is_multiple_of(PAGE_SIZE) { + return Err(AllocateError::InvalidFixedAddress { + tag: request.tag.clone(), + address: request.range.start(), + }); + } + + if request.range.end() > ADDRESS_LIMIT { + return Err(AllocateError::FixedRangeOverflow { + tag: request.tag.clone(), + address: request.range.start(), + size: request.range.len(), + }); + } + } + + Ok(()) +} + fn validate_fixed_requests(requests: &[FixedRequest<'_>]) -> Result<(), AllocateError> { for request in requests { validate_size_alignment(&request.tag, request.size, request.alignment)?; @@ -628,6 +669,39 @@ fn validate_fixed_requests(requests: &[FixedRequest<'_>]) -> Result<(), Allocate Ok(()) } +fn validate_pinned_ranges( + reserved_requests: &[ReservedRequest], + fixed_requests: &[FixedRequest<'_>], +) -> Result<(), AllocateError> { + let mut pinned = reserved_requests + .iter() + .map(|request| (request.range, request.tag.as_str())) + .chain(fixed_requests.iter().map(|request| { + ( + MemoryRange::new(request.base..request.base + request.size), + request.tag.as_str(), + ) + })) + .collect::>(); + + pinned.sort_by_key(|(range, _)| range.start()); + + for pair in pinned.windows(2) { + let (range_a, tag_a) = pair[0]; + let (range_b, tag_b) = pair[1]; + if range_a.overlaps(&range_b) { + return Err(AllocateError::FixedOverlap { + tag_a: tag_a.to_string(), + range_a, + tag_b: tag_b.to_string(), + range_b, + }); + } + } + + Ok(()) +} + fn validate_dynamic_requests(requests: &[DynamicRequest<'_>]) -> Result<(), AllocateError> { for request in requests { validate_size_alignment(&request.tag, request.size, request.alignment)?; @@ -885,6 +959,24 @@ mod tests { )); } + #[test] + fn reserved_overlap_rejected() { + let mut fixed = MemoryRange::EMPTY; + let mut builder = LayoutBuilder::new(); + builder.reserve("reserved", MemoryRange::new(GIB..GIB + MIB)); + builder.request( + "fixed", + &mut fixed, + MIB, + PAGE_SIZE, + Placement::Fixed(GIB + PAGE_SIZE), + ); + + let error = builder.allocate().unwrap_err(); + + assert!(matches!(error, AllocateError::FixedOverlap { .. })); + } + #[test] fn mmio32_uses_top_down_placement_below_4_gib() { let mut reserved = MemoryRange::EMPTY; @@ -1034,6 +1126,49 @@ mod tests { assert_eq!(aligned, MemoryRange::new(3 * GIB..3 * GIB + MIB)); } + #[test] + fn high_reserved_range_does_not_affect_post_mmio_placement() { + let mut ram = Vec::new(); + let mut post_mmio = MemoryRange::EMPTY; + let mut builder = LayoutBuilder::new(); + builder.ram("ram", &mut ram, 2 * GIB, PAGE_SIZE); + builder.reserve( + "high_reserved", + MemoryRange::new(0xFD_0000_0000..0xFD_4000_0000), + ); + builder.request("post_mmio", &mut post_mmio, MIB, MIB, Placement::PostMmio); + + let sorted = builder.allocate().unwrap(); + + assert_eq!(post_mmio, MemoryRange::new(2 * GIB..2 * GIB + MIB)); + assert!( + !sorted + .iter() + .any(|allocation| allocation.kind == PlacedRangeKind::Reserved) + ); + } + + #[test] + fn reserved_range_between_allocations_is_reported() { + let mut ram = Vec::new(); + let mut post_mmio = MemoryRange::EMPTY; + let mut builder = LayoutBuilder::new(); + builder.ram("ram", &mut ram, 2 * GIB, PAGE_SIZE); + builder.reserve("reserved", MemoryRange::new(2 * GIB..2 * GIB + MIB)); + builder.request("post_mmio", &mut post_mmio, MIB, MIB, Placement::PostMmio); + + let sorted = builder.allocate().unwrap(); + + assert_eq!( + post_mmio, + MemoryRange::new(2 * GIB + MIB..2 * GIB + 2 * MIB) + ); + assert!(sorted.iter().any(|allocation| { + allocation.kind == PlacedRangeKind::Reserved + && allocation.range == MemoryRange::new(2 * GIB..2 * GIB + MIB) + })); + } + #[test] fn fixed_hypertransport_hole_is_regular_fixed_placement() { let mut ram = Vec::new(); From b07d719e6c72789e03f15436695a2922721a345c Mon Sep 17 00:00:00 2001 From: John Starks Date: Sat, 16 May 2026 04:20:28 +0000 Subject: [PATCH 07/36] wip --- .../openvmm_core/src/worker/memory_layout.rs | 68 ++--- vm/vmcore/vm_topology/src/layout.rs | 268 +++--------------- 2 files changed, 75 insertions(+), 261 deletions(-) diff --git a/openvmm/openvmm_core/src/worker/memory_layout.rs b/openvmm/openvmm_core/src/worker/memory_layout.rs index 8ae4f5db8b..265244993c 100644 --- a/openvmm/openvmm_core/src/worker/memory_layout.rs +++ b/openvmm/openvmm_core/src/worker/memory_layout.rs @@ -11,6 +11,7 @@ use vm_topology::memory::MemoryLayout; use vm_topology::memory::MemoryRangeWithNode; const PAGE_SIZE: u64 = 4096; +const RAM_ALIGNMENT: u64 = 1024 * 1024 * 1024; pub(super) struct MemoryLayoutInput<'a> { pub mem_size: u64, @@ -25,31 +26,13 @@ pub(super) struct MemoryLayoutInput<'a> { pub(super) fn resolve_memory_layout(input: MemoryLayoutInput<'_>) -> anyhow::Result { let ram_sizes = validate_ram_sizes(input.mem_size, input.numa_mem_sizes)?; - let mut resolved_mmio_gaps = vec![MemoryRange::EMPTY; input.mmio_gaps.len()]; - let mut resolved_pci_ecam_gaps = vec![MemoryRange::EMPTY; input.pci_ecam_gaps.len()]; - let mut resolved_pci_mmio_gaps = vec![MemoryRange::EMPTY; input.pci_mmio_gaps.len()]; let mut ram_ranges_by_node = vec![Vec::new(); ram_sizes.len()]; let mut vtl2_range = MemoryRange::EMPTY; let mut builder = LayoutBuilder::new(); - add_fixed_ranges( - &mut builder, - "mmio", - input.mmio_gaps, - &mut resolved_mmio_gaps, - ); - add_fixed_ranges( - &mut builder, - "pci_ecam", - input.pci_ecam_gaps, - &mut resolved_pci_ecam_gaps, - ); - add_fixed_ranges( - &mut builder, - "pci_mmio", - input.pci_mmio_gaps, - &mut resolved_pci_mmio_gaps, - ); + add_fixed_ranges(&mut builder, "mmio", input.mmio_gaps); + add_fixed_ranges(&mut builder, "pci_ecam", input.pci_ecam_gaps); + add_fixed_ranges(&mut builder, "pci_mmio", input.pci_mmio_gaps); for (vnode, (ram_size, ram_ranges)) in ram_sizes .iter() @@ -57,7 +40,7 @@ pub(super) fn resolve_memory_layout(input: MemoryLayoutInput<'_>) -> anyhow::Res .zip(&mut ram_ranges_by_node) .enumerate() { - builder.ram(format!("ram[{vnode}]"), ram_ranges, ram_size, PAGE_SIZE); + builder.ram(format!("ram[{vnode}]"), ram_ranges, ram_size, RAM_ALIGNMENT); } if let Some(vtl2_layout) = input.vtl2_layout { @@ -108,20 +91,9 @@ pub(super) fn resolve_memory_layout(input: MemoryLayoutInput<'_>) -> anyhow::Res Ok(memory_layout) } -fn add_fixed_ranges<'a>( - builder: &mut LayoutBuilder<'a>, - tag_prefix: &str, - ranges: &[MemoryRange], - targets: &'a mut [MemoryRange], -) { - for (index, (range, target)) in ranges.iter().zip(targets).enumerate() { - builder.request( - format!("{tag_prefix}[{index}]"), - target, - range.len(), - PAGE_SIZE, - Placement::Fixed(range.start()), - ); +fn add_fixed_ranges(builder: &mut LayoutBuilder<'_>, tag_prefix: &str, ranges: &[MemoryRange]) { + for (index, range) in ranges.iter().enumerate() { + builder.fixed(format!("{tag_prefix}[{index}]"), *range); } } @@ -204,7 +176,7 @@ mod tests { MemoryRange::new(2 * GB..3 * GB), MemoryRange::new(4 * GB..5 * GB), ]; - let pci_ecam = [MemoryRange::new(8 * GB..8 * GB + MB)]; + let pci_ecam = [MemoryRange::new(8 * GB..9 * GB)]; let pci_mmio = [MemoryRange::new(6 * GB..7 * GB)]; let actual = @@ -250,6 +222,28 @@ mod tests { })); } + #[test] + fn ram_chunks_start_on_gb_alignment() { + let mmio = [MemoryRange::new(GB + MB..GB + 2 * MB)]; + + let actual = resolve_memory_layout(input(2 * GB, None, &mmio, &[], &[], None)).unwrap(); + + assert_eq!( + actual.ram(), + &[ + MemoryRangeWithNode { + range: MemoryRange::new(0..GB + MB), + vnode: 0, + }, + MemoryRangeWithNode { + range: MemoryRange::new(2 * GB..3 * GB - MB), + vnode: 0, + }, + ] + ); + assert!(actual.ram().iter().all(|ram| ram.range.start() % GB == 0)); + } + #[test] fn vtl2_is_allocated_after_all_mmio() { let mmio = [MemoryRange::new(GB..2 * GB)]; diff --git a/vm/vmcore/vm_topology/src/layout.rs b/vm/vmcore/vm_topology/src/layout.rs index 1c90599158..d8f5f08239 100644 --- a/vm/vmcore/vm_topology/src/layout.rs +++ b/vm/vmcore/vm_topology/src/layout.rs @@ -15,17 +15,13 @@ //! use memory_range::MemoryRange; //! use vm_topology::layout::{LayoutBuilder, Placement}; //! -//! let mut reserved = MemoryRange::EMPTY; //! let mut ram = Vec::new(); //! let mut vmbus = MemoryRange::EMPTY; //! //! let mut builder = LayoutBuilder::new(); -//! builder.request( +//! builder.fixed( //! "reserved", -//! &mut reserved, -//! 32 * 1024 * 1024, -//! 4096, -//! Placement::Fixed(0xFE00_0000), +//! MemoryRange::new(0xFE00_0000..0x1_0000_0000), //! ); //! builder.request( //! "vmbus", @@ -37,7 +33,6 @@ //! builder.ram("ram", &mut ram, 2 * 1024 * 1024 * 1024, 4096); //! //! let sorted = builder.allocate().unwrap(); -//! assert_eq!(reserved, MemoryRange::new(0xFE00_0000..0x1_0000_0000)); //! assert_eq!(ram, [MemoryRange::new(0..0x8000_0000)]); //! assert_eq!(vmbus.end(), 0xFE00_0000); //! assert_eq!(sorted.len(), 3); @@ -50,11 +45,9 @@ const PAGE_SIZE: u64 = 4096; const FOUR_GIB: u64 = 0x1_0000_0000; const ADDRESS_LIMIT: u64 = MemoryRange::MAX_ADDRESS; -/// The placement class for a single-range layout request. +/// The placement class for a dynamic single-range layout request. #[derive(Debug, Clone, Copy, PartialEq, Eq)] pub enum Placement { - /// The allocation must be placed exactly at the given address. - Fixed(u64), /// The allocation must fit below the 4 GiB boundary and is placed top down. Mmio32, /// The allocation is placed bottom up from the end of RAM. @@ -111,7 +104,7 @@ pub struct PlacedRange { /// A builder for computing a deterministic VM address-space layout. pub struct LayoutBuilder<'a> { reserved: Vec, - fixed: Vec>, + fixed: Vec, mmio32: Vec>, ram: Vec>, mmio64: Vec>, @@ -123,12 +116,9 @@ struct ReservedRequest { range: MemoryRange, } -struct FixedRequest<'a> { +struct FixedRequest { tag: String, - target: &'a mut MemoryRange, - base: u64, - size: u64, - alignment: u64, + range: MemoryRange, } struct DynamicRequest<'a> { @@ -201,21 +191,9 @@ impl AllocationState { } } - fn place_fixed(&mut self, requests: &mut [FixedRequest<'_>]) -> Result<(), AllocateError> { - let fixed = requests - .iter() - .enumerate() - .map(|(index, request)| { - ( - MemoryRange::new(request.base..request.base + request.size), - index, - ) - }) - .collect::>(); - - for &(range, request_index) in &fixed { - *requests[request_index].target = range; - self.allocate_range(&requests[request_index].tag, PlacedRangeKind::Fixed, range); + fn place_fixed(&mut self, requests: &[FixedRequest]) -> Result<(), AllocateError> { + for request in requests { + self.allocate_range(&request.tag, PlacedRangeKind::Fixed, request.range); } Ok(()) @@ -427,26 +405,6 @@ pub enum AllocateError { /// The invalid alignment. alignment: u64, }, - /// A fixed request has a non-page-aligned address. - #[error("{tag}: fixed address {address:#x} is not page-aligned")] - InvalidFixedAddress { - /// The tag identifying the request. - tag: String, - /// The invalid address. - address: u64, - }, - /// A fixed request's range cannot be represented. - #[error( - "{tag}: fixed range starting at {address:#x} with size {size:#x} exceeds the address space" - )] - FixedRangeOverflow { - /// The tag identifying the request. - tag: String, - /// The start address. - address: u64, - /// The requested size. - size: u64, - }, /// Two fixed or reserved requests overlap. #[error("fixed/reserved requests {tag_a} ({range_a}) and {tag_b} ({range_b}) overlap")] FixedOverlap { @@ -502,7 +460,16 @@ impl<'a> LayoutBuilder<'a> { }); } - /// Adds a single-range request to the builder. + /// Adds a fixed range request to the builder. + /// + pub fn fixed(&mut self, tag: impl Into, range: MemoryRange) { + self.fixed.push(FixedRequest { + tag: tag.into(), + range, + }); + } + + /// Adds a dynamic single-range request to the builder. /// /// The target is filled in when [`Self::allocate`] succeeds. pub fn request( @@ -514,13 +481,6 @@ impl<'a> LayoutBuilder<'a> { placement: Placement, ) { match placement { - Placement::Fixed(base) => self.fixed.push(FixedRequest { - tag: tag.into(), - target, - base, - size, - alignment, - }), Placement::Mmio32 => self.mmio32.push(DynamicRequest { tag: tag.into(), target, @@ -575,7 +535,7 @@ impl<'a> LayoutBuilder<'a> { let mut state = AllocationState::new(); state.place_reserved(&self.reserved); - state.place_fixed(&mut self.fixed)?; + state.place_fixed(&self.fixed)?; state.place_mmio32(&mut self.mmio32)?; state.place_ram(&mut self.ram)?; state.place_mmio64(&mut self.mmio64)?; @@ -620,50 +580,14 @@ fn validate_size_alignment(tag: &str, size: u64, alignment: u64) -> Result<(), A fn validate_reserved_requests(requests: &[ReservedRequest]) -> Result<(), AllocateError> { for request in requests { validate_size_alignment(&request.tag, request.range.len(), PAGE_SIZE)?; - if !request.range.start().is_multiple_of(PAGE_SIZE) { - return Err(AllocateError::InvalidFixedAddress { - tag: request.tag.clone(), - address: request.range.start(), - }); - } - - if request.range.end() > ADDRESS_LIMIT { - return Err(AllocateError::FixedRangeOverflow { - tag: request.tag.clone(), - address: request.range.start(), - size: request.range.len(), - }); - } } Ok(()) } -fn validate_fixed_requests(requests: &[FixedRequest<'_>]) -> Result<(), AllocateError> { +fn validate_fixed_requests(requests: &[FixedRequest]) -> Result<(), AllocateError> { for request in requests { - validate_size_alignment(&request.tag, request.size, request.alignment)?; - if !request.base.is_multiple_of(PAGE_SIZE) { - return Err(AllocateError::InvalidFixedAddress { - tag: request.tag.clone(), - address: request.base, - }); - } - - let Some(end) = request.base.checked_add(request.size) else { - return Err(AllocateError::FixedRangeOverflow { - tag: request.tag.clone(), - address: request.base, - size: request.size, - }); - }; - - if end > ADDRESS_LIMIT { - return Err(AllocateError::FixedRangeOverflow { - tag: request.tag.clone(), - address: request.base, - size: request.size, - }); - } + validate_size_alignment(&request.tag, request.range.len(), PAGE_SIZE)?; } Ok(()) @@ -671,17 +595,16 @@ fn validate_fixed_requests(requests: &[FixedRequest<'_>]) -> Result<(), Allocate fn validate_pinned_ranges( reserved_requests: &[ReservedRequest], - fixed_requests: &[FixedRequest<'_>], + fixed_requests: &[FixedRequest], ) -> Result<(), AllocateError> { let mut pinned = reserved_requests .iter() .map(|request| (request.range, request.tag.as_str())) - .chain(fixed_requests.iter().map(|request| { - ( - MemoryRange::new(request.base..request.base + request.size), - request.tag.as_str(), - ) - })) + .chain( + fixed_requests + .iter() + .map(|request| (request.range, request.tag.as_str())), + ) .collect::>(); pinned.sort_by_key(|(range, _)| range.start()); @@ -865,43 +788,22 @@ mod tests { } #[test] - fn fixed_request_fills_target() { - let mut target = MemoryRange::EMPTY; + fn fixed_request_is_reported() { let mut builder = LayoutBuilder::new(); - builder.request( - "fixed", - &mut target, - 4 * MIB, - PAGE_SIZE, - Placement::Fixed(0xFC00_0000), - ); + let range = MemoryRange::new(0xFC00_0000..0xFC40_0000); + builder.fixed("fixed", range); let sorted = builder.allocate().unwrap(); - assert_eq!(target, MemoryRange::new(0xFC00_0000..0xFC40_0000)); - assert_eq!(sorted[0].range, target); + assert_eq!(sorted[0].range, range); assert_eq!(sorted[0].kind, PlacedRangeKind::Fixed); } #[test] fn fixed_overlap_rejected() { - let mut first = MemoryRange::EMPTY; - let mut second = MemoryRange::EMPTY; let mut builder = LayoutBuilder::new(); - builder.request( - "first", - &mut first, - 8 * KIB, - PAGE_SIZE, - Placement::Fixed(0x1000), - ); - builder.request( - "second", - &mut second, - 4 * KIB, - PAGE_SIZE, - Placement::Fixed(0x2000), - ); + builder.fixed("first", MemoryRange::new(0x1000..0x3000)); + builder.fixed("second", MemoryRange::new(0x2000..0x3000)); let error = builder.allocate().unwrap_err(); @@ -925,51 +827,15 @@ mod tests { builder.allocate().unwrap_err(), AllocateError::InvalidAlignment { .. } )); - - let mut target = MemoryRange::EMPTY; - let mut builder = LayoutBuilder::new(); - builder.request( - "fixed", - &mut target, - PAGE_SIZE, - PAGE_SIZE, - Placement::Fixed(0x1234), - ); - assert!(matches!( - builder.allocate().unwrap_err(), - AllocateError::InvalidFixedAddress { .. } - )); - } - - #[test] - fn fixed_range_overflow_rejected() { - let mut target = MemoryRange::EMPTY; - let mut builder = LayoutBuilder::new(); - builder.request( - "overflow", - &mut target, - 2 * PAGE_SIZE, - PAGE_SIZE, - Placement::Fixed(ADDRESS_LIMIT - PAGE_SIZE), - ); - - assert!(matches!( - builder.allocate().unwrap_err(), - AllocateError::FixedRangeOverflow { .. } - )); } #[test] fn reserved_overlap_rejected() { - let mut fixed = MemoryRange::EMPTY; let mut builder = LayoutBuilder::new(); builder.reserve("reserved", MemoryRange::new(GIB..GIB + MIB)); - builder.request( + builder.fixed( "fixed", - &mut fixed, - MIB, - PAGE_SIZE, - Placement::Fixed(GIB + PAGE_SIZE), + MemoryRange::new(GIB + PAGE_SIZE..GIB + PAGE_SIZE + MIB), ); let error = builder.allocate().unwrap_err(); @@ -979,17 +845,10 @@ mod tests { #[test] fn mmio32_uses_top_down_placement_below_4_gib() { - let mut reserved = MemoryRange::EMPTY; let mut first = MemoryRange::EMPTY; let mut second = MemoryRange::EMPTY; let mut builder = LayoutBuilder::new(); - builder.request( - "reserved", - &mut reserved, - 32 * MIB, - PAGE_SIZE, - Placement::Fixed(0xFE00_0000), - ); + builder.fixed("reserved", MemoryRange::new(0xFE00_0000..0x1_0000_0000)); builder.request("first", &mut first, MIB, MIB, Placement::Mmio32); builder.request("second", &mut second, MIB, MIB, Placement::Mmio32); @@ -1034,11 +893,10 @@ mod tests { #[test] fn ram_splits_around_fixed_ranges_and_mmio32() { - let mut fixed = MemoryRange::EMPTY; let mut mmio32 = MemoryRange::EMPTY; let mut ram = Vec::new(); let mut builder = LayoutBuilder::new(); - builder.request("fixed", &mut fixed, MIB, PAGE_SIZE, Placement::Fixed(GIB)); + builder.fixed("fixed", MemoryRange::new(GIB..GIB + MIB)); builder.request("mmio32", &mut mmio32, 2 * GIB, MIB, Placement::Mmio32); builder.ram("ram", &mut ram, 3 * GIB, PAGE_SIZE); @@ -1073,17 +931,10 @@ mod tests { #[test] fn mmio64_skips_fixed_ranges_above_ram() { let mut ram = Vec::new(); - let mut fixed = MemoryRange::EMPTY; let mut mmio64 = MemoryRange::EMPTY; let mut builder = LayoutBuilder::new(); builder.ram("ram", &mut ram, 2 * GIB, PAGE_SIZE); - builder.request( - "fixed", - &mut fixed, - MIB, - PAGE_SIZE, - Placement::Fixed(2 * GIB), - ); + builder.fixed("fixed", MemoryRange::new(2 * GIB..2 * GIB + MIB)); builder.request("mmio64", &mut mmio64, MIB, MIB, Placement::Mmio64); builder.allocate().unwrap(); @@ -1172,23 +1023,13 @@ mod tests { #[test] fn fixed_hypertransport_hole_is_regular_fixed_placement() { let mut ram = Vec::new(); - let mut hypertransport = MemoryRange::EMPTY; let mut builder = LayoutBuilder::new(); builder.ram("ram", &mut ram, 2 * GIB, PAGE_SIZE); - builder.request( - "amd_hypertransport_hole", - &mut hypertransport, - GIB, - PAGE_SIZE, - Placement::Fixed(0xFD_0000_0000), - ); + let hypertransport = MemoryRange::new(0xFD_0000_0000..0xFD_4000_0000); + builder.fixed("amd_hypertransport_hole", hypertransport); let sorted = builder.allocate().unwrap(); - assert_eq!( - hypertransport, - MemoryRange::new(0xFD_0000_0000..0xFD_4000_0000) - ); assert_eq!(sorted.last().unwrap().range, hypertransport); } @@ -1212,15 +1053,8 @@ mod tests { )); let mut ram = Vec::new(); - let mut fixed = MemoryRange::EMPTY; let mut builder = LayoutBuilder::new(); - builder.request( - "fixed", - &mut fixed, - ADDRESS_LIMIT, - PAGE_SIZE, - Placement::Fixed(0), - ); + builder.fixed("fixed", MemoryRange::new(0..ADDRESS_LIMIT)); builder.ram("ram", &mut ram, PAGE_SIZE, PAGE_SIZE); assert!(matches!( builder.allocate().unwrap_err(), @@ -1231,17 +1065,10 @@ mod tests { )); let mut ram = Vec::new(); - let mut fixed = MemoryRange::EMPTY; let mut mmio64 = MemoryRange::EMPTY; let mut builder = LayoutBuilder::new(); builder.ram("ram", &mut ram, PAGE_SIZE, PAGE_SIZE); - builder.request( - "fixed", - &mut fixed, - ADDRESS_LIMIT - PAGE_SIZE, - PAGE_SIZE, - Placement::Fixed(PAGE_SIZE), - ); + builder.fixed("fixed", MemoryRange::new(PAGE_SIZE..ADDRESS_LIMIT)); builder.request( "mmio64", &mut mmio64, @@ -1284,20 +1111,13 @@ mod tests { for _ in 0..10 { let mut ram = Vec::new(); - let mut reserved = MemoryRange::EMPTY; let mut vmbus_low = MemoryRange::EMPTY; let mut pcie_ecam = MemoryRange::EMPTY; let mut pcie_high = MemoryRange::EMPTY; let mut virtio = MemoryRange::EMPTY; let mut builder = LayoutBuilder::new(); builder.ram("ram", &mut ram, 2 * GIB, PAGE_SIZE); - builder.request( - "reserved", - &mut reserved, - 32 * MIB, - PAGE_SIZE, - Placement::Fixed(0xFE00_0000), - ); + builder.fixed("reserved", MemoryRange::new(0xFE00_0000..0x1_0000_0000)); builder.request( "vmbus_low", &mut vmbus_low, From 40cb3e41d2985a680429b34d21a5c027d072a7b7 Mon Sep 17 00:00:00 2001 From: John Starks Date: Sat, 16 May 2026 04:27:04 +0000 Subject: [PATCH 08/36] cleanup --- .../openvmm_core/src/worker/memory_layout.rs | 35 +++++++++++++++---- vm/vmcore/vm_topology/src/layout.rs | 31 ++++++++++++++-- 2 files changed, 56 insertions(+), 10 deletions(-) diff --git a/openvmm/openvmm_core/src/worker/memory_layout.rs b/openvmm/openvmm_core/src/worker/memory_layout.rs index 265244993c..7685b9f151 100644 --- a/openvmm/openvmm_core/src/worker/memory_layout.rs +++ b/openvmm/openvmm_core/src/worker/memory_layout.rs @@ -11,7 +11,8 @@ use vm_topology::memory::MemoryLayout; use vm_topology::memory::MemoryRangeWithNode; const PAGE_SIZE: u64 = 4096; -const RAM_ALIGNMENT: u64 = 1024 * 1024 * 1024; +const TWO_MB: u64 = 2 * 1024 * 1024; +const GB: u64 = 1024 * 1024 * 1024; pub(super) struct MemoryLayoutInput<'a> { pub mem_size: u64, @@ -40,7 +41,8 @@ pub(super) fn resolve_memory_layout(input: MemoryLayoutInput<'_>) -> anyhow::Res .zip(&mut ram_ranges_by_node) .enumerate() { - builder.ram(format!("ram[{vnode}]"), ram_ranges, ram_size, RAM_ALIGNMENT); + let ram_alignment = if ram_size < GB { TWO_MB } else { GB }; + builder.ram(format!("ram[{vnode}]"), ram_ranges, ram_size, ram_alignment); } if let Some(vtl2_layout) = input.vtl2_layout { @@ -141,7 +143,6 @@ mod tests { use super::*; use vm_topology::memory::AddressType; - const GB: u64 = 1024 * 1024 * 1024; const MB: u64 = 1024 * 1024; fn input<'a>( @@ -223,7 +224,7 @@ mod tests { } #[test] - fn ram_chunks_start_on_gb_alignment() { + fn gb_sized_ram_request_uses_gb_chunks() { let mmio = [MemoryRange::new(GB + MB..GB + 2 * MB)]; let actual = resolve_memory_layout(input(2 * GB, None, &mmio, &[], &[], None)).unwrap(); @@ -232,16 +233,36 @@ mod tests { actual.ram(), &[ MemoryRangeWithNode { - range: MemoryRange::new(0..GB + MB), + range: MemoryRange::new(0..GB), vnode: 0, }, MemoryRangeWithNode { - range: MemoryRange::new(2 * GB..3 * GB - MB), + range: MemoryRange::new(2 * GB..3 * GB), vnode: 0, }, ] ); - assert!(actual.ram().iter().all(|ram| ram.range.start() % GB == 0)); + } + + #[test] + fn sub_gb_numa_nodes_use_two_mb_alignment() { + let sizes = [512 * MB, 512 * MB]; + + let actual = resolve_memory_layout(input(GB, Some(&sizes), &[], &[], &[], None)).unwrap(); + + assert_eq!( + actual.ram(), + &[ + MemoryRangeWithNode { + range: MemoryRange::new(0..512 * MB), + vnode: 0, + }, + MemoryRangeWithNode { + range: MemoryRange::new(512 * MB..GB), + vnode: 1, + }, + ] + ); } #[test] diff --git a/vm/vmcore/vm_topology/src/layout.rs b/vm/vmcore/vm_topology/src/layout.rs index d8f5f08239..1c0a320daa 100644 --- a/vm/vmcore/vm_topology/src/layout.rs +++ b/vm/vmcore/vm_topology/src/layout.rs @@ -505,8 +505,11 @@ impl<'a> LayoutBuilder<'a> { /// Adds an ordinary RAM request to the builder. /// /// RAM is placed bottom up from GPA 0 and may split around fixed and MMIO32 - /// ranges. The target vector is replaced with the placed RAM extents when - /// [`Self::allocate`] succeeds. + /// ranges. Each extent starts at `alignment`, and split extents that do not + /// satisfy the rest of the request are rounded down to `alignment` so large + /// aligned requests are not fragmented into smaller chunks. The target + /// vector is replaced with the placed RAM extents when [`Self::allocate`] + /// succeeds. pub fn ram( &mut self, tag: impl Into, @@ -749,7 +752,14 @@ fn find_lowest_splittable_fit( } let available = effective_end - aligned_start; - let allocation_size = available.min(remaining); + let allocation_size = if available >= remaining { + remaining + } else { + align_down(available, alignment) + }; + if allocation_size == 0 { + continue; + } ranges.push(MemoryRange::new( aligned_start..aligned_start + allocation_size, )); @@ -912,6 +922,21 @@ mod tests { ); } + #[test] + fn ram_split_chunks_round_down_to_alignment() { + let mut ram = Vec::new(); + let mut builder = LayoutBuilder::new(); + builder.fixed("fixed", MemoryRange::new(GIB + MIB..GIB + 2 * MIB)); + builder.ram("ram", &mut ram, 2 * GIB, GIB); + + builder.allocate().unwrap(); + + assert_eq!( + ram, + [MemoryRange::new(0..GIB), MemoryRange::new(2 * GIB..3 * GIB),] + ); + } + #[test] fn mmio64_uses_bottom_up_placement_from_end_of_ram() { let mut ram = Vec::new(); From 0f806770c2335a6ad3b018813c4ce777011a9777 Mon Sep 17 00:00:00 2001 From: John Starks Date: Sat, 16 May 2026 04:48:40 +0000 Subject: [PATCH 09/36] docs --- Guide/src/SUMMARY.md | 1 + .../architecture/openvmm/memory-layout.md | 238 ++++++++++++++++++ .../openvmm_core/src/worker/memory_layout.rs | 63 ++++- 3 files changed, 293 insertions(+), 9 deletions(-) create mode 100644 Guide/src/reference/architecture/openvmm/memory-layout.md diff --git a/Guide/src/SUMMARY.md b/Guide/src/SUMMARY.md index b9533b7948..c0036e01b8 100644 --- a/Guide/src/SUMMARY.md +++ b/Guide/src/SUMMARY.md @@ -127,6 +127,7 @@ - [Consomme](./reference/backends/consomme.md) - [Architecture](./reference/architecture.md) - [OpenVMM Architecture](./reference/architecture/openvmm.md) + - [Memory Layout](./reference/architecture/openvmm/memory-layout.md) - [mesh](./reference/architecture/openvmm/mesh.md) - [Using mesh](./reference/architecture/openvmm/mesh/usage.md) - [How mesh works](./reference/architecture/openvmm/mesh/internals.md) diff --git a/Guide/src/reference/architecture/openvmm/memory-layout.md b/Guide/src/reference/architecture/openvmm/memory-layout.md new file mode 100644 index 0000000000..644062ae34 --- /dev/null +++ b/Guide/src/reference/architecture/openvmm/memory-layout.md @@ -0,0 +1,238 @@ +# Memory Layout + +OpenVMM computes guest physical address layouts by combining fixed platform +ranges, RAM requests, MMIO requests, and private implementation ranges through a +single deterministic allocator. + +The memory layout is part of the VM compatibility contract. Guest operating +systems remember RAM and device addresses across hibernation, and saved VM state +contains device state tied to those addresses. For an existing VM, changing +request order, placement class, or alignment policy can move guest physical +addresses and break resume. + +```admonish warning title="Compatibility surface" +Treat layout policy changes like VM ABI changes. A new default can be fine for +new VMs, but existing persisted VM configuration must continue to resolve to the +same guest physical addresses. +``` + +## Layers + +Memory layout is split across three layers: + +| Layer | Responsibility | +|---|---| +| `vm_topology::layout` | Pure address-space allocation. | +| `openvmm_core::worker::memory_layout` | Production VM policy and validation. | +| `vm_topology::memory::MemoryLayout` | Shared validation and query API. | + +[`vm_topology::layout::LayoutBuilder`](https://openvmm.dev/rustdoc/linux/vm_topology/layout/struct.LayoutBuilder.html) +knows only about ranges, sizes, alignments, and placement classes. It does not +know about chipsets, firmware, VTLs, PCI, or host physical address width. +Callers express policy by adding fixed ranges, reserved ranges, RAM requests, +and dynamic MMIO requests. + +The VM worker owns the production policy. It currently feeds existing +`mmio_gaps`, `pci_ecam_gaps`, and `pci_mmio_gaps` into the allocator as fixed +occupied ranges, then asks the allocator to place RAM. Future work moves more +MMIO consumers from precomputed gaps into typed dynamic requests. + +`MemoryLayout` remains the object other worker code uses to query RAM, MMIO, +PCI ECAM, PCI MMIO, VTL2 memory, and the VTL0-visible layout top. + +## Request Types + +The allocator accepts these input forms: + +| Input | Meaning | +|---|---| +| `reserve(tag, range)` | Blocks allocation, but does not raise layout top. | +| `fixed(tag, range)` | Already-known occupied range that is part of layout. | +| `ram(tag, target, size, alignment)` | Splittable ordinary RAM request. | +| `request(..., Placement::Mmio32)` | Single range below 4 GB, packed top down. | +| `request(..., Placement::Mmio64)` | Single range after RAM, packed bottom up. | +| `request(..., Placement::PostMmio)` | Single range after all VTL0 RAM and MMIO. | + +`reserve` is for architectural holes that must block allocation but should not +make the VTL0 layout appear larger. A high reserved hole near the top of the +address space, for example, should not force VTL2 or high MMIO above that hole. + +`fixed` is for ranges that have already been resolved by policy or existing +configuration. Fixed ranges block all dynamic allocation and are included in the +returned placed ranges. + +## Allocation Order + +The allocator is deterministic for the same request list. The phase order is: + +1. Remove reserved ranges from free space. +2. Remove fixed ranges from free space. +3. Allocate 32-bit MMIO below 4 GB, top down. +4. Allocate ordinary RAM from GPA 0 upward, splitting around holes. +5. Allocate 64-bit MMIO from the end of RAM upward. +6. Allocate post-MMIO ranges after the VTL0-visible layout. + +Within MMIO phases, requests are ordered by alignment, then size, then caller +order. RAM and post-MMIO requests use caller order because those orders carry +policy. RAM request order assigns NUMA vnode ownership. Post-MMIO request order +keeps private implementation ranges from being reordered by alignment. + +## Worker Policy + +The VM worker resolver applies the production policy in +`openvmm/openvmm_core/src/worker/memory_layout.rs`: + +1. Validate total RAM size and optional per-vNUMA budgets. +2. Add existing MMIO, PCI ECAM, and PCI MMIO gaps as fixed ranges. +3. Add RAM requests in vnode order. +4. Add optional IGVM VTL2 memory as `Placement::PostMmio`. +5. Allocate all ranges. +6. Build `MemoryLayout` from the resolved RAM and fixed ranges. +7. Validate the VTL0-visible layout top against host physical address width. + +Host physical address width is deliberately not an allocator input. The layout +is computed from VM configuration first, then checked against the host. That +keeps guest physical addresses from changing just because the VM runs on a host +with a different physical address width. + +## RAM Alignment + +Worker RAM requests use two alignment policies: + +| RAM request size | Alignment | +|---|---| +| Less than 1 GB | 2 MB | +| At least 1 GB | 1 GB | + +The alignment is also split granularity. If a RAM request cannot fit entirely in +the current free range, the allocator rounds the non-final chunk down to the +request alignment before continuing. That prevents a tiny fixed hole from +creating odd sub-GB RAM fragments in an otherwise GB-sized VM. + +Sub-GB RAM requests use 2 MB alignment so small NUMA nodes do not waste a full +GB of guest physical address space. + +## VTL2 Placement + +IGVM files can request VTL2 memory using `Vtl2BaseAddressType::MemoryLayout`. +The worker derives only a size and alignment from the IGVM file. It does not +feed IGVM relocation min/max bounds into layout. + +VTL2 memory is allocated as `Placement::PostMmio`, after all VTL0-visible RAM +and MMIO. Enabling VTL2 must not move VTL0 RAM or device ranges. The selected +VTL2 base is later validated by the IGVM loader against the file's relocation +records. Unsupported IGVM files fail there instead of reshaping the VTL0 layout. + +## Examples + +The examples below use compact synthetic ranges. They describe the same policy +that the unit tests cover in `openvmm_core::worker::memory_layout` and +`vm_topology::layout`. + +### Fixed MMIO Splits RAM + +A VM with 4 GB of RAM and a fixed MMIO hole from 1 GB to 2 GB gets RAM on both +sides of the hole. + +| Input | Range | +|---|---| +| RAM request | 4 GB | +| Fixed MMIO | `0x4000_0000..0x8000_0000` | + +| Output | Range | +|---|---| +| RAM | `0x0000_0000..0x4000_0000` | +| MMIO | `0x4000_0000..0x8000_0000` | +| RAM | `0x8000_0000..0x1_4000_0000` | + +The total RAM is still 4 GB. The fixed range is occupied address space, not RAM. + +### GB RAM Chunks Stay GB-Sized + +A 2 GB RAM request with a small fixed hole just above 1 GB should not create a +nearly-1-GB chunk plus a tiny fragment. + +| Input | Range | +|---|---| +| RAM request | 2 GB, 1 GB alignment | +| Fixed MMIO | `0x4010_0000..0x4020_0000` | + +| Output | Range | +|---|---| +| RAM | `0x0000_0000..0x4000_0000` | +| Fixed MMIO | `0x4010_0000..0x4020_0000` | +| RAM | `0x8000_0000..0xC000_0000` | + +The allocator uses the first full 1 GB chunk, skips the interrupted region, and +continues at the next 1 GB boundary. + +### Small NUMA Nodes Use 2 MB Alignment + +For two 512 MB NUMA nodes, using 1 GB alignment would waste address space and +make the layout harder to read. The worker uses 2 MB alignment for sub-GB RAM +requests. + +| Input | Size | +|---|---| +| vnode 0 RAM | 512 MB | +| vnode 1 RAM | 512 MB | + +| Output | Range | +|---|---| +| vnode 0 RAM | `0x0000_0000..0x2000_0000` | +| vnode 1 RAM | `0x2000_0000..0x4000_0000` | + +The request order is the vnode assignment order, so changing it changes the NUMA +layout. + +### VTL2 Does Not Move VTL0 + +Start with 2 GB of VTL0 RAM and a fixed MMIO hole from 1 GB to 2 GB. + +| VTL0 output | Range | +|---|---| +| RAM | `0x0000_0000..0x4000_0000` | +| MMIO | `0x4000_0000..0x8000_0000` | +| RAM | `0x8000_0000..0xC000_0000` | + +If the IGVM file asks for 2 MB of VTL2 memory, the VTL0 layout stays exactly the +same. VTL2 is placed separately after the VTL0-visible top. + +| Private output | Range | +|---|---| +| VTL2 | `0xC000_0000..0xC020_0000` | + +`MemoryLayout::end_of_layout()` reports the VTL0-visible top. VTL2 remains +available through `MemoryLayout::vtl2_range()`. + +### Reserved High Holes Do Not Raise Layout Top + +A reserved range blocks allocation, but it does not describe a guest-visible +resource. If a VM has 2 GB of RAM and a high reserved hole, post-MMIO memory can +still start immediately after the VTL0 layout. + +| Input | Range | +|---|---| +| RAM request | 2 GB | +| Reserved hole | `0xFD_0000_0000..0xFD_4000_0000` | +| Post-MMIO request | 1 MB | + +| Output | Range | +|---|---| +| RAM | `0x0000_0000..0x8000_0000` | +| Post-MMIO | `0x8000_0000..0x8010_0000` | + +The reserved hole is not returned at the end of the sorted layout because it is +only a constraint. If a reserved range sits between returned allocations, it is +reported so callers can inspect the occupied map. + +## Where To Update This Page + +Update this page when changing any of these behaviors: + +- placement phase order in `vm_topology::layout` +- `reserve`, `fixed`, `ram`, or `request` semantics +- worker RAM alignment policy +- VTL2 `MemoryLayout` placement +- host physical-address validation policy +- `MemoryLayout::end_of_layout()` or `MemoryLayout::vtl2_range()` semantics diff --git a/openvmm/openvmm_core/src/worker/memory_layout.rs b/openvmm/openvmm_core/src/worker/memory_layout.rs index 7685b9f151..48733b05d4 100644 --- a/openvmm/openvmm_core/src/worker/memory_layout.rs +++ b/openvmm/openvmm_core/src/worker/memory_layout.rs @@ -1,6 +1,20 @@ // Copyright (c) Microsoft Corporation. // Licensed under the MIT License. +//! Guest physical memory layout resolution for the VM worker. +//! +//! This module is the point where OpenVMM turns stable VM configuration and +//! already-known platform ranges into the production [`MemoryLayout`]. The +//! resulting guest physical addresses are part of the VM's compatibility surface: +//! hibernated guests and saved VMs remember device and RAM locations, so changes +//! to the request order, placement class, or alignment policy can break resume or +//! restore. Keep layout policy changes deliberate and covered by tests. +//! +//! The resolver keeps today's MMIO inputs fixed while moving RAM and VTL2 +//! placement into `vm_topology::layout`. Fixed ranges are registered first so RAM +//! splits around them. VTL2 is registered last as post-MMIO private memory so it +//! does not perturb the VTL0-visible RAM/MMIO layout. + use super::vm_loaders::igvm::Vtl2MemoryLayoutRequest; use anyhow::Context; use anyhow::bail; @@ -15,12 +29,24 @@ const TWO_MB: u64 = 2 * 1024 * 1024; const GB: u64 = 1024 * 1024 * 1024; pub(super) struct MemoryLayoutInput<'a> { + /// Total VTL0 RAM size requested by the VM configuration. pub mem_size: u64, + /// Optional per-vNUMA RAM budgets. When present, these must sum to + /// `mem_size`, and request order is the vnode assignment order. pub numa_mem_sizes: Option<&'a [u64]>, + /// Existing resolved chipset/MMIO ranges. These are fixed for this + /// transition step; later commits will move individual consumers to typed + /// dynamic intents. pub mmio_gaps: &'a [MemoryRange], + /// Existing resolved PCI ECAM ranges, treated as fixed occupied space. pub pci_ecam_gaps: &'a [MemoryRange], + /// Existing resolved PCI MMIO ranges, treated as fixed occupied space. pub pci_mmio_gaps: &'a [MemoryRange], + /// Optional IGVM VTL2 private-memory request. This is allocated after all + /// VTL0-visible RAM and MMIO and is carried separately from ordinary RAM. pub vtl2_layout: Option, + /// Host-supported physical address width used only after allocation. The + /// allocator computes the smallest layout it can; host fit is validation. pub physical_address_size: u8, } @@ -35,6 +61,11 @@ pub(super) fn resolve_memory_layout(input: MemoryLayoutInput<'_>) -> anyhow::Res add_fixed_ranges(&mut builder, "pci_ecam", input.pci_ecam_gaps); add_fixed_ranges(&mut builder, "pci_mmio", input.pci_mmio_gaps); + // RAM request order is part of the NUMA compatibility contract: the first + // request maps to vnode 0, the second to vnode 1, and so on. For GB-sized + // nodes, use GB alignment so holes do not create sub-GB RAM chunks. For + // sub-GB nodes, use 2 MB alignment to avoid wasting a full GB of address + // space per small node. for (vnode, (ram_size, ram_ranges)) in ram_sizes .iter() .copied() @@ -45,6 +76,15 @@ pub(super) fn resolve_memory_layout(input: MemoryLayoutInput<'_>) -> anyhow::Res builder.ram(format!("ram[{vnode}]"), ram_ranges, ram_size, ram_alignment); } + // VTL2 MemoryLayout mode is implementation-private memory, not a VTL0 RAM + // hole. Allocate it only after all VTL0-visible RAM/MMIO so enabling VTL2 + // does not move the VTL0 layout. + // + // IGVM relocation min/max constraints are checked later by the IGVM loader + // against the selected base; using them as a constraint here would be + // overconstraining and would lead to holes in the VTL0 layout--we just + // don't support IGVM files with relocation sections that cannot be + // satisfied by the post-MMIO space. if let Some(vtl2_layout) = input.vtl2_layout { builder.request( "vtl2", @@ -72,6 +112,9 @@ pub(super) fn resolve_memory_layout(input: MemoryLayoutInput<'_>) -> anyhow::Res let vtl2_range = input.vtl2_layout.map(|_| vtl2_range); + // `MemoryLayout` remains the shared validation and query type for the rest + // of the worker. Construct it from resolved RAM so no later consumer repeats + // RAM placement or infers RAM by subtracting from MMIO gaps. let memory_layout = MemoryLayout::new_from_resolved_ranges( ram, input.mmio_gaps.to_vec(), @@ -81,7 +124,11 @@ pub(super) fn resolve_memory_layout(input: MemoryLayoutInput<'_>) -> anyhow::Res ) .context("validating resolved memory layout")?; - let address_space_limit = physical_address_limit(input.physical_address_size); + // Host address-width validation is intentionally after allocation. The + // layout engine is host-width independent, which keeps the layout a pure + // function of VM configuration and avoids host differences changing guest + // physical addresses. + let address_space_limit = 1u64 << input.physical_address_size; if memory_layout.end_of_layout() > address_space_limit { bail!( "memory layout ends at {:#x}, which exceeds the address width of {} bits", @@ -94,12 +141,18 @@ pub(super) fn resolve_memory_layout(input: MemoryLayoutInput<'_>) -> anyhow::Res } fn add_fixed_ranges(builder: &mut LayoutBuilder<'_>, tag_prefix: &str, ranges: &[MemoryRange]) { + // These are fixed only from the allocator's point of view. Today they are + // already-resolved config fields; future commits will replace some of them + // with typed dynamic requests owned by this resolver. for (index, range) in ranges.iter().enumerate() { builder.fixed(format!("{tag_prefix}[{index}]"), *range); } } fn validate_ram_sizes(mem_size: u64, numa_mem_sizes: Option<&[u64]>) -> anyhow::Result> { + // Keep validation compatible with `MemoryLayout::new()` / `new_with_numa()`: + // RAM sizes are page-granular, nonzero, and NUMA budgets must exactly cover + // the configured total. if mem_size == 0 || !mem_size.is_multiple_of(PAGE_SIZE) { bail!("invalid memory size {mem_size:#x}"); } @@ -130,14 +183,6 @@ fn validate_ram_sizes(mem_size: u64, numa_mem_sizes: Option<&[u64]>) -> anyhow:: Ok(numa_mem_sizes.to_vec()) } -fn physical_address_limit(physical_address_size: u8) -> u64 { - if physical_address_size >= u64::BITS as u8 { - u64::MAX - } else { - 1u64 << physical_address_size - } -} - #[cfg(test)] mod tests { use super::*; From 9e5c4882293a5fe862bc8571a5fbc245575ba450 Mon Sep 17 00:00:00 2001 From: John Starks Date: Sat, 16 May 2026 05:53:56 +0000 Subject: [PATCH 10/36] pci --- .../architecture/openvmm/memory-layout.md | 27 +- openvmm/openvmm_core/src/worker/dispatch.rs | 25 +- .../openvmm_core/src/worker/memory_layout.rs | 310 +++++++++++++----- openvmm/openvmm_defs/src/config.rs | 14 +- openvmm/openvmm_entry/src/lib.rs | 43 +-- openvmm/openvmm_entry/src/ttrpc/mod.rs | 2 - petri/src/vm/openvmm/construct.rs | 2 - petri/src/vm/openvmm/modify.rs | 41 +-- 8 files changed, 289 insertions(+), 175 deletions(-) diff --git a/Guide/src/reference/architecture/openvmm/memory-layout.md b/Guide/src/reference/architecture/openvmm/memory-layout.md index 644062ae34..baf4503405 100644 --- a/Guide/src/reference/architecture/openvmm/memory-layout.md +++ b/Guide/src/reference/architecture/openvmm/memory-layout.md @@ -32,10 +32,12 @@ know about chipsets, firmware, VTLs, PCI, or host physical address width. Callers express policy by adding fixed ranges, reserved ranges, RAM requests, and dynamic MMIO requests. -The VM worker owns the production policy. It currently feeds existing -`mmio_gaps`, `pci_ecam_gaps`, and `pci_mmio_gaps` into the allocator as fixed -occupied ranges, then asks the allocator to place RAM. Future work moves more -MMIO consumers from precomputed gaps into typed dynamic requests. +The VM worker owns the production policy. It feeds existing chipset MMIO gaps +into the allocator as fixed occupied ranges, resolves PCIe root complex ECAM +from an optional fixed range or the root-complex bus window, resolves PCIe low +MMIO and high MMIO from typed intents, then asks the allocator to place RAM and +private implementation ranges. Future work moves more MMIO consumers from +precomputed gaps into typed dynamic requests. `MemoryLayout` remains the object other worker code uses to query RAM, MMIO, PCI ECAM, PCI MMIO, VTL2 memory, and the VTL0-visible layout top. @@ -83,12 +85,17 @@ The VM worker resolver applies the production policy in `openvmm/openvmm_core/src/worker/memory_layout.rs`: 1. Validate total RAM size and optional per-vNUMA budgets. -2. Add existing MMIO, PCI ECAM, and PCI MMIO gaps as fixed ranges. -3. Add RAM requests in vnode order. -4. Add optional IGVM VTL2 memory as `Placement::PostMmio`. -5. Allocate all ranges. -6. Build `MemoryLayout` from the resolved RAM and fixed ranges. -7. Validate the VTL0-visible layout top against host physical address width. +2. Add existing chipset MMIO gaps as fixed ranges. +3. Add PCIe root complex ECAM and low MMIO requests as `Placement::Mmio32`. + A root complex with no fixed ECAM range gets an ECAM size derived from its + bus window. +4. Add PCIe root complex high MMIO requests as `Placement::Mmio64`. +5. Add RAM requests in vnode order. +6. Add optional IGVM VTL2 memory as `Placement::PostMmio`. +7. Allocate all ranges. +8. Build `MemoryLayout` from resolved RAM, chipset MMIO gaps, and resolved PCIe + ranges. +9. Validate the VTL0-visible layout top against host physical address width. Host physical address width is deliberately not an allocator input. The layout is computed from VM configuration first, then checked against the host. That diff --git a/openvmm/openvmm_core/src/worker/dispatch.rs b/openvmm/openvmm_core/src/worker/dispatch.rs index 9d680cf8ed..1ab093c424 100644 --- a/openvmm/openvmm_core/src/worker/dispatch.rs +++ b/openvmm/openvmm_core/src/worker/dispatch.rs @@ -6,6 +6,7 @@ use crate::partition::BindHvliteVp; use crate::partition::HvlitePartition; use crate::vmgs_non_volatile_store::HvLiteVmgsNonVolatileStore; use crate::worker::memory_layout::MemoryLayoutInput; +use crate::worker::memory_layout::ResolvedPcieRootComplexRanges; use crate::worker::memory_layout::resolve_memory_layout; use crate::worker::rom::RomBuilder; use acpi::dsdt; @@ -399,6 +400,7 @@ pub(crate) struct InitializedVm { gm: GuestMemory, cfg: Manifest, mem_layout: MemoryLayout, + resolved_pcie_root_complex_ranges: Vec, processor_topology: ProcessorTopology, igvm_file: Option, driver_source: VmTaskDriverSource, @@ -899,16 +901,17 @@ impl InitializedVm { // TODO: The vNUMA nodes reported are meant for test usage only, as they // are not aligned to any physical NUMA node. There is more work to do // to support useful vNUMA reporting. - let mem_layout = resolve_memory_layout(MemoryLayoutInput { + let resolved_layout = resolve_memory_layout(MemoryLayoutInput { mem_size: cfg.memory.mem_size, numa_mem_sizes: cfg.memory.numa_mem_sizes.as_deref(), mmio_gaps: &cfg.memory.mmio_gaps, - pci_ecam_gaps: &cfg.memory.pci_ecam_gaps, - pci_mmio_gaps: &cfg.memory.pci_mmio_gaps, + pcie_root_complexes: &cfg.pcie_root_complexes, vtl2_layout, physical_address_size, }) .context("invalid memory configuration")?; + let mem_layout = resolved_layout.memory_layout; + let resolved_pcie_root_complex_ranges = resolved_layout.pcie_root_complex_ranges; // Place the alias map at the end of the address space. Newer versions // of OpenHCL support receiving this offset via devicetree (especially @@ -1020,6 +1023,7 @@ impl InitializedVm { gm, cfg, mem_layout, + resolved_pcie_root_complex_ranges, processor_topology, igvm_file, driver_source, @@ -1046,6 +1050,7 @@ impl InitializedVm { gm, cfg, mem_layout, + resolved_pcie_root_complex_ranges, processor_topology, igvm_file, driver_source, @@ -1753,7 +1758,11 @@ impl InitializedVm { let mut pcie_host_bridges = Vec::new(); let mut pcie_root_complexes = Vec::new(); - for rc in cfg.pcie_root_complexes { + for (rc, ranges) in cfg + .pcie_root_complexes + .into_iter() + .zip(resolved_pcie_root_complex_ranges) + { let device_name = format!("pcie-root:{}", rc.name); // Create a static bus range for the root complex so that @@ -1785,7 +1794,7 @@ impl InitializedVm { &mut services.register_mmio(), rc.start_bus, rc.end_bus, - rc.ecam_range, + ranges.ecam_range, root_port_definitions, msi_conn.target(), ) @@ -1810,9 +1819,9 @@ impl InitializedVm { segment: rc.segment, start_bus: rc.start_bus, end_bus: rc.end_bus, - ecam_range: rc.ecam_range, - low_mmio: rc.low_mmio, - high_mmio: rc.high_mmio, + ecam_range: ranges.ecam_range, + low_mmio: ranges.low_mmio, + high_mmio: ranges.high_mmio, }); pcie_root_complexes.push(root_complex.clone()); diff --git a/openvmm/openvmm_core/src/worker/memory_layout.rs b/openvmm/openvmm_core/src/worker/memory_layout.rs index 48733b05d4..8eb01e87de 100644 --- a/openvmm/openvmm_core/src/worker/memory_layout.rs +++ b/openvmm/openvmm_core/src/worker/memory_layout.rs @@ -19,6 +19,8 @@ use super::vm_loaders::igvm::Vtl2MemoryLayoutRequest; use anyhow::Context; use anyhow::bail; use memory_range::MemoryRange; +use openvmm_defs::config::MmioRangeConfig; +use openvmm_defs::config::PcieRootComplexConfig; use vm_topology::layout::LayoutBuilder; use vm_topology::layout::Placement; use vm_topology::memory::MemoryLayout; @@ -28,6 +30,22 @@ const PAGE_SIZE: u64 = 4096; const TWO_MB: u64 = 2 * 1024 * 1024; const GB: u64 = 1024 * 1024 * 1024; +/// PCIe ECAM: 32 devices * 8 functions * 4 KiB config space = 1 MB per bus. +const PCIE_ECAM_BYTES_PER_BUS: u64 = 32 * 8 * 4096; + +#[derive(Debug)] +pub(super) struct ResolvedMemoryLayout { + pub memory_layout: MemoryLayout, + pub pcie_root_complex_ranges: Vec, +} + +#[derive(Debug)] +pub(super) struct ResolvedPcieRootComplexRanges { + pub ecam_range: MemoryRange, + pub low_mmio: MemoryRange, + pub high_mmio: MemoryRange, +} + pub(super) struct MemoryLayoutInput<'a> { /// Total VTL0 RAM size requested by the VM configuration. pub mem_size: u64, @@ -38,10 +56,9 @@ pub(super) struct MemoryLayoutInput<'a> { /// transition step; later commits will move individual consumers to typed /// dynamic intents. pub mmio_gaps: &'a [MemoryRange], - /// Existing resolved PCI ECAM ranges, treated as fixed occupied space. - pub pci_ecam_gaps: &'a [MemoryRange], - /// Existing resolved PCI MMIO ranges, treated as fixed occupied space. - pub pci_mmio_gaps: &'a [MemoryRange], + /// PCIe root complex address-space intents. These are resolved by this + /// worker step so front ends do not need to carve guest physical addresses. + pub pcie_root_complexes: &'a [PcieRootComplexConfig], /// Optional IGVM VTL2 private-memory request. This is allocated after all /// VTL0-visible RAM and MMIO and is carried separately from ordinary RAM. pub vtl2_layout: Option, @@ -50,16 +67,66 @@ pub(super) struct MemoryLayoutInput<'a> { pub physical_address_size: u8, } -pub(super) fn resolve_memory_layout(input: MemoryLayoutInput<'_>) -> anyhow::Result { +pub(super) fn resolve_memory_layout( + input: MemoryLayoutInput<'_>, +) -> anyhow::Result { let ram_sizes = validate_ram_sizes(input.mem_size, input.numa_mem_sizes)?; let mut ram_ranges_by_node = vec![Vec::new(); ram_sizes.len()]; + let mut pcie_root_complex_ranges = input + .pcie_root_complexes + .iter() + .map(|_| ResolvedPcieRootComplexRanges { + ecam_range: MemoryRange::EMPTY, + low_mmio: MemoryRange::EMPTY, + high_mmio: MemoryRange::EMPTY, + }) + .collect::>(); let mut vtl2_range = MemoryRange::EMPTY; let mut builder = LayoutBuilder::new(); add_fixed_ranges(&mut builder, "mmio", input.mmio_gaps); - add_fixed_ranges(&mut builder, "pci_ecam", input.pci_ecam_gaps); - add_fixed_ranges(&mut builder, "pci_mmio", input.pci_mmio_gaps); + + for (index, (root_complex, ranges)) in input + .pcie_root_complexes + .iter() + .zip(&mut pcie_root_complex_ranges) + .enumerate() + { + add_pcie_ecam_range( + &mut builder, + format!("pcie[{index}].ecam"), + &mut ranges.ecam_range, + &root_complex.ecam_range, + pcie_ecam_size(root_complex)?, + PCIE_ECAM_BYTES_PER_BUS, + ); + // Low MMIO: 2 MB aligned. + add_pcie_mmio_range( + &mut builder, + format!("pcie[{index}].low_mmio"), + &mut ranges.low_mmio, + &root_complex.low_mmio, + TWO_MB, + Placement::Mmio32, + ); + // High MMIO: 1 GB aligned. Ideally we'd align it to its actual size so + // that the full amount is always usable for a single large BAR. But + // that burns physical address space, which is especially limited on + // some x86 machines. + // + // The downside of this approach is that the maximum mappable BAR size + // is a function of the rest of the topology, which can create + // reliability issues for users. + add_pcie_mmio_range( + &mut builder, + format!("pcie[{index}].high_mmio"), + &mut ranges.high_mmio, + &root_complex.high_mmio, + GB, + Placement::Mmio64, + ); + } // RAM request order is part of the NUMA compatibility contract: the first // request maps to vnode 0, the second to vnode 1, and so on. For GB-sized @@ -115,11 +182,27 @@ pub(super) fn resolve_memory_layout(input: MemoryLayoutInput<'_>) -> anyhow::Res // `MemoryLayout` remains the shared validation and query type for the rest // of the worker. Construct it from resolved RAM so no later consumer repeats // RAM placement or infers RAM by subtracting from MMIO gaps. + let mut pci_ecam_gaps: Vec = Vec::new(); + pci_ecam_gaps.extend( + pcie_root_complex_ranges + .iter() + .map(|ranges| ranges.ecam_range), + ); + pci_ecam_gaps.sort(); + + let mut pci_mmio_gaps: Vec = Vec::new(); + pci_mmio_gaps.extend( + pcie_root_complex_ranges + .iter() + .flat_map(|ranges| [ranges.low_mmio, ranges.high_mmio]), + ); + pci_mmio_gaps.sort(); + let memory_layout = MemoryLayout::new_from_resolved_ranges( ram, input.mmio_gaps.to_vec(), - input.pci_ecam_gaps.to_vec(), - input.pci_mmio_gaps.to_vec(), + pci_ecam_gaps, + pci_mmio_gaps, vtl2_range, ) .context("validating resolved memory layout")?; @@ -137,7 +220,60 @@ pub(super) fn resolve_memory_layout(input: MemoryLayoutInput<'_>) -> anyhow::Res ); } - Ok(memory_layout) + Ok(ResolvedMemoryLayout { + memory_layout, + pcie_root_complex_ranges, + }) +} + +fn pcie_ecam_size(root_complex: &PcieRootComplexConfig) -> anyhow::Result { + let bus_count = root_complex + .end_bus + .checked_sub(root_complex.start_bus) + .with_context(|| { + format!( + "invalid PCIe bus range {}..{} for {}", + root_complex.start_bus, root_complex.end_bus, root_complex.name + ) + })?; + + Ok((u64::from(bus_count) + 1) * PCIE_ECAM_BYTES_PER_BUS) +} + +fn add_pcie_ecam_range<'a>( + builder: &mut LayoutBuilder<'a>, + tag: String, + target: &'a mut MemoryRange, + config: &Option, + size: u64, + alignment: u64, +) { + match config { + Some(range) => { + *target = *range; + builder.fixed(tag, *range); + } + None => builder.request(tag, target, size, alignment, Placement::Mmio32), + } +} + +fn add_pcie_mmio_range<'a>( + builder: &mut LayoutBuilder<'a>, + tag: String, + target: &'a mut MemoryRange, + config: &MmioRangeConfig, + alignment: u64, + placement: Placement, +) { + match config { + MmioRangeConfig::Dynamic { size } => { + builder.request(tag, target, *size, alignment, placement); + } + MmioRangeConfig::Fixed(range) => { + *target = *range; + builder.fixed(tag, *range); + } + } } fn add_fixed_ranges(builder: &mut LayoutBuilder<'_>, tag_prefix: &str, ranges: &[MemoryRange]) { @@ -194,21 +330,22 @@ mod tests { mem_size: u64, numa_mem_sizes: Option<&'a [u64]>, mmio_gaps: &'a [MemoryRange], - pci_ecam_gaps: &'a [MemoryRange], - pci_mmio_gaps: &'a [MemoryRange], vtl2_layout: Option, ) -> MemoryLayoutInput<'a> { MemoryLayoutInput { mem_size, numa_mem_sizes, mmio_gaps, - pci_ecam_gaps, - pci_mmio_gaps, + pcie_root_complexes: &[], vtl2_layout, physical_address_size: 46, } } + fn resolve(input: MemoryLayoutInput<'_>) -> MemoryLayout { + resolve_memory_layout(input).unwrap().memory_layout + } + fn vtl2_layout(size: u64) -> Vtl2MemoryLayoutRequest { Vtl2MemoryLayoutRequest { size, @@ -216,18 +353,33 @@ mod tests { } } + fn pcie_root_complex( + ecam_range: Option, + low_mmio: MmioRangeConfig, + high_mmio: MmioRangeConfig, + ) -> PcieRootComplexConfig { + PcieRootComplexConfig { + index: 0, + name: "rc0".to_string(), + segment: 0, + start_bus: 0, + end_bus: 0, + ecam_range, + low_mmio, + high_mmio, + ports: Vec::new(), + } + } + #[test] fn non_numa_matches_memory_layout_new() { let mmio = [ MemoryRange::new(2 * GB..3 * GB), MemoryRange::new(4 * GB..5 * GB), ]; - let pci_ecam = [MemoryRange::new(8 * GB..9 * GB)]; - let pci_mmio = [MemoryRange::new(6 * GB..7 * GB)]; - let actual = - resolve_memory_layout(input(6 * GB, None, &mmio, &pci_ecam, &pci_mmio, None)).unwrap(); - let expected = MemoryLayout::new(6 * GB, &mmio, &pci_ecam, &pci_mmio, None).unwrap(); + let actual = resolve(input(6 * GB, None, &mmio, None)); + let expected = MemoryLayout::new(6 * GB, &mmio, &[], &[], None).unwrap(); assert_eq!(actual.ram(), expected.ram()); assert_eq!(actual.mmio(), expected.mmio()); @@ -241,8 +393,7 @@ mod tests { let mmio = [MemoryRange::new(3 * GB..4 * GB)]; let sizes = [2 * GB, 2 * GB]; - let actual = - resolve_memory_layout(input(4 * GB, Some(&sizes), &mmio, &[], &[], None)).unwrap(); + let actual = resolve(input(4 * GB, Some(&sizes), &mmio, None)); let expected = MemoryLayout::new_with_numa(&sizes, &mmio, &[], &[], None).unwrap(); assert_eq!(actual.ram(), expected.ram()); @@ -250,29 +401,62 @@ mod tests { #[test] fn fixed_ranges_are_occupied_for_ram() { - let mmio = [MemoryRange::new(GB..2 * GB)]; - let pci_ecam = [MemoryRange::new(3 * GB..3 * GB + MB)]; - let pci_mmio = [MemoryRange::new(4 * GB..5 * GB)]; + let mmio = [ + MemoryRange::new(GB..2 * GB), + MemoryRange::new(3 * GB..3 * GB + MB), + MemoryRange::new(4 * GB..5 * GB), + ]; - let actual = - resolve_memory_layout(input(4 * GB, None, &mmio, &pci_ecam, &pci_mmio, None)).unwrap(); + let actual = resolve(input(4 * GB, None, &mmio, None)); - assert_eq!(actual.probe_address(GB), Some(AddressType::Mmio)); - assert_eq!(actual.probe_address(3 * GB), Some(AddressType::PciEcam)); - assert_eq!(actual.probe_address(4 * GB), Some(AddressType::PciMmio)); assert_eq!(actual.ram_size(), 4 * GB); - assert!(actual.ram().iter().all(|ram| { - !ram.range.overlaps(&mmio[0]) - && !ram.range.overlaps(&pci_ecam[0]) - && !ram.range.overlaps(&pci_mmio[0]) - })); + assert!( + actual + .ram() + .iter() + .all(|ram| { mmio.iter().all(|m| !ram.range.overlaps(m)) }) + ); + } + + #[test] + fn pcie_dynamic_intents_are_resolved() { + let mmio = [MemoryRange::new(0xf800_0000..4 * GB)]; + let root_complexes = [pcie_root_complex( + None, + MmioRangeConfig::Dynamic { size: 64 * MB }, + MmioRangeConfig::Dynamic { size: GB }, + )]; + let mut config = input(2 * GB, None, &mmio, None); + config.pcie_root_complexes = &root_complexes; + + let actual = resolve_memory_layout(config).unwrap(); + let ranges = &actual.pcie_root_complex_ranges[0]; + + assert_eq!( + ranges.ecam_range, + MemoryRange::new(0xf3f0_0000..0xf400_0000) + ); + assert_eq!(ranges.low_mmio, MemoryRange::new(0xf400_0000..0xf800_0000)); + assert_eq!(ranges.high_mmio, MemoryRange::new(2 * GB..3 * GB)); + assert_eq!( + actual.memory_layout.probe_address(0xf3f0_0000), + Some(AddressType::PciEcam) + ); + assert_eq!( + actual.memory_layout.probe_address(0xf400_0000), + Some(AddressType::PciMmio) + ); + assert_eq!( + actual.memory_layout.probe_address(2 * GB), + Some(AddressType::PciMmio) + ); } #[test] fn gb_sized_ram_request_uses_gb_chunks() { let mmio = [MemoryRange::new(GB + MB..GB + 2 * MB)]; - let actual = resolve_memory_layout(input(2 * GB, None, &mmio, &[], &[], None)).unwrap(); + let actual = resolve(input(2 * GB, None, &mmio, None)); assert_eq!( actual.ram(), @@ -293,7 +477,7 @@ mod tests { fn sub_gb_numa_nodes_use_two_mb_alignment() { let sizes = [512 * MB, 512 * MB]; - let actual = resolve_memory_layout(input(GB, Some(&sizes), &[], &[], &[], None)).unwrap(); + let actual = resolve(input(GB, Some(&sizes), &[], None)); assert_eq!( actual.ram(), @@ -312,19 +496,12 @@ mod tests { #[test] fn vtl2_is_allocated_after_all_mmio() { - let mmio = [MemoryRange::new(GB..2 * GB)]; - let pci_ecam = [MemoryRange::new(3 * GB..3 * GB + MB)]; - let pci_mmio = [MemoryRange::new(7 * GB..8 * GB)]; + let mmio = [ + MemoryRange::new(GB..2 * GB), + MemoryRange::new(7 * GB..8 * GB), + ]; - let actual = resolve_memory_layout(input( - 4 * GB, - None, - &mmio, - &pci_ecam, - &pci_mmio, - Some(vtl2_layout(2 * MB)), - )) - .unwrap(); + let actual = resolve(input(4 * GB, None, &mmio, Some(vtl2_layout(2 * MB)))); assert_eq!(actual.end_of_layout(), 8 * GB); assert_eq!( @@ -337,17 +514,8 @@ mod tests { fn vtl2_does_not_change_ram_placement() { let mmio = [MemoryRange::new(GB..2 * GB)]; - let without_vtl2 = - resolve_memory_layout(input(2 * GB, None, &mmio, &[], &[], None)).unwrap(); - let with_vtl2 = resolve_memory_layout(input( - 2 * GB, - None, - &mmio, - &[], - &[], - Some(vtl2_layout(2 * MB)), - )) - .unwrap(); + let without_vtl2 = resolve(input(2 * GB, None, &mmio, None)); + let with_vtl2 = resolve(input(2 * GB, None, &mmio, Some(vtl2_layout(2 * MB)))); assert_eq!(with_vtl2.ram(), without_vtl2.ram()); assert_eq!(with_vtl2.end_of_layout(), without_vtl2.end_of_layout()); @@ -363,28 +531,10 @@ mod tests { MemoryRange::new(GB..2 * GB), MemoryRange::new(5 * GB..6 * GB), ]; - let pci_ecam = [MemoryRange::new(3 * GB..3 * GB + MB)]; - let pci_mmio = [MemoryRange::new(7 * GB..8 * GB)]; let sizes = [2 * GB, 3 * GB]; - let first = resolve_memory_layout(input( - 5 * GB, - Some(&sizes), - &mmio, - &pci_ecam, - &pci_mmio, - None, - )) - .unwrap(); - let second = resolve_memory_layout(input( - 5 * GB, - Some(&sizes), - &mmio, - &pci_ecam, - &pci_mmio, - None, - )) - .unwrap(); + let first = resolve(input(5 * GB, Some(&sizes), &mmio, None)); + let second = resolve(input(5 * GB, Some(&sizes), &mmio, None)); assert_eq!(first.ram(), second.ram()); assert_eq!(first.end_of_layout(), second.end_of_layout()); @@ -393,7 +543,7 @@ mod tests { #[test] fn host_width_validation_happens_after_allocation() { let mmio = [MemoryRange::new(GB..4 * GB)]; - let mut config = input(3 * GB, None, &mmio, &[], &[], None); + let mut config = input(3 * GB, None, &mmio, None); config.physical_address_size = 32; let err = resolve_memory_layout(config).unwrap_err(); diff --git a/openvmm/openvmm_defs/src/config.rs b/openvmm/openvmm_defs/src/config.rs index 07b4a7c38a..810f62d18b 100644 --- a/openvmm/openvmm_defs/src/config.rs +++ b/openvmm/openvmm_defs/src/config.rs @@ -216,6 +216,12 @@ pub enum Vtl2BaseAddressType { Vtl2Allocate { size: Option }, } +#[derive(Debug, MeshPayload)] +pub enum MmioRangeConfig { + Dynamic { size: u64 }, + Fixed(MemoryRange), +} + #[derive(Debug, MeshPayload)] pub struct PcieRootComplexConfig { pub index: u32, @@ -223,9 +229,9 @@ pub struct PcieRootComplexConfig { pub segment: u16, pub start_bus: u8, pub end_bus: u8, - pub ecam_range: MemoryRange, - pub low_mmio: MemoryRange, - pub high_mmio: MemoryRange, + pub ecam_range: Option, + pub low_mmio: MmioRangeConfig, + pub high_mmio: MmioRangeConfig, pub ports: Vec, } @@ -361,8 +367,6 @@ pub struct MemoryConfig { pub hugepages: bool, pub hugepage_size: Option, pub mmio_gaps: Vec, - pub pci_ecam_gaps: Vec, - pub pci_mmio_gaps: Vec, /// Test only: per-NUMA-node memory sizes. When set, RAM is distributed /// across vNUMA nodes according to these sizes instead of assigning all RAM /// to node 0. The sum must equal `mem_size`. diff --git a/openvmm/openvmm_entry/src/lib.rs b/openvmm/openvmm_entry/src/lib.rs index a99441fc0a..548270aefc 100644 --- a/openvmm/openvmm_entry/src/lib.rs +++ b/openvmm/openvmm_entry/src/lib.rs @@ -77,6 +77,7 @@ use openvmm_defs::config::HypervisorConfig; use openvmm_defs::config::LateMapVtl0MemoryPolicy; use openvmm_defs::config::LoadMode; use openvmm_defs::config::MemoryConfig; +use openvmm_defs::config::MmioRangeConfig; use openvmm_defs::config::PcieDeviceConfig; use openvmm_defs::config::PcieRootComplexConfig; use openvmm_defs::config::PcieRootPortConfig; @@ -739,12 +740,6 @@ async fn vm_config_from_command_line( (false, MachineArch::Aarch64) => DEFAULT_MMIO_GAPS_AARCH64.into(), }; - let mut pci_ecam_gaps = Vec::new(); - let mut pci_mmio_gaps = Vec::new(); - - let mut low_mmio_start = mmio_gaps.first().context("expected mmio gap")?.start(); - let mut high_mmio_end = mmio_gaps.last().context("expected second mmio gap")?.end(); - let mut pcie_root_complexes = Vec::new(); for (i, rc_cli) in opt.pcie_root_complex.iter().enumerate() { let ports = opt @@ -764,43 +759,23 @@ async fn vm_config_from_command_line( .high_mmio .checked_next_multiple_of(ONE_MB) .context("high mmio rounding error")?; - let ecam_size = (((rc_cli.end_bus - rc_cli.start_bus) as u64) + 1) * 256 * 4096; - - let low_pci_mmio_start = low_mmio_start - .checked_sub(low_mmio_size) - .context("pci low mmio underflow")?; - let ecam_start = low_pci_mmio_start - .checked_sub(ecam_size) - .context("pci ecam underflow")?; - low_mmio_start = ecam_start; - high_mmio_end = high_mmio_end - .checked_add(high_mmio_size) - .context("pci high mmio overflow")?; - - let ecam_range = MemoryRange::new(ecam_start..ecam_start + ecam_size); - let low_mmio = MemoryRange::new(low_pci_mmio_start..low_pci_mmio_start + low_mmio_size); - let high_mmio = MemoryRange::new(high_mmio_end - high_mmio_size..high_mmio_end); - - pci_ecam_gaps.push(ecam_range); - pci_mmio_gaps.push(low_mmio); - pci_mmio_gaps.push(high_mmio); - pcie_root_complexes.push(PcieRootComplexConfig { index: i as u32, name: rc_cli.name.clone(), segment: rc_cli.segment, start_bus: rc_cli.start_bus, end_bus: rc_cli.end_bus, - ecam_range, - low_mmio, - high_mmio, + ecam_range: None, + low_mmio: MmioRangeConfig::Dynamic { + size: low_mmio_size, + }, + high_mmio: MmioRangeConfig::Dynamic { + size: high_mmio_size, + }, ports, }); } - pci_ecam_gaps.sort(); - pci_mmio_gaps.sort(); - let pcie_switches = build_switch_list(&opt.pcie_switch); #[cfg(target_os = "linux")] @@ -1628,8 +1603,6 @@ async fn vm_config_from_command_line( transparent_hugepages: opt.transparent_hugepages(), hugepages: opt.memory.hugepages, hugepage_size: opt.memory.hugepage_size, - pci_ecam_gaps, - pci_mmio_gaps, numa_mem_sizes: opt.numa_memory.clone(), }, processor_topology: ProcessorTopologyConfig { diff --git a/openvmm/openvmm_entry/src/ttrpc/mod.rs b/openvmm/openvmm_entry/src/ttrpc/mod.rs index 385d01bdf2..dcdffb82e1 100644 --- a/openvmm/openvmm_entry/src/ttrpc/mod.rs +++ b/openvmm/openvmm_entry/src/ttrpc/mod.rs @@ -574,8 +574,6 @@ impl VmService { memory: MemoryConfig { mem_size: config_mem_size, mmio_gaps: DEFAULT_MMIO_GAPS_X86.into(), - pci_ecam_gaps: vec![], - pci_mmio_gaps: vec![], prefetch_memory: false, private_memory: false, transparent_hugepages: false, diff --git a/petri/src/vm/openvmm/construct.rs b/petri/src/vm/openvmm/construct.rs index f6df910cc8..1e39f7b850 100644 --- a/petri/src/vm/openvmm/construct.rs +++ b/petri/src/vm/openvmm/construct.rs @@ -396,8 +396,6 @@ impl PetriVmConfigOpenVmm { } MmioConfig::Custom(ranges) => ranges, }, - pci_ecam_gaps: vec![], - pci_mmio_gaps: vec![], prefetch_memory: false, private_memory: false, transparent_hugepages: false, diff --git a/petri/src/vm/openvmm/modify.rs b/petri/src/vm/openvmm/modify.rs index ef93b552dd..8794ed4b9f 100644 --- a/petri/src/vm/openvmm/modify.rs +++ b/petri/src/vm/openvmm/modify.rs @@ -18,13 +18,13 @@ use gdma_resources::GdmaDeviceHandle; use gdma_resources::VportDefinition; use get_resources::ged::IgvmAttestTestConfig; use guid::Guid; -use memory_range::MemoryRange; use net_backend_resources::mac_address::MacAddress; use nvme_resources::NamespaceDefinition; use nvme_resources::NvmeControllerHandle; use openvmm_defs::config::Config; use openvmm_defs::config::DeviceVtl; use openvmm_defs::config::LoadMode; +use openvmm_defs::config::MmioRangeConfig; use openvmm_defs::config::PcieDeviceConfig; use openvmm_defs::config::PcieRootComplexConfig; use openvmm_defs::config::PcieRootPortConfig; @@ -242,27 +242,9 @@ impl PetriVmConfigOpenVmm { root_complex_per_segment: u64, root_ports_per_root_complex: u64, ) -> Self { - const SINGLE_BUS_NUMBER_ECAM_SIZE: u64 = 1024 * 1024; // 1 MB - const FULL_SEGMENT_ECAM_SIZE: u64 = 256 * SINGLE_BUS_NUMBER_ECAM_SIZE; // 256 MB const LOW_MMIO_SIZE: u64 = 64 * 1024 * 1024; // 64 MB const HIGH_MMIO_SIZE: u64 = 1024 * 1024 * 1024; // 1 GB - // Allocate and configure the address space gaps - let ecam_size = segment_count * FULL_SEGMENT_ECAM_SIZE; - let low_mmio_size = segment_count * root_complex_per_segment * LOW_MMIO_SIZE; - let high_mmio_size = segment_count * root_complex_per_segment * HIGH_MMIO_SIZE; - - let low_mmio_start = self.config.memory.mmio_gaps[0].start(); - let high_mmio_end = self.config.memory.mmio_gaps[1].end(); - - let ecam_gap = MemoryRange::new(low_mmio_start - ecam_size..low_mmio_start); - let low_gap = MemoryRange::new(ecam_gap.start() - low_mmio_size..ecam_gap.start()); - let high_gap = MemoryRange::new(high_mmio_end..high_mmio_end + high_mmio_size); - - self.config.memory.pci_ecam_gaps.push(ecam_gap); - self.config.memory.pci_mmio_gaps.push(low_gap); - self.config.memory.pci_mmio_gaps.push(high_gap); - // Add the root complexes to the VM for segment in 0..segment_count { let bus_count_per_rc = 256 / root_complex_per_segment; @@ -273,17 +255,6 @@ impl PetriVmConfigOpenVmm { let start_bus = rc_index_in_segment * bus_count_per_rc; let end_bus = start_bus + bus_count_per_rc - 1; - let ecam_range_start = ecam_gap.start() - + segment * FULL_SEGMENT_ECAM_SIZE - + start_bus * SINGLE_BUS_NUMBER_ECAM_SIZE; - let ecam_range_end = - ecam_range_start + bus_count_per_rc * SINGLE_BUS_NUMBER_ECAM_SIZE; - - let low_mmio_start = low_gap.start() + index * LOW_MMIO_SIZE; - let low_mmio_end = low_gap.start() + (index + 1) * LOW_MMIO_SIZE; - let high_mmio_start = high_gap.start() + index * HIGH_MMIO_SIZE; - let high_mmio_end = high_gap.start() + (index + 1) * HIGH_MMIO_SIZE; - let ports = (0..root_ports_per_root_complex) .map(|i| PcieRootPortConfig { name: format!("s{}rc{}rp{}", segment, rc_index_in_segment, i), @@ -298,9 +269,13 @@ impl PetriVmConfigOpenVmm { segment: segment.try_into().unwrap(), start_bus: start_bus.try_into().unwrap(), end_bus: end_bus.try_into().unwrap(), - ecam_range: MemoryRange::new(ecam_range_start..ecam_range_end), - low_mmio: MemoryRange::new(low_mmio_start..low_mmio_end), - high_mmio: MemoryRange::new(high_mmio_start..high_mmio_end), + ecam_range: None, + low_mmio: MmioRangeConfig::Dynamic { + size: LOW_MMIO_SIZE, + }, + high_mmio: MmioRangeConfig::Dynamic { + size: HIGH_MMIO_SIZE, + }, ports, }); } From 3f62efc4c12ca75f2cc7cfc55e15aefbb260ac51 Mon Sep 17 00:00:00 2001 From: John Starks Date: Sat, 16 May 2026 06:15:36 +0000 Subject: [PATCH 11/36] virtio-mmio --- openvmm/openvmm_core/src/worker/dispatch.rs | 86 +++++++++---------- .../openvmm_core/src/worker/memory_layout.rs | 70 +++++++++++++++ 2 files changed, 112 insertions(+), 44 deletions(-) diff --git a/openvmm/openvmm_core/src/worker/dispatch.rs b/openvmm/openvmm_core/src/worker/dispatch.rs index 1ab093c424..b54dafb55b 100644 --- a/openvmm/openvmm_core/src/worker/dispatch.rs +++ b/openvmm/openvmm_core/src/worker/dispatch.rs @@ -401,6 +401,7 @@ pub(crate) struct InitializedVm { cfg: Manifest, mem_layout: MemoryLayout, resolved_pcie_root_complex_ranges: Vec, + virtio_mmio_region: Option, processor_topology: ProcessorTopology, igvm_file: Option, driver_source: VmTaskDriverSource, @@ -682,7 +683,7 @@ struct LoadedVmInner { chipset_cfg: BaseChipsetManifest, chipset_capabilities: VmChipsetCapabilities, #[cfg_attr(not(guest_arch = "x86_64"), expect(dead_code))] - virtio_mmio_count: usize, + virtio_mmio_region: Option, #[cfg_attr(not(guest_arch = "x86_64"), expect(dead_code))] virtio_mmio_irq: u32, /// ((device, function), interrupt) @@ -901,17 +902,25 @@ impl InitializedVm { // TODO: The vNUMA nodes reported are meant for test usage only, as they // are not aligned to any physical NUMA node. There is more work to do // to support useful vNUMA reporting. + let virtio_mmio_count = cfg + .virtio_devices + .iter() + .filter(|(bus, _)| matches!(bus, VirtioBus::Mmio)) + .count(); + let resolved_layout = resolve_memory_layout(MemoryLayoutInput { mem_size: cfg.memory.mem_size, numa_mem_sizes: cfg.memory.numa_mem_sizes.as_deref(), mmio_gaps: &cfg.memory.mmio_gaps, pcie_root_complexes: &cfg.pcie_root_complexes, + virtio_mmio_count, vtl2_layout, physical_address_size, }) .context("invalid memory configuration")?; let mem_layout = resolved_layout.memory_layout; let resolved_pcie_root_complex_ranges = resolved_layout.pcie_root_complex_ranges; + let virtio_mmio_region = resolved_layout.virtio_mmio_region; // Place the alias map at the end of the address space. Newer versions // of OpenHCL support receiving this offset via devicetree (especially @@ -1024,6 +1033,7 @@ impl InitializedVm { cfg, mem_layout, resolved_pcie_root_complex_ranges, + virtio_mmio_region, processor_topology, igvm_file, driver_source, @@ -1051,6 +1061,7 @@ impl InitializedVm { cfg, mem_layout, resolved_pcie_root_complex_ranges, + virtio_mmio_region, processor_topology, igvm_file, driver_source, @@ -2335,15 +2346,11 @@ impl InitializedVm { // add virtio devices - // Construct virtio devices. - // - // TODO: allocate PCI and MMIO space better. + // Construct virtio devices. Virtio-mmio device addresses are resolved + // by the memory layout allocator; each slot is a 4 KiB Mmio32 + // allocation indexed by the order of VirtioBus::Mmio devices. let mut pci_device_number = 10; - if mem_layout.mmio().len() < 2 { - anyhow::bail!("at least two mmio regions are required"); - } - let mut virtio_mmio_start = mem_layout.mmio()[1].end(); - let mut virtio_mmio_count = 0; + let mut virtio_mmio_index = 0; // Avoid an ISA interrupt to avoid conflicts and to avoid needing to // configure the line as level-triggered in the MADT (necessary for @@ -2369,8 +2376,10 @@ impl InitializedVm { .await?; match bus { VirtioBus::Mmio => { - let mmio_start = virtio_mmio_start - 0x1000; - virtio_mmio_start -= 0x1000; + let region = virtio_mmio_region + .expect("virtio_mmio_region must be allocated for Mmio devices"); + let mmio_start = region.start() + virtio_mmio_index as u64 * 0x1000; + virtio_mmio_index += 1; let id = format!("{id}-{mmio_start}"); let gm = gm.clone(); chipset_builder.arc_mutex_device(id).try_add(|services| { @@ -2384,7 +2393,6 @@ impl InitializedVm { 0x1000, ) })?; - virtio_mmio_count += 1; } VirtioBus::Pci => { let pci_inta_line = pci_inta_line.context("missing PCI INT#A line")?; @@ -2421,8 +2429,6 @@ impl InitializedVm { } } - assert!(virtio_mmio_start >= mem_layout.mmio()[1].start()); - let (chipset, devices) = chipset_builder.build()?; let (fatal_error_send, _fatal_error_recv) = mesh::channel(); let chipset = vmm_core::vmotherboard_adapter::AdaptedChipset::new( @@ -2521,7 +2527,7 @@ impl InitializedVm { chipset_capabilities: cfg.chipset_capabilities, firmware_event_send: cfg.firmware_event_send, load_mode: cfg.load_mode, - virtio_mmio_count, + virtio_mmio_region, virtio_mmio_irq, pci_legacy_interrupts, igvm_file, @@ -2628,7 +2634,7 @@ impl LoadedVmInner { dsdt, &self.chipset_cfg, enable_serial, - self.virtio_mmio_count, + self.virtio_mmio_region.as_ref(), self.virtio_mmio_irq, &self.pci_legacy_interrupts, ) @@ -3380,7 +3386,7 @@ fn add_devices_to_dsdt_x64( dsdt: &mut dsdt::Dsdt, cfg: &BaseChipsetManifest, serial_uarts: bool, - virtio_mmio_count: usize, + virtio_mmio_region: Option<&MemoryRange>, virtio_mmio_irq: u32, pci_legacy_interrupts: &[((u8, Option), u32)], // ((device, function), interrupt) ) { @@ -3406,35 +3412,27 @@ fn add_devices_to_dsdt_x64( "the DSDT describes two MMIO regions" ); let low_mmio_gap = mem_layout.mmio()[0]; - let mut high_mmio_space: std::ops::Range = mem_layout.mmio()[1].into(); - // Device(\_SB.VI00) - // { - // Name(_HID, "LNRO0005") - // Name(_UID, 0) - // Name(_CRS, ResourceTemplate() - // { - // QWORDMemory(,,,,,ReadWrite,0,0x1fffff000,0x1ffffffff,0,0x1000) - // Interrupt(ResourceConsumer, Level, ActiveHigh, Exclusive) - // {5} - // }) - // } - // TODO: manage MMIO space better than this - for i in 0..virtio_mmio_count { - high_mmio_space.end -= HV_PAGE_SIZE; - let mut device = dsdt::Device::new(format!("\\_SB.VI{i:02}").as_bytes()); - device.add_object(&dsdt::NamedString::new(b"_HID", b"LNRO0005")); - device.add_object(&dsdt::NamedInteger::new(b"_UID", i as u64)); - let mut crs = dsdt::CurrentResourceSettings::new(); - crs.add_resource(&dsdt::QwordMemory::new(high_mmio_space.end, HV_PAGE_SIZE)); - let mut intr = dsdt::Interrupt::new(virtio_mmio_irq); - intr.is_edge_triggered = false; - crs.add_resource(&intr); - device.add_object(&crs); - dsdt.add_object(&device); + let high_mmio_gap = mem_layout.mmio()[1]; + + // Virtio-mmio devices are allocated as a contiguous region by the memory + // layout resolver. Each 4 KiB slot is a separate device. + if let Some(region) = virtio_mmio_region { + let slot_count = region.len() / HV_PAGE_SIZE; + for i in 0..slot_count { + let slot_base = region.start() + i * HV_PAGE_SIZE; + let mut device = dsdt::Device::new(format!("\\_SB.VI{i:02}").as_bytes()); + device.add_object(&dsdt::NamedString::new(b"_HID", b"LNRO0005")); + device.add_object(&dsdt::NamedInteger::new(b"_UID", i)); + let mut crs = dsdt::CurrentResourceSettings::new(); + crs.add_resource(&dsdt::QwordMemory::new(slot_base, HV_PAGE_SIZE)); + let mut intr = dsdt::Interrupt::new(virtio_mmio_irq); + intr.is_edge_triggered = false; + crs.add_resource(&intr); + device.add_object(&crs); + dsdt.add_object(&device); + } } - let high_mmio_gap = MemoryRange::new(high_mmio_space); - if cfg.with_generic_pci_bus || cfg.with_i440bx_host_pci_bridge { // TODO: actually plumb through legacy PCI interrupts dsdt.add_pci(low_mmio_gap, high_mmio_gap, pci_legacy_interrupts); diff --git a/openvmm/openvmm_core/src/worker/memory_layout.rs b/openvmm/openvmm_core/src/worker/memory_layout.rs index 8eb01e87de..c4a268144c 100644 --- a/openvmm/openvmm_core/src/worker/memory_layout.rs +++ b/openvmm/openvmm_core/src/worker/memory_layout.rs @@ -37,6 +37,10 @@ const PCIE_ECAM_BYTES_PER_BUS: u64 = 32 * 8 * 4096; pub(super) struct ResolvedMemoryLayout { pub memory_layout: MemoryLayout, pub pcie_root_complex_ranges: Vec, + /// Contiguous MMIO region for all virtio-mmio device slots. Each slot is + /// 4 KiB, indexed from the start of the region. `None` when no + /// virtio-mmio devices are configured. + pub virtio_mmio_region: Option, } #[derive(Debug)] @@ -59,6 +63,9 @@ pub(super) struct MemoryLayoutInput<'a> { /// PCIe root complex address-space intents. These are resolved by this /// worker step so front ends do not need to carve guest physical addresses. pub pcie_root_complexes: &'a [PcieRootComplexConfig], + /// Number of virtio-mmio device slots to allocate in 32-bit MMIO space. + /// A single contiguous region of `count * 4 KiB` is allocated. + pub virtio_mmio_count: usize, /// Optional IGVM VTL2 private-memory request. This is allocated after all /// VTL0-visible RAM and MMIO and is carried separately from ordinary RAM. pub vtl2_layout: Option, @@ -83,6 +90,7 @@ pub(super) fn resolve_memory_layout( }) .collect::>(); let mut vtl2_range = MemoryRange::EMPTY; + let mut virtio_mmio_region = MemoryRange::EMPTY; let mut builder = LayoutBuilder::new(); add_fixed_ranges(&mut builder, "mmio", input.mmio_gaps); @@ -128,6 +136,19 @@ pub(super) fn resolve_memory_layout( ); } + // Virtio-mmio: allocate one contiguous region for all slots. Each slot is + // 4 KiB, so the region is `count * 4 KiB` placed as a single Mmio32 + // request. + if input.virtio_mmio_count > 0 { + builder.request( + "virtio_mmio".to_string(), + &mut virtio_mmio_region, + input.virtio_mmio_count as u64 * PAGE_SIZE, + PAGE_SIZE, + Placement::Mmio32, + ); + } + // RAM request order is part of the NUMA compatibility contract: the first // request maps to vnode 0, the second to vnode 1, and so on. For GB-sized // nodes, use GB alignment so holes do not create sub-GB RAM chunks. For @@ -220,9 +241,16 @@ pub(super) fn resolve_memory_layout( ); } + let virtio_mmio_region = if input.virtio_mmio_count > 0 { + Some(virtio_mmio_region) + } else { + None + }; + Ok(ResolvedMemoryLayout { memory_layout, pcie_root_complex_ranges, + virtio_mmio_region, }) } @@ -337,6 +365,7 @@ mod tests { numa_mem_sizes, mmio_gaps, pcie_root_complexes: &[], + virtio_mmio_count: 0, vtl2_layout, physical_address_size: 46, } @@ -550,4 +579,45 @@ mod tests { assert!(err.to_string().contains("memory layout ends at")); } + + #[test] + fn virtio_mmio_slots_are_allocated_in_mmio32() { + let mmio = [MemoryRange::new(0xf800_0000..4 * GB)]; + let mut config = input(2 * GB, None, &mmio, None); + config.virtio_mmio_count = 3; + + let result = resolve_memory_layout(config).unwrap(); + + let region = result + .virtio_mmio_region + .expect("should have virtio-mmio region"); + assert_eq!(region.len(), 3 * PAGE_SIZE); + assert!(region.end() <= 4 * GB, "virtio-mmio should be below 4 GiB"); + assert!( + !MemoryRange::new(0xf800_0000..4 * GB).overlaps(®ion), + "virtio-mmio should not overlap with chipset MMIO gap" + ); + } + + #[test] + fn virtio_mmio_does_not_move_ram() { + let mmio = [MemoryRange::new(0xf800_0000..4 * GB)]; + + let without = resolve(input(2 * GB, None, &mmio, None)); + let mut config = input(2 * GB, None, &mmio, None); + config.virtio_mmio_count = 2; + let with = resolve_memory_layout(config).unwrap(); + + assert_eq!(with.memory_layout.ram(), without.ram()); + } + + #[test] + fn zero_virtio_mmio_produces_no_region() { + let mmio = [MemoryRange::new(0xf800_0000..4 * GB)]; + let config = input(2 * GB, None, &mmio, None); + + let result = resolve_memory_layout(config).unwrap(); + + assert!(result.virtio_mmio_region.is_none()); + } } From 23477ca35cbee4e9b48a67aebf40c6f3131ffe84 Mon Sep 17 00:00:00 2001 From: John Starks Date: Sat, 16 May 2026 16:32:09 +0000 Subject: [PATCH 12/36] wip --- Cargo.lock | 4 +- openhcl/underhill_core/src/worker.rs | 1 + openvmm/openvmm_core/src/worker/dispatch.rs | 38 +- .../openvmm_core/src/worker/memory_layout.rs | 380 ++++++++++++------ .../src/worker/vm_loaders/igvm.rs | 62 +-- .../src/worker/vm_loaders/linux.rs | 11 +- openvmm/openvmm_defs/src/config.rs | 43 +- openvmm/openvmm_entry/Cargo.toml | 1 - openvmm/openvmm_entry/src/lib.rs | 30 +- openvmm/openvmm_entry/src/ttrpc/mod.rs | 5 +- petri/Cargo.toml | 1 - petri/src/vm/mod.rs | 14 - petri/src/vm/openvmm/construct.rs | 28 +- vmm_core/vm_manifest_builder/Cargo.toml | 1 + vmm_core/vm_manifest_builder/src/lib.rs | 39 ++ vmm_core/vmm_core_defs/Cargo.toml | 1 + vmm_core/vmm_core_defs/src/lib.rs | 15 + .../tests/tests/multiarch/memstat.rs | 1 - .../tests/x86_64/openhcl_linux_direct.rs | 26 +- 19 files changed, 413 insertions(+), 288 deletions(-) diff --git a/Cargo.lock b/Cargo.lock index 163090f441..c11ea08871 100644 --- a/Cargo.lock +++ b/Cargo.lock @@ -5427,7 +5427,6 @@ dependencies = [ "input_core", "inspect", "inspect_proto", - "memory_range", "mesh", "mesh_process", "mesh_rpc", @@ -6019,7 +6018,6 @@ dependencies = [ "kmsg", "libtest-mimic", "linkme", - "memory_range", "mesh", "mesh_process", "mesh_worker", @@ -9515,6 +9513,7 @@ dependencies = [ "serial_pl011_resources", "thiserror 2.0.16", "vm_resource", + "vmm_core_defs", "vmotherboard", ] @@ -10075,6 +10074,7 @@ name = "vmm_core_defs" version = "0.0.0" dependencies = [ "inspect", + "memory_range", "mesh", "virt", ] diff --git a/openhcl/underhill_core/src/worker.rs b/openhcl/underhill_core/src/worker.rs index 9eb32a722c..a9b763d73d 100644 --- a/openhcl/underhill_core/src/worker.rs +++ b/openhcl/underhill_core/src/worker.rs @@ -2384,6 +2384,7 @@ async fn new_underhill_vm( mut chipset_devices, pci_chipset_devices, capabilities, + .. } = chipset .build() .context("failed to build chipset configuration")?; diff --git a/openvmm/openvmm_core/src/worker/dispatch.rs b/openvmm/openvmm_core/src/worker/dispatch.rs index b54dafb55b..9eff7f1011 100644 --- a/openvmm/openvmm_core/src/worker/dispatch.rs +++ b/openvmm/openvmm_core/src/worker/dispatch.rs @@ -57,6 +57,7 @@ use openvmm_defs::config::GicConfig; use openvmm_defs::config::HypervisorConfig; use openvmm_defs::config::LoadMode; use openvmm_defs::config::MemoryConfig; +use openvmm_defs::config::MmioRangeConfig; use openvmm_defs::config::PcieDeviceConfig; use openvmm_defs::config::PcieRootComplexConfig; use openvmm_defs::config::PcieSwitchConfig; @@ -200,6 +201,9 @@ impl Manifest { chipset_devices: config.chipset_devices, pci_chipset_devices: config.pci_chipset_devices, chipset_capabilities: config.chipset_capabilities, + chipset_low_mmio: config.chipset_low_mmio, + chipset_high_mmio: config.chipset_high_mmio, + vtl2_chipset_mmio: config.vtl2_chipset_mmio, generation_id_recv: config.generation_id_recv, rtc_delta_milliseconds: config.rtc_delta_milliseconds, automatic_guest_reset: config.automatic_guest_reset, @@ -249,6 +253,9 @@ pub struct Manifest { chipset_devices: Vec, pci_chipset_devices: Vec, chipset_capabilities: VmChipsetCapabilities, + chipset_low_mmio: Option, + chipset_high_mmio: Option, + vtl2_chipset_mmio: Option, generation_id_recv: Option>, rtc_delta_milliseconds: i64, automatic_guest_reset: bool, @@ -402,6 +409,9 @@ pub(crate) struct InitializedVm { mem_layout: MemoryLayout, resolved_pcie_root_complex_ranges: Vec, virtio_mmio_region: Option, + chipset_low_mmio: Option, + chipset_high_mmio: Option, + vtl2_chipset_mmio: Option, processor_topology: ProcessorTopology, igvm_file: Option, driver_source: VmTaskDriverSource, @@ -686,6 +696,12 @@ struct LoadedVmInner { virtio_mmio_region: Option, #[cfg_attr(not(guest_arch = "x86_64"), expect(dead_code))] virtio_mmio_irq: u32, + /// Chipset low MMIO range for VMOD/PCI0 _CRS. + chipset_low_mmio: Option, + /// Chipset high MMIO range for VMOD/PCI0 _CRS. + chipset_high_mmio: Option, + /// VTL2-private chipset MMIO range for VTL2 VMBus. + vtl2_chipset_mmio: Option, /// ((device, function), interrupt) #[cfg_attr(not(guest_arch = "x86_64"), expect(dead_code))] pci_legacy_interrupts: Vec<((u8, Option), u32)>, @@ -911,7 +927,9 @@ impl InitializedVm { let resolved_layout = resolve_memory_layout(MemoryLayoutInput { mem_size: cfg.memory.mem_size, numa_mem_sizes: cfg.memory.numa_mem_sizes.as_deref(), - mmio_gaps: &cfg.memory.mmio_gaps, + chipset_low_mmio: cfg.chipset_low_mmio.as_ref(), + chipset_high_mmio: cfg.chipset_high_mmio.as_ref(), + vtl2_chipset_mmio: cfg.vtl2_chipset_mmio.as_ref(), pcie_root_complexes: &cfg.pcie_root_complexes, virtio_mmio_count, vtl2_layout, @@ -921,6 +939,9 @@ impl InitializedVm { let mem_layout = resolved_layout.memory_layout; let resolved_pcie_root_complex_ranges = resolved_layout.pcie_root_complex_ranges; let virtio_mmio_region = resolved_layout.virtio_mmio_region; + let chipset_low_mmio = resolved_layout.chipset_low_mmio; + let chipset_high_mmio = resolved_layout.chipset_high_mmio; + let vtl2_chipset_mmio = resolved_layout.vtl2_chipset_mmio; // Place the alias map at the end of the address space. Newer versions // of OpenHCL support receiving this offset via devicetree (especially @@ -1034,6 +1055,9 @@ impl InitializedVm { mem_layout, resolved_pcie_root_complex_ranges, virtio_mmio_region, + chipset_low_mmio, + chipset_high_mmio, + vtl2_chipset_mmio, processor_topology, igvm_file, driver_source, @@ -1062,6 +1086,9 @@ impl InitializedVm { mem_layout, resolved_pcie_root_complex_ranges, virtio_mmio_region, + chipset_low_mmio, + chipset_high_mmio, + vtl2_chipset_mmio, processor_topology, igvm_file, driver_source, @@ -2529,6 +2556,9 @@ impl InitializedVm { load_mode: cfg.load_mode, virtio_mmio_region, virtio_mmio_irq, + chipset_low_mmio, + chipset_high_mmio, + vtl2_chipset_mmio, pci_legacy_interrupts, igvm_file, next_igvm_file: None, @@ -2770,6 +2800,9 @@ impl LoadedVmInner { with_vmbus_redirect: self.vmbus_redirect, com_serial, entropy: Some(&entropy), + chipset_low_mmio: self.chipset_low_mmio, + chipset_high_mmio: self.chipset_high_mmio, + vtl2_chipset_mmio: self.vtl2_chipset_mmio, }; super::vm_loaders::igvm::load_igvm(params)? } @@ -3344,6 +3377,9 @@ impl LoadedVm { chipset_devices: vec![], // TODO pci_chipset_devices: vec![], // TODO chipset_capabilities: self.inner.chipset_capabilities, + chipset_low_mmio: None, // TODO + chipset_high_mmio: None, // TODO + vtl2_chipset_mmio: None, // TODO generation_id_recv: None, // TODO rtc_delta_milliseconds: 0, // TODO automatic_guest_reset: self.inner.automatic_guest_reset, diff --git a/openvmm/openvmm_core/src/worker/memory_layout.rs b/openvmm/openvmm_core/src/worker/memory_layout.rs index c4a268144c..d587a28f0e 100644 --- a/openvmm/openvmm_core/src/worker/memory_layout.rs +++ b/openvmm/openvmm_core/src/worker/memory_layout.rs @@ -10,10 +10,11 @@ //! to the request order, placement class, or alignment policy can break resume or //! restore. Keep layout policy changes deliberate and covered by tests. //! -//! The resolver keeps today's MMIO inputs fixed while moving RAM and VTL2 -//! placement into `vm_topology::layout`. Fixed ranges are registered first so RAM -//! splits around them. VTL2 is registered last as post-MMIO private memory so it -//! does not perturb the VTL0-visible RAM/MMIO layout. +//! The resolver owns all layout consumers: architectural reserved zones (LAPIC, +//! IOAPIC, GIC, etc.), chipset MMIO (VMBus relay, PIIX4 PCI BARs), PCIe +//! ECAM/BAR pools, virtio-mmio slots, ordinary RAM, VTL2 private memory, and +//! VTL2 chipset MMIO. Callers express sizing intent; the resolver places +//! everything and derives the effective MMIO gaps for [`MemoryLayout`]. use super::vm_loaders::igvm::Vtl2MemoryLayoutRequest; use anyhow::Context; @@ -41,6 +42,15 @@ pub(super) struct ResolvedMemoryLayout { /// 4 KiB, indexed from the start of the region. `None` when no /// virtio-mmio devices are configured. pub virtio_mmio_region: Option, + /// Chipset low MMIO range (below 4 GiB) for VMOD/PCI0 _CRS. `None` when + /// no VMBus / chipset MMIO is configured. + pub chipset_low_mmio: Option, + /// Chipset high MMIO range (above RAM) for VMOD/PCI0 _CRS. `None` when + /// no VMBus / chipset MMIO is configured. + pub chipset_high_mmio: Option, + /// VTL2-private chipset MMIO range, reported to VTL2 VMBus via the device + /// tree. `None` when VTL2 is not configured or has no chipset MMIO. + pub vtl2_chipset_mmio: Option, } #[derive(Debug)] @@ -56,10 +66,15 @@ pub(super) struct MemoryLayoutInput<'a> { /// Optional per-vNUMA RAM budgets. When present, these must sum to /// `mem_size`, and request order is the vnode assignment order. pub numa_mem_sizes: Option<&'a [u64]>, - /// Existing resolved chipset/MMIO ranges. These are fixed for this - /// transition step; later commits will move individual consumers to typed - /// dynamic intents. - pub mmio_gaps: &'a [MemoryRange], + /// Chipset low MMIO range (below 4 GiB). This is the VMOD/PCI0 _CRS range + /// for VMBus relay devices and PIIX4 PCI BARs. + pub chipset_low_mmio: Option<&'a MmioRangeConfig>, + /// Chipset high MMIO range (above RAM). This is the VMOD/PCI0 _CRS high + /// range for VMBus relay devices. + pub chipset_high_mmio: Option<&'a MmioRangeConfig>, + /// VTL2-private chipset MMIO range. Placed after all VTL0-visible layout + /// so enabling VTL2 does not move VTL0 addresses. + pub vtl2_chipset_mmio: Option<&'a MmioRangeConfig>, /// PCIe root complex address-space intents. These are resolved by this /// worker step so front ends do not need to carve guest physical addresses. pub pcie_root_complexes: &'a [PcieRootComplexConfig], @@ -74,6 +89,12 @@ pub(super) struct MemoryLayoutInput<'a> { pub physical_address_size: u8, } +/// Architectural reserved zone for x86_64: LAPIC, IOAPIC, battery, TPM. +const ARCH_RESERVED_X86_64: MemoryRange = MemoryRange::new(0xFE00_0000..0x1_0000_0000); + +/// Architectural reserved zone for aarch64: GIC, PL011, battery. +const ARCH_RESERVED_AARCH64: MemoryRange = MemoryRange::new(0xEF00_0000..0x1_0000_0000); + pub(super) fn resolve_memory_layout( input: MemoryLayoutInput<'_>, ) -> anyhow::Result { @@ -91,9 +112,45 @@ pub(super) fn resolve_memory_layout( .collect::>(); let mut vtl2_range = MemoryRange::EMPTY; let mut virtio_mmio_region = MemoryRange::EMPTY; + let mut chipset_low_mmio = MemoryRange::EMPTY; + let mut chipset_high_mmio = MemoryRange::EMPTY; + let mut vtl2_chipset_mmio = MemoryRange::EMPTY; let mut builder = LayoutBuilder::new(); - add_fixed_ranges(&mut builder, "mmio", input.mmio_gaps); + + // Architectural reserved zone — pinned addresses that no dynamic consumer + // may overlap (LAPIC, IOAPIC, GIC, PL011, battery, TPM, etc.). + let arch_reserved = if cfg!(guest_arch = "x86_64") { + ARCH_RESERVED_X86_64 + } else { + ARCH_RESERVED_AARCH64 + }; + builder.reserve("arch_reserved".to_string(), arch_reserved); + + // Chipset low MMIO (Mmio32): VMOD/PCI0 _CRS low range for VMBus relay + // devices and PIIX4 PCI BARs. + if let Some(config) = input.chipset_low_mmio { + add_mmio_range( + &mut builder, + "chipset_low_mmio".to_string(), + &mut chipset_low_mmio, + config, + TWO_MB, + Placement::Mmio32, + ); + } + + // Chipset high MMIO (Mmio64): VMOD/PCI0 _CRS high range. + if let Some(config) = input.chipset_high_mmio { + add_mmio_range( + &mut builder, + "chipset_high_mmio".to_string(), + &mut chipset_high_mmio, + config, + TWO_MB, + Placement::Mmio64, + ); + } for (index, (root_complex, ranges)) in input .pcie_root_complexes @@ -110,7 +167,7 @@ pub(super) fn resolve_memory_layout( PCIE_ECAM_BYTES_PER_BUS, ); // Low MMIO: 2 MB aligned. - add_pcie_mmio_range( + add_mmio_range( &mut builder, format!("pcie[{index}].low_mmio"), &mut ranges.low_mmio, @@ -126,7 +183,7 @@ pub(super) fn resolve_memory_layout( // The downside of this approach is that the maximum mappable BAR size // is a function of the rest of the topology, which can create // reliability issues for users. - add_pcie_mmio_range( + add_mmio_range( &mut builder, format!("pcie[{index}].high_mmio"), &mut ranges.high_mmio, @@ -164,6 +221,19 @@ pub(super) fn resolve_memory_layout( builder.ram(format!("ram[{vnode}]"), ram_ranges, ram_size, ram_alignment); } + // VTL2 chipset MMIO is implementation-private — placed after all + // VTL0-visible RAM/MMIO so enabling VTL2 does not move VTL0 addresses. + if let Some(config) = input.vtl2_chipset_mmio { + add_mmio_range( + &mut builder, + "vtl2_chipset_mmio".to_string(), + &mut vtl2_chipset_mmio, + config, + TWO_MB, + Placement::PostMmio, + ); + } + // VTL2 MemoryLayout mode is implementation-private memory, not a VTL0 RAM // hole. Allocate it only after all VTL0-visible RAM/MMIO so enabling VTL2 // does not move the VTL0 layout. @@ -200,9 +270,24 @@ pub(super) fn resolve_memory_layout( let vtl2_range = input.vtl2_layout.map(|_| vtl2_range); - // `MemoryLayout` remains the shared validation and query type for the rest - // of the worker. Construct it from resolved RAM so no later consumer repeats - // RAM placement or infers RAM by subtracting from MMIO gaps. + // Derive the effective MMIO gaps from the resolved allocations. These are + // the non-RAM, non-VTL2 ranges that MemoryLayout stores as `mmio()`. We + // collect chipset MMIO, PCIe, virtio-mmio, and the architectural reserved + // zone into sorted gap vectors so existing consumers of + // `MemoryLayout::mmio()` keep working. + let mut mmio_gaps: Vec = Vec::new(); + mmio_gaps.push(arch_reserved); + if input.chipset_low_mmio.is_some() { + mmio_gaps.push(chipset_low_mmio); + } + if input.chipset_high_mmio.is_some() { + mmio_gaps.push(chipset_high_mmio); + } + if input.virtio_mmio_count > 0 { + mmio_gaps.push(virtio_mmio_region); + } + mmio_gaps.sort(); + let mut pci_ecam_gaps: Vec = Vec::new(); pci_ecam_gaps.extend( pcie_root_complex_ranges @@ -221,7 +306,7 @@ pub(super) fn resolve_memory_layout( let memory_layout = MemoryLayout::new_from_resolved_ranges( ram, - input.mmio_gaps.to_vec(), + mmio_gaps, pci_ecam_gaps, pci_mmio_gaps, vtl2_range, @@ -251,6 +336,9 @@ pub(super) fn resolve_memory_layout( memory_layout, pcie_root_complex_ranges, virtio_mmio_region, + chipset_low_mmio: input.chipset_low_mmio.map(|_| chipset_low_mmio), + chipset_high_mmio: input.chipset_high_mmio.map(|_| chipset_high_mmio), + vtl2_chipset_mmio: input.vtl2_chipset_mmio.map(|_| vtl2_chipset_mmio), }) } @@ -285,7 +373,7 @@ fn add_pcie_ecam_range<'a>( } } -fn add_pcie_mmio_range<'a>( +fn add_mmio_range<'a>( builder: &mut LayoutBuilder<'a>, tag: String, target: &'a mut MemoryRange, @@ -304,15 +392,6 @@ fn add_pcie_mmio_range<'a>( } } -fn add_fixed_ranges(builder: &mut LayoutBuilder<'_>, tag_prefix: &str, ranges: &[MemoryRange]) { - // These are fixed only from the allocator's point of view. Today they are - // already-resolved config fields; future commits will replace some of them - // with typed dynamic requests owned by this resolver. - for (index, range) in ranges.iter().enumerate() { - builder.fixed(format!("{tag_prefix}[{index}]"), *range); - } -} - fn validate_ram_sizes(mem_size: u64, numa_mem_sizes: Option<&[u64]>) -> anyhow::Result> { // Keep validation compatible with `MemoryLayout::new()` / `new_with_numa()`: // RAM sizes are page-granular, nonzero, and NUMA budgets must exactly cover @@ -353,17 +432,32 @@ mod tests { use vm_topology::memory::AddressType; const MB: u64 = 1024 * 1024; + const DEFAULT_CHIPSET_LOW_MMIO_SIZE_X86_64: u64 = 96 * 1024 * 1024; + const DEFAULT_CHIPSET_HIGH_MMIO_SIZE: u64 = 512 * 1024 * 1024; + const DEFAULT_VTL2_CHIPSET_MMIO_SIZE: u64 = GB; - fn input<'a>( + const DEFAULT_CHIPSET_LOW: MmioRangeConfig = MmioRangeConfig::Dynamic { + size: DEFAULT_CHIPSET_LOW_MMIO_SIZE_X86_64, + }; + const DEFAULT_CHIPSET_HIGH: MmioRangeConfig = MmioRangeConfig::Dynamic { + size: DEFAULT_CHIPSET_HIGH_MMIO_SIZE, + }; + + const DEFAULT_VTL2_CHIPSET: MmioRangeConfig = MmioRangeConfig::Dynamic { + size: DEFAULT_VTL2_CHIPSET_MMIO_SIZE, + }; + + fn input( mem_size: u64, - numa_mem_sizes: Option<&'a [u64]>, - mmio_gaps: &'a [MemoryRange], + numa_mem_sizes: Option<&[u64]>, vtl2_layout: Option, - ) -> MemoryLayoutInput<'a> { + ) -> MemoryLayoutInput<'_> { MemoryLayoutInput { mem_size, numa_mem_sizes, - mmio_gaps, + chipset_low_mmio: Some(&DEFAULT_CHIPSET_LOW), + chipset_high_mmio: Some(&DEFAULT_CHIPSET_HIGH), + vtl2_chipset_mmio: None, pcie_root_complexes: &[], virtio_mmio_count: 0, vtl2_layout, @@ -401,112 +495,108 @@ mod tests { } #[test] - fn non_numa_matches_memory_layout_new() { - let mmio = [ - MemoryRange::new(2 * GB..3 * GB), - MemoryRange::new(4 * GB..5 * GB), - ]; - - let actual = resolve(input(6 * GB, None, &mmio, None)); - let expected = MemoryLayout::new(6 * GB, &mmio, &[], &[], None).unwrap(); + fn basic_ram_placement() { + let actual = resolve(input(2 * GB, None, None)); - assert_eq!(actual.ram(), expected.ram()); - assert_eq!(actual.mmio(), expected.mmio()); - assert_eq!(actual.ram_size(), expected.ram_size()); - assert_eq!(actual.end_of_ram(), expected.end_of_ram()); - assert_eq!(actual.end_of_layout(), expected.end_of_layout()); + assert_eq!(actual.ram_size(), 2 * GB); + // RAM starts at GPA 0 and fills upward. + assert_eq!(actual.ram()[0].range.start(), 0); } #[test] - fn numa_preserves_node_ordering_and_splitting() { - let mmio = [MemoryRange::new(3 * GB..4 * GB)]; - let sizes = [2 * GB, 2 * GB]; - - let actual = resolve(input(4 * GB, Some(&sizes), &mmio, None)); - let expected = MemoryLayout::new_with_numa(&sizes, &mmio, &[], &[], None).unwrap(); + fn ram_splits_around_arch_reserved_zone() { + // 4 GiB of RAM must split around the architectural reserved zone + // and the chipset MMIO allocations below 4 GiB. + let actual = resolve(input(4 * GB, None, None)); - assert_eq!(actual.ram(), expected.ram()); + assert_eq!(actual.ram_size(), 4 * GB); + // RAM must not overlap the architectural reserved zone. + let reserved = ARCH_RESERVED_X86_64; + for ram in actual.ram() { + assert!( + !ram.range.overlaps(&reserved), + "RAM {:?} overlaps reserved {:?}", + ram.range, + reserved + ); + } } #[test] - fn fixed_ranges_are_occupied_for_ram() { - let mmio = [ - MemoryRange::new(GB..2 * GB), - MemoryRange::new(3 * GB..3 * GB + MB), - MemoryRange::new(4 * GB..5 * GB), - ]; + fn numa_preserves_node_ordering() { + let sizes = [2 * GB, 2 * GB]; - let actual = resolve(input(4 * GB, None, &mmio, None)); + let actual = resolve(input(4 * GB, Some(&sizes), None)); + // First vnode's RAM starts at 0. + assert_eq!(actual.ram()[0].vnode, 0); + assert_eq!(actual.ram()[0].range.start(), 0); + // All RAM accounts for 4 GiB total. assert_eq!(actual.ram_size(), 4 * GB); + } + + #[test] + fn chipset_mmio_is_resolved() { + let result = resolve_memory_layout(input(2 * GB, None, None)).unwrap(); + + let low = result + .chipset_low_mmio + .expect("should have low chipset MMIO"); + let high = result + .chipset_high_mmio + .expect("should have high chipset MMIO"); + assert_eq!(low.len(), DEFAULT_CHIPSET_LOW_MMIO_SIZE_X86_64); + assert_eq!(high.len(), DEFAULT_CHIPSET_HIGH_MMIO_SIZE); assert!( - actual - .ram() - .iter() - .all(|ram| { mmio.iter().all(|m| !ram.range.overlaps(m)) }) + low.end() <= 4 * GB, + "low chipset MMIO should be below 4 GiB" + ); + assert!( + high.start() >= 2 * GB, + "high chipset MMIO should be above RAM" ); } #[test] fn pcie_dynamic_intents_are_resolved() { - let mmio = [MemoryRange::new(0xf800_0000..4 * GB)]; let root_complexes = [pcie_root_complex( None, MmioRangeConfig::Dynamic { size: 64 * MB }, MmioRangeConfig::Dynamic { size: GB }, )]; - let mut config = input(2 * GB, None, &mmio, None); + let mut config = input(2 * GB, None, None); config.pcie_root_complexes = &root_complexes; let actual = resolve_memory_layout(config).unwrap(); let ranges = &actual.pcie_root_complex_ranges[0]; - assert_eq!( - ranges.ecam_range, - MemoryRange::new(0xf3f0_0000..0xf400_0000) + assert!( + ranges.ecam_range.end() <= 4 * GB, + "ECAM should be below 4 GiB" ); - assert_eq!(ranges.low_mmio, MemoryRange::new(0xf400_0000..0xf800_0000)); - assert_eq!(ranges.high_mmio, MemoryRange::new(2 * GB..3 * GB)); + assert_eq!(ranges.low_mmio.len(), 64 * MB); + assert_eq!(ranges.high_mmio.len(), GB); assert_eq!( - actual.memory_layout.probe_address(0xf3f0_0000), + actual + .memory_layout + .probe_address(ranges.ecam_range.start()), Some(AddressType::PciEcam) ); assert_eq!( - actual.memory_layout.probe_address(0xf400_0000), + actual.memory_layout.probe_address(ranges.low_mmio.start()), Some(AddressType::PciMmio) ); assert_eq!( - actual.memory_layout.probe_address(2 * GB), + actual.memory_layout.probe_address(ranges.high_mmio.start()), Some(AddressType::PciMmio) ); } - #[test] - fn gb_sized_ram_request_uses_gb_chunks() { - let mmio = [MemoryRange::new(GB + MB..GB + 2 * MB)]; - - let actual = resolve(input(2 * GB, None, &mmio, None)); - - assert_eq!( - actual.ram(), - &[ - MemoryRangeWithNode { - range: MemoryRange::new(0..GB), - vnode: 0, - }, - MemoryRangeWithNode { - range: MemoryRange::new(2 * GB..3 * GB), - vnode: 0, - }, - ] - ); - } - #[test] fn sub_gb_numa_nodes_use_two_mb_alignment() { let sizes = [512 * MB, 512 * MB]; - let actual = resolve(input(GB, Some(&sizes), &[], None)); + let actual = resolve(input(GB, Some(&sizes), None)); assert_eq!( actual.ram(), @@ -525,45 +615,31 @@ mod tests { #[test] fn vtl2_is_allocated_after_all_mmio() { - let mmio = [ - MemoryRange::new(GB..2 * GB), - MemoryRange::new(7 * GB..8 * GB), - ]; - - let actual = resolve(input(4 * GB, None, &mmio, Some(vtl2_layout(2 * MB)))); - - assert_eq!(actual.end_of_layout(), 8 * GB); - assert_eq!( - actual.vtl2_range(), - Some(MemoryRange::new(8 * GB..8 * GB + 2 * MB)) - ); + let actual = resolve(input(4 * GB, None, Some(vtl2_layout(2 * MB)))); + + assert!(actual.vtl2_range().is_some()); + let vtl2 = actual.vtl2_range().unwrap(); + assert_eq!(vtl2.len(), 2 * MB); + // VTL2 should be after all other allocations. + for ram in actual.ram() { + assert!(vtl2.start() >= ram.range.end()); + } } #[test] fn vtl2_does_not_change_ram_placement() { - let mmio = [MemoryRange::new(GB..2 * GB)]; - - let without_vtl2 = resolve(input(2 * GB, None, &mmio, None)); - let with_vtl2 = resolve(input(2 * GB, None, &mmio, Some(vtl2_layout(2 * MB)))); + let without_vtl2 = resolve(input(2 * GB, None, None)); + let with_vtl2 = resolve(input(2 * GB, None, Some(vtl2_layout(2 * MB)))); assert_eq!(with_vtl2.ram(), without_vtl2.ram()); - assert_eq!(with_vtl2.end_of_layout(), without_vtl2.end_of_layout()); - assert_eq!( - with_vtl2.vtl2_range(), - Some(MemoryRange::new(3 * GB..3 * GB + 2 * MB)) - ); } #[test] fn deterministic_for_same_inputs() { - let mmio = [ - MemoryRange::new(GB..2 * GB), - MemoryRange::new(5 * GB..6 * GB), - ]; let sizes = [2 * GB, 3 * GB]; - let first = resolve(input(5 * GB, Some(&sizes), &mmio, None)); - let second = resolve(input(5 * GB, Some(&sizes), &mmio, None)); + let first = resolve(input(5 * GB, Some(&sizes), None)); + let second = resolve(input(5 * GB, Some(&sizes), None)); assert_eq!(first.ram(), second.ram()); assert_eq!(first.end_of_layout(), second.end_of_layout()); @@ -571,8 +647,9 @@ mod tests { #[test] fn host_width_validation_happens_after_allocation() { - let mmio = [MemoryRange::new(GB..4 * GB)]; - let mut config = input(3 * GB, None, &mmio, None); + // Use enough RAM that the layout (RAM + chipset high MMIO + arch + // reserved zone) exceeds 32 bits. + let mut config = input(4 * GB, None, None); config.physical_address_size = 32; let err = resolve_memory_layout(config).unwrap_err(); @@ -582,8 +659,7 @@ mod tests { #[test] fn virtio_mmio_slots_are_allocated_in_mmio32() { - let mmio = [MemoryRange::new(0xf800_0000..4 * GB)]; - let mut config = input(2 * GB, None, &mmio, None); + let mut config = input(2 * GB, None, None); config.virtio_mmio_count = 3; let result = resolve_memory_layout(config).unwrap(); @@ -593,18 +669,12 @@ mod tests { .expect("should have virtio-mmio region"); assert_eq!(region.len(), 3 * PAGE_SIZE); assert!(region.end() <= 4 * GB, "virtio-mmio should be below 4 GiB"); - assert!( - !MemoryRange::new(0xf800_0000..4 * GB).overlaps(®ion), - "virtio-mmio should not overlap with chipset MMIO gap" - ); } #[test] fn virtio_mmio_does_not_move_ram() { - let mmio = [MemoryRange::new(0xf800_0000..4 * GB)]; - - let without = resolve(input(2 * GB, None, &mmio, None)); - let mut config = input(2 * GB, None, &mmio, None); + let without = resolve(input(2 * GB, None, None)); + let mut config = input(2 * GB, None, None); config.virtio_mmio_count = 2; let with = resolve_memory_layout(config).unwrap(); @@ -613,11 +683,53 @@ mod tests { #[test] fn zero_virtio_mmio_produces_no_region() { - let mmio = [MemoryRange::new(0xf800_0000..4 * GB)]; - let config = input(2 * GB, None, &mmio, None); + let config = input(2 * GB, None, None); let result = resolve_memory_layout(config).unwrap(); assert!(result.virtio_mmio_region.is_none()); } + + #[test] + fn vtl2_chipset_mmio_is_post_mmio() { + let mut config = input(2 * GB, None, None); + config.vtl2_chipset_mmio = Some(&DEFAULT_VTL2_CHIPSET); + + let result = resolve_memory_layout(config).unwrap(); + + let vtl2_mmio = result + .vtl2_chipset_mmio + .expect("should have VTL2 chipset MMIO"); + assert_eq!(vtl2_mmio.len(), DEFAULT_VTL2_CHIPSET_MMIO_SIZE); + // VTL2 chipset MMIO should be after all VTL0-visible ranges. + let chipset_high = result + .chipset_high_mmio + .expect("should have high chipset MMIO"); + assert!( + vtl2_mmio.start() >= chipset_high.end(), + "VTL2 chipset MMIO should be after VTL0 high MMIO" + ); + } + + #[test] + fn vtl2_chipset_mmio_does_not_move_vtl0_layout() { + let without = resolve(input(2 * GB, None, None)); + let mut config = input(2 * GB, None, None); + config.vtl2_chipset_mmio = Some(&DEFAULT_VTL2_CHIPSET); + let with = resolve_memory_layout(config).unwrap(); + + assert_eq!(with.memory_layout.ram(), without.ram()); + } + + #[test] + fn no_chipset_mmio_when_none() { + let mut config = input(2 * GB, None, None); + config.chipset_low_mmio = None; + config.chipset_high_mmio = None; + + let result = resolve_memory_layout(config).unwrap(); + + assert!(result.chipset_low_mmio.is_none()); + assert!(result.chipset_high_mmio.is_none()); + } } diff --git a/openvmm/openvmm_core/src/worker/vm_loaders/igvm.rs b/openvmm/openvmm_core/src/worker/vm_loaders/igvm.rs index 399ff44c63..01abe0c70b 100644 --- a/openvmm/openvmm_core/src/worker/vm_loaders/igvm.rs +++ b/openvmm/openvmm_core/src/worker/vm_loaders/igvm.rs @@ -259,7 +259,6 @@ pub fn vtl2_memory_layout_request( /// Build a device tree representing the whole guest partition. fn build_device_tree( processor_topology: &ProcessorTopology, - mem_layout: &MemoryLayout, all_ram: &[MemoryRangeWithNode], vtl2_protectable_ram: &[MemoryRange], vtl2_base_address: Vtl2BaseAddressType, @@ -267,6 +266,9 @@ fn build_device_tree( with_vmbus_redirect: bool, com_serial: Option, entropy: Option<&[u8]>, + chipset_low_mmio: Option, + chipset_high_mmio: Option, + vtl2_chipset_mmio: Option, ) -> Result, fdt::builder::Error> { let mut buf = vec![0; HV_PAGE_SIZE as usize * 256]; @@ -339,26 +341,20 @@ fn build_device_tree( .add_u32(p_size_cells, 2)? .add_prop_array(p_ranges, &[])?; - // Determine how much mmio this system has. 2 or less gaps are reported to - // VTL0. The 3rd and/or 4th gap will be reported to VTL2. Any more are - // ignored. - let mut mmio_chunks = mem_layout.mmio().chunks(2); - - let extract_ranges = |mmio: Option<&[MemoryRange]>| -> Vec { - let mut ranges = Vec::new(); - - if let Some(mmio) = mmio { - for entry in mmio { - ranges.push(entry.start()); - ranges.push(entry.start()); - ranges.push(entry.len()); - } - } - ranges - }; - - let ranges_vtl0 = extract_ranges(mmio_chunks.next()); - let ranges_vtl2 = extract_ranges(mmio_chunks.next()); + // Build DT ranges for VMBus devices. VTL0 gets the chipset low/high MMIO + // ranges; VTL2 gets its own private chipset MMIO range. + let mut ranges_vtl0: Vec = Vec::new(); + for range in [chipset_low_mmio, chipset_high_mmio].into_iter().flatten() { + ranges_vtl0.push(range.start()); + ranges_vtl0.push(range.start()); + ranges_vtl0.push(range.len()); + } + let mut ranges_vtl2: Vec = Vec::new(); + if let Some(range) = vtl2_chipset_mmio { + ranges_vtl2.push(range.start()); + ranges_vtl2.push(range.start()); + ranges_vtl2.push(range.len()); + } // VTL0 vmbus root device let vmbus_vtl0_name = if ranges_vtl0.is_empty() { @@ -511,6 +507,12 @@ pub struct LoadIgvmParams<'a, T: ArchTopology> { pub com_serial: Option, /// Entropy pub entropy: Option<&'a [u8]>, + /// VTL0 chipset low MMIO range for the device tree VMBus node. + pub chipset_low_mmio: Option, + /// VTL0 chipset high MMIO range for the device tree VMBus node. + pub chipset_high_mmio: Option, + /// VTL2-private chipset MMIO range for the device tree VTL2 VMBus node. + pub vtl2_chipset_mmio: Option, } pub fn load_igvm( @@ -553,6 +555,9 @@ fn load_igvm_x86( with_vmbus_redirect, com_serial, entropy, + chipset_low_mmio, + chipset_high_mmio, + vtl2_chipset_mmio, } = params; let relocations_enabled = match vtl2_base_address { @@ -930,14 +935,11 @@ fn load_igvm_x86( } } IgvmDirectiveHeader::MmioRanges(ref info) => { - // Convert the OpenVMM format to the IGVM format - // Any gaps above 2 are ignored. - let mmio = mem_layout.mmio(); - if mmio.len() < 2 { - return Err(Error::UnsupportedMmio); - } + // Convert the chipset MMIO ranges to the IGVM format. + let low = chipset_low_mmio.ok_or(Error::UnsupportedMmio)?; + let high = chipset_high_mmio.ok_or(Error::UnsupportedMmio)?; let mmio_ranges = IGVM_VHS_MMIO_RANGES { - mmio_ranges: [from_memory_range(&mmio[0]), from_memory_range(&mmio[1])], + mmio_ranges: [from_memory_range(&low), from_memory_range(&high)], }; import_parameter(&mut parameter_areas, info, mmio_ranges.as_bytes())?; } @@ -951,7 +953,6 @@ fn load_igvm_x86( IgvmDirectiveHeader::DeviceTree(ref info) => { let dt = build_device_tree( processor_topology, - mem_layout, &all_ram, &vtl2_protectable_ram, vtl2_base_address, @@ -959,6 +960,9 @@ fn load_igvm_x86( with_vmbus_redirect, com_serial, entropy, + chipset_low_mmio, + chipset_high_mmio, + vtl2_chipset_mmio, ) .map_err(Error::DeviceTree)?; import_parameter(&mut parameter_areas, info, &dt)?; diff --git a/openvmm/openvmm_core/src/worker/vm_loaders/linux.rs b/openvmm/openvmm_core/src/worker/vm_loaders/linux.rs index 4b4aa0de6b..4d272561c1 100644 --- a/openvmm/openvmm_core/src/worker/vm_loaders/linux.rs +++ b/openvmm/openvmm_core/src/worker/vm_loaders/linux.rs @@ -10,7 +10,6 @@ use loader::linux::InitrdAddressType; use loader::linux::InitrdConfig; use loader::linux::RegisterConfig; use loader::linux::ZeroPageConfig; -use openvmm_defs::config::DEFAULT_MMIO_GAPS_AARCH64; use std::ffi::CString; use std::io::Seek; use thiserror::Error; @@ -494,9 +493,13 @@ fn build_dt( } } - assert!(DEFAULT_MMIO_GAPS_AARCH64.len() == 2); - let low_mmio_gap = DEFAULT_MMIO_GAPS_AARCH64[0]; - let high_mmio_gap = DEFAULT_MMIO_GAPS_AARCH64[1]; + // Build VMBus MMIO ranges from the memory layout's chipset MMIO gaps. + assert!( + cfg.mem_layout.mmio().len() >= 2, + "need at least two MMIO regions for VMBus DT node" + ); + let low_mmio_gap = cfg.mem_layout.mmio()[0]; + let high_mmio_gap = cfg.mem_layout.mmio()[1]; soc = soc .start_node("vmbus")? .add_u32(p_address_cells, 2)? diff --git a/openvmm/openvmm_defs/src/config.rs b/openvmm/openvmm_defs/src/config.rs index 810f62d18b..9dac349ac0 100644 --- a/openvmm/openvmm_defs/src/config.rs +++ b/openvmm/openvmm_defs/src/config.rs @@ -55,6 +55,12 @@ pub struct Config { pub chipset_devices: Vec, pub pci_chipset_devices: Vec, pub chipset_capabilities: VmChipsetCapabilities, + /// Chipset low MMIO range (below 4 GiB) for VMOD/PCI0 _CRS. + pub chipset_low_mmio: Option, + /// Chipset high MMIO range (above RAM) for VMOD/PCI0 _CRS. + pub chipset_high_mmio: Option, + /// VTL2-private chipset MMIO range for VTL2 VMBus. + pub vtl2_chipset_mmio: Option, pub generation_id_recv: Option>, // This is used for testing. TODO: resourcify, and also store this in VMGS. pub rtc_delta_milliseconds: i64, @@ -63,36 +69,6 @@ pub struct Config { pub efi_diagnostics_log_level: EfiDiagnosticsLogLevelType, } -// ARM64 needs a larger low gap. -const DEFAULT_LOW_MMAP_GAP_SIZE_X86: u64 = 1024 * 1024 * 128; -const DEFAULT_LOW_MMAP_GAP_SIZE_AARCH64: u64 = 1024 * 1024 * 512; - -/// Default mmio gaps for an x86 partition. -pub const DEFAULT_MMIO_GAPS_X86: [MemoryRange; 2] = [ - MemoryRange::new(0x1_0000_0000 - DEFAULT_LOW_MMAP_GAP_SIZE_X86..0x1_0000_0000), // nMB just below 4GB - MemoryRange::new(0xF_E000_0000..0x10_0000_0000), // 512MB just below 64GB, then up to 64GB -]; - -/// Default mmio gaps for x86 if VTL2 is enabled. -pub const DEFAULT_MMIO_GAPS_X86_WITH_VTL2: [MemoryRange; 3] = [ - MemoryRange::new(0x1_0000_0000 - DEFAULT_LOW_MMAP_GAP_SIZE_X86..0x1_0000_0000), // nMB just below 4GB - MemoryRange::new(0xF_E000_0000..0x20_0000_0000), // 512MB just below 64GB, then up to 128GB - MemoryRange::new(0x20_0000_0000..0x20_4000_0000), // 128GB to 129 GB -]; - -/// Default mmio gaps for an aarch64 partition. -pub const DEFAULT_MMIO_GAPS_AARCH64: [MemoryRange; 2] = [ - MemoryRange::new(0x1_0000_0000 - DEFAULT_LOW_MMAP_GAP_SIZE_AARCH64..0x1_0000_0000), // nMB just below 4GB - MemoryRange::new(0xF_E000_0000..0x10_0000_0000), // 512MB just below 64GB, then up to 64GB -]; - -/// Default mmio gaps for aarch64 if VTL2 is enabled. -pub const DEFAULT_MMIO_GAPS_AARCH64_WITH_VTL2: [MemoryRange; 3] = [ - MemoryRange::new(0x1_0000_0000 - DEFAULT_LOW_MMAP_GAP_SIZE_AARCH64..0x1_0000_0000), // nMB just below 4GB - MemoryRange::new(0xF_E000_0000..0x20_0000_0000), // 512MB just below 64GB, then up to 128GB - MemoryRange::new(0x20_0000_0000..0x20_4000_0000), // 128GB to 129 GB -]; - pub const DEFAULT_GIC_DISTRIBUTOR_BASE: u64 = 0xFFFF_0000; // The KVM in-kernel vGICv3 requires the distributor and redistributor bases be 64KiB aligned. pub const DEFAULT_GIC_REDISTRIBUTORS_BASE: u64 = if cfg!(target_os = "linux") { @@ -216,11 +192,7 @@ pub enum Vtl2BaseAddressType { Vtl2Allocate { size: Option }, } -#[derive(Debug, MeshPayload)] -pub enum MmioRangeConfig { - Dynamic { size: u64 }, - Fixed(MemoryRange), -} +pub use vmm_core_defs::MmioRangeConfig; #[derive(Debug, MeshPayload)] pub struct PcieRootComplexConfig { @@ -366,7 +338,6 @@ pub struct MemoryConfig { pub transparent_hugepages: bool, pub hugepages: bool, pub hugepage_size: Option, - pub mmio_gaps: Vec, /// Test only: per-NUMA-node memory sizes. When set, RAM is distributed /// across vNUMA nodes according to these sizes instead of assigning all RAM /// to node 0. The sum must equal `mem_size`. diff --git a/openvmm/openvmm_entry/Cargo.toml b/openvmm/openvmm_entry/Cargo.toml index 8e3164cd7b..82f8103bef 100644 --- a/openvmm/openvmm_entry/Cargo.toml +++ b/openvmm/openvmm_entry/Cargo.toml @@ -40,7 +40,6 @@ get_resources.workspace = true hyperv_ic_resources.workspace = true ide_resources.workspace = true input_core.workspace = true -memory_range.workspace = true net_backend_resources.workspace = true netvsp_resources.workspace = true nvme_resources.workspace = true diff --git a/openvmm/openvmm_entry/src/lib.rs b/openvmm/openvmm_entry/src/lib.rs index 548270aefc..2f51954392 100644 --- a/openvmm/openvmm_entry/src/lib.rs +++ b/openvmm/openvmm_entry/src/lib.rs @@ -58,7 +58,6 @@ use guid::Guid; use input_core::MultiplexedInputHandle; use inspect::InspectMut; use io::Read; -use memory_range::MemoryRange; use mesh::CancelContext; use mesh::CellUpdater; use mesh::rpc::RpcSend; @@ -66,10 +65,6 @@ use meshworker::VmmMesh; use net_backend_resources::mac_address::MacAddress; use nvme_resources::NvmeControllerRequest; use openvmm_defs::config::Config; -use openvmm_defs::config::DEFAULT_MMIO_GAPS_AARCH64; -use openvmm_defs::config::DEFAULT_MMIO_GAPS_AARCH64_WITH_VTL2; -use openvmm_defs::config::DEFAULT_MMIO_GAPS_X86; -use openvmm_defs::config::DEFAULT_MMIO_GAPS_X86_WITH_VTL2; use openvmm_defs::config::DEFAULT_PCAT_BOOT_ORDER; use openvmm_defs::config::DeviceVtl; use openvmm_defs::config::EfiDiagnosticsLogLevelType; @@ -87,7 +82,6 @@ use openvmm_defs::config::SerialInformation; use openvmm_defs::config::VirtioBus; use openvmm_defs::config::VmbusConfig; use openvmm_defs::config::VpciDeviceConfig; -use openvmm_defs::config::Vtl2BaseAddressType; use openvmm_defs::config::Vtl2Config; use openvmm_defs::rpc::VmRpc; use openvmm_defs::worker::VM_WORKER; @@ -720,27 +714,12 @@ async fn vm_config_from_command_line( }), ); - // If VTL2 is enabled, and we are not in VTL2 self allocate mode, provide an - // mmio gap for VTL2. - let use_vtl2_gap = opt.vtl2 - && !matches!( - opt.igvm_vtl2_relocation_type, - Vtl2BaseAddressType::Vtl2Allocate { .. }, - ); + let mut pcie_root_complexes = Vec::new(); #[cfg(guest_arch = "aarch64")] let arch = MachineArch::Aarch64; #[cfg(guest_arch = "x86_64")] let arch = MachineArch::X86_64; - - let mmio_gaps: Vec = match (use_vtl2_gap, arch) { - (true, MachineArch::X86_64) => DEFAULT_MMIO_GAPS_X86_WITH_VTL2.into(), - (true, MachineArch::Aarch64) => DEFAULT_MMIO_GAPS_AARCH64_WITH_VTL2.into(), - (false, MachineArch::X86_64) => DEFAULT_MMIO_GAPS_X86.into(), - (false, MachineArch::Aarch64) => DEFAULT_MMIO_GAPS_AARCH64.into(), - }; - - let mut pcie_root_complexes = Vec::new(); for (i, rc_cli) in opt.pcie_root_complex.iter().enumerate() { let ports = opt .pcie_root_port @@ -899,6 +878,9 @@ async fn vm_config_from_command_line( mut chipset_devices, pci_chipset_devices, capabilities, + chipset_low_mmio, + chipset_high_mmio, + vtl2_chipset_mmio, } = chipset .build() .context("failed to build chipset configuration")?; @@ -1597,7 +1579,6 @@ async fn vm_config_from_command_line( } else { opt.memory_size() }, - mmio_gaps, prefetch_memory: opt.prefetch_memory(), private_memory: opt.private_memory(), transparent_hugepages: opt.transparent_hugepages(), @@ -1654,6 +1635,9 @@ async fn vm_config_from_command_line( chipset_devices, pci_chipset_devices, chipset_capabilities: capabilities, + chipset_low_mmio, + chipset_high_mmio, + vtl2_chipset_mmio, #[cfg(windows)] vpci_resources, vmgs, diff --git a/openvmm/openvmm_entry/src/ttrpc/mod.rs b/openvmm/openvmm_entry/src/ttrpc/mod.rs index dcdffb82e1..22bde5b29e 100644 --- a/openvmm/openvmm_entry/src/ttrpc/mod.rs +++ b/openvmm/openvmm_entry/src/ttrpc/mod.rs @@ -33,7 +33,6 @@ use mesh_worker::WorkerId; use mesh_worker::WorkerRpc; use netvsp_resources::NetvspHandle; use openvmm_defs::config::Config; -use openvmm_defs::config::DEFAULT_MMIO_GAPS_X86; use openvmm_defs::config::DeviceVtl; use openvmm_defs::config::HypervisorConfig; use openvmm_defs::config::LoadMode; @@ -573,7 +572,6 @@ impl VmService { vpci_devices: vec![], memory: MemoryConfig { mem_size: config_mem_size, - mmio_gaps: DEFAULT_MMIO_GAPS_X86.into(), prefetch_memory: false, private_memory: false, transparent_hugepages: false, @@ -612,6 +610,9 @@ impl VmService { chipset_devices: chipset.chipset_devices, pci_chipset_devices: chipset.pci_chipset_devices, chipset_capabilities: chipset.capabilities, + chipset_low_mmio: chipset.chipset_low_mmio, + chipset_high_mmio: chipset.chipset_high_mmio, + vtl2_chipset_mmio: chipset.vtl2_chipset_mmio, generation_id_recv: None, rtc_delta_milliseconds: 0, automatic_guest_reset: true, diff --git a/petri/Cargo.toml b/petri/Cargo.toml index 0ca720467f..14b599a273 100644 --- a/petri/Cargo.toml +++ b/petri/Cargo.toml @@ -31,7 +31,6 @@ disk_backend_resources.workspace = true framebuffer.workspace = true get_resources.workspace = true ide_resources.workspace = true -memory_range.workspace = true net_backend_resources.workspace = true netvsp_resources.workspace = true nvme_resources.workspace = true diff --git a/petri/src/vm/mod.rs b/petri/src/vm/mod.rs index c67ee590a1..42bfff6092 100644 --- a/petri/src/vm/mod.rs +++ b/petri/src/vm/mod.rs @@ -22,7 +22,6 @@ use crate::vtl2_settings::Vtl2StorageControllerBuilder; use async_trait::async_trait; use get_resources::ged::FirmwareEvent; use guid::Guid; -use memory_range::MemoryRange; use mesh::CancelContext; use openvmm_defs::config::Vtl2BaseAddressType; use pal_async::DefaultDriver; @@ -2130,16 +2129,6 @@ pub enum ApicMode { X2apicEnabled, } -/// Mmio configuration. -#[derive(Debug)] -pub enum MmioConfig { - /// The platform provided default. - Platform, - /// Custom mmio gaps. - /// TODO: Not supported on all platforms (ie Hyper-V). - Custom(Vec), -} - /// Common memory configuration information for the VM. #[derive(Debug)] pub struct MemoryConfig { @@ -2150,8 +2139,6 @@ pub struct MemoryConfig { /// /// Dynamic memory will be disabled if this is `None`. pub dynamic_memory_range: Option<(u64, u64)>, - /// Specifies the mmio gaps to use, either platform or custom. - pub mmio_gaps: MmioConfig, /// Per-NUMA-node memory sizes. When set, RAM is distributed across /// vNUMA nodes instead of assigning all RAM to node 0. pub numa_mem_sizes: Option>, @@ -2162,7 +2149,6 @@ impl Default for MemoryConfig { Self { startup_bytes: 4 * 1024 * 1024 * 1024, // 4 GiB dynamic_memory_range: None, - mmio_gaps: MmioConfig::Platform, numa_mem_sizes: None, } } diff --git a/petri/src/vm/openvmm/construct.rs b/petri/src/vm/openvmm/construct.rs index 1e39f7b850..b1b9b67b3c 100644 --- a/petri/src/vm/openvmm/construct.rs +++ b/petri/src/vm/openvmm/construct.rs @@ -23,7 +23,6 @@ use crate::UefiConfig; use crate::VmbusStorageType; use crate::linux_direct_serial_agent::LinuxDirectSerialAgent; -use crate::MmioConfig; use crate::SIZE_1_MB; use crate::VmbusStorageController; use crate::openvmm::memdiff_vmgs; @@ -46,10 +45,6 @@ use mesh_process::Mesh; use nvme_resources::NamespaceDefinition; use nvme_resources::NvmeControllerHandle; use openvmm_defs::config::Config; -use openvmm_defs::config::DEFAULT_MMIO_GAPS_AARCH64; -use openvmm_defs::config::DEFAULT_MMIO_GAPS_AARCH64_WITH_VTL2; -use openvmm_defs::config::DEFAULT_MMIO_GAPS_X86; -use openvmm_defs::config::DEFAULT_MMIO_GAPS_X86_WITH_VTL2; use openvmm_defs::config::DEFAULT_PCAT_BOOT_ORDER; use openvmm_defs::config::DeviceVtl; use openvmm_defs::config::HypervisorConfig; @@ -361,7 +356,6 @@ impl PetriVmConfigOpenVmm { let MemoryConfig { startup_bytes, dynamic_memory_range, - mmio_gaps, numa_mem_sizes, } = memory; @@ -380,22 +374,6 @@ impl PetriVmConfigOpenVmm { openvmm_defs::config::MemoryConfig { mem_size, - mmio_gaps: match mmio_gaps { - MmioConfig::Platform => { - if firmware.is_openhcl() { - match arch { - MachineArch::X86_64 => DEFAULT_MMIO_GAPS_X86_WITH_VTL2.into(), - MachineArch::Aarch64 => DEFAULT_MMIO_GAPS_AARCH64_WITH_VTL2.into(), - } - } else { - match arch { - MachineArch::X86_64 => DEFAULT_MMIO_GAPS_X86.into(), - MachineArch::Aarch64 => DEFAULT_MMIO_GAPS_AARCH64.into(), - } - } - } - MmioConfig::Custom(ranges) => ranges, - }, prefetch_memory: false, private_memory: false, transparent_hugepages: false, @@ -481,6 +459,9 @@ impl PetriVmConfigOpenVmm { mut chipset_devices, pci_chipset_devices, capabilities, + chipset_low_mmio, + chipset_high_mmio, + vtl2_chipset_mmio, } = chipset; // Add the TPM @@ -518,6 +499,9 @@ impl PetriVmConfigOpenVmm { chipset_devices, pci_chipset_devices, chipset_capabilities: capabilities, + chipset_low_mmio, + chipset_high_mmio, + vtl2_chipset_mmio, // Basic virtualization device support hypervisor: HypervisorConfig { diff --git a/vmm_core/vm_manifest_builder/Cargo.toml b/vmm_core/vm_manifest_builder/Cargo.toml index b89fa5d683..a040d3a9b0 100644 --- a/vmm_core/vm_manifest_builder/Cargo.toml +++ b/vmm_core/vm_manifest_builder/Cargo.toml @@ -15,6 +15,7 @@ serial_core.workspace = true serial_debugcon_resources.workspace = true serial_pl011_resources.workspace = true vm_resource.workspace = true +vmm_core_defs.workspace = true vmotherboard.workspace = true mesh.workspace = true diff --git a/vmm_core/vm_manifest_builder/src/lib.rs b/vmm_core/vm_manifest_builder/src/lib.rs index b49f96066c..0d69c5093d 100644 --- a/vmm_core/vm_manifest_builder/src/lib.rs +++ b/vmm_core/vm_manifest_builder/src/lib.rs @@ -43,6 +43,7 @@ use vm_resource::PlatformResource; use vm_resource::Resource; use vm_resource::ResourceId; use vm_resource::kind::SerialBackendHandle; +use vmm_core_defs::MmioRangeConfig; use vmotherboard::ChipsetDeviceHandle; use vmotherboard::LegacyPciChipsetDeviceHandle; use vmotherboard::options::BaseChipsetManifest; @@ -102,6 +103,15 @@ pub struct VmChipsetResult { pub pci_chipset_devices: Vec, /// Derived chipset capabilities needed by firmware and table generation. pub capabilities: VmChipsetCapabilities, + /// Default chipset low MMIO range (below 4 GiB) for VMOD/PCI0 _CRS. + /// `None` when the VM type has no VMBus or PCI bus. + pub chipset_low_mmio: Option, + /// Default chipset high MMIO range (above RAM) for VMOD/PCI0 _CRS. + /// `None` when the VM type has no VMBus or PCI bus. + pub chipset_high_mmio: Option, + /// Default VTL2-private chipset MMIO range for VTL2 VMBus. + /// `None` when the VM type does not include VTL2. + pub vtl2_chipset_mmio: Option, } /// Error type for building a VM manifest. @@ -236,6 +246,9 @@ impl VmManifestBuilder { with_psp: false, with_guest_watchdog: false, }, + chipset_low_mmio: None, + chipset_high_mmio: None, + vtl2_chipset_mmio: None, }; if let Some((backend, port)) = self.debugcon { @@ -390,6 +403,32 @@ impl VmManifestBuilder { } } + // Chipset MMIO sizing: all VM types with VMBus or a PCI bus get low + + // high chipset MMIO. HclHost additionally gets VTL2 chipset MMIO. + // + // x86_64: 96 MiB low (128 MiB traditional gap minus 32 MiB reserved zone). + // aarch64: 240 MiB low (512 MiB traditional gap minus 272 MiB reserved zone). + let default_low = match self.arch { + MachineArch::X86_64 => 96 * 1024 * 1024, + MachineArch::Aarch64 => 240 * 1024 * 1024, + }; + let default_high: u64 = 512 * 1024 * 1024; + let default_vtl2: u64 = 1024 * 1024 * 1024; + match self.ty { + BaseChipsetType::HypervGen1 + | BaseChipsetType::HypervGen2Uefi + | BaseChipsetType::HyperVGen2LinuxDirect + | BaseChipsetType::UnenlightenedLinuxDirect => { + result.chipset_low_mmio = Some(MmioRangeConfig::Dynamic { size: default_low }); + result.chipset_high_mmio = Some(MmioRangeConfig::Dynamic { size: default_high }); + } + BaseChipsetType::HclHost => { + result.chipset_low_mmio = Some(MmioRangeConfig::Dynamic { size: default_low }); + result.chipset_high_mmio = Some(MmioRangeConfig::Dynamic { size: default_high }); + result.vtl2_chipset_mmio = Some(MmioRangeConfig::Dynamic { size: default_vtl2 }); + } + } + Ok(result) } } diff --git a/vmm_core/vmm_core_defs/Cargo.toml b/vmm_core/vmm_core_defs/Cargo.toml index c2c93559d8..36d22e00a5 100644 --- a/vmm_core/vmm_core_defs/Cargo.toml +++ b/vmm_core/vmm_core_defs/Cargo.toml @@ -7,6 +7,7 @@ edition.workspace = true rust-version.workspace = true [dependencies] +memory_range = { workspace = true, features = ["mesh"] } virt.workspace = true inspect.workspace = true diff --git a/vmm_core/vmm_core_defs/src/lib.rs b/vmm_core/vmm_core_defs/src/lib.rs index 221cbe554c..35fcdd2f3f 100644 --- a/vmm_core/vmm_core_defs/src/lib.rs +++ b/vmm_core/vmm_core_defs/src/lib.rs @@ -9,9 +9,24 @@ pub mod debug_rpc; use inspect::Inspect; +use memory_range::MemoryRange; +use mesh::MeshPayload; use mesh::payload::Protobuf; use std::sync::Arc; +/// Specifies an MMIO range, either by size (the resolver allocates) or by +/// fixed location. +#[derive(Debug, MeshPayload)] +pub enum MmioRangeConfig { + /// Dynamically allocate a range of the given size. + Dynamic { + /// Size of the range in bytes. + size: u64, + }, + /// Use the specified fixed memory range. + Fixed(MemoryRange), +} + /// HaltReason sent by devices and vp_set to the vmm. #[derive(Debug, Clone, Eq, PartialEq, Protobuf, Inspect)] #[inspect(tag = "halt_reason")] diff --git a/vmm_tests/vmm_tests/tests/tests/multiarch/memstat.rs b/vmm_tests/vmm_tests/tests/tests/multiarch/memstat.rs index 9e4f4127d1..5db40cded7 100644 --- a/vmm_tests/vmm_tests/tests/tests/multiarch/memstat.rs +++ b/vmm_tests/vmm_tests/tests/tests/multiarch/memstat.rs @@ -404,7 +404,6 @@ async fn idle_test( MemoryConfig { startup_bytes: 16 * (1024 * 1024 * 1024), dynamic_memory_range: None, - mmio_gaps: petri::MmioConfig::Platform, numa_mem_sizes: None, } }) diff --git a/vmm_tests/vmm_tests/tests/tests/x86_64/openhcl_linux_direct.rs b/vmm_tests/vmm_tests/tests/tests/x86_64/openhcl_linux_direct.rs index 921906e17e..7a17c69534 100644 --- a/vmm_tests/vmm_tests/tests/tests/x86_64/openhcl_linux_direct.rs +++ b/vmm_tests/vmm_tests/tests/tests/x86_64/openhcl_linux_direct.rs @@ -372,16 +372,10 @@ async fn parse_openhcl_memory_node( async fn openhcl_linux_vtl2_mmio_self_allocate( config: PetriVmBuilder, ) -> Result<(), anyhow::Error> { - // Use the OpenVMM default which has a 1GB mmio gap for VTL2. This should - // cause the whole gap to be given to VTL2, as we should report 128MB for - // self allocation. - let expected_mmio_ranges: Vec = - openvmm_defs::config::DEFAULT_MMIO_GAPS_X86_WITH_VTL2.into(); + // The worker resolver allocates a 1 GiB VTL2 chipset MMIO region and + // reports 128 MiB for self allocation. Verify the device tree reflects + // this. let (mut vm, agent) = config - .with_memory(MemoryConfig { - mmio_gaps: petri::MmioConfig::Custom(expected_mmio_ranges.clone()), - ..Default::default() - }) .with_vtl2_base_address_type(Vtl2BaseAddressType::Vtl2Allocate { size: None }) .run() .await?; @@ -404,16 +398,12 @@ async fn openhcl_linux_vtl2_mmio_self_allocate( const EXPECTED_MMIO_SIZE: u64 = 128 * 1024 * 1024; assert_eq!(mmio_size, EXPECTED_MMIO_SIZE); - // Read the bootloader provided dt via sysfs to verify the VTL0 and VTL2 - // mmio ranges are as expected. + // Verify the VTL2 VMBus gets a non-empty MMIO range in the device tree. let vtl2_mmio = parse_vmbus_mmio(&vtl2_agent, "bus/vmbus").await?; - assert_eq!(vtl2_mmio, expected_mmio_ranges[2..]); - let mut vtl0_mmio = Vec::new(); - for range_start in expected_mmio_ranges[..2].iter().map(|r| r.start()) { - let range = parse_openhcl_memory_node(&vtl2_agent, range_start).await?; - vtl0_mmio.push(range); - } - assert_eq!(vtl0_mmio, expected_mmio_ranges[..2]); + assert!( + !vtl2_mmio.is_empty(), + "VTL2 should have at least one MMIO range" + ); agent.power_off().await?; vm.wait_for_clean_teardown().await?; From 480414402f5cdbbfe38c1bb6c62c2a620a13918b Mon Sep 17 00:00:00 2001 From: John Starks Date: Sat, 16 May 2026 16:43:30 +0000 Subject: [PATCH 13/36] feedback --- .../openvmm_core/src/worker/memory_layout.rs | 15 +++-- vm/vmcore/vm_topology/src/layout.rs | 67 +++++++++---------- 2 files changed, 41 insertions(+), 41 deletions(-) diff --git a/openvmm/openvmm_core/src/worker/memory_layout.rs b/openvmm/openvmm_core/src/worker/memory_layout.rs index d587a28f0e..5258aeacfd 100644 --- a/openvmm/openvmm_core/src/worker/memory_layout.rs +++ b/openvmm/openvmm_core/src/worker/memory_layout.rs @@ -22,6 +22,7 @@ use anyhow::bail; use memory_range::MemoryRange; use openvmm_defs::config::MmioRangeConfig; use openvmm_defs::config::PcieRootComplexConfig; +use std::sync::Arc; use vm_topology::layout::LayoutBuilder; use vm_topology::layout::Placement; use vm_topology::memory::MemoryLayout; @@ -125,14 +126,14 @@ pub(super) fn resolve_memory_layout( } else { ARCH_RESERVED_AARCH64 }; - builder.reserve("arch_reserved".to_string(), arch_reserved); + builder.reserve("arch_reserved", arch_reserved); // Chipset low MMIO (Mmio32): VMOD/PCI0 _CRS low range for VMBus relay // devices and PIIX4 PCI BARs. if let Some(config) = input.chipset_low_mmio { add_mmio_range( &mut builder, - "chipset_low_mmio".to_string(), + "chipset_low_mmio", &mut chipset_low_mmio, config, TWO_MB, @@ -144,7 +145,7 @@ pub(super) fn resolve_memory_layout( if let Some(config) = input.chipset_high_mmio { add_mmio_range( &mut builder, - "chipset_high_mmio".to_string(), + "chipset_high_mmio", &mut chipset_high_mmio, config, TWO_MB, @@ -198,7 +199,7 @@ pub(super) fn resolve_memory_layout( // request. if input.virtio_mmio_count > 0 { builder.request( - "virtio_mmio".to_string(), + "virtio_mmio", &mut virtio_mmio_region, input.virtio_mmio_count as u64 * PAGE_SIZE, PAGE_SIZE, @@ -226,7 +227,7 @@ pub(super) fn resolve_memory_layout( if let Some(config) = input.vtl2_chipset_mmio { add_mmio_range( &mut builder, - "vtl2_chipset_mmio".to_string(), + "vtl2_chipset_mmio", &mut vtl2_chipset_mmio, config, TWO_MB, @@ -358,7 +359,7 @@ fn pcie_ecam_size(root_complex: &PcieRootComplexConfig) -> anyhow::Result { fn add_pcie_ecam_range<'a>( builder: &mut LayoutBuilder<'a>, - tag: String, + tag: impl Into>, target: &'a mut MemoryRange, config: &Option, size: u64, @@ -375,7 +376,7 @@ fn add_pcie_ecam_range<'a>( fn add_mmio_range<'a>( builder: &mut LayoutBuilder<'a>, - tag: String, + tag: impl Into>, target: &'a mut MemoryRange, config: &MmioRangeConfig, alignment: u64, diff --git a/vm/vmcore/vm_topology/src/layout.rs b/vm/vmcore/vm_topology/src/layout.rs index 1c0a320daa..23fcd30b0b 100644 --- a/vm/vmcore/vm_topology/src/layout.rs +++ b/vm/vmcore/vm_topology/src/layout.rs @@ -39,6 +39,7 @@ //! ``` use memory_range::MemoryRange; +use std::sync::Arc; use thiserror::Error; const PAGE_SIZE: u64 = 4096; @@ -94,7 +95,7 @@ pub enum AllocationPhase { #[derive(Debug, Clone, PartialEq, Eq)] pub struct PlacedRange { /// The caller-supplied tag for the request. - pub tag: String, + pub tag: Arc, /// The kind of allocation. pub kind: PlacedRangeKind, /// The placed range. @@ -112,37 +113,37 @@ pub struct LayoutBuilder<'a> { } struct ReservedRequest { - tag: String, + tag: Arc, range: MemoryRange, } struct FixedRequest { - tag: String, + tag: Arc, range: MemoryRange, } struct DynamicRequest<'a> { - tag: String, + tag: Arc, target: &'a mut MemoryRange, size: u64, alignment: u64, } struct RamRequest<'a> { - tag: String, + tag: Arc, target: &'a mut Vec, size: u64, alignment: u64, } trait RequestDetails { - fn tag(&self) -> &str; + fn tag(&self) -> &Arc; fn size(&self) -> u64; fn alignment(&self) -> u64; } impl RequestDetails for DynamicRequest<'_> { - fn tag(&self) -> &str { + fn tag(&self) -> &Arc { &self.tag } @@ -156,7 +157,7 @@ impl RequestDetails for DynamicRequest<'_> { } impl RequestDetails for RamRequest<'_> { - fn tag(&self) -> &str { + fn tag(&self) -> &Arc { &self.tag } @@ -343,9 +344,9 @@ impl AllocationState { .unwrap_or(0) } - fn record(&mut self, tag: &str, kind: PlacedRangeKind, range: MemoryRange) { + fn record(&mut self, tag: &Arc, kind: PlacedRangeKind, range: MemoryRange) { self.allocations.push(PlacedRange { - tag: tag.to_string(), + tag: tag.clone(), kind, range, }); @@ -355,7 +356,7 @@ impl AllocationState { } } - fn allocate_range(&mut self, tag: &str, kind: PlacedRangeKind, range: MemoryRange) { + fn allocate_range(&mut self, tag: &Arc, kind: PlacedRangeKind, range: MemoryRange) { self.remove_free_range(range); self.record(tag, kind, range); } @@ -393,7 +394,7 @@ pub enum AllocateError { #[error("{tag}: invalid size {size:#x} (must be > 0 and a multiple of {PAGE_SIZE:#x})")] InvalidSize { /// The tag identifying the request. - tag: String, + tag: Arc, /// The invalid size. size: u64, }, @@ -401,7 +402,7 @@ pub enum AllocateError { #[error("{tag}: invalid alignment {alignment:#x} (must be >= {PAGE_SIZE:#x} and a power of 2)")] InvalidAlignment { /// The tag identifying the request. - tag: String, + tag: Arc, /// The invalid alignment. alignment: u64, }, @@ -409,11 +410,11 @@ pub enum AllocateError { #[error("fixed/reserved requests {tag_a} ({range_a}) and {tag_b} ({range_b}) overlap")] FixedOverlap { /// The tag of the first request. - tag_a: String, + tag_a: Arc, /// The range of the first request. range_a: MemoryRange, /// The tag of the second request. - tag_b: String, + tag_b: Arc, /// The range of the second request. range_b: MemoryRange, }, @@ -423,7 +424,7 @@ pub enum AllocateError { )] Exhausted { /// The tag identifying the request. - tag: String, + tag: Arc, /// The requested size. size: u64, /// The requested alignment. @@ -453,7 +454,7 @@ impl<'a> LayoutBuilder<'a> { /// Reserved ranges are removed from the free list and may appear in the /// returned [`PlacedRange`] list, but they do not affect post-MMIO /// placement. Trailing reserved ranges are omitted from the returned list. - pub fn reserve(&mut self, tag: impl Into, range: MemoryRange) { + pub fn reserve(&mut self, tag: impl Into>, range: MemoryRange) { self.reserved.push(ReservedRequest { tag: tag.into(), range, @@ -462,7 +463,7 @@ impl<'a> LayoutBuilder<'a> { /// Adds a fixed range request to the builder. /// - pub fn fixed(&mut self, tag: impl Into, range: MemoryRange) { + pub fn fixed(&mut self, tag: impl Into>, range: MemoryRange) { self.fixed.push(FixedRequest { tag: tag.into(), range, @@ -474,7 +475,7 @@ impl<'a> LayoutBuilder<'a> { /// The target is filled in when [`Self::allocate`] succeeds. pub fn request( &mut self, - tag: impl Into, + tag: impl Into>, target: &'a mut MemoryRange, size: u64, alignment: u64, @@ -512,7 +513,7 @@ impl<'a> LayoutBuilder<'a> { /// succeeds. pub fn ram( &mut self, - tag: impl Into, + tag: impl Into>, target: &'a mut Vec, size: u64, alignment: u64, @@ -562,17 +563,17 @@ impl Default for LayoutBuilder<'_> { } } -fn validate_size_alignment(tag: &str, size: u64, alignment: u64) -> Result<(), AllocateError> { +fn validate_size_alignment(tag: &Arc, size: u64, alignment: u64) -> Result<(), AllocateError> { if size == 0 || !size.is_multiple_of(PAGE_SIZE) { return Err(AllocateError::InvalidSize { - tag: tag.to_string(), + tag: tag.clone(), size, }); } if alignment < PAGE_SIZE || !alignment.is_power_of_two() { return Err(AllocateError::InvalidAlignment { - tag: tag.to_string(), + tag: tag.clone(), alignment, }); } @@ -602,24 +603,22 @@ fn validate_pinned_ranges( ) -> Result<(), AllocateError> { let mut pinned = reserved_requests .iter() - .map(|request| (request.range, request.tag.as_str())) + .map(|request| (request.range, &request.tag)) .chain( fixed_requests .iter() - .map(|request| (request.range, request.tag.as_str())), + .map(|request| (request.range, &request.tag)), ) .collect::>(); pinned.sort_by_key(|(range, _)| range.start()); - for pair in pinned.windows(2) { - let (range_a, tag_a) = pair[0]; - let (range_b, tag_b) = pair[1]; + for &[(range_a, tag_a), (range_b, tag_b)] in pinned.array_windows() { if range_a.overlaps(&range_b) { return Err(AllocateError::FixedOverlap { - tag_a: tag_a.to_string(), + tag_a: tag_a.clone(), range_a, - tag_b: tag_b.to_string(), + tag_b: tag_b.clone(), range_b, }); } @@ -652,7 +651,7 @@ fn exhausted_error( region_end: u64, ) -> AllocateError { AllocateError::Exhausted { - tag: request.tag().to_string(), + tag: request.tag().clone(), size: request.size(), alignment: request.alignment(), phase, @@ -1122,11 +1121,11 @@ mod tests { let sorted = builder.allocate().unwrap(); - assert_eq!(sorted[0].tag, "ram"); + assert_eq!(&*sorted[0].tag, "ram"); assert_eq!(sorted[0].kind, PlacedRangeKind::Ram); - assert_eq!(sorted[1].tag, "mmio64"); + assert_eq!(&*sorted[1].tag, "mmio64"); assert_eq!(sorted[1].kind, PlacedRangeKind::Mmio64); - assert_eq!(sorted[2].tag, "mmio32"); + assert_eq!(&*sorted[2].tag, "mmio32"); assert_eq!(sorted[2].kind, PlacedRangeKind::Mmio32); } From b5753682468c45d923acaa82ee19cc03fd03ace9 Mon Sep 17 00:00:00 2001 From: John Starks Date: Sat, 16 May 2026 16:52:08 +0000 Subject: [PATCH 14/36] better guide --- .../architecture/openvmm/memory-layout.md | 333 ++++++++++-------- 1 file changed, 178 insertions(+), 155 deletions(-) diff --git a/Guide/src/reference/architecture/openvmm/memory-layout.md b/Guide/src/reference/architecture/openvmm/memory-layout.md index baf4503405..2668ca3b6e 100644 --- a/Guide/src/reference/architecture/openvmm/memory-layout.md +++ b/Guide/src/reference/architecture/openvmm/memory-layout.md @@ -1,145 +1,167 @@ # Memory Layout -OpenVMM computes guest physical address layouts by combining fixed platform -ranges, RAM requests, MMIO requests, and private implementation ranges through a -single deterministic allocator. - -The memory layout is part of the VM compatibility contract. Guest operating -systems remember RAM and device addresses across hibernation, and saved VM state -contains device state tied to those addresses. For an existing VM, changing -request order, placement class, or alignment policy can move guest physical -addresses and break resume. +OpenVMM has to decide where every byte of guest physical address space goes: +RAM, MMIO windows for emulated and PCIe devices, paravisor private memory, and +architectural ranges like the LAPIC or GIC. This page describes how those +decisions are made. ```admonish warning title="Compatibility surface" -Treat layout policy changes like VM ABI changes. A new default can be fine for -new VMs, but existing persisted VM configuration must continue to resolve to the -same guest physical addresses. +Guest physical addresses are part of the VM's compatibility contract. Guests +remember device and RAM locations across hibernation, and saved VM state +references them. Changing request order, placement class, or alignment can +move guest addresses and break resume on existing VMs. + +Treat layout policy changes like VM ABI changes: a new default may be fine +for new VMs, but existing persisted configuration must continue to resolve +to the same guest physical addresses. ``` -## Layers - -Memory layout is split across three layers: +## Two pieces -| Layer | Responsibility | -|---|---| -| `vm_topology::layout` | Pure address-space allocation. | -| `openvmm_core::worker::memory_layout` | Production VM policy and validation. | -| `vm_topology::memory::MemoryLayout` | Shared validation and query API. | +Layout resolution is split into two pieces that you should think about +separately: -[`vm_topology::layout::LayoutBuilder`](https://openvmm.dev/rustdoc/linux/vm_topology/layout/struct.LayoutBuilder.html) -knows only about ranges, sizes, alignments, and placement classes. It does not -know about chipsets, firmware, VTLs, PCI, or host physical address width. -Callers express policy by adding fixed ranges, reserved ranges, RAM requests, -and dynamic MMIO requests. +1. A **pure address-space allocator** in `vm_topology::layout`. It knows + nothing about chipsets, firmware, VTLs, PCI, or the host. Callers describe + what they need in terms of ranges, sizes, alignments, and a placement + class, and the allocator returns deterministic guest physical addresses. +2. A **worker resolver** in `openvmm_core::worker::memory_layout`. This is + where OpenVMM's policy lives: which platform ranges are pinned, what + alignments NUMA nodes get, how PCIe ECAM is sized, and so on. The resolver + describes the VM to the allocator, runs it, and builds the resulting + [`MemoryLayout`](https://openvmm.dev/rustdoc/linux/vm_topology/memory/struct.MemoryLayout.html) + that the rest of the VM worker uses to look up RAM, MMIO, PCI ECAM, and + PCI MMIO ranges. -The VM worker owns the production policy. It feeds existing chipset MMIO gaps -into the allocator as fixed occupied ranges, resolves PCIe root complex ECAM -from an optional fixed range or the root-complex bus window, resolves PCIe low -MMIO and high MMIO from typed intents, then asks the allocator to place RAM and -private implementation ranges. Future work moves more MMIO consumers from -precomputed gaps into typed dynamic requests. +Keeping the allocator policy-free means its behavior can be exhaustively +tested in isolation, and the worker can be reasoned about as a list of +requests that fully describes the VM. -`MemoryLayout` remains the object other worker code uses to query RAM, MMIO, -PCI ECAM, PCI MMIO, VTL2 memory, and the VTL0-visible layout top. +## The allocator -## Request Types +[`LayoutBuilder`](https://openvmm.dev/rustdoc/linux/vm_topology/layout/struct.LayoutBuilder.html) +accepts four kinds of input: -The allocator accepts these input forms: - -| Input | Meaning | -|---|---| -| `reserve(tag, range)` | Blocks allocation, but does not raise layout top. | -| `fixed(tag, range)` | Already-known occupied range that is part of layout. | -| `ram(tag, target, size, alignment)` | Splittable ordinary RAM request. | -| `request(..., Placement::Mmio32)` | Single range below 4 GB, packed top down. | -| `request(..., Placement::Mmio64)` | Single range after RAM, packed bottom up. | -| `request(..., Placement::PostMmio)` | Single range after all VTL0 RAM and MMIO. | - -`reserve` is for architectural holes that must block allocation but should not -make the VTL0 layout appear larger. A high reserved hole near the top of the -address space, for example, should not force VTL2 or high MMIO above that hole. - -`fixed` is for ranges that have already been resolved by policy or existing -configuration. Fixed ranges block all dynamic allocation and are included in the -returned placed ranges. - -## Allocation Order - -The allocator is deterministic for the same request list. The phase order is: - -1. Remove reserved ranges from free space. -2. Remove fixed ranges from free space. -3. Allocate 32-bit MMIO below 4 GB, top down. -4. Allocate ordinary RAM from GPA 0 upward, splitting around holes. -5. Allocate 64-bit MMIO from the end of RAM upward. -6. Allocate post-MMIO ranges after the VTL0-visible layout. - -Within MMIO phases, requests are ordered by alignment, then size, then caller -order. RAM and post-MMIO requests use caller order because those orders carry -policy. RAM request order assigns NUMA vnode ownership. Post-MMIO request order -keeps private implementation ranges from being reordered by alignment. - -## Worker Policy - -The VM worker resolver applies the production policy in -`openvmm/openvmm_core/src/worker/memory_layout.rs`: - -1. Validate total RAM size and optional per-vNUMA budgets. -2. Add existing chipset MMIO gaps as fixed ranges. -3. Add PCIe root complex ECAM and low MMIO requests as `Placement::Mmio32`. - A root complex with no fixed ECAM range gets an ECAM size derived from its - bus window. -4. Add PCIe root complex high MMIO requests as `Placement::Mmio64`. -5. Add RAM requests in vnode order. -6. Add optional IGVM VTL2 memory as `Placement::PostMmio`. -7. Allocate all ranges. -8. Build `MemoryLayout` from resolved RAM, chipset MMIO gaps, and resolved PCIe - ranges. -9. Validate the VTL0-visible layout top against host physical address width. - -Host physical address width is deliberately not an allocator input. The layout -is computed from VM configuration first, then checked against the host. That -keeps guest physical addresses from changing just because the VM runs on a host -with a different physical address width. - -## RAM Alignment - -Worker RAM requests use two alignment policies: - -| RAM request size | Alignment | +| Input | Purpose | |---|---| -| Less than 1 GB | 2 MB | -| At least 1 GB | 1 GB | - -The alignment is also split granularity. If a RAM request cannot fit entirely in -the current free range, the allocator rounds the non-final chunk down to the -request alignment before continuing. That prevents a tiny fixed hole from -creating odd sub-GB RAM fragments in an otherwise GB-sized VM. - -Sub-GB RAM requests use 2 MB alignment so small NUMA nodes do not waste a full -GB of guest physical address space. - -## VTL2 Placement - -IGVM files can request VTL2 memory using `Vtl2BaseAddressType::MemoryLayout`. -The worker derives only a size and alignment from the IGVM file. It does not -feed IGVM relocation min/max bounds into layout. +| `reserve(tag, range)` | Block allocation at this address but do not include it in the layout top. | +| `fixed(tag, range)` | A range whose address is already decided. Blocks allocation and counts as part of the layout. | +| `ram(tag, target, size, alignment)` | Ordinary guest RAM. The only request type that may be split across multiple extents. | +| `request(tag, target, size, alignment, placement)` | A single contiguous range, placed dynamically. The `placement` chooses one of three phases below. | + +`reserve` and `fixed` differ only in how they affect the **layout top** — +the address one past the highest guest-visible byte. `fixed` ranges raise +it; `reserve` ranges do not. This matters because the layout top determines +where post-MMIO requests (such as paravisor private memory) start: a +reserved hole high up in the address space should not push them even +higher. + +When `allocate()` runs, it processes requests in a fixed phase order. Each +phase pulls from whatever address space the earlier phases left free: + +1. **Reserved ranges** are removed from the free space. +2. **Fixed ranges** are removed from the free space. +3. **`Placement::Mmio32`** requests are packed *top down* below 4 GiB, so + RAM can start at GPA 0 and grow upward through the lowest free space. +4. **RAM** requests are placed *bottom up* from GPA 0, splitting around any + holes left by the earlier phases. RAM is the only splittable kind. +5. **`Placement::Mmio64`** requests are packed *bottom up* starting at the + end of RAM. This makes the layout top a function of requested topology + rather than a precomputed high MMIO bucket size. +6. **`Placement::PostMmio`** requests are placed *after* everything else + (excluding reserved ranges from the "everything else"). They are for + ranges that should not affect the guest-visible top of memory. + +Within `Mmio32` and `Mmio64`, requests are sorted by alignment (largest +first), then size (largest first), then caller order. This keeps large, +strictly-aligned device windows from being fragmented by small devices. +RAM and `PostMmio` use caller order verbatim: RAM order is the NUMA vnode +assignment, and `PostMmio` carries policy that should not be reordered by +alignment. + +```admonish note +The allocator does not take host physical-address width as an input. The +layout is computed as a pure function of VM configuration; the worker +checks the resulting layout top against host capabilities after the fact. +This keeps guest physical addresses from shifting when the same VM moves +to a host with a different physical-address width. +``` -VTL2 memory is allocated as `Placement::PostMmio`, after all VTL0-visible RAM -and MMIO. Enabling VTL2 must not move VTL0 RAM or device ranges. The selected -VTL2 base is later validated by the IGVM loader against the file's relocation -records. Unsupported IGVM files fail there instead of reshaping the VTL0 layout. +## Worker policy + +The worker resolver in +[`openvmm_core::worker::memory_layout`](https://github.com/microsoft/openvmm/blob/main/openvmm/openvmm_core/src/worker/memory_layout.rs) +issues requests in this order: + +1. **Architectural reserved zone.** A `reserve` request for the + per-architecture range containing LAPIC, IOAPIC, GIC, PL011, battery, + TPM, and similar fixed-address platform devices. + + | Architecture | Range | + |---|---| + | x86_64 | `0xFE00_0000..0x1_0000_0000` | + | aarch64 | `0xEF00_0000..0x1_0000_0000` | + +2. **Chipset low MMIO** (`Mmio32`) — the VMOD/PCI0 `_CRS` low range for + VMBus relay devices and PIIX4 PCI BARs. 2 MB alignment. +3. **Chipset high MMIO** (`Mmio64`) — the corresponding high range. 2 MB + alignment. +4. **PCIe root complex ranges**, one per root complex: + - **ECAM** (`Mmio32`). If the config specifies a fixed ECAM range, the + worker uses it as `fixed`. Otherwise the size is derived from the + bus window as `(end_bus - start_bus + 1) * 1 MB` (32 devices × 8 + functions × 4 KiB per config space). + - **Low MMIO** (`Mmio32`), 2 MB aligned. + - **High MMIO** (`Mmio64`), 1 GB aligned. Per-BAR alignment would + guarantee the entire window is usable for one large BAR, but burns + address space on hosts with tight physical-address widths. +5. **Virtio-mmio slots** (`Mmio32`) — one contiguous region sized + `slot_count * 4 KiB`, when any slots are configured. +6. **RAM**, in vnode order. The first request becomes vnode 0, the second + vnode 1, and so on. Alignment depends on request size: + + | RAM request size | Alignment | + |---|---| + | < 1 GB | 2 MB | + | ≥ 1 GB | 1 GB | + + Sub-GB nodes use 2 MB so small NUMA nodes do not waste a full GB of + address space. +7. **VTL2 chipset MMIO** (`PostMmio`) — VTL2's own VMBus / chipset MMIO + region, when VTL2 is configured. Placed after VTL0 so enabling VTL2 + does not move any VTL0 address. +8. **VTL2 private memory** (`PostMmio`) — when the IGVM file requests + layout-mode VTL2 memory, the worker takes only its size and alignment + from the IGVM relocation header. The IGVM file's relocation min/max + bounds are not fed in as constraints here; they are validated later by + the IGVM loader against the selected base. Treating them as constraints + here would over-constrain layout and could put holes in VTL0 just to + accommodate an IGVM file we will reject anyway. + +After `allocate()` succeeds, the worker collects the resolved ranges into +the `MemoryLayout`'s MMIO, PCI ECAM, and PCI MMIO gap vectors, then checks +`MemoryLayout::end_of_layout()` against the host's physical-address width. + +## RAM splitting + +RAM is the only splittable request, and the splitter has one rule worth +calling out: the alignment passed in is also the **split granularity**. +When a single free range cannot hold the entire request, the part placed +in that range is rounded down to the request alignment before continuing. + +The practical effect is that 1 GB-aligned RAM stays in 1 GB-aligned +chunks. A small fixed hole just above the 1 GB boundary will not cause a +"nearly 1 GB" RAM extent to be placed in the interrupted range; instead, +RAM resumes at the next 1 GB boundary. ## Examples -The examples below use compact synthetic ranges. They describe the same policy -that the unit tests cover in `openvmm_core::worker::memory_layout` and -`vm_topology::layout`. +These examples use compact synthetic configurations. Each one is covered +by tests in `vm_topology::layout` or `openvmm_core::worker::memory_layout`. -### Fixed MMIO Splits RAM +### A fixed MMIO range splits RAM -A VM with 4 GB of RAM and a fixed MMIO hole from 1 GB to 2 GB gets RAM on both -sides of the hole. +4 GB of RAM with a 1 GB fixed MMIO range from 1 GB to 2 GB: | Input | Range | |---|---| @@ -152,12 +174,13 @@ sides of the hole. | MMIO | `0x4000_0000..0x8000_0000` | | RAM | `0x8000_0000..0x1_4000_0000` | -The total RAM is still 4 GB. The fixed range is occupied address space, not RAM. +Total RAM is still 4 GB — the fixed range is occupied address space, not +RAM. -### GB RAM Chunks Stay GB-Sized +### GB-aligned RAM stays GB-aligned -A 2 GB RAM request with a small fixed hole just above 1 GB should not create a -nearly-1-GB chunk plus a tiny fragment. +2 GB of RAM with a tiny fixed hole just above the 1 GB boundary should +not produce a sub-GB RAM fragment: | Input | Range | |---|---| @@ -170,14 +193,12 @@ nearly-1-GB chunk plus a tiny fragment. | Fixed MMIO | `0x4010_0000..0x4020_0000` | | RAM | `0x8000_0000..0xC000_0000` | -The allocator uses the first full 1 GB chunk, skips the interrupted region, and -continues at the next 1 GB boundary. +The splitter places one full 1 GB chunk, refuses to use the interrupted +sub-GB fragment, and resumes at the next 1 GB boundary. -### Small NUMA Nodes Use 2 MB Alignment +### Small NUMA nodes use 2 MB alignment -For two 512 MB NUMA nodes, using 1 GB alignment would waste address space and -make the layout harder to read. The worker uses 2 MB alignment for sub-GB RAM -requests. +Two 512 MB NUMA nodes: | Input | Size | |---|---| @@ -189,12 +210,13 @@ requests. | vnode 0 RAM | `0x0000_0000..0x2000_0000` | | vnode 1 RAM | `0x2000_0000..0x4000_0000` | -The request order is the vnode assignment order, so changing it changes the NUMA -layout. +With 1 GB alignment each node would burn a full GB of address space. +Request order is the vnode assignment, so swapping the requests swaps the +NUMA layout. -### VTL2 Does Not Move VTL0 +### VTL2 does not move VTL0 -Start with 2 GB of VTL0 RAM and a fixed MMIO hole from 1 GB to 2 GB. +Starting from 2 GB of VTL0 RAM and a fixed 1 GB MMIO hole: | VTL0 output | Range | |---|---| @@ -202,21 +224,20 @@ Start with 2 GB of VTL0 RAM and a fixed MMIO hole from 1 GB to 2 GB. | MMIO | `0x4000_0000..0x8000_0000` | | RAM | `0x8000_0000..0xC000_0000` | -If the IGVM file asks for 2 MB of VTL2 memory, the VTL0 layout stays exactly the -same. VTL2 is placed separately after the VTL0-visible top. +Adding a 2 MB VTL2 private-memory request leaves the VTL0 layout +identical and places VTL2 after the VTL0-visible top: | Private output | Range | |---|---| | VTL2 | `0xC000_0000..0xC020_0000` | -`MemoryLayout::end_of_layout()` reports the VTL0-visible top. VTL2 remains -available through `MemoryLayout::vtl2_range()`. +`MemoryLayout::end_of_layout()` reports the VTL0-visible top. +`MemoryLayout::vtl2_range()` reports the VTL2 range separately. -### Reserved High Holes Do Not Raise Layout Top +### Reserved holes do not raise the layout top -A reserved range blocks allocation, but it does not describe a guest-visible -resource. If a VM has 2 GB of RAM and a high reserved hole, post-MMIO memory can -still start immediately after the VTL0 layout. +A reserved range blocks allocation but is not a guest-visible resource, +so it does not push later post-MMIO ranges higher: | Input | Range | |---|---| @@ -229,17 +250,19 @@ still start immediately after the VTL0 layout. | RAM | `0x0000_0000..0x8000_0000` | | Post-MMIO | `0x8000_0000..0x8010_0000` | -The reserved hole is not returned at the end of the sorted layout because it is -only a constraint. If a reserved range sits between returned allocations, it is -reported so callers can inspect the occupied map. +Trailing reserved ranges are omitted from the returned allocation list, +but a reserved range that sits between real allocations is reported so +callers can see the full occupied map. -## Where To Update This Page +## When to update this page -Update this page when changing any of these behaviors: +Update this page when any of these change: -- placement phase order in `vm_topology::layout` -- `reserve`, `fixed`, `ram`, or `request` semantics -- worker RAM alignment policy -- VTL2 `MemoryLayout` placement -- host physical-address validation policy +- the allocator's phase order or any phase's placement direction +- the semantics of `reserve`, `fixed`, `ram`, or `request` +- the architectural reserved zones or their per-architecture addresses +- the worker's RAM alignment policy +- PCIe ECAM sizing or per-BAR alignment policy +- VTL2 chipset MMIO or VTL2 private-memory placement +- the host physical-address validation step - `MemoryLayout::end_of_layout()` or `MemoryLayout::vtl2_range()` semantics From af7d1a9f0f114a32023059b7b3f19fd55c6c30ce Mon Sep 17 00:00:00 2001 From: John Starks Date: Sat, 16 May 2026 17:25:44 +0000 Subject: [PATCH 15/36] improve --- Cargo.lock | 2 - openvmm/openvmm_core/src/worker/dispatch.rs | 13 +- .../openvmm_core/src/worker/memory_layout.rs | 129 +++++++----------- openvmm/openvmm_defs/Cargo.toml | 2 +- openvmm/openvmm_defs/src/config.rs | 34 +++-- openvmm/openvmm_entry/src/lib.rs | 7 +- petri/src/vm/openvmm/modify.rs | 7 +- vmm_core/vm_manifest_builder/Cargo.toml | 1 - vmm_core/vm_manifest_builder/src/lib.rs | 32 +++-- vmm_core/vmm_core_defs/Cargo.toml | 1 - vmm_core/vmm_core_defs/src/lib.rs | 15 -- 11 files changed, 104 insertions(+), 139 deletions(-) diff --git a/Cargo.lock b/Cargo.lock index c11ea08871..4d7817b6ac 100644 --- a/Cargo.lock +++ b/Cargo.lock @@ -9513,7 +9513,6 @@ dependencies = [ "serial_pl011_resources", "thiserror 2.0.16", "vm_resource", - "vmm_core_defs", "vmotherboard", ] @@ -10074,7 +10073,6 @@ name = "vmm_core_defs" version = "0.0.0" dependencies = [ "inspect", - "memory_range", "mesh", "virt", ] diff --git a/openvmm/openvmm_core/src/worker/dispatch.rs b/openvmm/openvmm_core/src/worker/dispatch.rs index 9eff7f1011..cd7f31e5df 100644 --- a/openvmm/openvmm_core/src/worker/dispatch.rs +++ b/openvmm/openvmm_core/src/worker/dispatch.rs @@ -57,7 +57,6 @@ use openvmm_defs::config::GicConfig; use openvmm_defs::config::HypervisorConfig; use openvmm_defs::config::LoadMode; use openvmm_defs::config::MemoryConfig; -use openvmm_defs::config::MmioRangeConfig; use openvmm_defs::config::PcieDeviceConfig; use openvmm_defs::config::PcieRootComplexConfig; use openvmm_defs::config::PcieSwitchConfig; @@ -253,9 +252,9 @@ pub struct Manifest { chipset_devices: Vec, pci_chipset_devices: Vec, chipset_capabilities: VmChipsetCapabilities, - chipset_low_mmio: Option, - chipset_high_mmio: Option, - vtl2_chipset_mmio: Option, + chipset_low_mmio: Option, + chipset_high_mmio: Option, + vtl2_chipset_mmio: Option, generation_id_recv: Option>, rtc_delta_milliseconds: i64, automatic_guest_reset: bool, @@ -927,9 +926,9 @@ impl InitializedVm { let resolved_layout = resolve_memory_layout(MemoryLayoutInput { mem_size: cfg.memory.mem_size, numa_mem_sizes: cfg.memory.numa_mem_sizes.as_deref(), - chipset_low_mmio: cfg.chipset_low_mmio.as_ref(), - chipset_high_mmio: cfg.chipset_high_mmio.as_ref(), - vtl2_chipset_mmio: cfg.vtl2_chipset_mmio.as_ref(), + chipset_low_mmio: cfg.chipset_low_mmio, + chipset_high_mmio: cfg.chipset_high_mmio, + vtl2_chipset_mmio: cfg.vtl2_chipset_mmio, pcie_root_complexes: &cfg.pcie_root_complexes, virtio_mmio_count, vtl2_layout, diff --git a/openvmm/openvmm_core/src/worker/memory_layout.rs b/openvmm/openvmm_core/src/worker/memory_layout.rs index 5258aeacfd..8a17c5a3a5 100644 --- a/openvmm/openvmm_core/src/worker/memory_layout.rs +++ b/openvmm/openvmm_core/src/worker/memory_layout.rs @@ -11,7 +11,7 @@ //! restore. Keep layout policy changes deliberate and covered by tests. //! //! The resolver owns all layout consumers: architectural reserved zones (LAPIC, -//! IOAPIC, GIC, etc.), chipset MMIO (VMBus relay, PIIX4 PCI BARs), PCIe +//! IOAPIC, GIC, etc.), chipset MMIO (VMBus, PIIX4 PCI BARs), PCIe //! ECAM/BAR pools, virtio-mmio slots, ordinary RAM, VTL2 private memory, and //! VTL2 chipset MMIO. Callers express sizing intent; the resolver places //! everything and derives the effective MMIO gaps for [`MemoryLayout`]. @@ -20,7 +20,7 @@ use super::vm_loaders::igvm::Vtl2MemoryLayoutRequest; use anyhow::Context; use anyhow::bail; use memory_range::MemoryRange; -use openvmm_defs::config::MmioRangeConfig; +use openvmm_defs::config::PcieMmioRangeConfig; use openvmm_defs::config::PcieRootComplexConfig; use std::sync::Arc; use vm_topology::layout::LayoutBuilder; @@ -43,7 +43,7 @@ pub(super) struct ResolvedMemoryLayout { /// 4 KiB, indexed from the start of the region. `None` when no /// virtio-mmio devices are configured. pub virtio_mmio_region: Option, - /// Chipset low MMIO range (below 4 GiB) for VMOD/PCI0 _CRS. `None` when + /// Chipset low MMIO range (below 4 GB) for VMOD/PCI0 _CRS. `None` when /// no VMBus / chipset MMIO is configured. pub chipset_low_mmio: Option, /// Chipset high MMIO range (above RAM) for VMOD/PCI0 _CRS. `None` when @@ -67,15 +67,18 @@ pub(super) struct MemoryLayoutInput<'a> { /// Optional per-vNUMA RAM budgets. When present, these must sum to /// `mem_size`, and request order is the vnode assignment order. pub numa_mem_sizes: Option<&'a [u64]>, - /// Chipset low MMIO range (below 4 GiB). This is the VMOD/PCI0 _CRS range - /// for VMBus relay devices and PIIX4 PCI BARs. - pub chipset_low_mmio: Option<&'a MmioRangeConfig>, - /// Chipset high MMIO range (above RAM). This is the VMOD/PCI0 _CRS high - /// range for VMBus relay devices. - pub chipset_high_mmio: Option<&'a MmioRangeConfig>, - /// VTL2-private chipset MMIO range. Placed after all VTL0-visible layout - /// so enabling VTL2 does not move VTL0 addresses. - pub vtl2_chipset_mmio: Option<&'a MmioRangeConfig>, + /// Chipset low MMIO size (below 4 GB). This is the VMOD/PCI0 _CRS range + /// for VMBus devices and PIIX4 PCI BARs. The address is always allocated + /// dynamically. `None` disables the range. + pub chipset_low_mmio: Option, + /// Chipset high MMIO size (above RAM). This is the VMOD/PCI0 _CRS high + /// range for VMBus devices. The address is always allocated dynamically. + /// `None` disables the range. + pub chipset_high_mmio: Option, + /// VTL2-private chipset MMIO size. Placed after all VTL0-visible layout + /// so enabling VTL2 does not move VTL0 addresses. The address is always + /// allocated dynamically. `None` disables the range. + pub vtl2_chipset_mmio: Option, /// PCIe root complex address-space intents. These are resolved by this /// worker step so front ends do not need to carve guest physical addresses. pub pcie_root_complexes: &'a [PcieRootComplexConfig], @@ -128,26 +131,24 @@ pub(super) fn resolve_memory_layout( }; builder.reserve("arch_reserved", arch_reserved); - // Chipset low MMIO (Mmio32): VMOD/PCI0 _CRS low range for VMBus relay + // Chipset low MMIO (Mmio32): VMOD/PCI0 _CRS low range for VMBus // devices and PIIX4 PCI BARs. - if let Some(config) = input.chipset_low_mmio { - add_mmio_range( - &mut builder, + if let Some(size) = input.chipset_low_mmio { + builder.request( "chipset_low_mmio", &mut chipset_low_mmio, - config, + size, TWO_MB, Placement::Mmio32, ); } // Chipset high MMIO (Mmio64): VMOD/PCI0 _CRS high range. - if let Some(config) = input.chipset_high_mmio { - add_mmio_range( - &mut builder, + if let Some(size) = input.chipset_high_mmio { + builder.request( "chipset_high_mmio", &mut chipset_high_mmio, - config, + size, TWO_MB, Placement::Mmio64, ); @@ -159,13 +160,18 @@ pub(super) fn resolve_memory_layout( .zip(&mut pcie_root_complex_ranges) .enumerate() { - add_pcie_ecam_range( - &mut builder, + // ECAM: always dynamically allocated below 4GB (since Linux on x86_64 + // refuses to use ECAM above 4GB unless the BIOS is of a special shape). + // Size is derived from the bus range. + // + // TODO: fix the Linux loader and move this above 4GB before the layout + // is stabilized. + builder.request( format!("pcie[{index}].ecam"), &mut ranges.ecam_range, - &root_complex.ecam_range, pcie_ecam_size(root_complex)?, PCIE_ECAM_BYTES_PER_BUS, + Placement::Mmio32, ); // Low MMIO: 2 MB aligned. add_mmio_range( @@ -224,12 +230,11 @@ pub(super) fn resolve_memory_layout( // VTL2 chipset MMIO is implementation-private — placed after all // VTL0-visible RAM/MMIO so enabling VTL2 does not move VTL0 addresses. - if let Some(config) = input.vtl2_chipset_mmio { - add_mmio_range( - &mut builder, + if let Some(size) = input.vtl2_chipset_mmio { + builder.request( "vtl2_chipset_mmio", &mut vtl2_chipset_mmio, - config, + size, TWO_MB, Placement::PostMmio, ); @@ -357,36 +362,19 @@ fn pcie_ecam_size(root_complex: &PcieRootComplexConfig) -> anyhow::Result { Ok((u64::from(bus_count) + 1) * PCIE_ECAM_BYTES_PER_BUS) } -fn add_pcie_ecam_range<'a>( - builder: &mut LayoutBuilder<'a>, - tag: impl Into>, - target: &'a mut MemoryRange, - config: &Option, - size: u64, - alignment: u64, -) { - match config { - Some(range) => { - *target = *range; - builder.fixed(tag, *range); - } - None => builder.request(tag, target, size, alignment, Placement::Mmio32), - } -} - fn add_mmio_range<'a>( builder: &mut LayoutBuilder<'a>, tag: impl Into>, target: &'a mut MemoryRange, - config: &MmioRangeConfig, + config: &PcieMmioRangeConfig, alignment: u64, placement: Placement, ) { match config { - MmioRangeConfig::Dynamic { size } => { + PcieMmioRangeConfig::Dynamic { size } => { builder.request(tag, target, *size, alignment, placement); } - MmioRangeConfig::Fixed(range) => { + PcieMmioRangeConfig::Fixed(range) => { *target = *range; builder.fixed(tag, *range); } @@ -437,17 +425,6 @@ mod tests { const DEFAULT_CHIPSET_HIGH_MMIO_SIZE: u64 = 512 * 1024 * 1024; const DEFAULT_VTL2_CHIPSET_MMIO_SIZE: u64 = GB; - const DEFAULT_CHIPSET_LOW: MmioRangeConfig = MmioRangeConfig::Dynamic { - size: DEFAULT_CHIPSET_LOW_MMIO_SIZE_X86_64, - }; - const DEFAULT_CHIPSET_HIGH: MmioRangeConfig = MmioRangeConfig::Dynamic { - size: DEFAULT_CHIPSET_HIGH_MMIO_SIZE, - }; - - const DEFAULT_VTL2_CHIPSET: MmioRangeConfig = MmioRangeConfig::Dynamic { - size: DEFAULT_VTL2_CHIPSET_MMIO_SIZE, - }; - fn input( mem_size: u64, numa_mem_sizes: Option<&[u64]>, @@ -456,8 +433,8 @@ mod tests { MemoryLayoutInput { mem_size, numa_mem_sizes, - chipset_low_mmio: Some(&DEFAULT_CHIPSET_LOW), - chipset_high_mmio: Some(&DEFAULT_CHIPSET_HIGH), + chipset_low_mmio: Some(DEFAULT_CHIPSET_LOW_MMIO_SIZE_X86_64), + chipset_high_mmio: Some(DEFAULT_CHIPSET_HIGH_MMIO_SIZE), vtl2_chipset_mmio: None, pcie_root_complexes: &[], virtio_mmio_count: 0, @@ -478,9 +455,8 @@ mod tests { } fn pcie_root_complex( - ecam_range: Option, - low_mmio: MmioRangeConfig, - high_mmio: MmioRangeConfig, + low_mmio: PcieMmioRangeConfig, + high_mmio: PcieMmioRangeConfig, ) -> PcieRootComplexConfig { PcieRootComplexConfig { index: 0, @@ -488,7 +464,6 @@ mod tests { segment: 0, start_bus: 0, end_bus: 0, - ecam_range, low_mmio, high_mmio, ports: Vec::new(), @@ -506,8 +481,8 @@ mod tests { #[test] fn ram_splits_around_arch_reserved_zone() { - // 4 GiB of RAM must split around the architectural reserved zone - // and the chipset MMIO allocations below 4 GiB. + // 4 GB of RAM must split around the architectural reserved zone + // and the chipset MMIO allocations below 4 GB. let actual = resolve(input(4 * GB, None, None)); assert_eq!(actual.ram_size(), 4 * GB); @@ -532,7 +507,7 @@ mod tests { // First vnode's RAM starts at 0. assert_eq!(actual.ram()[0].vnode, 0); assert_eq!(actual.ram()[0].range.start(), 0); - // All RAM accounts for 4 GiB total. + // All RAM accounts for 4 GB total. assert_eq!(actual.ram_size(), 4 * GB); } @@ -548,10 +523,7 @@ mod tests { .expect("should have high chipset MMIO"); assert_eq!(low.len(), DEFAULT_CHIPSET_LOW_MMIO_SIZE_X86_64); assert_eq!(high.len(), DEFAULT_CHIPSET_HIGH_MMIO_SIZE); - assert!( - low.end() <= 4 * GB, - "low chipset MMIO should be below 4 GiB" - ); + assert!(low.end() <= 4 * GB, "low chipset MMIO should be below 4 GB"); assert!( high.start() >= 2 * GB, "high chipset MMIO should be above RAM" @@ -561,9 +533,8 @@ mod tests { #[test] fn pcie_dynamic_intents_are_resolved() { let root_complexes = [pcie_root_complex( - None, - MmioRangeConfig::Dynamic { size: 64 * MB }, - MmioRangeConfig::Dynamic { size: GB }, + PcieMmioRangeConfig::Dynamic { size: 64 * MB }, + PcieMmioRangeConfig::Dynamic { size: GB }, )]; let mut config = input(2 * GB, None, None); config.pcie_root_complexes = &root_complexes; @@ -573,7 +544,7 @@ mod tests { assert!( ranges.ecam_range.end() <= 4 * GB, - "ECAM should be below 4 GiB" + "ECAM should be below 4 GB" ); assert_eq!(ranges.low_mmio.len(), 64 * MB); assert_eq!(ranges.high_mmio.len(), GB); @@ -669,7 +640,7 @@ mod tests { .virtio_mmio_region .expect("should have virtio-mmio region"); assert_eq!(region.len(), 3 * PAGE_SIZE); - assert!(region.end() <= 4 * GB, "virtio-mmio should be below 4 GiB"); + assert!(region.end() <= 4 * GB, "virtio-mmio should be below 4 GB"); } #[test] @@ -694,7 +665,7 @@ mod tests { #[test] fn vtl2_chipset_mmio_is_post_mmio() { let mut config = input(2 * GB, None, None); - config.vtl2_chipset_mmio = Some(&DEFAULT_VTL2_CHIPSET); + config.vtl2_chipset_mmio = Some(DEFAULT_VTL2_CHIPSET_MMIO_SIZE); let result = resolve_memory_layout(config).unwrap(); @@ -716,7 +687,7 @@ mod tests { fn vtl2_chipset_mmio_does_not_move_vtl0_layout() { let without = resolve(input(2 * GB, None, None)); let mut config = input(2 * GB, None, None); - config.vtl2_chipset_mmio = Some(&DEFAULT_VTL2_CHIPSET); + config.vtl2_chipset_mmio = Some(DEFAULT_VTL2_CHIPSET_MMIO_SIZE); let with = resolve_memory_layout(config).unwrap(); assert_eq!(with.memory_layout.ram(), without.ram()); diff --git a/openvmm/openvmm_defs/Cargo.toml b/openvmm/openvmm_defs/Cargo.toml index 884361ac0f..cce2dd82f2 100644 --- a/openvmm/openvmm_defs/Cargo.toml +++ b/openvmm/openvmm_defs/Cargo.toml @@ -11,7 +11,7 @@ hypervisor_resources.workspace = true openvmm_pcat_locator.workspace = true # vmcore -memory_range.workspace = true +memory_range = { workspace = true, features = ["mesh"] } vm_resource.workspace = true vmgs_resources.workspace = true diff --git a/openvmm/openvmm_defs/src/config.rs b/openvmm/openvmm_defs/src/config.rs index 9dac349ac0..3b337e1fda 100644 --- a/openvmm/openvmm_defs/src/config.rs +++ b/openvmm/openvmm_defs/src/config.rs @@ -55,12 +55,15 @@ pub struct Config { pub chipset_devices: Vec, pub pci_chipset_devices: Vec, pub chipset_capabilities: VmChipsetCapabilities, - /// Chipset low MMIO range (below 4 GiB) for VMOD/PCI0 _CRS. - pub chipset_low_mmio: Option, - /// Chipset high MMIO range (above RAM) for VMOD/PCI0 _CRS. - pub chipset_high_mmio: Option, - /// VTL2-private chipset MMIO range for VTL2 VMBus. - pub vtl2_chipset_mmio: Option, + /// Chipset low MMIO range size (below 4 GiB) for VMOD/PCI0 _CRS. + /// The address is always allocated dynamically. `None` disables the range. + pub chipset_low_mmio: Option, + /// Chipset high MMIO range size (above RAM) for VMOD/PCI0 _CRS. + /// The address is always allocated dynamically. `None` disables the range. + pub chipset_high_mmio: Option, + /// VTL2-private chipset MMIO range size for VTL2 VMBus. + /// The address is always allocated dynamically. `None` disables the range. + pub vtl2_chipset_mmio: Option, pub generation_id_recv: Option>, // This is used for testing. TODO: resourcify, and also store this in VMGS. pub rtc_delta_milliseconds: i64, @@ -192,7 +195,19 @@ pub enum Vtl2BaseAddressType { Vtl2Allocate { size: Option }, } -pub use vmm_core_defs::MmioRangeConfig; +/// Specifies a PCIe MMIO BAR window, either by size (the resolver allocates) or +/// by a fixed location. Fixed locations exist for assigned-device, IOMMU, and +/// physical-topology compatibility. +#[derive(Debug, MeshPayload)] +pub enum PcieMmioRangeConfig { + /// Dynamically allocate a range of the given size. + Dynamic { + /// Size of the range in bytes. + size: u64, + }, + /// Use the specified fixed memory range. + Fixed(MemoryRange), +} #[derive(Debug, MeshPayload)] pub struct PcieRootComplexConfig { @@ -201,9 +216,8 @@ pub struct PcieRootComplexConfig { pub segment: u16, pub start_bus: u8, pub end_bus: u8, - pub ecam_range: Option, - pub low_mmio: MmioRangeConfig, - pub high_mmio: MmioRangeConfig, + pub low_mmio: PcieMmioRangeConfig, + pub high_mmio: PcieMmioRangeConfig, pub ports: Vec, } diff --git a/openvmm/openvmm_entry/src/lib.rs b/openvmm/openvmm_entry/src/lib.rs index 2f51954392..73e551ce72 100644 --- a/openvmm/openvmm_entry/src/lib.rs +++ b/openvmm/openvmm_entry/src/lib.rs @@ -72,8 +72,8 @@ use openvmm_defs::config::HypervisorConfig; use openvmm_defs::config::LateMapVtl0MemoryPolicy; use openvmm_defs::config::LoadMode; use openvmm_defs::config::MemoryConfig; -use openvmm_defs::config::MmioRangeConfig; use openvmm_defs::config::PcieDeviceConfig; +use openvmm_defs::config::PcieMmioRangeConfig; use openvmm_defs::config::PcieRootComplexConfig; use openvmm_defs::config::PcieRootPortConfig; use openvmm_defs::config::PcieSwitchConfig; @@ -744,11 +744,10 @@ async fn vm_config_from_command_line( segment: rc_cli.segment, start_bus: rc_cli.start_bus, end_bus: rc_cli.end_bus, - ecam_range: None, - low_mmio: MmioRangeConfig::Dynamic { + low_mmio: PcieMmioRangeConfig::Dynamic { size: low_mmio_size, }, - high_mmio: MmioRangeConfig::Dynamic { + high_mmio: PcieMmioRangeConfig::Dynamic { size: high_mmio_size, }, ports, diff --git a/petri/src/vm/openvmm/modify.rs b/petri/src/vm/openvmm/modify.rs index 8794ed4b9f..dcaadb322c 100644 --- a/petri/src/vm/openvmm/modify.rs +++ b/petri/src/vm/openvmm/modify.rs @@ -24,8 +24,8 @@ use nvme_resources::NvmeControllerHandle; use openvmm_defs::config::Config; use openvmm_defs::config::DeviceVtl; use openvmm_defs::config::LoadMode; -use openvmm_defs::config::MmioRangeConfig; use openvmm_defs::config::PcieDeviceConfig; +use openvmm_defs::config::PcieMmioRangeConfig; use openvmm_defs::config::PcieRootComplexConfig; use openvmm_defs::config::PcieRootPortConfig; use openvmm_defs::config::PcieSwitchConfig; @@ -269,11 +269,10 @@ impl PetriVmConfigOpenVmm { segment: segment.try_into().unwrap(), start_bus: start_bus.try_into().unwrap(), end_bus: end_bus.try_into().unwrap(), - ecam_range: None, - low_mmio: MmioRangeConfig::Dynamic { + low_mmio: PcieMmioRangeConfig::Dynamic { size: LOW_MMIO_SIZE, }, - high_mmio: MmioRangeConfig::Dynamic { + high_mmio: PcieMmioRangeConfig::Dynamic { size: HIGH_MMIO_SIZE, }, ports, diff --git a/vmm_core/vm_manifest_builder/Cargo.toml b/vmm_core/vm_manifest_builder/Cargo.toml index a040d3a9b0..b89fa5d683 100644 --- a/vmm_core/vm_manifest_builder/Cargo.toml +++ b/vmm_core/vm_manifest_builder/Cargo.toml @@ -15,7 +15,6 @@ serial_core.workspace = true serial_debugcon_resources.workspace = true serial_pl011_resources.workspace = true vm_resource.workspace = true -vmm_core_defs.workspace = true vmotherboard.workspace = true mesh.workspace = true diff --git a/vmm_core/vm_manifest_builder/src/lib.rs b/vmm_core/vm_manifest_builder/src/lib.rs index 0d69c5093d..45db9769b7 100644 --- a/vmm_core/vm_manifest_builder/src/lib.rs +++ b/vmm_core/vm_manifest_builder/src/lib.rs @@ -43,7 +43,6 @@ use vm_resource::PlatformResource; use vm_resource::Resource; use vm_resource::ResourceId; use vm_resource::kind::SerialBackendHandle; -use vmm_core_defs::MmioRangeConfig; use vmotherboard::ChipsetDeviceHandle; use vmotherboard::LegacyPciChipsetDeviceHandle; use vmotherboard::options::BaseChipsetManifest; @@ -103,15 +102,18 @@ pub struct VmChipsetResult { pub pci_chipset_devices: Vec, /// Derived chipset capabilities needed by firmware and table generation. pub capabilities: VmChipsetCapabilities, - /// Default chipset low MMIO range (below 4 GiB) for VMOD/PCI0 _CRS. - /// `None` when the VM type has no VMBus or PCI bus. - pub chipset_low_mmio: Option, - /// Default chipset high MMIO range (above RAM) for VMOD/PCI0 _CRS. - /// `None` when the VM type has no VMBus or PCI bus. - pub chipset_high_mmio: Option, - /// Default VTL2-private chipset MMIO range for VTL2 VMBus. - /// `None` when the VM type does not include VTL2. - pub vtl2_chipset_mmio: Option, + /// Default chipset low MMIO size (below 4 GiB) for VMOD/PCI0 _CRS. + /// The address is always allocated dynamically. `None` when the VM type + /// has no VMBus or PCI bus. + pub chipset_low_mmio: Option, + /// Default chipset high MMIO size (above RAM) for VMOD/PCI0 _CRS. + /// The address is always allocated dynamically. `None` when the VM type + /// has no VMBus or PCI bus. + pub chipset_high_mmio: Option, + /// Default VTL2-private chipset MMIO size for VTL2 VMBus. + /// The address is always allocated dynamically. `None` when the VM type + /// does not include VTL2. + pub vtl2_chipset_mmio: Option, } /// Error type for building a VM manifest. @@ -419,13 +421,13 @@ impl VmManifestBuilder { | BaseChipsetType::HypervGen2Uefi | BaseChipsetType::HyperVGen2LinuxDirect | BaseChipsetType::UnenlightenedLinuxDirect => { - result.chipset_low_mmio = Some(MmioRangeConfig::Dynamic { size: default_low }); - result.chipset_high_mmio = Some(MmioRangeConfig::Dynamic { size: default_high }); + result.chipset_low_mmio = Some(default_low); + result.chipset_high_mmio = Some(default_high); } BaseChipsetType::HclHost => { - result.chipset_low_mmio = Some(MmioRangeConfig::Dynamic { size: default_low }); - result.chipset_high_mmio = Some(MmioRangeConfig::Dynamic { size: default_high }); - result.vtl2_chipset_mmio = Some(MmioRangeConfig::Dynamic { size: default_vtl2 }); + result.chipset_low_mmio = Some(default_low); + result.chipset_high_mmio = Some(default_high); + result.vtl2_chipset_mmio = Some(default_vtl2); } } diff --git a/vmm_core/vmm_core_defs/Cargo.toml b/vmm_core/vmm_core_defs/Cargo.toml index 36d22e00a5..c2c93559d8 100644 --- a/vmm_core/vmm_core_defs/Cargo.toml +++ b/vmm_core/vmm_core_defs/Cargo.toml @@ -7,7 +7,6 @@ edition.workspace = true rust-version.workspace = true [dependencies] -memory_range = { workspace = true, features = ["mesh"] } virt.workspace = true inspect.workspace = true diff --git a/vmm_core/vmm_core_defs/src/lib.rs b/vmm_core/vmm_core_defs/src/lib.rs index 35fcdd2f3f..221cbe554c 100644 --- a/vmm_core/vmm_core_defs/src/lib.rs +++ b/vmm_core/vmm_core_defs/src/lib.rs @@ -9,24 +9,9 @@ pub mod debug_rpc; use inspect::Inspect; -use memory_range::MemoryRange; -use mesh::MeshPayload; use mesh::payload::Protobuf; use std::sync::Arc; -/// Specifies an MMIO range, either by size (the resolver allocates) or by -/// fixed location. -#[derive(Debug, MeshPayload)] -pub enum MmioRangeConfig { - /// Dynamically allocate a range of the given size. - Dynamic { - /// Size of the range in bytes. - size: u64, - }, - /// Use the specified fixed memory range. - Fixed(MemoryRange), -} - /// HaltReason sent by devices and vp_set to the vmm. #[derive(Debug, Clone, Eq, PartialEq, Protobuf, Inspect)] #[inspect(tag = "halt_reason")] From bb56b68ef6fb7e7b94030d4f6cea3b1ce103085c Mon Sep 17 00:00:00 2001 From: John Starks Date: Sat, 16 May 2026 18:01:19 +0000 Subject: [PATCH 16/36] clarity --- vm/vmcore/vm_topology/src/layout.rs | 239 +++++++++++----------------- 1 file changed, 97 insertions(+), 142 deletions(-) diff --git a/vm/vmcore/vm_topology/src/layout.rs b/vm/vmcore/vm_topology/src/layout.rs index 23fcd30b0b..f71c891341 100644 --- a/vm/vmcore/vm_topology/src/layout.rs +++ b/vm/vmcore/vm_topology/src/layout.rs @@ -129,6 +129,15 @@ struct DynamicRequest<'a> { alignment: u64, } +impl DynamicRequest<'_> { + /// Sort key for the dynamic placement phases: larger alignment first, then + /// larger size first. Wrapping with `Reverse` makes the descending order + /// self-evident at the call site. + fn placement_sort_key(&self) -> std::cmp::Reverse<(u64, u64)> { + std::cmp::Reverse((self.alignment, self.size)) + } +} + struct RamRequest<'a> { tag: Arc, target: &'a mut Vec, @@ -136,45 +145,14 @@ struct RamRequest<'a> { alignment: u64, } -trait RequestDetails { - fn tag(&self) -> &Arc; - fn size(&self) -> u64; - fn alignment(&self) -> u64; -} - -impl RequestDetails for DynamicRequest<'_> { - fn tag(&self) -> &Arc { - &self.tag - } - - fn size(&self) -> u64 { - self.size - } - - fn alignment(&self) -> u64 { - self.alignment - } -} - -impl RequestDetails for RamRequest<'_> { - fn tag(&self) -> &Arc { - &self.tag - } - - fn size(&self) -> u64 { - self.size - } - - fn alignment(&self) -> u64 { - self.alignment - } -} - struct AllocationState { - // Sorted, non-overlapping ranges not yet consumed by any request. Keeping - // free space as the primary state lets each phase update the map - // incrementally instead of repeatedly subtracting all allocations from the - // whole address space. + // Sorted, non-overlapping, non-empty ranges not yet consumed by any + // request. Keeping free space as the primary state lets each phase update + // the map incrementally instead of repeatedly subtracting all allocations + // from the whole address space. + // + // The non-empty invariant lets `remove_free_range` locate the containing + // free range with a single `partition_point` lookup. free: Vec, allocations: Vec, // Highest end address of ordinary RAM. High MMIO starts here so the layout @@ -210,21 +188,18 @@ impl AllocationState { // Pack 32-bit MMIO from the top of the 4 GiB window downward so RAM can // start at GPA 0 and grow upward through the lowest remaining space. // Alignment/size ordering keeps large, constrained windows from being - // fragmented by small devices. `sort_by` is stable, so otherwise equal - // requests keep caller order. - requests.sort_by(|request, other_request| { - other_request - .alignment - .cmp(&request.alignment) - .then(other_request.size.cmp(&request.size)) - }); + // fragmented by small devices. `sort_by_key` is stable, so otherwise + // equal requests keep caller order. + requests.sort_by_key(|r| r.placement_sort_key()); for request in requests { let Some(start) = find_highest_fit(&self.free, request.size, request.alignment, 0, FOUR_GIB) else { return Err(exhausted_error( - request, + &request.tag, + request.size, + request.alignment, AllocationPhase::Mmio32, &self.free, 0, @@ -253,7 +228,15 @@ impl AllocationState { ADDRESS_LIMIT, ) .ok_or_else(|| { - exhausted_error(request, AllocationPhase::Ram, &self.free, 0, ADDRESS_LIMIT) + exhausted_error( + &request.tag, + request.size, + request.alignment, + AllocationPhase::Ram, + &self.free, + 0, + ADDRESS_LIMIT, + ) })?; request.target.clear(); @@ -270,12 +253,7 @@ impl AllocationState { // High MMIO is allocated bottom up from the end of RAM. The allocator // intentionally does not take host physical-address width as an input; // callers validate the resulting top against host capabilities later. - requests.sort_by(|request, other_request| { - other_request - .alignment - .cmp(&request.alignment) - .then(other_request.size.cmp(&request.size)) - }); + requests.sort_by_key(|r| r.placement_sort_key()); for request in requests { let Some(start) = find_lowest_fit( @@ -286,7 +264,9 @@ impl AllocationState { ADDRESS_LIMIT, ) else { return Err(exhausted_error( - request, + &request.tag, + request.size, + request.alignment, AllocationPhase::Mmio64, &self.free, self.ram_end, @@ -319,7 +299,9 @@ impl AllocationState { ADDRESS_LIMIT, ) else { return Err(exhausted_error( - request, + &request.tag, + request.size, + request.alignment, AllocationPhase::PostMmio, &self.free, layout_top, @@ -344,23 +326,18 @@ impl AllocationState { .unwrap_or(0) } - fn record(&mut self, tag: &Arc, kind: PlacedRangeKind, range: MemoryRange) { + fn allocate_range(&mut self, tag: &Arc, kind: PlacedRangeKind, range: MemoryRange) { + self.remove_free_range(range); self.allocations.push(PlacedRange { tag: tag.clone(), kind, range, }); - if kind == PlacedRangeKind::Ram { self.ram_end = self.ram_end.max(range.end()); } } - fn allocate_range(&mut self, tag: &Arc, kind: PlacedRangeKind, range: MemoryRange) { - self.remove_free_range(range); - self.record(tag, kind, range); - } - fn remove_free_range(&mut self, allocated: MemoryRange) { let free_index = self .free @@ -481,25 +458,16 @@ impl<'a> LayoutBuilder<'a> { alignment: u64, placement: Placement, ) { + let request = DynamicRequest { + tag: tag.into(), + target, + size, + alignment, + }; match placement { - Placement::Mmio32 => self.mmio32.push(DynamicRequest { - tag: tag.into(), - target, - size, - alignment, - }), - Placement::Mmio64 => self.mmio64.push(DynamicRequest { - tag: tag.into(), - target, - size, - alignment, - }), - Placement::PostMmio => self.post_mmio.push(DynamicRequest { - tag: tag.into(), - target, - size, - alignment, - }), + Placement::Mmio32 => self.mmio32.push(request), + Placement::Mmio64 => self.mmio64.push(request), + Placement::PostMmio => self.post_mmio.push(request), } } @@ -529,13 +497,13 @@ impl<'a> LayoutBuilder<'a> { /// Allocates all requests, fills in each target, and returns every placed /// range sorted by address. pub fn allocate(mut self) -> Result, AllocateError> { - validate_reserved_requests(&self.reserved)?; - validate_fixed_requests(&self.fixed)?; + validate_requests(&self.reserved, |r| (&r.tag, r.range.len(), PAGE_SIZE))?; + validate_requests(&self.fixed, |r| (&r.tag, r.range.len(), PAGE_SIZE))?; validate_pinned_ranges(&self.reserved, &self.fixed)?; - validate_dynamic_requests(&self.mmio32)?; - validate_ram_requests(&self.ram)?; - validate_dynamic_requests(&self.mmio64)?; - validate_dynamic_requests(&self.post_mmio)?; + validate_requests(&self.mmio32, |r| (&r.tag, r.size, r.alignment))?; + validate_requests(&self.ram, |r| (&r.tag, r.size, r.alignment))?; + validate_requests(&self.mmio64, |r| (&r.tag, r.size, r.alignment))?; + validate_requests(&self.post_mmio, |r| (&r.tag, r.size, r.alignment))?; let mut state = AllocationState::new(); state.place_reserved(&self.reserved); @@ -546,6 +514,11 @@ impl<'a> LayoutBuilder<'a> { state.place_post_mmio(&mut self.post_mmio)?; state.allocations.sort_by_key(|allocation| allocation.range); + // Trailing reserved ranges sit above every guest-visible allocation and + // exist only to keep that space out of the free list during placement. + // Returning them would bloat the layout without informing any + // consumer, so drop them. Reserved ranges interleaved with real + // allocations are still reported. while state .allocations .last() @@ -581,17 +554,13 @@ fn validate_size_alignment(tag: &Arc, size: u64, alignment: u64) -> Result< Ok(()) } -fn validate_reserved_requests(requests: &[ReservedRequest]) -> Result<(), AllocateError> { - for request in requests { - validate_size_alignment(&request.tag, request.range.len(), PAGE_SIZE)?; - } - - Ok(()) -} - -fn validate_fixed_requests(requests: &[FixedRequest]) -> Result<(), AllocateError> { +fn validate_requests( + requests: &[T], + get: impl Fn(&T) -> (&Arc, u64, u64), +) -> Result<(), AllocateError> { for request in requests { - validate_size_alignment(&request.tag, request.range.len(), PAGE_SIZE)?; + let (tag, size, alignment) = get(request); + validate_size_alignment(tag, size, alignment)?; } Ok(()) @@ -627,33 +596,19 @@ fn validate_pinned_ranges( Ok(()) } -fn validate_dynamic_requests(requests: &[DynamicRequest<'_>]) -> Result<(), AllocateError> { - for request in requests { - validate_size_alignment(&request.tag, request.size, request.alignment)?; - } - - Ok(()) -} - -fn validate_ram_requests(requests: &[RamRequest<'_>]) -> Result<(), AllocateError> { - for request in requests { - validate_size_alignment(&request.tag, request.size, request.alignment)?; - } - - Ok(()) -} - fn exhausted_error( - request: &impl RequestDetails, + tag: &Arc, + size: u64, + alignment: u64, phase: AllocationPhase, free_ranges: &[MemoryRange], region_start: u64, region_end: u64, ) -> AllocateError { AllocateError::Exhausted { - tag: request.tag().clone(), - size: request.size(), - alignment: request.alignment(), + tag: tag.clone(), + size, + alignment, phase, free_space: free_space_in_region(free_ranges, region_start, region_end), } @@ -662,14 +617,19 @@ fn exhausted_error( fn free_space_in_region(free_ranges: &[MemoryRange], region_start: u64, region_end: u64) -> u64 { free_ranges .iter() - .map(|range| { - let effective_start = range.start().max(region_start); - let effective_end = range.end().min(region_end); - effective_end.saturating_sub(effective_start) - }) + .filter_map(|range| clamp_to_region(*range, region_start, region_end)) + .map(|(start, end)| end - start) .sum() } +/// Clamps a free range to the requested placement region. Returns `None` when +/// the intersection is empty. +fn clamp_to_region(range: MemoryRange, region_start: u64, region_end: u64) -> Option<(u64, u64)> { + let start = range.start().max(region_start); + let end = range.end().min(region_end); + (start < end).then_some((start, end)) +} + fn find_highest_fit( free_ranges: &[MemoryRange], size: u64, @@ -678,15 +638,15 @@ fn find_highest_fit( region_end: u64, ) -> Option { for range in free_ranges.iter().rev() { - let effective_start = range.start().max(region_start); - let effective_end = range.end().min(region_end); - - if effective_start >= effective_end || effective_end - effective_start < size { + let Some((effective_start, effective_end)) = + clamp_to_region(*range, region_start, region_end) + else { + continue; + }; + if effective_end - effective_start < size { continue; } - - let latest_start = effective_end - size; - let aligned_start = align_down(latest_start, alignment); + let aligned_start = align_down(effective_end - size, alignment); if aligned_start >= effective_start { return Some(aligned_start); } @@ -703,20 +663,17 @@ fn find_lowest_fit( region_end: u64, ) -> Option { for range in free_ranges { - let effective_start = range.start().max(region_start); - let effective_end = range.end().min(region_end); - - if effective_start >= effective_end { + let Some((effective_start, effective_end)) = + clamp_to_region(*range, region_start, region_end) + else { continue; - } - + }; let Some(aligned_start) = align_up(effective_start, alignment) else { continue; }; let Some(end) = aligned_start.checked_add(size) else { continue; }; - if end <= effective_end { return Some(aligned_start); } @@ -736,13 +693,11 @@ fn find_lowest_splittable_fit( let mut ranges = Vec::new(); for range in free_ranges { - let effective_start = range.start().max(region_start); - let effective_end = range.end().min(region_end); - - if effective_start >= effective_end { + let Some((effective_start, effective_end)) = + clamp_to_region(*range, region_start, region_end) + else { continue; - } - + }; let Some(aligned_start) = align_up(effective_start, alignment) else { continue; }; From e6669167e2c2238b054d981dcb98a277382cd680 Mon Sep 17 00:00:00 2001 From: John Starks Date: Sat, 16 May 2026 18:16:59 +0000 Subject: [PATCH 17/36] while we're here --- .../src/worker/vm_loaders/igvm.rs | 60 +++++++++++++------ 1 file changed, 42 insertions(+), 18 deletions(-) diff --git a/openvmm/openvmm_core/src/worker/vm_loaders/igvm.rs b/openvmm/openvmm_core/src/worker/vm_loaders/igvm.rs index 01abe0c70b..3049b32517 100644 --- a/openvmm/openvmm_core/src/worker/vm_loaders/igvm.rs +++ b/openvmm/openvmm_core/src/worker/vm_loaders/igvm.rs @@ -30,7 +30,6 @@ use openvmm_defs::config::SerialInformation; use openvmm_defs::config::Vtl2BaseAddressType; use range_map_vec::RangeMap; use std::collections::HashMap; -use std::ffi::CString; use std::io::Read; use std::io::Seek; use thiserror::Error; @@ -46,8 +45,8 @@ use zerocopy::IntoBytes; #[derive(Debug, Error)] pub enum Error { - #[error("command line is not a valid C string")] - InvalidCommandLine(#[source] std::ffi::NulError), + #[error("command line contains an embedded NUL byte")] + CommandLineContainsNul, #[error("failed to read igvm file")] Igvm(#[source] std::io::Error), #[error("invalid igvm file")] @@ -256,20 +255,37 @@ pub fn vtl2_memory_layout_request( Ok(Vtl2MemoryLayoutRequest { size, alignment }) } -/// Build a device tree representing the whole guest partition. -fn build_device_tree( - processor_topology: &ProcessorTopology, - all_ram: &[MemoryRangeWithNode], - vtl2_protectable_ram: &[MemoryRange], +/// Parameters for [`build_device_tree`]. +struct BuildDeviceTreeParams<'a> { + processor_topology: &'a ProcessorTopology, + all_ram: &'a [MemoryRangeWithNode], + vtl2_protectable_ram: &'a [MemoryRange], vtl2_base_address: Vtl2BaseAddressType, - command_line: &str, + command_line: &'a str, with_vmbus_redirect: bool, com_serial: Option, - entropy: Option<&[u8]>, + entropy: Option<&'a [u8]>, chipset_low_mmio: Option, chipset_high_mmio: Option, vtl2_chipset_mmio: Option, -) -> Result, fdt::builder::Error> { +} + +/// Build a device tree representing the whole guest partition. +fn build_device_tree(params: BuildDeviceTreeParams<'_>) -> Result, fdt::builder::Error> { + let BuildDeviceTreeParams { + processor_topology, + all_ram, + vtl2_protectable_ram, + vtl2_base_address, + command_line, + with_vmbus_redirect, + com_serial, + entropy, + chipset_low_mmio, + chipset_high_mmio, + vtl2_chipset_mmio, + } = params; + let mut buf = vec![0; HV_PAGE_SIZE as usize * 256]; let mut builder = fdt::builder::Builder::new(fdt::builder::BuilderConfig { @@ -575,7 +591,12 @@ fn load_igvm_x86( cmdline.to_string() }; - let command_line = CString::new(cmdline).map_err(Error::InvalidCommandLine)?; + // The command line is exposed to the guest as a NUL-terminated byte + // sequence (via the IGVM CommandLine parameter), so reject any embedded NUL + // bytes up front. + if cmdline.as_bytes().contains(&0) { + return Err(Error::CommandLineContainsNul); + } let (mask, max_vtl) = match vbs_platform_header(igvm_file)? { IgvmPlatformHeader::SupportedPlatform(info) => { @@ -948,22 +969,25 @@ fn load_igvm_x86( import_parameter(&mut parameter_areas, info, memory_map.as_bytes())?; } IgvmDirectiveHeader::CommandLine(ref info) => { - import_parameter(&mut parameter_areas, info, command_line.as_bytes_with_nul())?; + let mut bytes = Vec::with_capacity(cmdline.len() + 1); + bytes.extend_from_slice(cmdline.as_bytes()); + bytes.push(0); + import_parameter(&mut parameter_areas, info, &bytes)?; } IgvmDirectiveHeader::DeviceTree(ref info) => { - let dt = build_device_tree( + let dt = build_device_tree(BuildDeviceTreeParams { processor_topology, - &all_ram, - &vtl2_protectable_ram, + all_ram: &all_ram, + vtl2_protectable_ram: &vtl2_protectable_ram, vtl2_base_address, - &String::from_utf8_lossy(command_line.as_bytes()), + command_line: &cmdline, with_vmbus_redirect, com_serial, entropy, chipset_low_mmio, chipset_high_mmio, vtl2_chipset_mmio, - ) + }) .map_err(Error::DeviceTree)?; import_parameter(&mut parameter_areas, info, &dt)?; } From f2fa992cf9942c33b514363ef128dfe393069221 Mon Sep 17 00:00:00 2001 From: John Starks Date: Sat, 16 May 2026 18:34:20 +0000 Subject: [PATCH 18/36] fix --- .../openvmm_core/src/worker/memory_layout.rs | 47 +++++++++++++++++++ vmm_core/src/acpi_builder.rs | 10 +++- 2 files changed, 56 insertions(+), 1 deletion(-) diff --git a/openvmm/openvmm_core/src/worker/memory_layout.rs b/openvmm/openvmm_core/src/worker/memory_layout.rs index 8a17c5a3a5..2d7be92c26 100644 --- a/openvmm/openvmm_core/src/worker/memory_layout.rs +++ b/openvmm/openvmm_core/src/worker/memory_layout.rs @@ -35,6 +35,15 @@ const GB: u64 = 1024 * 1024 * 1024; /// PCIe ECAM: 32 devices * 8 functions * 4 KiB config space = 1 MB per bus. const PCIE_ECAM_BYTES_PER_BUS: u64 = 32 * 8 * 4096; +/// Minimum guest physical address at which an ECAM range may be placed. +/// +/// The ACPI MCFG table reports the bus-0 base as +/// `ecam_range.start() - start_bus * 1 MiB`. `start_bus` is a `u8`, so up to +/// 255 MiB of headroom may be required. Rounding up to a flat 256 MiB gives a +/// single easy-to-remember invariant that works for every legal `start_bus` +/// value, independent of any individual root complex's configuration. +const PCIE_ECAM_MIN_ADDRESS: u64 = 256 * 1024 * 1024; + #[derive(Debug)] pub(super) struct ResolvedMemoryLayout { pub memory_layout: MemoryLayout, @@ -263,6 +272,25 @@ pub(super) fn resolve_memory_layout( .allocate() .context("allocating memory layout ranges")?; + // Enforce the MCFG bus-0 base invariant: every ECAM range must sit at + // `PCIE_ECAM_MIN_ADDRESS` or above. Fail fast at VM construction with a + // clear error rather than letting an unrepresentable MCFG entry surface + // later as a panic (debug) or silent wraparound (release). + for (root_complex, ranges) in input + .pcie_root_complexes + .iter() + .zip(&pcie_root_complex_ranges) + { + if ranges.ecam_range.start() < PCIE_ECAM_MIN_ADDRESS { + bail!( + "PCIe root complex {:?}: ECAM at {:#x} is below the {:#x} minimum", + root_complex.name, + ranges.ecam_range.start(), + PCIE_ECAM_MIN_ADDRESS, + ); + } + } + let ram = ram_ranges_by_node .into_iter() .enumerate() @@ -704,4 +732,23 @@ mod tests { assert!(result.chipset_low_mmio.is_none()); assert!(result.chipset_high_mmio.is_none()); } + + #[test] + fn ecam_below_256mb_is_rejected() { + // Force ECAM placement below 256 MiB by reserving most of the 32-bit + // MMIO window for low_mmio. The Mmio32 zone is ~4064 MiB on x86_64, + // so a 3840 MiB low_mmio request plus the default 96 MiB chipset_low + // pushes ECAM down to ~127 MiB. The resolver must bail because MCFG + // cannot represent a bus-0 base below the ECAM start. + let root_complexes = [pcie_root_complex( + PcieMmioRangeConfig::Dynamic { size: 3840 * MB }, + PcieMmioRangeConfig::Dynamic { size: GB }, + )]; + let mut config = input(2 * GB, None, None); + config.pcie_root_complexes = &root_complexes; + + let err = resolve_memory_layout(config).unwrap_err(); + + assert!(err.to_string().contains("ECAM"), "unexpected error: {err}"); + } } diff --git a/vmm_core/src/acpi_builder.rs b/vmm_core/src/acpi_builder.rs index 791edcb857..49503f2f4e 100644 --- a/vmm_core/src/acpi_builder.rs +++ b/vmm_core/src/acpi_builder.rs @@ -329,10 +329,18 @@ impl AcpiTablesBuilder<'_, T> { // address reported in the MCFG table must reflect wherever bus number // 0 would be accessible even if the host bridge has a different starting // bus number. + // + // The layout resolver guarantees `ecam_range.start() >= + // start_bus * 1 MiB` so this subtraction never underflows in + // practice. Use `wrapping_sub` anyway so that, if a future code + // path ever bypasses that check, behavior matches what a C MCFG + // builder would do: the guest sees a wrapped base address and is + // most likely to still compute the right per-bus ECAM addresses + // for the buses it actually accesses. let ecam_region_offset = (bridge.start_bus as u64) * 256 * 4096; mcfg_extra.extend_from_slice( acpi_spec::mcfg::McfgSegmentBusRange::new( - bridge.ecam_range.start() - ecam_region_offset, + bridge.ecam_range.start().wrapping_sub(ecam_region_offset), bridge.segment, bridge.start_bus, bridge.end_bus, From 47487ab79de49393101acb9527ef8954afeff4b3 Mon Sep 17 00:00:00 2001 From: John Starks Date: Sat, 16 May 2026 18:48:03 +0000 Subject: [PATCH 19/36] idiot --- .../openvmm_core/src/worker/memory_layout.rs | 18 +++++++++--------- 1 file changed, 9 insertions(+), 9 deletions(-) diff --git a/openvmm/openvmm_core/src/worker/memory_layout.rs b/openvmm/openvmm_core/src/worker/memory_layout.rs index 2d7be92c26..a38b484498 100644 --- a/openvmm/openvmm_core/src/worker/memory_layout.rs +++ b/openvmm/openvmm_core/src/worker/memory_layout.rs @@ -304,23 +304,23 @@ pub(super) fn resolve_memory_layout( let vtl2_range = input.vtl2_layout.map(|_| vtl2_range); - // Derive the effective MMIO gaps from the resolved allocations. These are - // the non-RAM, non-VTL2 ranges that MemoryLayout stores as `mmio()`. We - // collect chipset MMIO, PCIe, virtio-mmio, and the architectural reserved - // zone into sorted gap vectors so existing consumers of - // `MemoryLayout::mmio()` keep working. + // `MemoryLayout::mmio()` is a legacy positional contract preserved here + // exactly as callers had it pre-allocator: `[0]` = chipset low MMIO, + // `[1]` = chipset high MMIO, and (when VTL2 is enabled) `[2]` = the + // VTL2-private chipset MMIO range. Consumers (DSDT, Linux DT, UEFI, + // PCAT) rely on this ordering. The architectural reserved zone and + // virtio-mmio region were never part of this vector and remain tracked + // separately. `MemoryLayout::mmio()` will eventually be removed. let mut mmio_gaps: Vec = Vec::new(); - mmio_gaps.push(arch_reserved); if input.chipset_low_mmio.is_some() { mmio_gaps.push(chipset_low_mmio); } if input.chipset_high_mmio.is_some() { mmio_gaps.push(chipset_high_mmio); } - if input.virtio_mmio_count > 0 { - mmio_gaps.push(virtio_mmio_region); + if input.vtl2_chipset_mmio.is_some() { + mmio_gaps.push(vtl2_chipset_mmio); } - mmio_gaps.sort(); let mut pci_ecam_gaps: Vec = Vec::new(); pci_ecam_gaps.extend( From d70d295583e4f8018388145ab2f847ca1a08f081 Mon Sep 17 00:00:00 2001 From: John Starks Date: Sat, 16 May 2026 18:48:13 +0000 Subject: [PATCH 20/36] actually above 4GB --- vm/vmcore/vm_topology/src/layout.rs | 60 ++++++++++++++++++++--------- 1 file changed, 42 insertions(+), 18 deletions(-) diff --git a/vm/vmcore/vm_topology/src/layout.rs b/vm/vmcore/vm_topology/src/layout.rs index f71c891341..eec518f630 100644 --- a/vm/vmcore/vm_topology/src/layout.rs +++ b/vm/vmcore/vm_topology/src/layout.rs @@ -51,7 +51,8 @@ const ADDRESS_LIMIT: u64 = MemoryRange::MAX_ADDRESS; pub enum Placement { /// The allocation must fit below the 4 GiB boundary and is placed top down. Mmio32, - /// The allocation is placed bottom up from the end of RAM. + /// The allocation must sit above the 4 GiB boundary and is placed bottom + /// up above RAM. Mmio64, /// The allocation is placed bottom up after RAM and all MMIO allocations. /// @@ -250,17 +251,20 @@ impl AllocationState { } fn place_mmio64(&mut self, requests: &mut [DynamicRequest<'_>]) -> Result<(), AllocateError> { - // High MMIO is allocated bottom up from the end of RAM. The allocator - // intentionally does not take host physical-address width as an input; - // callers validate the resulting top against host capabilities later. + // High MMIO is allocated bottom up above RAM, but never below the + // 4 GiB boundary: it is "64-bit" MMIO and must not overlap the 32-bit + // window even when RAM is small. The allocator intentionally does not + // take host physical-address width as an input; callers validate the + // resulting top against host capabilities later. requests.sort_by_key(|r| r.placement_sort_key()); + let floor = self.ram_end.max(FOUR_GIB); for request in requests { let Some(start) = find_lowest_fit( &self.free, request.size, request.alignment, - self.ram_end, + floor, ADDRESS_LIMIT, ) else { return Err(exhausted_error( @@ -269,7 +273,7 @@ impl AllocationState { request.alignment, AllocationPhase::Mmio64, &self.free, - self.ram_end, + floor, ADDRESS_LIMIT, )); }; @@ -892,7 +896,7 @@ mod tests { } #[test] - fn mmio64_uses_bottom_up_placement_from_end_of_ram() { + fn mmio64_uses_bottom_up_placement_above_four_gib() { let mut ram = Vec::new(); let mut first = MemoryRange::EMPTY; let mut second = MemoryRange::EMPTY; @@ -903,22 +907,41 @@ mod tests { builder.allocate().unwrap(); - assert_eq!(first, MemoryRange::new(2 * GIB..2 * GIB + MIB)); - assert_eq!(second, MemoryRange::new(2 * GIB + MIB..2 * GIB + 2 * MIB)); + // Mmio64 is floored at 4 GiB even when RAM ends below it. + assert_eq!(first, MemoryRange::new(FOUR_GIB..FOUR_GIB + MIB)); + assert_eq!(second, MemoryRange::new(FOUR_GIB + MIB..FOUR_GIB + 2 * MIB)); + } + + #[test] + fn mmio64_starts_above_ram_when_ram_exceeds_four_gib() { + let mut ram = Vec::new(); + let mut mmio64 = MemoryRange::EMPTY; + let mut builder = LayoutBuilder::new(); + builder.ram("ram", &mut ram, 6 * GIB, PAGE_SIZE); + builder.request("mmio64", &mut mmio64, MIB, MIB, Placement::Mmio64); + + builder.allocate().unwrap(); + + // RAM occupies [0, 4 GiB) and [4 GiB + low MMIO ..]; with no Mmio32 + // requests, the second RAM extent starts at 4 GiB and ends at 6 GiB + + // (low MMIO hole) above 4 GiB. Mmio64 is placed bottom-up above RAM. + let ram_end = ram.iter().map(|r| r.end()).max().unwrap(); + assert_eq!(mmio64, MemoryRange::new(ram_end..ram_end + MIB)); + assert!(mmio64.start() >= FOUR_GIB); } #[test] - fn mmio64_skips_fixed_ranges_above_ram() { + fn mmio64_skips_fixed_ranges_above_four_gib() { let mut ram = Vec::new(); let mut mmio64 = MemoryRange::EMPTY; let mut builder = LayoutBuilder::new(); builder.ram("ram", &mut ram, 2 * GIB, PAGE_SIZE); - builder.fixed("fixed", MemoryRange::new(2 * GIB..2 * GIB + MIB)); + builder.fixed("fixed", MemoryRange::new(FOUR_GIB..FOUR_GIB + MIB)); builder.request("mmio64", &mut mmio64, MIB, MIB, Placement::Mmio64); builder.allocate().unwrap(); - assert_eq!(mmio64, MemoryRange::new(2 * GIB + MIB..2 * GIB + 2 * MIB)); + assert_eq!(mmio64, MemoryRange::new(FOUR_GIB + MIB..FOUR_GIB + 2 * MIB)); } #[test] @@ -933,10 +956,10 @@ mod tests { builder.allocate().unwrap(); - assert_eq!(mmio64, MemoryRange::new(2 * GIB..2 * GIB + MIB)); + assert_eq!(mmio64, MemoryRange::new(FOUR_GIB..FOUR_GIB + MIB)); assert_eq!( post_mmio, - MemoryRange::new(2 * GIB + MIB..2 * GIB + 2 * MIB) + MemoryRange::new(FOUR_GIB + MIB..FOUR_GIB + 2 * MIB) ); } @@ -1076,12 +1099,13 @@ mod tests { let sorted = builder.allocate().unwrap(); + // mmio32 sits just below 4 GiB; mmio64 sits at 4 GiB or above. assert_eq!(&*sorted[0].tag, "ram"); assert_eq!(sorted[0].kind, PlacedRangeKind::Ram); - assert_eq!(&*sorted[1].tag, "mmio64"); - assert_eq!(sorted[1].kind, PlacedRangeKind::Mmio64); - assert_eq!(&*sorted[2].tag, "mmio32"); - assert_eq!(sorted[2].kind, PlacedRangeKind::Mmio32); + assert_eq!(&*sorted[1].tag, "mmio32"); + assert_eq!(sorted[1].kind, PlacedRangeKind::Mmio32); + assert_eq!(&*sorted[2].tag, "mmio64"); + assert_eq!(sorted[2].kind, PlacedRangeKind::Mmio64); } #[test] From cff608507c6c50108d59724a8f7b64705ec27914 Mon Sep 17 00:00:00 2001 From: John Starks Date: Sat, 16 May 2026 18:54:23 +0000 Subject: [PATCH 21/36] improvements --- .../architecture/openvmm/memory-layout.md | 18 +++++++++------- .../openvmm_core/src/worker/memory_layout.rs | 7 ++++--- .../src/worker/vm_loaders/igvm.rs | 21 ++++++++----------- 3 files changed, 23 insertions(+), 23 deletions(-) diff --git a/Guide/src/reference/architecture/openvmm/memory-layout.md b/Guide/src/reference/architecture/openvmm/memory-layout.md index 2668ca3b6e..1ba8ca1f40 100644 --- a/Guide/src/reference/architecture/openvmm/memory-layout.md +++ b/Guide/src/reference/architecture/openvmm/memory-layout.md @@ -107,14 +107,16 @@ issues requests in this order: 3. **Chipset high MMIO** (`Mmio64`) — the corresponding high range. 2 MB alignment. 4. **PCIe root complex ranges**, one per root complex: - - **ECAM** (`Mmio32`). If the config specifies a fixed ECAM range, the - worker uses it as `fixed`. Otherwise the size is derived from the - bus window as `(end_bus - start_bus + 1) * 1 MB` (32 devices × 8 - functions × 4 KiB per config space). - - **Low MMIO** (`Mmio32`), 2 MB aligned. - - **High MMIO** (`Mmio64`), 1 GB aligned. Per-BAR alignment would - guarantee the entire window is usable for one large BAR, but burns - address space on hosts with tight physical-address widths. + - **ECAM** (`Mmio32`). The size is derived from the bus window as + `(end_bus - start_bus + 1) * 1 MB` (32 devices × 8 functions × + 4 KiB per config space). + - **Low MMIO** (`Mmio32`), 2 MB aligned. A caller can pin this to a + fixed range instead of supplying a size, for assigned-device, IOMMU, + and physical-topology passthrough. + - **High MMIO** (`Mmio64`), 1 GB aligned. A caller can pin this to a + fixed range as well. Per-BAR alignment would guarantee the entire + window is usable for one large BAR, but burns address space on + hosts with tight physical-address widths. 5. **Virtio-mmio slots** (`Mmio32`) — one contiguous region sized `slot_count * 4 KiB`, when any slots are configured. 6. **RAM**, in vnode order. The first request becomes vnode 0, the second diff --git a/openvmm/openvmm_core/src/worker/memory_layout.rs b/openvmm/openvmm_core/src/worker/memory_layout.rs index a38b484498..78c2d07005 100644 --- a/openvmm/openvmm_core/src/worker/memory_layout.rs +++ b/openvmm/openvmm_core/src/worker/memory_layout.rs @@ -268,7 +268,7 @@ pub(super) fn resolve_memory_layout( ); } - builder + let placed_ranges = builder .allocate() .context("allocating memory layout ranges")?; @@ -352,10 +352,11 @@ pub(super) fn resolve_memory_layout( // function of VM configuration and avoids host differences changing guest // physical addresses. let address_space_limit = 1u64 << input.physical_address_size; - if memory_layout.end_of_layout() > address_space_limit { + let layout_top = placed_ranges.last().map(|r| r.range.end()).unwrap_or(0); + if layout_top > address_space_limit { bail!( "memory layout ends at {:#x}, which exceeds the address width of {} bits", - memory_layout.end_of_layout(), + layout_top, input.physical_address_size ); } diff --git a/openvmm/openvmm_core/src/worker/vm_loaders/igvm.rs b/openvmm/openvmm_core/src/worker/vm_loaders/igvm.rs index 3049b32517..d4a166fee2 100644 --- a/openvmm/openvmm_core/src/worker/vm_loaders/igvm.rs +++ b/openvmm/openvmm_core/src/worker/vm_loaders/igvm.rs @@ -359,18 +359,15 @@ fn build_device_tree(params: BuildDeviceTreeParams<'_>) -> Result, fdt:: // Build DT ranges for VMBus devices. VTL0 gets the chipset low/high MMIO // ranges; VTL2 gets its own private chipset MMIO range. - let mut ranges_vtl0: Vec = Vec::new(); - for range in [chipset_low_mmio, chipset_high_mmio].into_iter().flatten() { - ranges_vtl0.push(range.start()); - ranges_vtl0.push(range.start()); - ranges_vtl0.push(range.len()); - } - let mut ranges_vtl2: Vec = Vec::new(); - if let Some(range) = vtl2_chipset_mmio { - ranges_vtl2.push(range.start()); - ranges_vtl2.push(range.start()); - ranges_vtl2.push(range.len()); - } + let ranges_vtl0: Vec = [chipset_low_mmio, chipset_high_mmio] + .into_iter() + .flatten() + .flat_map(|range| [range.start(), range.start(), range.len()]) + .collect(); + let ranges_vtl2: Vec = vtl2_chipset_mmio + .into_iter() + .flat_map(|range| [range.start(), range.start(), range.len()]) + .collect(); // VTL0 vmbus root device let vmbus_vtl0_name = if ranges_vtl0.is_empty() { From 25261cd5b6f4a374dd075349083052abb6227788 Mon Sep 17 00:00:00 2001 From: John Starks Date: Sat, 16 May 2026 19:07:53 +0000 Subject: [PATCH 22/36] feedback --- .../architecture/openvmm/memory-layout.md | 14 +- openvmm/openvmm_core/src/worker/dispatch.rs | 24 ++-- .../openvmm_core/src/worker/memory_layout.rs | 50 ++++---- openvmm/openvmm_defs/src/config.rs | 12 +- openvmm/openvmm_entry/src/lib.rs | 12 +- openvmm/openvmm_entry/src/ttrpc/mod.rs | 6 +- petri/src/vm/openvmm/construct.rs | 12 +- vm/vmcore/vm_topology/src/layout.rs | 121 ++++++++++++++++-- vmm_core/vm_manifest_builder/src/lib.rs | 28 ++-- 9 files changed, 196 insertions(+), 83 deletions(-) diff --git a/Guide/src/reference/architecture/openvmm/memory-layout.md b/Guide/src/reference/architecture/openvmm/memory-layout.md index 1ba8ca1f40..666361bd85 100644 --- a/Guide/src/reference/architecture/openvmm/memory-layout.md +++ b/Guide/src/reference/architecture/openvmm/memory-layout.md @@ -63,8 +63,11 @@ phase pulls from whatever address space the earlier phases left free: 2. **Fixed ranges** are removed from the free space. 3. **`Placement::Mmio32`** requests are packed *top down* below 4 GiB, so RAM can start at GPA 0 and grow upward through the lowest free space. -4. **RAM** requests are placed *bottom up* from GPA 0, splitting around any - holes left by the earlier phases. RAM is the only splittable kind. +4. **RAM** requests are placed *bottom up*, in caller order, splitting + around any holes left by the earlier phases. The first request starts at + GPA 0; each subsequent request starts at or above the highest address + used by previous RAM requests, so later requests never backfill + fragments earlier ones skipped. RAM is the only splittable kind. 5. **`Placement::Mmio64`** requests are packed *bottom up* starting at the end of RAM. This makes the layout top a function of requested topology rather than a precomputed high MMIO bucket size. @@ -120,7 +123,12 @@ issues requests in this order: 5. **Virtio-mmio slots** (`Mmio32`) — one contiguous region sized `slot_count * 4 KiB`, when any slots are configured. 6. **RAM**, in vnode order. The first request becomes vnode 0, the second - vnode 1, and so on. Alignment depends on request size: + vnode 1, and so on. Each vnode starts at or above the highest address + used by prior vnodes; vnode N+1 never backfills a fragment that vnode + N skipped. This keeps vnode ordering equal to address ordering and + turns vnode layout into a clean compatibility surface — adding a new + fixed or reserved range below RAM end can only shift the first vnode + whose own span actually covers it. Alignment depends on request size: | RAM request size | Alignment | |---|---| diff --git a/openvmm/openvmm_core/src/worker/dispatch.rs b/openvmm/openvmm_core/src/worker/dispatch.rs index cd7f31e5df..fda3b35b30 100644 --- a/openvmm/openvmm_core/src/worker/dispatch.rs +++ b/openvmm/openvmm_core/src/worker/dispatch.rs @@ -200,9 +200,9 @@ impl Manifest { chipset_devices: config.chipset_devices, pci_chipset_devices: config.pci_chipset_devices, chipset_capabilities: config.chipset_capabilities, - chipset_low_mmio: config.chipset_low_mmio, - chipset_high_mmio: config.chipset_high_mmio, - vtl2_chipset_mmio: config.vtl2_chipset_mmio, + chipset_low_mmio_size: config.chipset_low_mmio_size, + chipset_high_mmio_size: config.chipset_high_mmio_size, + vtl2_chipset_mmio_size: config.vtl2_chipset_mmio_size, generation_id_recv: config.generation_id_recv, rtc_delta_milliseconds: config.rtc_delta_milliseconds, automatic_guest_reset: config.automatic_guest_reset, @@ -252,9 +252,9 @@ pub struct Manifest { chipset_devices: Vec, pci_chipset_devices: Vec, chipset_capabilities: VmChipsetCapabilities, - chipset_low_mmio: Option, - chipset_high_mmio: Option, - vtl2_chipset_mmio: Option, + chipset_low_mmio_size: u64, + chipset_high_mmio_size: u64, + vtl2_chipset_mmio_size: u64, generation_id_recv: Option>, rtc_delta_milliseconds: i64, automatic_guest_reset: bool, @@ -926,9 +926,9 @@ impl InitializedVm { let resolved_layout = resolve_memory_layout(MemoryLayoutInput { mem_size: cfg.memory.mem_size, numa_mem_sizes: cfg.memory.numa_mem_sizes.as_deref(), - chipset_low_mmio: cfg.chipset_low_mmio, - chipset_high_mmio: cfg.chipset_high_mmio, - vtl2_chipset_mmio: cfg.vtl2_chipset_mmio, + chipset_low_mmio_size: cfg.chipset_low_mmio_size, + chipset_high_mmio_size: cfg.chipset_high_mmio_size, + vtl2_chipset_mmio_size: cfg.vtl2_chipset_mmio_size, pcie_root_complexes: &cfg.pcie_root_complexes, virtio_mmio_count, vtl2_layout, @@ -3376,9 +3376,9 @@ impl LoadedVm { chipset_devices: vec![], // TODO pci_chipset_devices: vec![], // TODO chipset_capabilities: self.inner.chipset_capabilities, - chipset_low_mmio: None, // TODO - chipset_high_mmio: None, // TODO - vtl2_chipset_mmio: None, // TODO + chipset_low_mmio_size: 0, // TODO + chipset_high_mmio_size: 0, // TODO + vtl2_chipset_mmio_size: 0, // TODO generation_id_recv: None, // TODO rtc_delta_milliseconds: 0, // TODO automatic_guest_reset: self.inner.automatic_guest_reset, diff --git a/openvmm/openvmm_core/src/worker/memory_layout.rs b/openvmm/openvmm_core/src/worker/memory_layout.rs index 78c2d07005..0b55172fcc 100644 --- a/openvmm/openvmm_core/src/worker/memory_layout.rs +++ b/openvmm/openvmm_core/src/worker/memory_layout.rs @@ -78,16 +78,16 @@ pub(super) struct MemoryLayoutInput<'a> { pub numa_mem_sizes: Option<&'a [u64]>, /// Chipset low MMIO size (below 4 GB). This is the VMOD/PCI0 _CRS range /// for VMBus devices and PIIX4 PCI BARs. The address is always allocated - /// dynamically. `None` disables the range. - pub chipset_low_mmio: Option, + /// dynamically. `0` disables the range. + pub chipset_low_mmio_size: u64, /// Chipset high MMIO size (above RAM). This is the VMOD/PCI0 _CRS high /// range for VMBus devices. The address is always allocated dynamically. - /// `None` disables the range. - pub chipset_high_mmio: Option, + /// `0` disables the range. + pub chipset_high_mmio_size: u64, /// VTL2-private chipset MMIO size. Placed after all VTL0-visible layout /// so enabling VTL2 does not move VTL0 addresses. The address is always - /// allocated dynamically. `None` disables the range. - pub vtl2_chipset_mmio: Option, + /// allocated dynamically. `0` disables the range. + pub vtl2_chipset_mmio_size: u64, /// PCIe root complex address-space intents. These are resolved by this /// worker step so front ends do not need to carve guest physical addresses. pub pcie_root_complexes: &'a [PcieRootComplexConfig], @@ -142,22 +142,22 @@ pub(super) fn resolve_memory_layout( // Chipset low MMIO (Mmio32): VMOD/PCI0 _CRS low range for VMBus // devices and PIIX4 PCI BARs. - if let Some(size) = input.chipset_low_mmio { + if input.chipset_low_mmio_size != 0 { builder.request( "chipset_low_mmio", &mut chipset_low_mmio, - size, + input.chipset_low_mmio_size, TWO_MB, Placement::Mmio32, ); } // Chipset high MMIO (Mmio64): VMOD/PCI0 _CRS high range. - if let Some(size) = input.chipset_high_mmio { + if input.chipset_high_mmio_size != 0 { builder.request( "chipset_high_mmio", &mut chipset_high_mmio, - size, + input.chipset_high_mmio_size, TWO_MB, Placement::Mmio64, ); @@ -239,11 +239,11 @@ pub(super) fn resolve_memory_layout( // VTL2 chipset MMIO is implementation-private — placed after all // VTL0-visible RAM/MMIO so enabling VTL2 does not move VTL0 addresses. - if let Some(size) = input.vtl2_chipset_mmio { + if input.vtl2_chipset_mmio_size != 0 { builder.request( "vtl2_chipset_mmio", &mut vtl2_chipset_mmio, - size, + input.vtl2_chipset_mmio_size, TWO_MB, Placement::PostMmio, ); @@ -312,13 +312,13 @@ pub(super) fn resolve_memory_layout( // virtio-mmio region were never part of this vector and remain tracked // separately. `MemoryLayout::mmio()` will eventually be removed. let mut mmio_gaps: Vec = Vec::new(); - if input.chipset_low_mmio.is_some() { + if input.chipset_low_mmio_size != 0 { mmio_gaps.push(chipset_low_mmio); } - if input.chipset_high_mmio.is_some() { + if input.chipset_high_mmio_size != 0 { mmio_gaps.push(chipset_high_mmio); } - if input.vtl2_chipset_mmio.is_some() { + if input.vtl2_chipset_mmio_size != 0 { mmio_gaps.push(vtl2_chipset_mmio); } @@ -371,9 +371,9 @@ pub(super) fn resolve_memory_layout( memory_layout, pcie_root_complex_ranges, virtio_mmio_region, - chipset_low_mmio: input.chipset_low_mmio.map(|_| chipset_low_mmio), - chipset_high_mmio: input.chipset_high_mmio.map(|_| chipset_high_mmio), - vtl2_chipset_mmio: input.vtl2_chipset_mmio.map(|_| vtl2_chipset_mmio), + chipset_low_mmio: (input.chipset_low_mmio_size != 0).then_some(chipset_low_mmio), + chipset_high_mmio: (input.chipset_high_mmio_size != 0).then_some(chipset_high_mmio), + vtl2_chipset_mmio: (input.vtl2_chipset_mmio_size != 0).then_some(vtl2_chipset_mmio), }) } @@ -462,9 +462,9 @@ mod tests { MemoryLayoutInput { mem_size, numa_mem_sizes, - chipset_low_mmio: Some(DEFAULT_CHIPSET_LOW_MMIO_SIZE_X86_64), - chipset_high_mmio: Some(DEFAULT_CHIPSET_HIGH_MMIO_SIZE), - vtl2_chipset_mmio: None, + chipset_low_mmio_size: DEFAULT_CHIPSET_LOW_MMIO_SIZE_X86_64, + chipset_high_mmio_size: DEFAULT_CHIPSET_HIGH_MMIO_SIZE, + vtl2_chipset_mmio_size: 0, pcie_root_complexes: &[], virtio_mmio_count: 0, vtl2_layout, @@ -694,7 +694,7 @@ mod tests { #[test] fn vtl2_chipset_mmio_is_post_mmio() { let mut config = input(2 * GB, None, None); - config.vtl2_chipset_mmio = Some(DEFAULT_VTL2_CHIPSET_MMIO_SIZE); + config.vtl2_chipset_mmio_size = DEFAULT_VTL2_CHIPSET_MMIO_SIZE; let result = resolve_memory_layout(config).unwrap(); @@ -716,7 +716,7 @@ mod tests { fn vtl2_chipset_mmio_does_not_move_vtl0_layout() { let without = resolve(input(2 * GB, None, None)); let mut config = input(2 * GB, None, None); - config.vtl2_chipset_mmio = Some(DEFAULT_VTL2_CHIPSET_MMIO_SIZE); + config.vtl2_chipset_mmio_size = DEFAULT_VTL2_CHIPSET_MMIO_SIZE; let with = resolve_memory_layout(config).unwrap(); assert_eq!(with.memory_layout.ram(), without.ram()); @@ -725,8 +725,8 @@ mod tests { #[test] fn no_chipset_mmio_when_none() { let mut config = input(2 * GB, None, None); - config.chipset_low_mmio = None; - config.chipset_high_mmio = None; + config.chipset_low_mmio_size = 0; + config.chipset_high_mmio_size = 0; let result = resolve_memory_layout(config).unwrap(); diff --git a/openvmm/openvmm_defs/src/config.rs b/openvmm/openvmm_defs/src/config.rs index 3b337e1fda..93c1d44636 100644 --- a/openvmm/openvmm_defs/src/config.rs +++ b/openvmm/openvmm_defs/src/config.rs @@ -56,14 +56,14 @@ pub struct Config { pub pci_chipset_devices: Vec, pub chipset_capabilities: VmChipsetCapabilities, /// Chipset low MMIO range size (below 4 GiB) for VMOD/PCI0 _CRS. - /// The address is always allocated dynamically. `None` disables the range. - pub chipset_low_mmio: Option, + /// The address is always allocated dynamically. `0` disables the range. + pub chipset_low_mmio_size: u64, /// Chipset high MMIO range size (above RAM) for VMOD/PCI0 _CRS. - /// The address is always allocated dynamically. `None` disables the range. - pub chipset_high_mmio: Option, + /// The address is always allocated dynamically. `0` disables the range. + pub chipset_high_mmio_size: u64, /// VTL2-private chipset MMIO range size for VTL2 VMBus. - /// The address is always allocated dynamically. `None` disables the range. - pub vtl2_chipset_mmio: Option, + /// The address is always allocated dynamically. `0` disables the range. + pub vtl2_chipset_mmio_size: u64, pub generation_id_recv: Option>, // This is used for testing. TODO: resourcify, and also store this in VMGS. pub rtc_delta_milliseconds: i64, diff --git a/openvmm/openvmm_entry/src/lib.rs b/openvmm/openvmm_entry/src/lib.rs index 73e551ce72..eaa7e24ddf 100644 --- a/openvmm/openvmm_entry/src/lib.rs +++ b/openvmm/openvmm_entry/src/lib.rs @@ -877,9 +877,9 @@ async fn vm_config_from_command_line( mut chipset_devices, pci_chipset_devices, capabilities, - chipset_low_mmio, - chipset_high_mmio, - vtl2_chipset_mmio, + chipset_low_mmio_size, + chipset_high_mmio_size, + vtl2_chipset_mmio_size, } = chipset .build() .context("failed to build chipset configuration")?; @@ -1634,9 +1634,9 @@ async fn vm_config_from_command_line( chipset_devices, pci_chipset_devices, chipset_capabilities: capabilities, - chipset_low_mmio, - chipset_high_mmio, - vtl2_chipset_mmio, + chipset_low_mmio_size, + chipset_high_mmio_size, + vtl2_chipset_mmio_size, #[cfg(windows)] vpci_resources, vmgs, diff --git a/openvmm/openvmm_entry/src/ttrpc/mod.rs b/openvmm/openvmm_entry/src/ttrpc/mod.rs index 22bde5b29e..1b097b900b 100644 --- a/openvmm/openvmm_entry/src/ttrpc/mod.rs +++ b/openvmm/openvmm_entry/src/ttrpc/mod.rs @@ -610,9 +610,9 @@ impl VmService { chipset_devices: chipset.chipset_devices, pci_chipset_devices: chipset.pci_chipset_devices, chipset_capabilities: chipset.capabilities, - chipset_low_mmio: chipset.chipset_low_mmio, - chipset_high_mmio: chipset.chipset_high_mmio, - vtl2_chipset_mmio: chipset.vtl2_chipset_mmio, + chipset_low_mmio_size: chipset.chipset_low_mmio_size, + chipset_high_mmio_size: chipset.chipset_high_mmio_size, + vtl2_chipset_mmio_size: chipset.vtl2_chipset_mmio_size, generation_id_recv: None, rtc_delta_milliseconds: 0, automatic_guest_reset: true, diff --git a/petri/src/vm/openvmm/construct.rs b/petri/src/vm/openvmm/construct.rs index b1b9b67b3c..5b1a23c4a4 100644 --- a/petri/src/vm/openvmm/construct.rs +++ b/petri/src/vm/openvmm/construct.rs @@ -459,9 +459,9 @@ impl PetriVmConfigOpenVmm { mut chipset_devices, pci_chipset_devices, capabilities, - chipset_low_mmio, - chipset_high_mmio, - vtl2_chipset_mmio, + chipset_low_mmio_size, + chipset_high_mmio_size, + vtl2_chipset_mmio_size, } = chipset; // Add the TPM @@ -499,9 +499,9 @@ impl PetriVmConfigOpenVmm { chipset_devices, pci_chipset_devices, chipset_capabilities: capabilities, - chipset_low_mmio, - chipset_high_mmio, - vtl2_chipset_mmio, + chipset_low_mmio_size, + chipset_high_mmio_size, + vtl2_chipset_mmio_size, // Basic virtualization device support hypervisor: HypervisorConfig { diff --git a/vm/vmcore/vm_topology/src/layout.rs b/vm/vmcore/vm_topology/src/layout.rs index eec518f630..31d5d8f3de 100644 --- a/vm/vmcore/vm_topology/src/layout.rs +++ b/vm/vmcore/vm_topology/src/layout.rs @@ -220,12 +220,21 @@ impl AllocationState { // Ordinary RAM is the only splittable request type in this API. It is // placed after low MMIO so the resulting RAM extents describe the // actual guest-visible memory map, including holes below 4 GiB. + // + // Requests are placed in caller order, and each request starts at or + // above the highest address used by previous RAM requests. A later + // RAM request never backfills a fragment that an earlier one skipped: + // this keeps the flattened RAM list sorted by address (matching the + // invariant `MemoryLayout` validates) and turns vnode order into a + // clean compatibility surface, since adding new fixed or reserved + // ranges only shifts vnodes whose own span actually covers them. for request in requests { + let floor = self.ram_end; let ranges = find_lowest_splittable_fit( &self.free, request.size, request.alignment, - 0, + floor, ADDRESS_LIMIT, ) .ok_or_else(|| { @@ -235,7 +244,7 @@ impl AllocationState { request.alignment, AllocationPhase::Ram, &self.free, - 0, + floor, ADDRESS_LIMIT, ) })?; @@ -477,12 +486,16 @@ impl<'a> LayoutBuilder<'a> { /// Adds an ordinary RAM request to the builder. /// - /// RAM is placed bottom up from GPA 0 and may split around fixed and MMIO32 - /// ranges. Each extent starts at `alignment`, and split extents that do not - /// satisfy the rest of the request are rounded down to `alignment` so large - /// aligned requests are not fragmented into smaller chunks. The target - /// vector is replaced with the placed RAM extents when [`Self::allocate`] - /// succeeds. + /// RAM requests are placed in caller order. The first request is placed + /// bottom up from GPA 0; each subsequent request starts at or above the + /// highest address used by previous RAM requests, so later requests never + /// backfill fragments skipped by earlier ones. A single request may still + /// split around fixed and Mmio32 ranges encountered inside its own span; + /// each extent starts at `alignment`, and split extents that do not + /// satisfy the rest of the request are rounded down to `alignment` so + /// large aligned requests are not fragmented into smaller chunks. The + /// target vector is replaced with the placed RAM extents when + /// [`Self::allocate`] succeeds. pub fn ram( &mut self, tag: impl Into>, @@ -895,6 +908,98 @@ mod tests { ); } + #[test] + fn ram_requests_are_placed_in_order() { + // Two RAM requests must not interleave: the second request starts at + // or above the maximum end address of the first, so the flattened + // RAM list is always sorted by address. + let mut first = Vec::new(); + let mut second = Vec::new(); + let mut builder = LayoutBuilder::new(); + builder.ram("first", &mut first, 2 * GIB, PAGE_SIZE); + builder.ram("second", &mut second, GIB, PAGE_SIZE); + + builder.allocate().unwrap(); + + assert_eq!(first, [MemoryRange::new(0..2 * GIB)]); + assert_eq!(second, [MemoryRange::new(2 * GIB..3 * GIB)]); + } + + #[test] + fn ram_request_does_not_backfill_earlier_fragments() { + // A small fixed range below the first RAM request's end leaves an + // unaligned fragment that the first request skips. An earlier + // best-fit policy would have allowed a smaller-aligned later RAM + // request to backfill that fragment, producing an out-of-order RAM + // list. In-order placement floors each request at the previous + // request's end, so the fragment stays unallocated and vnode order + // matches address order. + let mut first = Vec::new(); + let mut second = Vec::new(); + let mut builder = LayoutBuilder::new(); + // Carve a tiny hole inside what the first request would otherwise + // round down to a GiB-aligned chunk. + builder.fixed("hole", MemoryRange::new(GIB + MIB..GIB + 2 * MIB)); + builder.ram("first", &mut first, 2 * GIB, GIB); + builder.ram("second", &mut second, 256 * MIB, PAGE_SIZE); + builder.allocate().unwrap(); + + // First request lands at [0, 1 GiB) and [2 GiB, 3 GiB); the fragment + // at [1 GiB + 2 MiB, 2 GiB) is left free. + assert_eq!( + first, + [MemoryRange::new(0..GIB), MemoryRange::new(2 * GIB..3 * GIB)] + ); + // The 256 MiB second request would fit at 1 GiB + 2 MiB if backfill + // were allowed; instead it must come after the first request's max + // end (3 GiB). + assert_eq!(second.len(), 1); + assert!( + second[0].start() >= first.iter().map(|r| r.end()).max().unwrap(), + "second RAM request backfilled below first request's end: {second:?}" + ); + assert_eq!(second, [MemoryRange::new(3 * GIB..3 * GIB + 256 * MIB)]); + } + + #[test] + fn ram_in_order_keeps_flattened_list_sorted_with_mmio32() { + // Reproduces the scenario that would have produced an unsorted RAM + // list under best-fit: a fixed Mmio32-style range low in memory plus + // a small second vnode that could otherwise be placed before the + // first vnode's tail. + let mut first = Vec::new(); + let mut second = Vec::new(); + let mut builder = LayoutBuilder::new(); + // A 1 MiB fixed range (e.g. a PCIe BAR) just above 1 GiB. + builder.fixed("pcie_bar", MemoryRange::new(0x4010_0000..0x4020_0000)); + builder.ram("first", &mut first, 2 * GIB, PAGE_SIZE); + builder.ram("second", &mut second, 512 * MIB, PAGE_SIZE); + + builder.allocate().unwrap(); + + let first_end = first.iter().map(|r| r.end()).max().unwrap(); + assert!( + second.iter().all(|r| r.start() >= first_end), + "second vnode placed below first vnode's end: first={first:?} second={second:?}" + ); + + let mut all: Vec<_> = first.iter().chain(second.iter()).copied().collect(); + let sorted = { + let mut s = all.clone(); + s.sort_by_key(|r| r.start()); + s + }; + assert_eq!(all, sorted, "flattened RAM list must be sorted"); + // Sanity: no overlaps either. + all.sort_by_key(|r| r.start()); + for pair in all.windows(2) { + assert!( + pair[0].end() <= pair[1].start(), + "overlapping RAM ranges: {pair:?}" + ); + } + } + #[test] fn mmio64_uses_bottom_up_placement_above_four_gib() { let mut ram = Vec::new(); diff --git a/vmm_core/vm_manifest_builder/src/lib.rs b/vmm_core/vm_manifest_builder/src/lib.rs index 45db9769b7..eb6ee4d642 100644 --- a/vmm_core/vm_manifest_builder/src/lib.rs +++ b/vmm_core/vm_manifest_builder/src/lib.rs @@ -103,17 +103,17 @@ pub struct VmChipsetResult { /// Derived chipset capabilities needed by firmware and table generation. pub capabilities: VmChipsetCapabilities, /// Default chipset low MMIO size (below 4 GiB) for VMOD/PCI0 _CRS. - /// The address is always allocated dynamically. `None` when the VM type + /// The address is always allocated dynamically. `0` when the VM type /// has no VMBus or PCI bus. - pub chipset_low_mmio: Option, + pub chipset_low_mmio_size: u64, /// Default chipset high MMIO size (above RAM) for VMOD/PCI0 _CRS. - /// The address is always allocated dynamically. `None` when the VM type + /// The address is always allocated dynamically. `0` when the VM type /// has no VMBus or PCI bus. - pub chipset_high_mmio: Option, + pub chipset_high_mmio_size: u64, /// Default VTL2-private chipset MMIO size for VTL2 VMBus. - /// The address is always allocated dynamically. `None` when the VM type + /// The address is always allocated dynamically. `0` when the VM type /// does not include VTL2. - pub vtl2_chipset_mmio: Option, + pub vtl2_chipset_mmio_size: u64, } /// Error type for building a VM manifest. @@ -248,9 +248,9 @@ impl VmManifestBuilder { with_psp: false, with_guest_watchdog: false, }, - chipset_low_mmio: None, - chipset_high_mmio: None, - vtl2_chipset_mmio: None, + chipset_low_mmio_size: 0, + chipset_high_mmio_size: 0, + vtl2_chipset_mmio_size: 0, }; if let Some((backend, port)) = self.debugcon { @@ -421,13 +421,13 @@ impl VmManifestBuilder { | BaseChipsetType::HypervGen2Uefi | BaseChipsetType::HyperVGen2LinuxDirect | BaseChipsetType::UnenlightenedLinuxDirect => { - result.chipset_low_mmio = Some(default_low); - result.chipset_high_mmio = Some(default_high); + result.chipset_low_mmio_size = default_low; + result.chipset_high_mmio_size = default_high; } BaseChipsetType::HclHost => { - result.chipset_low_mmio = Some(default_low); - result.chipset_high_mmio = Some(default_high); - result.vtl2_chipset_mmio = Some(default_vtl2); + result.chipset_low_mmio_size = default_low; + result.chipset_high_mmio_size = default_high; + result.vtl2_chipset_mmio_size = default_vtl2; } } From df44b46bd70982aa254da878b65a442f3718d4af Mon Sep 17 00:00:00 2001 From: John Starks Date: Sat, 16 May 2026 19:19:31 +0000 Subject: [PATCH 23/36] fix test --- .../tests/x86_64/openhcl_linux_direct.rs | 115 +++++++++++++++--- 1 file changed, 98 insertions(+), 17 deletions(-) diff --git a/vmm_tests/vmm_tests/tests/tests/x86_64/openhcl_linux_direct.rs b/vmm_tests/vmm_tests/tests/tests/x86_64/openhcl_linux_direct.rs index 7a17c69534..87608da103 100644 --- a/vmm_tests/vmm_tests/tests/tests/x86_64/openhcl_linux_direct.rs +++ b/vmm_tests/vmm_tests/tests/tests/x86_64/openhcl_linux_direct.rs @@ -363,18 +363,67 @@ async fn parse_openhcl_memory_node( Ok(MemoryRange::new(range_start..range_end)) } -/// Test VTL2 memory allocation mode, and validate that VTL0 saw the correct -/// amount of mmio, when the host provides a VTL2 mmio range. +/// Enumerate the VTL0 chipset MMIO ranges reported by the bootloader in the +/// `openhcl` device tree node, sorted by start address. /// -/// TODO: onboard Hyper-V support in petri for custom mmio config once Hyper-V -/// supports this. +/// The `openhcl/memory@*` nodes are a mix of VTL0/VTL2 RAM and VTL0/VTL2 MMIO. +/// This helper lists the directory, filters to entries whose +/// `openhcl,memory-type` is `VTL0_MMIO` (5), and delegates to +/// `parse_openhcl_memory_node` for the range read. +async fn enumerate_openhcl_vtl0_mmio_ranges( + agent: &PipetteClient, +) -> Result, anyhow::Error> { + let sh = agent.unix_shell(); + let listing = cmd!(sh, "ls /sys/firmware/devicetree/base/openhcl/") + .read() + .await?; + let mut ranges = Vec::new(); + for name in listing.lines() { + let Some(start_hex) = name.strip_prefix("memory@") else { + continue; + }; + let start = u64::from_str_radix(start_hex, 16) + .map_err(|e| anyhow::anyhow!("failed to parse {name}: {e}"))?; + // Read the type first so we can skip non-VTL0_MMIO entries (RAM, + // VTL2_MMIO) without tripping the assertion in + // `parse_openhcl_memory_node`. + let memory_type: u32 = read_sysfs_dt::( + agent, + format!("openhcl/{name}/openhcl,memory-type").as_str(), + ) + .await? + .into(); + const VTL0_MMIO: u32 = 5; + if memory_type != VTL0_MMIO { + continue; + } + ranges.push(parse_openhcl_memory_node(agent, start).await?); + } + ranges.sort_by_key(|r| r.start()); + Ok(ranges) +} + +/// Test VTL2 memory allocation mode and validate that the bootloader-built +/// device tree reflects the host-provided VTL2 MMIO range (path A in +/// `openhcl_boot`'s MMIO selection). +/// +/// Path B — where `openhcl_boot` carves VTL2 MMIO out of VTL0 because the host +/// did not provide a range — is covered by unit tests for +/// `select_vtl2_mmio_range` in `openhcl_boot::host_params::mmio`. #[openvmm_test(openhcl_linux_direct_x64)] async fn openhcl_linux_vtl2_mmio_self_allocate( config: PetriVmBuilder, ) -> Result<(), anyhow::Error> { - // The worker resolver allocates a 1 GiB VTL2 chipset MMIO region and - // reports 128 MiB for self allocation. Verify the device tree reflects - // this. + // Default chipset MMIO sizes for `HclHost` from + // `vm_manifest_builder::VmChipsetBuilder::build`. Keep in sync with that + // file. + const DEFAULT_LOW_MMIO_SIZE: u64 = 96 * 1024 * 1024; + const DEFAULT_HIGH_MMIO_SIZE: u64 = 512 * 1024 * 1024; + const DEFAULT_VTL2_MMIO_SIZE: u64 = 1024 * 1024 * 1024; + // `mmio-size` is hardcoded in openvmm — see + // `openvmm_core::worker::vm_loaders::igvm::build_device_tree`. + const EXPECTED_MMIO_SIZE: u64 = 128 * 1024 * 1024; + let (mut vm, agent) = config .with_vtl2_base_address_type(Vtl2BaseAddressType::Vtl2Allocate { size: None }) .run() @@ -382,10 +431,7 @@ async fn openhcl_linux_vtl2_mmio_self_allocate( let vtl2_agent = vm.wait_for_vtl2_agent().await?; - // Read the bootloader provided fdt via sysfs to verify that the VTL2 and - // VTL0 mmio ranges are as expected, along with the allocated mmio size - // being 128 MB. - let memory_allocation_mode: String = + let memory_allocation_mode = read_sysfs_dt_string(&vtl2_agent, "openhcl/memory-allocation-mode").await?; assert_eq!(memory_allocation_mode, "vtl2"); @@ -393,16 +439,51 @@ async fn openhcl_linux_vtl2_mmio_self_allocate( read_sysfs_dt::(&vtl2_agent, "openhcl/mmio-size") .await? .into(); - // NOTE: This value is hardcoded in openvmm today to report this to the - // guest provided device tree. - const EXPECTED_MMIO_SIZE: u64 = 128 * 1024 * 1024; assert_eq!(mmio_size, EXPECTED_MMIO_SIZE); - // Verify the VTL2 VMBus gets a non-empty MMIO range in the device tree. + // VTL2 VMBus sees exactly one MMIO range — the VTL2-private chipset MMIO + // — placed in PostMmio above all VTL0-visible RAM/MMIO. let vtl2_mmio = parse_vmbus_mmio(&vtl2_agent, "bus/vmbus").await?; + assert_eq!( + vtl2_mmio.len(), + 1, + "VTL2 should have exactly one MMIO range, got {:?}", + vtl2_mmio, + ); + assert_eq!(vtl2_mmio[0].len(), DEFAULT_VTL2_MMIO_SIZE); + assert!( + vtl2_mmio[0].start() >= 1 << 32, + "VTL2 MMIO should be above 4 GiB, got {:#x}", + vtl2_mmio[0].start(), + ); + + // VTL0 sees exactly two chipset MMIO ranges in the openhcl device tree: + // the low (Mmio32) range below 4 GiB and the high (Mmio64) range above + // RAM but below the VTL2 PostMmio range. + let vtl0_mmio = enumerate_openhcl_vtl0_mmio_ranges(&vtl2_agent).await?; + assert_eq!( + vtl0_mmio.len(), + 2, + "VTL0 should have exactly two chipset MMIO ranges, got {:?}", + vtl0_mmio, + ); + assert_eq!(vtl0_mmio[0].len(), DEFAULT_LOW_MMIO_SIZE); + assert!( + vtl0_mmio[0].end() <= 1 << 32, + "VTL0 low MMIO should be below 4 GiB, got {:?}", + vtl0_mmio[0], + ); + assert_eq!(vtl0_mmio[1].len(), DEFAULT_HIGH_MMIO_SIZE); + assert!( + vtl0_mmio[1].start() >= 1 << 32, + "VTL0 high MMIO should be above 4 GiB, got {:?}", + vtl0_mmio[1], + ); assert!( - !vtl2_mmio.is_empty(), - "VTL2 should have at least one MMIO range" + vtl0_mmio[1].end() <= vtl2_mmio[0].start(), + "VTL0 high MMIO {:?} should sit below the VTL2 chipset MMIO {:?}", + vtl0_mmio[1], + vtl2_mmio[0], ); agent.power_off().await?; From 0cb93759329917be91c17784cfec8b6c752eea34 Mon Sep 17 00:00:00 2001 From: John Starks Date: Sat, 16 May 2026 19:47:23 +0000 Subject: [PATCH 24/36] fix test --- .../openvmm_core/src/worker/memory_layout.rs | 18 +++++++++++++----- 1 file changed, 13 insertions(+), 5 deletions(-) diff --git a/openvmm/openvmm_core/src/worker/memory_layout.rs b/openvmm/openvmm_core/src/worker/memory_layout.rs index 0b55172fcc..71662a25b5 100644 --- a/openvmm/openvmm_core/src/worker/memory_layout.rs +++ b/openvmm/openvmm_core/src/worker/memory_layout.rs @@ -737,12 +737,20 @@ mod tests { #[test] fn ecam_below_256mb_is_rejected() { // Force ECAM placement below 256 MiB by reserving most of the 32-bit - // MMIO window for low_mmio. The Mmio32 zone is ~4064 MiB on x86_64, - // so a 3840 MiB low_mmio request plus the default 96 MiB chipset_low - // pushes ECAM down to ~127 MiB. The resolver must bail because MCFG - // cannot represent a bus-0 base below the ECAM start. + // MMIO window for low_mmio. The Mmio32 zone is ~4064 MiB on x86_64 + // and ~3824 MiB on aarch64 (the per-arch reserved zone differs), so + // the low_mmio request is sized per-arch to land ECAM around 127 MiB + // in both cases. The resolver must bail because MCFG cannot + // represent a bus-0 base below the ECAM start. + let low_mmio_size = if cfg!(guest_arch = "x86_64") { + 3840 * MB + } else { + 3600 * MB + }; let root_complexes = [pcie_root_complex( - PcieMmioRangeConfig::Dynamic { size: 3840 * MB }, + PcieMmioRangeConfig::Dynamic { + size: low_mmio_size, + }, PcieMmioRangeConfig::Dynamic { size: GB }, )]; let mut config = input(2 * GB, None, None); From cf8de7517ac945eaca1036e584a328cf5733152c Mon Sep 17 00:00:00 2001 From: John Starks Date: Sat, 16 May 2026 20:29:16 +0000 Subject: [PATCH 25/36] feedback --- .../openvmm_core/src/worker/memory_layout.rs | 64 ++++++++++++++++++- 1 file changed, 61 insertions(+), 3 deletions(-) diff --git a/openvmm/openvmm_core/src/worker/memory_layout.rs b/openvmm/openvmm_core/src/worker/memory_layout.rs index 71662a25b5..e232d1e594 100644 --- a/openvmm/openvmm_core/src/worker/memory_layout.rs +++ b/openvmm/openvmm_core/src/worker/memory_layout.rs @@ -113,6 +113,19 @@ pub(super) fn resolve_memory_layout( ) -> anyhow::Result { let ram_sizes = validate_ram_sizes(input.mem_size, input.numa_mem_sizes)?; + // Chipset low and high MMIO must be paired: downstream consumers (UEFI, + // x64 DSDT, PCAT) index `MemoryLayout::mmio()` positionally and require + // both entries to be present. Allowing only one to be set would silently + // produce a layout where consumers either fail late or, with VTL2 + // enabled, misinterpret the VTL2 chipset MMIO range as the high gap. + if (input.chipset_low_mmio_size == 0) != (input.chipset_high_mmio_size == 0) { + bail!( + "chipset low and high MMIO must be both enabled or both disabled (low={:#x}, high={:#x})", + input.chipset_low_mmio_size, + input.chipset_high_mmio_size, + ); + } + let mut ram_ranges_by_node = vec![Vec::new(); ram_sizes.len()]; let mut pcie_root_complex_ranges = input .pcie_root_complexes @@ -190,7 +203,7 @@ pub(super) fn resolve_memory_layout( &root_complex.low_mmio, TWO_MB, Placement::Mmio32, - ); + )?; // High MMIO: 1 GB aligned. Ideally we'd align it to its actual size so // that the full amount is always usable for a single large BAR. But // that burns physical address space, which is especially limited on @@ -206,7 +219,7 @@ pub(super) fn resolve_memory_layout( &root_complex.high_mmio, GB, Placement::Mmio64, - ); + )?; } // Virtio-mmio: allocate one contiguous region for all slots. Each slot is @@ -398,16 +411,25 @@ fn add_mmio_range<'a>( config: &PcieMmioRangeConfig, alignment: u64, placement: Placement, -) { +) -> anyhow::Result<()> { + let tag = tag.into(); match config { PcieMmioRangeConfig::Dynamic { size } => { builder.request(tag, target, *size, alignment, placement); } PcieMmioRangeConfig::Fixed(range) => { + // A fixed low-MMIO range must satisfy the Mmio32 placement contract. + // Without this check, an above-4 GiB range would be accepted and + // then silently truncated to 32 bits in the ARM64 PCI device tree + // (`ranges` property uses `low_start as u32`). + if placement == Placement::Mmio32 && range.end() > 4 * GB { + bail!("{tag}: fixed low MMIO range {range} must end at or below 4 GiB",); + } *target = *range; builder.fixed(tag, *range); } } + Ok(()) } fn validate_ram_sizes(mem_size: u64, numa_mem_sizes: Option<&[u64]>) -> anyhow::Result> { @@ -734,6 +756,42 @@ mod tests { assert!(result.chipset_high_mmio.is_none()); } + #[test] + fn asymmetric_chipset_mmio_is_rejected() { + let mut config = input(2 * GB, None, None); + config.chipset_high_mmio_size = 0; + let err = resolve_memory_layout(config).unwrap_err(); + assert!( + err.to_string().contains("both enabled or both disabled"), + "unexpected error: {err}" + ); + + let mut config = input(2 * GB, None, None); + config.chipset_low_mmio_size = 0; + let err = resolve_memory_layout(config).unwrap_err(); + assert!( + err.to_string().contains("both enabled or both disabled"), + "unexpected error: {err}" + ); + } + + #[test] + fn fixed_low_mmio_above_4gb_is_rejected() { + let root_complexes = [pcie_root_complex( + // A 1 GiB fixed low MMIO range placed above 4 GiB violates the + // Mmio32 placement contract. + PcieMmioRangeConfig::Fixed(MemoryRange::new(5 * GB..6 * GB)), + PcieMmioRangeConfig::Dynamic { size: GB }, + )]; + let mut config = input(2 * GB, None, None); + config.pcie_root_complexes = &root_complexes; + let err = resolve_memory_layout(config).unwrap_err(); + assert!( + err.to_string().contains("must end at or below 4 GiB"), + "unexpected error: {err}" + ); + } + #[test] fn ecam_below_256mb_is_rejected() { // Force ECAM placement below 256 MiB by reserving most of the 32-bit From d970f35e6d40a9dd5512c083221b59c73f0a8c4b Mon Sep 17 00:00:00 2001 From: John Starks Date: Sat, 16 May 2026 20:49:11 +0000 Subject: [PATCH 26/36] tweaks --- openvmm/openvmm_core/src/worker/dispatch.rs | 7 ++++--- .../openvmm_core/src/worker/memory_layout.rs | 21 +++++++++---------- 2 files changed, 14 insertions(+), 14 deletions(-) diff --git a/openvmm/openvmm_core/src/worker/dispatch.rs b/openvmm/openvmm_core/src/worker/dispatch.rs index fda3b35b30..e08d9bf3a5 100644 --- a/openvmm/openvmm_core/src/worker/dispatch.rs +++ b/openvmm/openvmm_core/src/worker/dispatch.rs @@ -766,9 +766,10 @@ fn convert_vtl2_config( // Use the size, but the base is the requested load // base. - LateMapVtl0AllowedRanges::Ranges(vec![MemoryRange::new( - *base..(*base + range.len()), - )]) + let allowed = + MemoryRange::try_new(*base..base.wrapping_add(range.len())) + .with_context(|| format!("invalid vtl2 absolute base {base:#x}"))?; + LateMapVtl0AllowedRanges::Ranges(vec![allowed]) } Vtl2BaseAddressType::MemoryLayout { .. } => { LateMapVtl0AllowedRanges::MemoryLayout diff --git a/openvmm/openvmm_core/src/worker/memory_layout.rs b/openvmm/openvmm_core/src/worker/memory_layout.rs index e232d1e594..06acb169d2 100644 --- a/openvmm/openvmm_core/src/worker/memory_layout.rs +++ b/openvmm/openvmm_core/src/worker/memory_layout.rs @@ -151,13 +151,13 @@ pub(super) fn resolve_memory_layout( } else { ARCH_RESERVED_AARCH64 }; - builder.reserve("arch_reserved", arch_reserved); + builder.reserve("arch-reserved", arch_reserved); // Chipset low MMIO (Mmio32): VMOD/PCI0 _CRS low range for VMBus // devices and PIIX4 PCI BARs. if input.chipset_low_mmio_size != 0 { builder.request( - "chipset_low_mmio", + "chipset-low-mmio", &mut chipset_low_mmio, input.chipset_low_mmio_size, TWO_MB, @@ -168,7 +168,7 @@ pub(super) fn resolve_memory_layout( // Chipset high MMIO (Mmio64): VMOD/PCI0 _CRS high range. if input.chipset_high_mmio_size != 0 { builder.request( - "chipset_high_mmio", + "chipset-high-mmio", &mut chipset_high_mmio, input.chipset_high_mmio_size, TWO_MB, @@ -176,11 +176,10 @@ pub(super) fn resolve_memory_layout( ); } - for (index, (root_complex, ranges)) in input + for (root_complex, ranges) in input .pcie_root_complexes .iter() .zip(&mut pcie_root_complex_ranges) - .enumerate() { // ECAM: always dynamically allocated below 4GB (since Linux on x86_64 // refuses to use ECAM above 4GB unless the BIOS is of a special shape). @@ -189,7 +188,7 @@ pub(super) fn resolve_memory_layout( // TODO: fix the Linux loader and move this above 4GB before the layout // is stabilized. builder.request( - format!("pcie[{index}].ecam"), + format!("pcie-{}-ecam", root_complex.name), &mut ranges.ecam_range, pcie_ecam_size(root_complex)?, PCIE_ECAM_BYTES_PER_BUS, @@ -198,7 +197,7 @@ pub(super) fn resolve_memory_layout( // Low MMIO: 2 MB aligned. add_mmio_range( &mut builder, - format!("pcie[{index}].low_mmio"), + format!("pcie-{}-low-mmio", root_complex.name), &mut ranges.low_mmio, &root_complex.low_mmio, TWO_MB, @@ -214,7 +213,7 @@ pub(super) fn resolve_memory_layout( // reliability issues for users. add_mmio_range( &mut builder, - format!("pcie[{index}].high_mmio"), + format!("pcie-{}-high-mmio", root_complex.name), &mut ranges.high_mmio, &root_complex.high_mmio, GB, @@ -227,7 +226,7 @@ pub(super) fn resolve_memory_layout( // request. if input.virtio_mmio_count > 0 { builder.request( - "virtio_mmio", + "virtio-mmio", &mut virtio_mmio_region, input.virtio_mmio_count as u64 * PAGE_SIZE, PAGE_SIZE, @@ -247,14 +246,14 @@ pub(super) fn resolve_memory_layout( .enumerate() { let ram_alignment = if ram_size < GB { TWO_MB } else { GB }; - builder.ram(format!("ram[{vnode}]"), ram_ranges, ram_size, ram_alignment); + builder.ram(format!("ram{vnode}"), ram_ranges, ram_size, ram_alignment); } // VTL2 chipset MMIO is implementation-private — placed after all // VTL0-visible RAM/MMIO so enabling VTL2 does not move VTL0 addresses. if input.vtl2_chipset_mmio_size != 0 { builder.request( - "vtl2_chipset_mmio", + "vtl2-chipset-mmio", &mut vtl2_chipset_mmio, input.vtl2_chipset_mmio_size, TWO_MB, From 6321e062f1629762338635bd7a572e461390c519 Mon Sep 17 00:00:00 2001 From: John Starks Date: Sun, 17 May 2026 16:02:01 -0700 Subject: [PATCH 27/36] windows fix --- .../openvmm_core/src/worker/memory_layout.rs | 135 +++++++++++++----- 1 file changed, 103 insertions(+), 32 deletions(-) diff --git a/openvmm/openvmm_core/src/worker/memory_layout.rs b/openvmm/openvmm_core/src/worker/memory_layout.rs index 06acb169d2..c1e02e5f0a 100644 --- a/openvmm/openvmm_core/src/worker/memory_layout.rs +++ b/openvmm/openvmm_core/src/worker/memory_layout.rs @@ -176,24 +176,51 @@ pub(super) fn resolve_memory_layout( ); } + // Group root complexes by PCI segment so that RCs sharing a segment get a + // single contiguous ECAM block. This ensures the MCFG bus-0 base address + // is consistent for all RCs in the same segment. + struct SegmentEcam { + segment: u16, + min_bus: u8, + max_bus: u8, + range: MemoryRange, + } + let mut segment_ecams: Vec = Vec::new(); + for rc in input.pcie_root_complexes { + if let Some(entry) = segment_ecams.iter_mut().find(|e| e.segment == rc.segment) { + entry.min_bus = entry.min_bus.min(rc.start_bus); + entry.max_bus = entry.max_bus.max(rc.end_bus); + } else { + segment_ecams.push(SegmentEcam { + segment: rc.segment, + min_bus: rc.start_bus, + max_bus: rc.end_bus, + range: MemoryRange::EMPTY, + }); + } + } + + // ECAM: always dynamically allocated below 4GB (since Linux on x86_64 + // refuses to use ECAM above 4GB unless the BIOS is of a special shape). + // + // TODO: fix the Linux loader and move this above 4GB before the layout + // is stabilized. + for se in &mut segment_ecams { + let bus_count = u64::from(se.max_bus - se.min_bus) + 1; + builder.request( + format!("pcie-seg{}-ecam", se.segment), + &mut se.range, + bus_count * PCIE_ECAM_BYTES_PER_BUS, + PCIE_ECAM_BYTES_PER_BUS, + Placement::Mmio32, + ); + } + for (root_complex, ranges) in input .pcie_root_complexes .iter() .zip(&mut pcie_root_complex_ranges) { - // ECAM: always dynamically allocated below 4GB (since Linux on x86_64 - // refuses to use ECAM above 4GB unless the BIOS is of a special shape). - // Size is derived from the bus range. - // - // TODO: fix the Linux loader and move this above 4GB before the layout - // is stabilized. - builder.request( - format!("pcie-{}-ecam", root_complex.name), - &mut ranges.ecam_range, - pcie_ecam_size(root_complex)?, - PCIE_ECAM_BYTES_PER_BUS, - Placement::Mmio32, - ); // Low MMIO: 2 MB aligned. add_mmio_range( &mut builder, @@ -284,6 +311,23 @@ pub(super) fn resolve_memory_layout( .allocate() .context("allocating memory layout ranges")?; + // Subdivide per-segment ECAM blocks into per-RC sub-ranges. + for (root_complex, ranges) in input + .pcie_root_complexes + .iter() + .zip(&mut pcie_root_complex_ranges) + { + let se = segment_ecams + .iter() + .find(|e| e.segment == root_complex.segment) + .expect("segment must exist"); + let offset = u64::from(root_complex.start_bus - se.min_bus) * PCIE_ECAM_BYTES_PER_BUS; + let size = + u64::from(root_complex.end_bus - root_complex.start_bus + 1) * PCIE_ECAM_BYTES_PER_BUS; + ranges.ecam_range = + MemoryRange::new(se.range.start() + offset..se.range.start() + offset + size); + } + // Enforce the MCFG bus-0 base invariant: every ECAM range must sit at // `PCIE_ECAM_MIN_ADDRESS` or above. Fail fast at VM construction with a // clear error rather than letting an unrepresentable MCFG entry surface @@ -335,11 +379,7 @@ pub(super) fn resolve_memory_layout( } let mut pci_ecam_gaps: Vec = Vec::new(); - pci_ecam_gaps.extend( - pcie_root_complex_ranges - .iter() - .map(|ranges| ranges.ecam_range), - ); + pci_ecam_gaps.extend(segment_ecams.iter().map(|se| se.range)); pci_ecam_gaps.sort(); let mut pci_mmio_gaps: Vec = Vec::new(); @@ -389,20 +429,6 @@ pub(super) fn resolve_memory_layout( }) } -fn pcie_ecam_size(root_complex: &PcieRootComplexConfig) -> anyhow::Result { - let bus_count = root_complex - .end_bus - .checked_sub(root_complex.start_bus) - .with_context(|| { - format!( - "invalid PCIe bus range {}..{} for {}", - root_complex.start_bus, root_complex.end_bus, root_complex.name - ) - })?; - - Ok((u64::from(bus_count) + 1) * PCIE_ECAM_BYTES_PER_BUS) -} - fn add_mmio_range<'a>( builder: &mut LayoutBuilder<'a>, tag: impl Into>, @@ -614,6 +640,51 @@ mod tests { ); } + #[test] + fn shared_segment_gets_contiguous_ecam() { + // Two root complexes on the same segment with disjoint bus ranges + // must get ECAM sub-ranges within a single contiguous block, so + // that the MCFG bus-0 base address is the same for both. + let root_complexes = [ + PcieRootComplexConfig { + index: 0, + name: "rc0".to_string(), + segment: 0, + start_bus: 0, + end_bus: 15, + low_mmio: PcieMmioRangeConfig::Dynamic { size: 32 * MB }, + high_mmio: PcieMmioRangeConfig::Dynamic { size: GB }, + ports: Vec::new(), + }, + PcieRootComplexConfig { + index: 1, + name: "rc1".to_string(), + segment: 0, + start_bus: 16, + end_bus: 31, + low_mmio: PcieMmioRangeConfig::Dynamic { size: 32 * MB }, + high_mmio: PcieMmioRangeConfig::Dynamic { size: GB }, + ports: Vec::new(), + }, + ]; + let mut config = input(2 * GB, None, None); + config.pcie_root_complexes = &root_complexes; + + let actual = resolve_memory_layout(config).unwrap(); + let r0 = &actual.pcie_root_complex_ranges[0]; + let r1 = &actual.pcie_root_complex_ranges[1]; + + // rc0 ends exactly where rc1 starts (contiguous). + assert_eq!(r0.ecam_range.end(), r1.ecam_range.start()); + + // Both derive the same MCFG bus-0 base. + let bus0_base_r0 = r0.ecam_range.start() + - u64::from(root_complexes[0].start_bus) * PCIE_ECAM_BYTES_PER_BUS; + let bus0_base_r1 = r1.ecam_range.start() + - u64::from(root_complexes[1].start_bus) * PCIE_ECAM_BYTES_PER_BUS; + assert_eq!(bus0_base_r0, bus0_base_r1); + } + #[test] fn sub_gb_numa_nodes_use_two_mb_alignment() { let sizes = [512 * MB, 512 * MB]; From 0da32e71353e3a62d47604fdf1973a8a3ef3dca1 Mon Sep 17 00:00:00 2001 From: John Starks Date: Sun, 17 May 2026 18:10:00 -0700 Subject: [PATCH 28/36] fix --- .../openvmm_core/src/worker/memory_layout.rs | 27 +++++++++++++++++-- 1 file changed, 25 insertions(+), 2 deletions(-) diff --git a/openvmm/openvmm_core/src/worker/memory_layout.rs b/openvmm/openvmm_core/src/worker/memory_layout.rs index c1e02e5f0a..1fa2c7f900 100644 --- a/openvmm/openvmm_core/src/worker/memory_layout.rs +++ b/openvmm/openvmm_core/src/worker/memory_layout.rs @@ -322,8 +322,8 @@ pub(super) fn resolve_memory_layout( .find(|e| e.segment == root_complex.segment) .expect("segment must exist"); let offset = u64::from(root_complex.start_bus - se.min_bus) * PCIE_ECAM_BYTES_PER_BUS; - let size = - u64::from(root_complex.end_bus - root_complex.start_bus + 1) * PCIE_ECAM_BYTES_PER_BUS; + let size = (u64::from(root_complex.end_bus - root_complex.start_bus) + 1) + * PCIE_ECAM_BYTES_PER_BUS; ranges.ecam_range = MemoryRange::new(se.range.start() + offset..se.range.start() + offset + size); } @@ -685,6 +685,29 @@ mod tests { assert_eq!(bus0_base_r0, bus0_base_r1); } + #[test] + fn full_bus_range_ecam_does_not_overflow() { + // A single RC spanning buses 0..255 requires (255 - 0 + 1) = 256 + // buses. The bus count must be computed in u64, not u8, to avoid + // overflow. + let root_complexes = [PcieRootComplexConfig { + index: 0, + name: "rc0".to_string(), + segment: 0, + start_bus: 0, + end_bus: 255, + low_mmio: PcieMmioRangeConfig::Dynamic { size: 64 * MB }, + high_mmio: PcieMmioRangeConfig::Dynamic { size: GB }, + ports: Vec::new(), + }]; + let mut config = input(2 * GB, None, None); + config.pcie_root_complexes = &root_complexes; + + let actual = resolve_memory_layout(config).unwrap(); + let ranges = &actual.pcie_root_complex_ranges[0]; + assert_eq!(ranges.ecam_range.len(), 256 * PCIE_ECAM_BYTES_PER_BUS); + } + #[test] fn sub_gb_numa_nodes_use_two_mb_alignment() { let sizes = [512 * MB, 512 * MB]; From 9e243ea441f7dc7d132c65d0ee0908743a31312e Mon Sep 17 00:00:00 2001 From: John Starks Date: Sun, 17 May 2026 21:14:00 -0700 Subject: [PATCH 29/36] test --- .../openvmm_core/src/worker/memory_layout.rs | 59 +++++++++++++++---- vmm_core/vm_manifest_builder/src/lib.rs | 15 +++-- 2 files changed, 58 insertions(+), 16 deletions(-) diff --git a/openvmm/openvmm_core/src/worker/memory_layout.rs b/openvmm/openvmm_core/src/worker/memory_layout.rs index 1fa2c7f900..dabca8691f 100644 --- a/openvmm/openvmm_core/src/worker/memory_layout.rs +++ b/openvmm/openvmm_core/src/worker/memory_layout.rs @@ -146,23 +146,55 @@ pub(super) fn resolve_memory_layout( // Architectural reserved zone — pinned addresses that no dynamic consumer // may overlap (LAPIC, IOAPIC, GIC, PL011, battery, TPM, etc.). + // + // When chipset low MMIO is enabled, this zone lives *inside* + // `chipset_low_mmio` (pinned at the `(4 GiB - size) .. 4 GiB` tail; see + // below). The two together form the single window advertised to the + // guest via `\_SB.VMOD._CRS`, which guests use to arbitrate the + // resources of fixed-address child devices such as the TPM2 CRB at + // `0xFED4_0000`. In that case we must *not* also `reserve()` the zone + // here, since the allocator rejects overlapping fixed/reserved ranges. + // + // When chipset low MMIO is disabled there is no enclosing fixed range, + // so the reservation is needed explicitly to keep LAPIC/IOAPIC/etc. + // off-limits to dynamic placements. let arch_reserved = if cfg!(guest_arch = "x86_64") { ARCH_RESERVED_X86_64 } else { ARCH_RESERVED_AARCH64 }; - builder.reserve("arch-reserved", arch_reserved); + if input.chipset_low_mmio_size == 0 { + builder.reserve("arch-reserved", arch_reserved); + } - // Chipset low MMIO (Mmio32): VMOD/PCI0 _CRS low range for VMBus - // devices and PIIX4 PCI BARs. + // Chipset low MMIO (Mmio32): VMOD/PCI0 _CRS low range. Pinned to the + // `(4 GiB - size) .. 4 GiB` tail so it always fully contains the + // architectural reserved zone (LAPIC, IOAPIC, TPM, ...). + // + // This is part of the guest-visible contract: this range is what + // firmware advertises in `\_SB.VMOD._CRS`, and OS resource arbiters + // require fixed-address child devices (e.g. the TPM2 CRB declared as + // `Memory32Fixed` at `0xFED4_0000`) to fall inside that window. If the + // window did not cover the reserved zone, Windows `tpm.sys` would fail + // to claim the TPM with `TBS_E_INTERNAL_ERROR`. if input.chipset_low_mmio_size != 0 { - builder.request( - "chipset-low-mmio", - &mut chipset_low_mmio, - input.chipset_low_mmio_size, - TWO_MB, - Placement::Mmio32, - ); + let size = input.chipset_low_mmio_size; + let four_gb = 4 * GB; + if size > four_gb { + bail!("chipset low MMIO size {:#x} exceeds 4 GiB", size,); + } + if size < arch_reserved.len() { + bail!( + "chipset low MMIO size {:#x} is too small to cover the \ + architectural reserved zone {:#x}..{:#x} ({:#x} bytes)", + size, + arch_reserved.start(), + arch_reserved.end(), + arch_reserved.len(), + ); + } + chipset_low_mmio = MemoryRange::new(four_gb - size..four_gb); + builder.fixed("chipset-low-mmio", chipset_low_mmio); } // Chipset high MMIO (Mmio64): VMOD/PCI0 _CRS high range. @@ -497,7 +529,7 @@ mod tests { use vm_topology::memory::AddressType; const MB: u64 = 1024 * 1024; - const DEFAULT_CHIPSET_LOW_MMIO_SIZE_X86_64: u64 = 96 * 1024 * 1024; + const DEFAULT_CHIPSET_LOW_MMIO_SIZE_X86_64: u64 = 128 * 1024 * 1024; const DEFAULT_CHIPSET_HIGH_MMIO_SIZE: u64 = 512 * 1024 * 1024; const DEFAULT_VTL2_CHIPSET_MMIO_SIZE: u64 = GB; @@ -599,7 +631,10 @@ mod tests { .expect("should have high chipset MMIO"); assert_eq!(low.len(), DEFAULT_CHIPSET_LOW_MMIO_SIZE_X86_64); assert_eq!(high.len(), DEFAULT_CHIPSET_HIGH_MMIO_SIZE); - assert!(low.end() <= 4 * GB, "low chipset MMIO should be below 4 GB"); + // Chipset low MMIO is pinned to end at 4 GiB and must fully contain + // the architectural reserved zone (LAPIC, IOAPIC, TPM, ...). + assert_eq!(low.end(), 4 * GB); + assert!(low.contains(&ARCH_RESERVED_X86_64)); assert!( high.start() >= 2 * GB, "high chipset MMIO should be above RAM" diff --git a/vmm_core/vm_manifest_builder/src/lib.rs b/vmm_core/vm_manifest_builder/src/lib.rs index eb6ee4d642..233b544190 100644 --- a/vmm_core/vm_manifest_builder/src/lib.rs +++ b/vmm_core/vm_manifest_builder/src/lib.rs @@ -408,11 +408,18 @@ impl VmManifestBuilder { // Chipset MMIO sizing: all VM types with VMBus or a PCI bus get low + // high chipset MMIO. HclHost additionally gets VTL2 chipset MMIO. // - // x86_64: 96 MiB low (128 MiB traditional gap minus 32 MiB reserved zone). - // aarch64: 240 MiB low (512 MiB traditional gap minus 272 MiB reserved zone). + // The low MMIO window is a single block pinned to end at 4 GiB, + // fully containing the architectural reserved zone (LAPIC, IOAPIC, + // TPM, etc.). This shape is part of the guest-visible contract: + // firmware advertises it as `\_SB.VMOD._CRS`, and OS resource + // arbiters (e.g. Windows `tpm.sys`) require fixed-address devices + // like the TPM2 CRB at `0xFED4_0000` to fall inside that window. + // + // x86_64: 128 MiB ending at 4 GiB (includes the 32 MiB arch zone). + // aarch64: 512 MiB ending at 4 GiB (includes the 272 MiB arch zone). let default_low = match self.arch { - MachineArch::X86_64 => 96 * 1024 * 1024, - MachineArch::Aarch64 => 240 * 1024 * 1024, + MachineArch::X86_64 => 128 * 1024 * 1024, + MachineArch::Aarch64 => 512 * 1024 * 1024, }; let default_high: u64 = 512 * 1024 * 1024; let default_vtl2: u64 = 1024 * 1024 * 1024; From 81bb0c6645fc4546592fe407b1b51a710246d7c8 Mon Sep 17 00:00:00 2001 From: John Starks Date: Sun, 17 May 2026 23:34:24 -0700 Subject: [PATCH 30/36] better? --- .../openvmm_core/src/worker/memory_layout.rs | 89 +++++++------------ vmm_core/vm_manifest_builder/src/lib.rs | 11 +-- 2 files changed, 33 insertions(+), 67 deletions(-) diff --git a/openvmm/openvmm_core/src/worker/memory_layout.rs b/openvmm/openvmm_core/src/worker/memory_layout.rs index dabca8691f..f8fc328864 100644 --- a/openvmm/openvmm_core/src/worker/memory_layout.rs +++ b/openvmm/openvmm_core/src/worker/memory_layout.rs @@ -138,64 +138,29 @@ pub(super) fn resolve_memory_layout( .collect::>(); let mut vtl2_range = MemoryRange::EMPTY; let mut virtio_mmio_region = MemoryRange::EMPTY; - let mut chipset_low_mmio = MemoryRange::EMPTY; + let chipset_low_mmio; let mut chipset_high_mmio = MemoryRange::EMPTY; let mut vtl2_chipset_mmio = MemoryRange::EMPTY; let mut builder = LayoutBuilder::new(); - // Architectural reserved zone — pinned addresses that no dynamic consumer - // may overlap (LAPIC, IOAPIC, GIC, PL011, battery, TPM, etc.). - // - // When chipset low MMIO is enabled, this zone lives *inside* - // `chipset_low_mmio` (pinned at the `(4 GiB - size) .. 4 GiB` tail; see - // below). The two together form the single window advertised to the - // guest via `\_SB.VMOD._CRS`, which guests use to arbitrate the - // resources of fixed-address child devices such as the TPM2 CRB at - // `0xFED4_0000`. In that case we must *not* also `reserve()` the zone - // here, since the allocator rejects overlapping fixed/reserved ranges. - // - // When chipset low MMIO is disabled there is no enclosing fixed range, - // so the reservation is needed explicitly to keep LAPIC/IOAPIC/etc. - // off-limits to dynamic placements. + // Chipset low MMIO (Mmio32): a fixed window pinned to the top of 32-bit + // address space, advertised to firmware as `\_SB.VMOD._CRS`. Always at + // least the architectural reserved zone (LAPIC, IOAPIC, TPM, ...) so + // guests can arbitrate fixed-address children like TPM2 against this + // window; the caller-requested size may extend it lower. let arch_reserved = if cfg!(guest_arch = "x86_64") { ARCH_RESERVED_X86_64 } else { ARCH_RESERVED_AARCH64 }; - if input.chipset_low_mmio_size == 0 { - builder.reserve("arch-reserved", arch_reserved); - } - - // Chipset low MMIO (Mmio32): VMOD/PCI0 _CRS low range. Pinned to the - // `(4 GiB - size) .. 4 GiB` tail so it always fully contains the - // architectural reserved zone (LAPIC, IOAPIC, TPM, ...). - // - // This is part of the guest-visible contract: this range is what - // firmware advertises in `\_SB.VMOD._CRS`, and OS resource arbiters - // require fixed-address child devices (e.g. the TPM2 CRB declared as - // `Memory32Fixed` at `0xFED4_0000`) to fall inside that window. If the - // window did not cover the reserved zone, Windows `tpm.sys` would fail - // to claim the TPM with `TBS_E_INTERNAL_ERROR`. - if input.chipset_low_mmio_size != 0 { - let size = input.chipset_low_mmio_size; - let four_gb = 4 * GB; - if size > four_gb { - bail!("chipset low MMIO size {:#x} exceeds 4 GiB", size,); - } - if size < arch_reserved.len() { - bail!( - "chipset low MMIO size {:#x} is too small to cover the \ - architectural reserved zone {:#x}..{:#x} ({:#x} bytes)", - size, - arch_reserved.start(), - arch_reserved.end(), - arch_reserved.len(), - ); - } - chipset_low_mmio = MemoryRange::new(four_gb - size..four_gb); - builder.fixed("chipset-low-mmio", chipset_low_mmio); + let four_gb = 4 * GB; + let low_mmio_size = input.chipset_low_mmio_size.max(arch_reserved.len()); + if low_mmio_size > four_gb { + bail!("chipset low MMIO size {low_mmio_size:#x} exceeds 4 GiB"); } + chipset_low_mmio = MemoryRange::new(four_gb - low_mmio_size..four_gb); + builder.fixed("chipset-low-mmio", chipset_low_mmio); // Chipset high MMIO (Mmio64): VMOD/PCI0 _CRS high range. if input.chipset_high_mmio_size != 0 { @@ -529,7 +494,15 @@ mod tests { use vm_topology::memory::AddressType; const MB: u64 = 1024 * 1024; - const DEFAULT_CHIPSET_LOW_MMIO_SIZE_X86_64: u64 = 128 * 1024 * 1024; + // Match the production defaults from `vm_manifest_builder`. + #[cfg(guest_arch = "x86_64")] + const DEFAULT_CHIPSET_LOW_MMIO_SIZE: u64 = 128 * 1024 * 1024; + #[cfg(guest_arch = "aarch64")] + const DEFAULT_CHIPSET_LOW_MMIO_SIZE: u64 = 512 * 1024 * 1024; + #[cfg(guest_arch = "x86_64")] + const ARCH_RESERVED: MemoryRange = ARCH_RESERVED_X86_64; + #[cfg(guest_arch = "aarch64")] + const ARCH_RESERVED: MemoryRange = ARCH_RESERVED_AARCH64; const DEFAULT_CHIPSET_HIGH_MMIO_SIZE: u64 = 512 * 1024 * 1024; const DEFAULT_VTL2_CHIPSET_MMIO_SIZE: u64 = GB; @@ -541,7 +514,7 @@ mod tests { MemoryLayoutInput { mem_size, numa_mem_sizes, - chipset_low_mmio_size: DEFAULT_CHIPSET_LOW_MMIO_SIZE_X86_64, + chipset_low_mmio_size: DEFAULT_CHIPSET_LOW_MMIO_SIZE, chipset_high_mmio_size: DEFAULT_CHIPSET_HIGH_MMIO_SIZE, vtl2_chipset_mmio_size: 0, pcie_root_complexes: &[], @@ -595,7 +568,7 @@ mod tests { assert_eq!(actual.ram_size(), 4 * GB); // RAM must not overlap the architectural reserved zone. - let reserved = ARCH_RESERVED_X86_64; + let reserved = ARCH_RESERVED; for ram in actual.ram() { assert!( !ram.range.overlaps(&reserved), @@ -629,12 +602,12 @@ mod tests { let high = result .chipset_high_mmio .expect("should have high chipset MMIO"); - assert_eq!(low.len(), DEFAULT_CHIPSET_LOW_MMIO_SIZE_X86_64); + assert_eq!(low.len(), DEFAULT_CHIPSET_LOW_MMIO_SIZE); assert_eq!(high.len(), DEFAULT_CHIPSET_HIGH_MMIO_SIZE); // Chipset low MMIO is pinned to end at 4 GiB and must fully contain // the architectural reserved zone (LAPIC, IOAPIC, TPM, ...). assert_eq!(low.end(), 4 * GB); - assert!(low.contains(&ARCH_RESERVED_X86_64)); + assert!(low.contains(&ARCH_RESERVED)); assert!( high.start() >= 2 * GB, "high chipset MMIO should be above RAM" @@ -922,16 +895,16 @@ mod tests { #[test] fn ecam_below_256mb_is_rejected() { - // Force ECAM placement below 256 MiB by reserving most of the 32-bit - // MMIO window for low_mmio. The Mmio32 zone is ~4064 MiB on x86_64 - // and ~3824 MiB on aarch64 (the per-arch reserved zone differs), so - // the low_mmio request is sized per-arch to land ECAM around 127 MiB - // in both cases. The resolver must bail because MCFG cannot + // Force ECAM placement below 256 MiB by reserving most of the free + // Mmio32 window for low_mmio. The fixed chipset_low_mmio at the top + // of 32-bit space leaves 3968 MiB on x86_64 and 3584 MiB on aarch64 + // for dynamic Mmio32 requests; size low_mmio to push ECAM near + // 127 MiB on both. The resolver must bail because MCFG cannot // represent a bus-0 base below the ECAM start. let low_mmio_size = if cfg!(guest_arch = "x86_64") { 3840 * MB } else { - 3600 * MB + 3456 * MB }; let root_complexes = [pcie_root_complex( PcieMmioRangeConfig::Dynamic { diff --git a/vmm_core/vm_manifest_builder/src/lib.rs b/vmm_core/vm_manifest_builder/src/lib.rs index 233b544190..e12b0ba322 100644 --- a/vmm_core/vm_manifest_builder/src/lib.rs +++ b/vmm_core/vm_manifest_builder/src/lib.rs @@ -408,15 +408,8 @@ impl VmManifestBuilder { // Chipset MMIO sizing: all VM types with VMBus or a PCI bus get low + // high chipset MMIO. HclHost additionally gets VTL2 chipset MMIO. // - // The low MMIO window is a single block pinned to end at 4 GiB, - // fully containing the architectural reserved zone (LAPIC, IOAPIC, - // TPM, etc.). This shape is part of the guest-visible contract: - // firmware advertises it as `\_SB.VMOD._CRS`, and OS resource - // arbiters (e.g. Windows `tpm.sys`) require fixed-address devices - // like the TPM2 CRB at `0xFED4_0000` to fall inside that window. - // - // x86_64: 128 MiB ending at 4 GiB (includes the 32 MiB arch zone). - // aarch64: 512 MiB ending at 4 GiB (includes the 272 MiB arch zone). + // Low MMIO is a single window pinned to end at 4 GiB and advertised + // as `\_SB.VMOD._CRS`. The defaults match the legacy Hyper-V sizes. let default_low = match self.arch { MachineArch::X86_64 => 128 * 1024 * 1024, MachineArch::Aarch64 => 512 * 1024 * 1024, From 6dd53c8aeffd5b03e124d06f2acc1dacb4ac4cf1 Mon Sep 17 00:00:00 2001 From: John Starks Date: Mon, 18 May 2026 10:48:35 -0700 Subject: [PATCH 31/36] wip --- .../openvmm_core/src/worker/memory_layout.rs | 36 ++++++++++++------- 1 file changed, 24 insertions(+), 12 deletions(-) diff --git a/openvmm/openvmm_core/src/worker/memory_layout.rs b/openvmm/openvmm_core/src/worker/memory_layout.rs index f8fc328864..4c778b6ca9 100644 --- a/openvmm/openvmm_core/src/worker/memory_layout.rs +++ b/openvmm/openvmm_core/src/worker/memory_layout.rs @@ -138,7 +138,6 @@ pub(super) fn resolve_memory_layout( .collect::>(); let mut vtl2_range = MemoryRange::EMPTY; let mut virtio_mmio_region = MemoryRange::EMPTY; - let chipset_low_mmio; let mut chipset_high_mmio = MemoryRange::EMPTY; let mut vtl2_chipset_mmio = MemoryRange::EMPTY; @@ -159,7 +158,7 @@ pub(super) fn resolve_memory_layout( if low_mmio_size > four_gb { bail!("chipset low MMIO size {low_mmio_size:#x} exceeds 4 GiB"); } - chipset_low_mmio = MemoryRange::new(four_gb - low_mmio_size..four_gb); + let chipset_low_mmio = MemoryRange::new(four_gb - low_mmio_size..four_gb); builder.fixed("chipset-low-mmio", chipset_low_mmio); // Chipset high MMIO (Mmio64): VMOD/PCI0 _CRS high range. @@ -361,13 +360,14 @@ pub(super) fn resolve_memory_layout( // exactly as callers had it pre-allocator: `[0]` = chipset low MMIO, // `[1]` = chipset high MMIO, and (when VTL2 is enabled) `[2]` = the // VTL2-private chipset MMIO range. Consumers (DSDT, Linux DT, UEFI, - // PCAT) rely on this ordering. The architectural reserved zone and - // virtio-mmio region were never part of this vector and remain tracked - // separately. `MemoryLayout::mmio()` will eventually be removed. - let mut mmio_gaps: Vec = Vec::new(); - if input.chipset_low_mmio_size != 0 { - mmio_gaps.push(chipset_low_mmio); - } + // PCAT) rely on this ordering. The virtio-mmio region was never part of + // this vector and remains tracked separately. `MemoryLayout::mmio()` will + // eventually be removed. + // + // The chipset low MMIO range is always present (at least the + // architectural reserved zone) and is always reported. Hiding it would + // leave a real allocated hole in the layout invisible to consumers. + let mut mmio_gaps: Vec = vec![chipset_low_mmio]; if input.chipset_high_mmio_size != 0 { mmio_gaps.push(chipset_high_mmio); } @@ -420,7 +420,7 @@ pub(super) fn resolve_memory_layout( memory_layout, pcie_root_complex_ranges, virtio_mmio_region, - chipset_low_mmio: (input.chipset_low_mmio_size != 0).then_some(chipset_low_mmio), + chipset_low_mmio: Some(chipset_low_mmio), chipset_high_mmio: (input.chipset_high_mmio_size != 0).then_some(chipset_high_mmio), vtl2_chipset_mmio: (input.vtl2_chipset_mmio_size != 0).then_some(vtl2_chipset_mmio), }) @@ -846,15 +846,27 @@ mod tests { } #[test] - fn no_chipset_mmio_when_none() { + fn disabled_chipset_mmio_still_reports_arch_reserved() { + // Even when the caller does not request any chipset MMIO, the + // architectural reserved zone (LAPIC, IOAPIC, TPM, ...) is still + // carved out of RAM at the top of 4 GiB. That range must be + // reported so consumers see the same layout the allocator + // produced. let mut config = input(2 * GB, None, None); config.chipset_low_mmio_size = 0; config.chipset_high_mmio_size = 0; let result = resolve_memory_layout(config).unwrap(); - assert!(result.chipset_low_mmio.is_none()); + let low = result + .chipset_low_mmio + .expect("low chipset MMIO is always present (arch reserved zone)"); + assert_eq!(low.end(), 4 * GB); + assert!(low.contains(&ARCH_RESERVED)); assert!(result.chipset_high_mmio.is_none()); + // The reported range must appear in MemoryLayout::mmio() so that + // RAM is not silently placed around an invisible hole. + assert_eq!(result.memory_layout.mmio(), &[low]); } #[test] From 8dfa0c88efa5f2944fa74bae09a8e3b2318bad5e Mon Sep 17 00:00:00 2001 From: John Starks Date: Mon, 18 May 2026 10:55:53 -0700 Subject: [PATCH 32/36] fixees --- openvmm/openvmm_core/src/worker/dispatch.rs | 31 ++++---- .../openvmm_core/src/worker/memory_layout.rs | 76 +++++-------------- .../src/worker/vm_loaders/igvm.rs | 36 +++++---- 3 files changed, 55 insertions(+), 88 deletions(-) diff --git a/openvmm/openvmm_core/src/worker/dispatch.rs b/openvmm/openvmm_core/src/worker/dispatch.rs index e08d9bf3a5..94605a230c 100644 --- a/openvmm/openvmm_core/src/worker/dispatch.rs +++ b/openvmm/openvmm_core/src/worker/dispatch.rs @@ -407,10 +407,10 @@ pub(crate) struct InitializedVm { cfg: Manifest, mem_layout: MemoryLayout, resolved_pcie_root_complex_ranges: Vec, - virtio_mmio_region: Option, - chipset_low_mmio: Option, - chipset_high_mmio: Option, - vtl2_chipset_mmio: Option, + virtio_mmio_region: MemoryRange, + chipset_low_mmio: MemoryRange, + chipset_high_mmio: MemoryRange, + vtl2_chipset_mmio: MemoryRange, processor_topology: ProcessorTopology, igvm_file: Option, driver_source: VmTaskDriverSource, @@ -692,15 +692,15 @@ struct LoadedVmInner { chipset_cfg: BaseChipsetManifest, chipset_capabilities: VmChipsetCapabilities, #[cfg_attr(not(guest_arch = "x86_64"), expect(dead_code))] - virtio_mmio_region: Option, + virtio_mmio_region: MemoryRange, #[cfg_attr(not(guest_arch = "x86_64"), expect(dead_code))] virtio_mmio_irq: u32, /// Chipset low MMIO range for VMOD/PCI0 _CRS. - chipset_low_mmio: Option, + chipset_low_mmio: MemoryRange, /// Chipset high MMIO range for VMOD/PCI0 _CRS. - chipset_high_mmio: Option, + chipset_high_mmio: MemoryRange, /// VTL2-private chipset MMIO range for VTL2 VMBus. - vtl2_chipset_mmio: Option, + vtl2_chipset_mmio: MemoryRange, /// ((device, function), interrupt) #[cfg_attr(not(guest_arch = "x86_64"), expect(dead_code))] pci_legacy_interrupts: Vec<((u8, Option), u32)>, @@ -2403,9 +2403,7 @@ impl InitializedVm { .await?; match bus { VirtioBus::Mmio => { - let region = virtio_mmio_region - .expect("virtio_mmio_region must be allocated for Mmio devices"); - let mmio_start = region.start() + virtio_mmio_index as u64 * 0x1000; + let mmio_start = virtio_mmio_region.start() + virtio_mmio_index as u64 * 0x1000; virtio_mmio_index += 1; let id = format!("{id}-{mmio_start}"); let gm = gm.clone(); @@ -2664,7 +2662,7 @@ impl LoadedVmInner { dsdt, &self.chipset_cfg, enable_serial, - self.virtio_mmio_region.as_ref(), + self.virtio_mmio_region, self.virtio_mmio_irq, &self.pci_legacy_interrupts, ) @@ -3422,7 +3420,7 @@ fn add_devices_to_dsdt_x64( dsdt: &mut dsdt::Dsdt, cfg: &BaseChipsetManifest, serial_uarts: bool, - virtio_mmio_region: Option<&MemoryRange>, + virtio_mmio_region: MemoryRange, virtio_mmio_irq: u32, pci_legacy_interrupts: &[((u8, Option), u32)], // ((device, function), interrupt) ) { @@ -3452,10 +3450,9 @@ fn add_devices_to_dsdt_x64( // Virtio-mmio devices are allocated as a contiguous region by the memory // layout resolver. Each 4 KiB slot is a separate device. - if let Some(region) = virtio_mmio_region { - let slot_count = region.len() / HV_PAGE_SIZE; - for i in 0..slot_count { - let slot_base = region.start() + i * HV_PAGE_SIZE; + { + for i in 0..virtio_mmio_region.page_count_4k() { + let slot_base = virtio_mmio_region.start() + i * HV_PAGE_SIZE; let mut device = dsdt::Device::new(format!("\\_SB.VI{i:02}").as_bytes()); device.add_object(&dsdt::NamedString::new(b"_HID", b"LNRO0005")); device.add_object(&dsdt::NamedInteger::new(b"_UID", i)); diff --git a/openvmm/openvmm_core/src/worker/memory_layout.rs b/openvmm/openvmm_core/src/worker/memory_layout.rs index 4c778b6ca9..b90d446ee1 100644 --- a/openvmm/openvmm_core/src/worker/memory_layout.rs +++ b/openvmm/openvmm_core/src/worker/memory_layout.rs @@ -51,16 +51,16 @@ pub(super) struct ResolvedMemoryLayout { /// Contiguous MMIO region for all virtio-mmio device slots. Each slot is /// 4 KiB, indexed from the start of the region. `None` when no /// virtio-mmio devices are configured. - pub virtio_mmio_region: Option, + pub virtio_mmio_region: MemoryRange, /// Chipset low MMIO range (below 4 GB) for VMOD/PCI0 _CRS. `None` when /// no VMBus / chipset MMIO is configured. - pub chipset_low_mmio: Option, + pub chipset_low_mmio: MemoryRange, /// Chipset high MMIO range (above RAM) for VMOD/PCI0 _CRS. `None` when /// no VMBus / chipset MMIO is configured. - pub chipset_high_mmio: Option, + pub chipset_high_mmio: MemoryRange, /// VTL2-private chipset MMIO range, reported to VTL2 VMBus via the device /// tree. `None` when VTL2 is not configured or has no chipset MMIO. - pub vtl2_chipset_mmio: Option, + pub vtl2_chipset_mmio: MemoryRange, } #[derive(Debug)] @@ -113,19 +113,6 @@ pub(super) fn resolve_memory_layout( ) -> anyhow::Result { let ram_sizes = validate_ram_sizes(input.mem_size, input.numa_mem_sizes)?; - // Chipset low and high MMIO must be paired: downstream consumers (UEFI, - // x64 DSDT, PCAT) index `MemoryLayout::mmio()` positionally and require - // both entries to be present. Allowing only one to be set would silently - // produce a layout where consumers either fail late or, with VTL2 - // enabled, misinterpret the VTL2 chipset MMIO range as the high gap. - if (input.chipset_low_mmio_size == 0) != (input.chipset_high_mmio_size == 0) { - bail!( - "chipset low and high MMIO must be both enabled or both disabled (low={:#x}, high={:#x})", - input.chipset_low_mmio_size, - input.chipset_high_mmio_size, - ); - } - let mut ram_ranges_by_node = vec![Vec::new(); ram_sizes.len()]; let mut pcie_root_complex_ranges = input .pcie_root_complexes @@ -136,10 +123,6 @@ pub(super) fn resolve_memory_layout( high_mmio: MemoryRange::EMPTY, }) .collect::>(); - let mut vtl2_range = MemoryRange::EMPTY; - let mut virtio_mmio_region = MemoryRange::EMPTY; - let mut chipset_high_mmio = MemoryRange::EMPTY; - let mut vtl2_chipset_mmio = MemoryRange::EMPTY; let mut builder = LayoutBuilder::new(); @@ -162,6 +145,7 @@ pub(super) fn resolve_memory_layout( builder.fixed("chipset-low-mmio", chipset_low_mmio); // Chipset high MMIO (Mmio64): VMOD/PCI0 _CRS high range. + let mut chipset_high_mmio = MemoryRange::EMPTY; if input.chipset_high_mmio_size != 0 { builder.request( "chipset-high-mmio", @@ -247,6 +231,7 @@ pub(super) fn resolve_memory_layout( // Virtio-mmio: allocate one contiguous region for all slots. Each slot is // 4 KiB, so the region is `count * 4 KiB` placed as a single Mmio32 // request. + let mut virtio_mmio_region = MemoryRange::EMPTY; if input.virtio_mmio_count > 0 { builder.request( "virtio-mmio", @@ -274,6 +259,7 @@ pub(super) fn resolve_memory_layout( // VTL2 chipset MMIO is implementation-private — placed after all // VTL0-visible RAM/MMIO so enabling VTL2 does not move VTL0 addresses. + let mut vtl2_chipset_mmio = MemoryRange::EMPTY; if input.vtl2_chipset_mmio_size != 0 { builder.request( "vtl2-chipset-mmio", @@ -293,6 +279,7 @@ pub(super) fn resolve_memory_layout( // overconstraining and would lead to holes in the VTL0 layout--we just // don't support IGVM files with relocation sections that cannot be // satisfied by the post-MMIO space. + let mut vtl2_range = MemoryRange::EMPTY; if let Some(vtl2_layout) = input.vtl2_layout { builder.request( "vtl2", @@ -367,11 +354,8 @@ pub(super) fn resolve_memory_layout( // The chipset low MMIO range is always present (at least the // architectural reserved zone) and is always reported. Hiding it would // leave a real allocated hole in the layout invisible to consumers. - let mut mmio_gaps: Vec = vec![chipset_low_mmio]; - if input.chipset_high_mmio_size != 0 { - mmio_gaps.push(chipset_high_mmio); - } - if input.vtl2_chipset_mmio_size != 0 { + let mut mmio_gaps: Vec = vec![chipset_low_mmio, chipset_high_mmio]; + if !vtl2_chipset_mmio.is_empty() { mmio_gaps.push(vtl2_chipset_mmio); } @@ -410,19 +394,13 @@ pub(super) fn resolve_memory_layout( ); } - let virtio_mmio_region = if input.virtio_mmio_count > 0 { - Some(virtio_mmio_region) - } else { - None - }; - Ok(ResolvedMemoryLayout { memory_layout, pcie_root_complex_ranges, virtio_mmio_region, - chipset_low_mmio: Some(chipset_low_mmio), - chipset_high_mmio: (input.chipset_high_mmio_size != 0).then_some(chipset_high_mmio), - vtl2_chipset_mmio: (input.vtl2_chipset_mmio_size != 0).then_some(vtl2_chipset_mmio), + chipset_low_mmio, + chipset_high_mmio, + vtl2_chipset_mmio, }) } @@ -596,12 +574,8 @@ mod tests { fn chipset_mmio_is_resolved() { let result = resolve_memory_layout(input(2 * GB, None, None)).unwrap(); - let low = result - .chipset_low_mmio - .expect("should have low chipset MMIO"); - let high = result - .chipset_high_mmio - .expect("should have high chipset MMIO"); + let low = result.chipset_low_mmio; + let high = result.chipset_high_mmio; assert_eq!(low.len(), DEFAULT_CHIPSET_LOW_MMIO_SIZE); assert_eq!(high.len(), DEFAULT_CHIPSET_HIGH_MMIO_SIZE); // Chipset low MMIO is pinned to end at 4 GiB and must fully contain @@ -788,9 +762,7 @@ mod tests { let result = resolve_memory_layout(config).unwrap(); - let region = result - .virtio_mmio_region - .expect("should have virtio-mmio region"); + let region = result.virtio_mmio_region; assert_eq!(region.len(), 3 * PAGE_SIZE); assert!(region.end() <= 4 * GB, "virtio-mmio should be below 4 GB"); } @@ -811,7 +783,7 @@ mod tests { let result = resolve_memory_layout(config).unwrap(); - assert!(result.virtio_mmio_region.is_none()); + assert!(result.virtio_mmio_region.is_empty()); } #[test] @@ -821,14 +793,10 @@ mod tests { let result = resolve_memory_layout(config).unwrap(); - let vtl2_mmio = result - .vtl2_chipset_mmio - .expect("should have VTL2 chipset MMIO"); + let vtl2_mmio = result.vtl2_chipset_mmio; assert_eq!(vtl2_mmio.len(), DEFAULT_VTL2_CHIPSET_MMIO_SIZE); // VTL2 chipset MMIO should be after all VTL0-visible ranges. - let chipset_high = result - .chipset_high_mmio - .expect("should have high chipset MMIO"); + let chipset_high = result.chipset_high_mmio; assert!( vtl2_mmio.start() >= chipset_high.end(), "VTL2 chipset MMIO should be after VTL0 high MMIO" @@ -858,12 +826,10 @@ mod tests { let result = resolve_memory_layout(config).unwrap(); - let low = result - .chipset_low_mmio - .expect("low chipset MMIO is always present (arch reserved zone)"); + let low = result.chipset_low_mmio; assert_eq!(low.end(), 4 * GB); assert!(low.contains(&ARCH_RESERVED)); - assert!(result.chipset_high_mmio.is_none()); + assert!(result.chipset_high_mmio.is_empty()); // The reported range must appear in MemoryLayout::mmio() so that // RAM is not silently placed around an invisible hole. assert_eq!(result.memory_layout.mmio(), &[low]); diff --git a/openvmm/openvmm_core/src/worker/vm_loaders/igvm.rs b/openvmm/openvmm_core/src/worker/vm_loaders/igvm.rs index d4a166fee2..2203b88180 100644 --- a/openvmm/openvmm_core/src/worker/vm_loaders/igvm.rs +++ b/openvmm/openvmm_core/src/worker/vm_loaders/igvm.rs @@ -91,8 +91,6 @@ pub enum Error { LowerVtlContext, #[error("missing required memory range {0}")] MissingRequiredMemory(MemoryRange), - #[error("IGVM file requires at least two mmio ranges")] - UnsupportedMmio, } fn from_memory_range(range: &MemoryRange) -> IGVM_VHS_MEMORY_RANGE { @@ -265,9 +263,9 @@ struct BuildDeviceTreeParams<'a> { with_vmbus_redirect: bool, com_serial: Option, entropy: Option<&'a [u8]>, - chipset_low_mmio: Option, - chipset_high_mmio: Option, - vtl2_chipset_mmio: Option, + chipset_low_mmio: MemoryRange, + chipset_high_mmio: MemoryRange, + vtl2_chipset_mmio: MemoryRange, } /// Build a device tree representing the whole guest partition. @@ -360,15 +358,20 @@ fn build_device_tree(params: BuildDeviceTreeParams<'_>) -> Result, fdt:: // Build DT ranges for VMBus devices. VTL0 gets the chipset low/high MMIO // ranges; VTL2 gets its own private chipset MMIO range. let ranges_vtl0: Vec = [chipset_low_mmio, chipset_high_mmio] - .into_iter() - .flatten() - .flat_map(|range| [range.start(), range.start(), range.len()]) - .collect(); - let ranges_vtl2: Vec = vtl2_chipset_mmio .into_iter() .flat_map(|range| [range.start(), range.start(), range.len()]) .collect(); + let ranges_vtl2: Vec = if vtl2_chipset_mmio.is_empty() { + vec![] + } else { + vec![ + vtl2_chipset_mmio.start(), + vtl2_chipset_mmio.start(), + vtl2_chipset_mmio.len(), + ] + }; + // VTL0 vmbus root device let vmbus_vtl0_name = if ranges_vtl0.is_empty() { "vmbus-vtl0".into() @@ -521,11 +524,11 @@ pub struct LoadIgvmParams<'a, T: ArchTopology> { /// Entropy pub entropy: Option<&'a [u8]>, /// VTL0 chipset low MMIO range for the device tree VMBus node. - pub chipset_low_mmio: Option, + pub chipset_low_mmio: MemoryRange, /// VTL0 chipset high MMIO range for the device tree VMBus node. - pub chipset_high_mmio: Option, + pub chipset_high_mmio: MemoryRange, /// VTL2-private chipset MMIO range for the device tree VTL2 VMBus node. - pub vtl2_chipset_mmio: Option, + pub vtl2_chipset_mmio: MemoryRange, } pub fn load_igvm( @@ -954,10 +957,11 @@ fn load_igvm_x86( } IgvmDirectiveHeader::MmioRanges(ref info) => { // Convert the chipset MMIO ranges to the IGVM format. - let low = chipset_low_mmio.ok_or(Error::UnsupportedMmio)?; - let high = chipset_high_mmio.ok_or(Error::UnsupportedMmio)?; let mmio_ranges = IGVM_VHS_MMIO_RANGES { - mmio_ranges: [from_memory_range(&low), from_memory_range(&high)], + mmio_ranges: [ + from_memory_range(&chipset_low_mmio), + from_memory_range(&chipset_high_mmio), + ], }; import_parameter(&mut parameter_areas, info, mmio_ranges.as_bytes())?; } From 728e56ee77a4f0993bd4ef6f28e2f9c63be504f2 Mon Sep 17 00:00:00 2001 From: John Starks Date: Mon, 18 May 2026 11:14:03 -0700 Subject: [PATCH 33/36] cleanup --- openvmm/openvmm_core/src/worker/dispatch.rs | 2 +- .../openvmm_core/src/worker/memory_layout.rs | 70 +++++++++---------- openvmm/openvmm_defs/src/config.rs | 2 +- vm/vmcore/vm_topology/src/memory.rs | 22 ++++-- vmm_core/vm_manifest_builder/src/lib.rs | 2 +- 5 files changed, 53 insertions(+), 45 deletions(-) diff --git a/openvmm/openvmm_core/src/worker/dispatch.rs b/openvmm/openvmm_core/src/worker/dispatch.rs index 94605a230c..4372451446 100644 --- a/openvmm/openvmm_core/src/worker/dispatch.rs +++ b/openvmm/openvmm_core/src/worker/dispatch.rs @@ -252,7 +252,7 @@ pub struct Manifest { chipset_devices: Vec, pci_chipset_devices: Vec, chipset_capabilities: VmChipsetCapabilities, - chipset_low_mmio_size: u64, + chipset_low_mmio_size: u32, chipset_high_mmio_size: u64, vtl2_chipset_mmio_size: u64, generation_id_recv: Option>, diff --git a/openvmm/openvmm_core/src/worker/memory_layout.rs b/openvmm/openvmm_core/src/worker/memory_layout.rs index b90d446ee1..d8faebf076 100644 --- a/openvmm/openvmm_core/src/worker/memory_layout.rs +++ b/openvmm/openvmm_core/src/worker/memory_layout.rs @@ -49,17 +49,17 @@ pub(super) struct ResolvedMemoryLayout { pub memory_layout: MemoryLayout, pub pcie_root_complex_ranges: Vec, /// Contiguous MMIO region for all virtio-mmio device slots. Each slot is - /// 4 KiB, indexed from the start of the region. `None` when no + /// 4 KiB, indexed from the start of the region. `EMPTY` when no /// virtio-mmio devices are configured. pub virtio_mmio_region: MemoryRange, - /// Chipset low MMIO range (below 4 GB) for VMOD/PCI0 _CRS. `None` when - /// no VMBus / chipset MMIO is configured. + /// Chipset low MMIO range (below 4 GB) for VMOD/PCI0 _CRS. Always at + /// least the architectural reserved zone (LAPIC, IOAPIC, TPM, ...). pub chipset_low_mmio: MemoryRange, - /// Chipset high MMIO range (above RAM) for VMOD/PCI0 _CRS. `None` when - /// no VMBus / chipset MMIO is configured. + /// Chipset high MMIO range (above RAM) for VMOD/PCI0 _CRS. `EMPTY` when + /// no chipset high MMIO is configured. pub chipset_high_mmio: MemoryRange, /// VTL2-private chipset MMIO range, reported to VTL2 VMBus via the device - /// tree. `None` when VTL2 is not configured or has no chipset MMIO. + /// tree. `EMPTY` when VTL2 is not configured or has no chipset MMIO. pub vtl2_chipset_mmio: MemoryRange, } @@ -79,7 +79,7 @@ pub(super) struct MemoryLayoutInput<'a> { /// Chipset low MMIO size (below 4 GB). This is the VMOD/PCI0 _CRS range /// for VMBus devices and PIIX4 PCI BARs. The address is always allocated /// dynamically. `0` disables the range. - pub chipset_low_mmio_size: u64, + pub chipset_low_mmio_size: u32, /// Chipset high MMIO size (above RAM). This is the VMOD/PCI0 _CRS high /// range for VMBus devices. The address is always allocated dynamically. /// `0` disables the range. @@ -137,10 +137,9 @@ pub(super) fn resolve_memory_layout( ARCH_RESERVED_AARCH64 }; let four_gb = 4 * GB; - let low_mmio_size = input.chipset_low_mmio_size.max(arch_reserved.len()); - if low_mmio_size > four_gb { - bail!("chipset low MMIO size {low_mmio_size:#x} exceeds 4 GiB"); - } + let low_mmio_size = u64::from(input.chipset_low_mmio_size) + .next_multiple_of(0x1000) + .max(arch_reserved.len()); let chipset_low_mmio = MemoryRange::new(four_gb - low_mmio_size..four_gb); builder.fixed("chipset-low-mmio", chipset_low_mmio); @@ -343,17 +342,12 @@ pub(super) fn resolve_memory_layout( let vtl2_range = input.vtl2_layout.map(|_| vtl2_range); - // `MemoryLayout::mmio()` is a legacy positional contract preserved here - // exactly as callers had it pre-allocator: `[0]` = chipset low MMIO, - // `[1]` = chipset high MMIO, and (when VTL2 is enabled) `[2]` = the - // VTL2-private chipset MMIO range. Consumers (DSDT, Linux DT, UEFI, - // PCAT) rely on this ordering. The virtio-mmio region was never part of - // this vector and remains tracked separately. `MemoryLayout::mmio()` will - // eventually be removed. - // - // The chipset low MMIO range is always present (at least the - // architectural reserved zone) and is always reported. Hiding it would - // leave a real allocated hole in the layout invisible to consumers. + // `MemoryLayout::mmio()` is a positional contract: `[0]` = chipset low + // MMIO, `[1]` = chipset high MMIO, and (when VTL2 is enabled) `[2]` = + // the VTL2-private chipset MMIO range. Consumers (DSDT, Linux DT, UEFI, + // PCAT) rely on this ordering. Entries may be `MemoryRange::EMPTY` when + // the corresponding range is not configured; the positional index is + // what matters, not the presence of a non-empty range. let mut mmio_gaps: Vec = vec![chipset_low_mmio, chipset_high_mmio]; if !vtl2_chipset_mmio.is_empty() { mmio_gaps.push(vtl2_chipset_mmio); @@ -474,7 +468,7 @@ mod tests { const MB: u64 = 1024 * 1024; // Match the production defaults from `vm_manifest_builder`. #[cfg(guest_arch = "x86_64")] - const DEFAULT_CHIPSET_LOW_MMIO_SIZE: u64 = 128 * 1024 * 1024; + const DEFAULT_CHIPSET_LOW_MMIO_SIZE: u32 = 128 * 1024 * 1024; #[cfg(guest_arch = "aarch64")] const DEFAULT_CHIPSET_LOW_MMIO_SIZE: u64 = 512 * 1024 * 1024; #[cfg(guest_arch = "x86_64")] @@ -576,7 +570,7 @@ mod tests { let low = result.chipset_low_mmio; let high = result.chipset_high_mmio; - assert_eq!(low.len(), DEFAULT_CHIPSET_LOW_MMIO_SIZE); + assert_eq!(low.len(), DEFAULT_CHIPSET_LOW_MMIO_SIZE as u64); assert_eq!(high.len(), DEFAULT_CHIPSET_HIGH_MMIO_SIZE); // Chipset low MMIO is pinned to end at 4 GiB and must fully contain // the architectural reserved zone (LAPIC, IOAPIC, TPM, ...). @@ -830,28 +824,28 @@ mod tests { assert_eq!(low.end(), 4 * GB); assert!(low.contains(&ARCH_RESERVED)); assert!(result.chipset_high_mmio.is_empty()); - // The reported range must appear in MemoryLayout::mmio() so that - // RAM is not silently placed around an invisible hole. - assert_eq!(result.memory_layout.mmio(), &[low]); + // The reported ranges must appear in MemoryLayout::mmio() preserving + // the positional contract: [0] = low, [1] = high (EMPTY placeholder). + assert_eq!(result.memory_layout.mmio(), &[low, MemoryRange::EMPTY]); } #[test] - fn asymmetric_chipset_mmio_is_rejected() { + fn asymmetric_chipset_mmio_is_accepted() { + // Asymmetric chipset MMIO (only low or only high) is allowed. + // The missing range is EMPTY. let mut config = input(2 * GB, None, None); config.chipset_high_mmio_size = 0; - let err = resolve_memory_layout(config).unwrap_err(); - assert!( - err.to_string().contains("both enabled or both disabled"), - "unexpected error: {err}" - ); + let result = resolve_memory_layout(config).unwrap(); + assert!(!result.chipset_low_mmio.is_empty()); + assert!(result.chipset_high_mmio.is_empty()); let mut config = input(2 * GB, None, None); config.chipset_low_mmio_size = 0; - let err = resolve_memory_layout(config).unwrap_err(); - assert!( - err.to_string().contains("both enabled or both disabled"), - "unexpected error: {err}" - ); + let result = resolve_memory_layout(config).unwrap(); + // Low is always at least the arch reserved zone. + assert!(!result.chipset_low_mmio.is_empty()); + // High is still configured in this case. + assert!(!result.chipset_high_mmio.is_empty()); } #[test] diff --git a/openvmm/openvmm_defs/src/config.rs b/openvmm/openvmm_defs/src/config.rs index 93c1d44636..2fb6193c6e 100644 --- a/openvmm/openvmm_defs/src/config.rs +++ b/openvmm/openvmm_defs/src/config.rs @@ -57,7 +57,7 @@ pub struct Config { pub chipset_capabilities: VmChipsetCapabilities, /// Chipset low MMIO range size (below 4 GiB) for VMOD/PCI0 _CRS. /// The address is always allocated dynamically. `0` disables the range. - pub chipset_low_mmio_size: u64, + pub chipset_low_mmio_size: u32, /// Chipset high MMIO range size (above RAM) for VMOD/PCI0 _CRS. /// The address is always allocated dynamically. `0` disables the range. pub chipset_high_mmio_size: u64, diff --git a/vm/vmcore/vm_topology/src/memory.rs b/vm/vmcore/vm_topology/src/memory.rs index 2de1998f87..c0a898620e 100644 --- a/vm/vmcore/vm_topology/src/memory.rs +++ b/vm/vmcore/vm_topology/src/memory.rs @@ -271,9 +271,12 @@ impl MemoryLayout { /// Makes a new memory layout from already-resolved RAM and fixed ranges. /// - /// The RAM, MMIO, PCI ECAM, and PCI MMIO ranges must each be in sorted - /// order, non-empty, and non-overlapping. The combined layout is also - /// validated for overlaps, including the optional VTL2 range. + /// The RAM, PCI ECAM, and PCI MMIO ranges must each be in sorted order, + /// non-empty, and non-overlapping. MMIO gaps may contain empty placeholder + /// ranges to preserve positional indexing (e.g. `mmio()[0]` = low, + /// `mmio()[1]` = high); empty entries are ignored during validation. + /// The combined layout is also validated for overlaps, including the + /// optional VTL2 range. pub fn new_from_resolved_ranges( ram: Vec, mmio_gaps: Vec, @@ -282,7 +285,14 @@ impl MemoryLayout { vtl2_range: Option, ) -> Result { validate_ranges_with_metadata(&ram)?; - validate_ranges(&mmio_gaps)?; + // MMIO gaps may include empty placeholders for positional indexing; + // validate only the non-empty entries. + let non_empty_mmio: Vec<_> = mmio_gaps + .iter() + .copied() + .filter(|r| !r.is_empty()) + .collect(); + validate_ranges(&non_empty_mmio)?; validate_ranges(&pci_ecam_gaps)?; validate_ranges(&pci_mmio_gaps)?; @@ -299,6 +309,9 @@ impl MemoryLayout { pci_mmio: Vec, vtl2_range: Option, ) -> Result { + // Filter out empty placeholder ranges before validation and overlap + // checks — they carry no physical meaning and exist only for + // positional indexing in the stored mmio vector. let mut all_ranges = ram .iter() .map(|x| &x.range) @@ -307,6 +320,7 @@ impl MemoryLayout { .chain(&pci_ecam) .chain(&pci_mmio) .copied() + .filter(|r| !r.is_empty()) .collect::>(); all_ranges.sort(); diff --git a/vmm_core/vm_manifest_builder/src/lib.rs b/vmm_core/vm_manifest_builder/src/lib.rs index e12b0ba322..09df814346 100644 --- a/vmm_core/vm_manifest_builder/src/lib.rs +++ b/vmm_core/vm_manifest_builder/src/lib.rs @@ -105,7 +105,7 @@ pub struct VmChipsetResult { /// Default chipset low MMIO size (below 4 GiB) for VMOD/PCI0 _CRS. /// The address is always allocated dynamically. `0` when the VM type /// has no VMBus or PCI bus. - pub chipset_low_mmio_size: u64, + pub chipset_low_mmio_size: u32, /// Default chipset high MMIO size (above RAM) for VMOD/PCI0 _CRS. /// The address is always allocated dynamically. `0` when the VM type /// has no VMBus or PCI bus. From 2746bcec148566647ece34eb7cc90d26a25b7c18 Mon Sep 17 00:00:00 2001 From: John Starks Date: Mon, 18 May 2026 12:00:32 -0700 Subject: [PATCH 34/36] feedback --- Cargo.lock | 1 + .../architecture/openvmm/memory-layout.md | 36 +++++++++---- openhcl/underhill_core/src/worker.rs | 1 - openvmm/openvmm_core/src/worker/dispatch.rs | 22 ++++---- .../openvmm_core/src/worker/memory_layout.rs | 2 +- openvmm/openvmm_defs/src/config.rs | 12 ++--- openvmm/openvmm_entry/src/lib.rs | 8 +-- openvmm/openvmm_entry/src/ttrpc/mod.rs | 14 ++--- petri/src/vm/openvmm/construct.rs | 8 +-- vm/vmcore/vm_topology/src/memory.rs | 13 ++--- vmm_core/vm_manifest_builder/Cargo.toml | 1 + vmm_core/vm_manifest_builder/src/lib.rs | 52 ++++++++----------- vmm_core/vmm_core_defs/src/lib.rs | 19 +++++++ 13 files changed, 99 insertions(+), 90 deletions(-) diff --git a/Cargo.lock b/Cargo.lock index 4d7817b6ac..fd7a72c74b 100644 --- a/Cargo.lock +++ b/Cargo.lock @@ -9513,6 +9513,7 @@ dependencies = [ "serial_pl011_resources", "thiserror 2.0.16", "vm_resource", + "vmm_core_defs", "vmotherboard", ] diff --git a/Guide/src/reference/architecture/openvmm/memory-layout.md b/Guide/src/reference/architecture/openvmm/memory-layout.md index 666361bd85..2092f4822e 100644 --- a/Guide/src/reference/architecture/openvmm/memory-layout.md +++ b/Guide/src/reference/architecture/openvmm/memory-layout.md @@ -44,10 +44,10 @@ accepts four kinds of input: | Input | Purpose | |---|---| -| `reserve(tag, range)` | Block allocation at this address but do not include it in the layout top. | -| `fixed(tag, range)` | A range whose address is already decided. Blocks allocation and counts as part of the layout. | -| `ram(tag, target, size, alignment)` | Ordinary guest RAM. The only request type that may be split across multiple extents. | -| `request(tag, target, size, alignment, placement)` | A single contiguous range, placed dynamically. The `placement` chooses one of three phases below. | +| `reserve(range)` | Block allocation at this address but do not include it in the layout top. | +| `fixed(range)` | A range whose address is already decided. Blocks allocation and counts as part of the layout. | +| `ram(size, alignment)` | Ordinary guest RAM. The only request type that may be split across multiple extents. | +| `request(size, alignment, placement)` | A single contiguous range, placed dynamically. The `placement` chooses one of three phases below. | `reserve` and `fixed` differ only in how they affect the **layout top** — the address one past the highest guest-visible byte. `fixed` ranges raise @@ -135,8 +135,12 @@ issues requests in this order: | < 1 GB | 2 MB | | ≥ 1 GB | 1 GB | - Sub-GB nodes use 2 MB so small NUMA nodes do not waste a full GB of - address space. + Alignment matters because RAM extents that start on a huge-page + boundary can be mapped with 2 MB or 1 GB huge pages in host and + guest page tables, avoiding the memory overhead and construction + cost of thousands of smaller page table entries and reducing TLB + pressure at runtime. Sub-GB nodes use 2 MB so small NUMA nodes + do not waste a full GB of address space. 7. **VTL2 chipset MMIO** (`PostMmio`) — VTL2's own VMBus / chipset MMIO region, when VTL2 is configured. Placed after VTL0 so enabling VTL2 does not move any VTL0 address. @@ -150,14 +154,24 @@ issues requests in this order: After `allocate()` succeeds, the worker collects the resolved ranges into the `MemoryLayout`'s MMIO, PCI ECAM, and PCI MMIO gap vectors, then checks -`MemoryLayout::end_of_layout()` against the host's physical-address width. +the highest placed-range address (which includes VTL2 private memory and +VTL2 chipset MMIO) against the host's physical-address width. ## RAM splitting -RAM is the only splittable request, and the splitter has one rule worth -calling out: the alignment passed in is also the **split granularity**. -When a single free range cannot hold the entire request, the part placed -in that range is rounded down to the request alignment before continuing. +RAM is the only splittable request. When contiguous free space is +available, the full requested size is placed at an aligned start address +— alignment constrains where the extent starts, not how large it is. A +1.5 GB request with 1 GB alignment in open space produces a single +`[0, 1.5 GB)` extent with no wasted space. + +Splitting only happens when a fixed or reserved range interrupts the free +space. In that case the alignment also acts as the **split granularity**: +partial chunks are rounded down to the alignment before continuing. This +keeps every RAM extent on a huge-page boundary so the host and guest can +use large pages (reducing page table overhead and TLB pressure), and +avoids sub-alignment fragments that would complicate the NUMA and +compatibility surface. The practical effect is that 1 GB-aligned RAM stays in 1 GB-aligned chunks. A small fixed hole just above the 1 GB boundary will not cause a diff --git a/openhcl/underhill_core/src/worker.rs b/openhcl/underhill_core/src/worker.rs index a9b763d73d..9eb32a722c 100644 --- a/openhcl/underhill_core/src/worker.rs +++ b/openhcl/underhill_core/src/worker.rs @@ -2384,7 +2384,6 @@ async fn new_underhill_vm( mut chipset_devices, pci_chipset_devices, capabilities, - .. } = chipset .build() .context("failed to build chipset configuration")?; diff --git a/openvmm/openvmm_core/src/worker/dispatch.rs b/openvmm/openvmm_core/src/worker/dispatch.rs index 4372451446..1953cdc677 100644 --- a/openvmm/openvmm_core/src/worker/dispatch.rs +++ b/openvmm/openvmm_core/src/worker/dispatch.rs @@ -200,9 +200,7 @@ impl Manifest { chipset_devices: config.chipset_devices, pci_chipset_devices: config.pci_chipset_devices, chipset_capabilities: config.chipset_capabilities, - chipset_low_mmio_size: config.chipset_low_mmio_size, - chipset_high_mmio_size: config.chipset_high_mmio_size, - vtl2_chipset_mmio_size: config.vtl2_chipset_mmio_size, + layout: config.layout, generation_id_recv: config.generation_id_recv, rtc_delta_milliseconds: config.rtc_delta_milliseconds, automatic_guest_reset: config.automatic_guest_reset, @@ -252,9 +250,7 @@ pub struct Manifest { chipset_devices: Vec, pci_chipset_devices: Vec, chipset_capabilities: VmChipsetCapabilities, - chipset_low_mmio_size: u32, - chipset_high_mmio_size: u64, - vtl2_chipset_mmio_size: u64, + layout: vmm_core_defs::LayoutConfig, generation_id_recv: Option>, rtc_delta_milliseconds: i64, automatic_guest_reset: bool, @@ -927,9 +923,9 @@ impl InitializedVm { let resolved_layout = resolve_memory_layout(MemoryLayoutInput { mem_size: cfg.memory.mem_size, numa_mem_sizes: cfg.memory.numa_mem_sizes.as_deref(), - chipset_low_mmio_size: cfg.chipset_low_mmio_size, - chipset_high_mmio_size: cfg.chipset_high_mmio_size, - vtl2_chipset_mmio_size: cfg.vtl2_chipset_mmio_size, + chipset_low_mmio_size: cfg.layout.chipset_low_mmio_size, + chipset_high_mmio_size: cfg.layout.chipset_high_mmio_size, + vtl2_chipset_mmio_size: cfg.layout.vtl2_chipset_mmio_size, pcie_root_complexes: &cfg.pcie_root_complexes, virtio_mmio_count, vtl2_layout, @@ -3375,9 +3371,11 @@ impl LoadedVm { chipset_devices: vec![], // TODO pci_chipset_devices: vec![], // TODO chipset_capabilities: self.inner.chipset_capabilities, - chipset_low_mmio_size: 0, // TODO - chipset_high_mmio_size: 0, // TODO - vtl2_chipset_mmio_size: 0, // TODO + layout: vmm_core_defs::LayoutConfig { + chipset_low_mmio_size: 0, + chipset_high_mmio_size: 0, + vtl2_chipset_mmio_size: 0, + }, // TODO generation_id_recv: None, // TODO rtc_delta_milliseconds: 0, // TODO automatic_guest_reset: self.inner.automatic_guest_reset, diff --git a/openvmm/openvmm_core/src/worker/memory_layout.rs b/openvmm/openvmm_core/src/worker/memory_layout.rs index d8faebf076..ec9e85766b 100644 --- a/openvmm/openvmm_core/src/worker/memory_layout.rs +++ b/openvmm/openvmm_core/src/worker/memory_layout.rs @@ -470,7 +470,7 @@ mod tests { #[cfg(guest_arch = "x86_64")] const DEFAULT_CHIPSET_LOW_MMIO_SIZE: u32 = 128 * 1024 * 1024; #[cfg(guest_arch = "aarch64")] - const DEFAULT_CHIPSET_LOW_MMIO_SIZE: u64 = 512 * 1024 * 1024; + const DEFAULT_CHIPSET_LOW_MMIO_SIZE: u32 = 512 * 1024 * 1024; #[cfg(guest_arch = "x86_64")] const ARCH_RESERVED: MemoryRange = ARCH_RESERVED_X86_64; #[cfg(guest_arch = "aarch64")] diff --git a/openvmm/openvmm_defs/src/config.rs b/openvmm/openvmm_defs/src/config.rs index 2fb6193c6e..4e4af4dd4b 100644 --- a/openvmm/openvmm_defs/src/config.rs +++ b/openvmm/openvmm_defs/src/config.rs @@ -55,15 +55,9 @@ pub struct Config { pub chipset_devices: Vec, pub pci_chipset_devices: Vec, pub chipset_capabilities: VmChipsetCapabilities, - /// Chipset low MMIO range size (below 4 GiB) for VMOD/PCI0 _CRS. - /// The address is always allocated dynamically. `0` disables the range. - pub chipset_low_mmio_size: u32, - /// Chipset high MMIO range size (above RAM) for VMOD/PCI0 _CRS. - /// The address is always allocated dynamically. `0` disables the range. - pub chipset_high_mmio_size: u64, - /// VTL2-private chipset MMIO range size for VTL2 VMBus. - /// The address is always allocated dynamically. `0` disables the range. - pub vtl2_chipset_mmio_size: u64, + /// Memory layout sizing for the layout engine. Determines chipset MMIO + /// range sizes; addresses are allocated dynamically by the resolver. + pub layout: vmm_core_defs::LayoutConfig, pub generation_id_recv: Option>, // This is used for testing. TODO: resourcify, and also store this in VMGS. pub rtc_delta_milliseconds: i64, diff --git a/openvmm/openvmm_entry/src/lib.rs b/openvmm/openvmm_entry/src/lib.rs index eaa7e24ddf..6afbc14913 100644 --- a/openvmm/openvmm_entry/src/lib.rs +++ b/openvmm/openvmm_entry/src/lib.rs @@ -872,14 +872,12 @@ async fn vm_config_from_command_line( // TODO: load from VMGS file if it exists let bios_guid = Guid::new_random(); + let layout_config = chipset.layout_config(); let VmChipsetResult { chipset, mut chipset_devices, pci_chipset_devices, capabilities, - chipset_low_mmio_size, - chipset_high_mmio_size, - vtl2_chipset_mmio_size, } = chipset .build() .context("failed to build chipset configuration")?; @@ -1634,9 +1632,7 @@ async fn vm_config_from_command_line( chipset_devices, pci_chipset_devices, chipset_capabilities: capabilities, - chipset_low_mmio_size, - chipset_high_mmio_size, - vtl2_chipset_mmio_size, + layout: layout_config, #[cfg(windows)] vpci_resources, vmgs, diff --git a/openvmm/openvmm_entry/src/ttrpc/mod.rs b/openvmm/openvmm_entry/src/ttrpc/mod.rs index 1b097b900b..8fc1eb398e 100644 --- a/openvmm/openvmm_entry/src/ttrpc/mod.rs +++ b/openvmm/openvmm_entry/src/ttrpc/mod.rs @@ -539,13 +539,15 @@ impl VmService { })?); } - let chipset = VmManifestBuilder::new( + let chipset_builder = VmManifestBuilder::new( vm_manifest_builder::BaseChipsetType::HyperVGen2LinuxDirect, vm_manifest_builder::MachineArch::X86_64, ) - .with_serial(ports) - .build() - .context("failed to build vm configuration")?; + .with_serial(ports); + let layout_config = chipset_builder.layout_config(); + let chipset = chipset_builder + .build() + .context("failed to build vm configuration")?; // Extract memory and processor counts for the VmController. let config_mem_size = req_config @@ -610,9 +612,7 @@ impl VmService { chipset_devices: chipset.chipset_devices, pci_chipset_devices: chipset.pci_chipset_devices, chipset_capabilities: chipset.capabilities, - chipset_low_mmio_size: chipset.chipset_low_mmio_size, - chipset_high_mmio_size: chipset.chipset_high_mmio_size, - vtl2_chipset_mmio_size: chipset.vtl2_chipset_mmio_size, + layout: layout_config, generation_id_recv: None, rtc_delta_milliseconds: 0, automatic_guest_reset: true, diff --git a/petri/src/vm/openvmm/construct.rs b/petri/src/vm/openvmm/construct.rs index 5b1a23c4a4..44e6f3dade 100644 --- a/petri/src/vm/openvmm/construct.rs +++ b/petri/src/vm/openvmm/construct.rs @@ -348,6 +348,7 @@ impl PetriVmConfigOpenVmm { let mut vsock_listener = Some(vsock_listener); let vsock_path_string = vsock_path.to_string_lossy(); + let layout_config = chipset.layout_config(); let chipset = chipset .build() .context("failed to build chipset configuration")?; @@ -459,9 +460,6 @@ impl PetriVmConfigOpenVmm { mut chipset_devices, pci_chipset_devices, capabilities, - chipset_low_mmio_size, - chipset_high_mmio_size, - vtl2_chipset_mmio_size, } = chipset; // Add the TPM @@ -499,9 +497,7 @@ impl PetriVmConfigOpenVmm { chipset_devices, pci_chipset_devices, chipset_capabilities: capabilities, - chipset_low_mmio_size, - chipset_high_mmio_size, - vtl2_chipset_mmio_size, + layout: layout_config, // Basic virtualization device support hypervisor: HypervisorConfig { diff --git a/vm/vmcore/vm_topology/src/memory.rs b/vm/vmcore/vm_topology/src/memory.rs index c0a898620e..2d67c9ef1b 100644 --- a/vm/vmcore/vm_topology/src/memory.rs +++ b/vm/vmcore/vm_topology/src/memory.rs @@ -271,12 +271,13 @@ impl MemoryLayout { /// Makes a new memory layout from already-resolved RAM and fixed ranges. /// - /// The RAM, PCI ECAM, and PCI MMIO ranges must each be in sorted order, - /// non-empty, and non-overlapping. MMIO gaps may contain empty placeholder - /// ranges to preserve positional indexing (e.g. `mmio()[0]` = low, - /// `mmio()[1]` = high); empty entries are ignored during validation. - /// The combined layout is also validated for overlaps, including the - /// optional VTL2 range. + /// Each individual range must be non-empty, but the lists themselves may + /// be empty (e.g. no PCIe root complexes means empty PCI ECAM/MMIO + /// vectors). Ranges within each list must be sorted and non-overlapping. + /// MMIO gaps may contain empty placeholder ranges to preserve positional + /// indexing (e.g. `mmio()[0]` = low, `mmio()[1]` = high); empty entries + /// are ignored during validation. The combined layout is also validated + /// for overlaps, including the optional VTL2 range. pub fn new_from_resolved_ranges( ram: Vec, mmio_gaps: Vec, diff --git a/vmm_core/vm_manifest_builder/Cargo.toml b/vmm_core/vm_manifest_builder/Cargo.toml index b89fa5d683..a040d3a9b0 100644 --- a/vmm_core/vm_manifest_builder/Cargo.toml +++ b/vmm_core/vm_manifest_builder/Cargo.toml @@ -15,6 +15,7 @@ serial_core.workspace = true serial_debugcon_resources.workspace = true serial_pl011_resources.workspace = true vm_resource.workspace = true +vmm_core_defs.workspace = true vmotherboard.workspace = true mesh.workspace = true diff --git a/vmm_core/vm_manifest_builder/src/lib.rs b/vmm_core/vm_manifest_builder/src/lib.rs index 09df814346..f63c624c63 100644 --- a/vmm_core/vm_manifest_builder/src/lib.rs +++ b/vmm_core/vm_manifest_builder/src/lib.rs @@ -43,6 +43,7 @@ use vm_resource::PlatformResource; use vm_resource::Resource; use vm_resource::ResourceId; use vm_resource::kind::SerialBackendHandle; +pub use vmm_core_defs::LayoutConfig; use vmotherboard::ChipsetDeviceHandle; use vmotherboard::LegacyPciChipsetDeviceHandle; use vmotherboard::options::BaseChipsetManifest; @@ -102,18 +103,6 @@ pub struct VmChipsetResult { pub pci_chipset_devices: Vec, /// Derived chipset capabilities needed by firmware and table generation. pub capabilities: VmChipsetCapabilities, - /// Default chipset low MMIO size (below 4 GiB) for VMOD/PCI0 _CRS. - /// The address is always allocated dynamically. `0` when the VM type - /// has no VMBus or PCI bus. - pub chipset_low_mmio_size: u32, - /// Default chipset high MMIO size (above RAM) for VMOD/PCI0 _CRS. - /// The address is always allocated dynamically. `0` when the VM type - /// has no VMBus or PCI bus. - pub chipset_high_mmio_size: u64, - /// Default VTL2-private chipset MMIO size for VTL2 VMBus. - /// The address is always allocated dynamically. `0` when the VM type - /// does not include VTL2. - pub vtl2_chipset_mmio_size: u64, } /// Error type for building a VM manifest. @@ -248,9 +237,6 @@ impl VmManifestBuilder { with_psp: false, with_guest_watchdog: false, }, - chipset_low_mmio_size: 0, - chipset_high_mmio_size: 0, - vtl2_chipset_mmio_size: 0, }; if let Some((backend, port)) = self.debugcon { @@ -405,11 +391,16 @@ impl VmManifestBuilder { } } - // Chipset MMIO sizing: all VM types with VMBus or a PCI bus get low + - // high chipset MMIO. HclHost additionally gets VTL2 chipset MMIO. - // - // Low MMIO is a single window pinned to end at 4 GiB and advertised - // as `\_SB.VMOD._CRS`. The defaults match the legacy Hyper-V sizes. + Ok(result) + } + + /// Returns the default memory layout sizing for this VM type and + /// architecture. + /// + /// This is separate from [`Self::build`] because not every consumer runs + /// the layout engine. In particular, OpenHCL (Underhill) receives its + /// memory layout from the host and does not use these defaults. + pub fn layout_config(&self) -> LayoutConfig { let default_low = match self.arch { MachineArch::X86_64 => 128 * 1024 * 1024, MachineArch::Aarch64 => 512 * 1024 * 1024, @@ -420,18 +411,17 @@ impl VmManifestBuilder { BaseChipsetType::HypervGen1 | BaseChipsetType::HypervGen2Uefi | BaseChipsetType::HyperVGen2LinuxDirect - | BaseChipsetType::UnenlightenedLinuxDirect => { - result.chipset_low_mmio_size = default_low; - result.chipset_high_mmio_size = default_high; - } - BaseChipsetType::HclHost => { - result.chipset_low_mmio_size = default_low; - result.chipset_high_mmio_size = default_high; - result.vtl2_chipset_mmio_size = default_vtl2; - } + | BaseChipsetType::UnenlightenedLinuxDirect => LayoutConfig { + chipset_low_mmio_size: default_low, + chipset_high_mmio_size: default_high, + vtl2_chipset_mmio_size: 0, + }, + BaseChipsetType::HclHost => LayoutConfig { + chipset_low_mmio_size: default_low, + chipset_high_mmio_size: default_high, + vtl2_chipset_mmio_size: default_vtl2, + }, } - - Ok(result) } } diff --git a/vmm_core/vmm_core_defs/src/lib.rs b/vmm_core/vmm_core_defs/src/lib.rs index 221cbe554c..a12211d4bd 100644 --- a/vmm_core/vmm_core_defs/src/lib.rs +++ b/vmm_core/vmm_core_defs/src/lib.rs @@ -9,7 +9,26 @@ pub mod debug_rpc; use inspect::Inspect; +use mesh::MeshPayload; use mesh::payload::Protobuf; + +/// Default memory layout sizing for a VM, used by the layout engine in +/// `openvmm_core::worker::memory_layout`. +/// +/// Consumers that receive their memory layout from the host (such as OpenHCL / +/// Underhill) do not use these values. +#[derive(Debug, Clone, MeshPayload)] +pub struct LayoutConfig { + /// Chipset low MMIO range size (below 4 GiB) for VMOD/PCI0 _CRS. + /// The address is always allocated dynamically. `0` disables the range. + pub chipset_low_mmio_size: u32, + /// Chipset high MMIO range size (above RAM) for VMOD/PCI0 _CRS. + /// The address is always allocated dynamically. `0` disables the range. + pub chipset_high_mmio_size: u64, + /// VTL2-private chipset MMIO range size for VTL2 VMBus. + /// The address is always allocated dynamically. `0` disables the range. + pub vtl2_chipset_mmio_size: u64, +} use std::sync::Arc; /// HaltReason sent by devices and vp_set to the vmm. From b47993cb4429f0d3b08a4aa6f3ba54aac0fd5b99 Mon Sep 17 00:00:00 2001 From: John Starks Date: Mon, 18 May 2026 12:11:21 -0700 Subject: [PATCH 35/36] feedback --- .../architecture/openvmm/memory-layout.md | 27 ++++++++++--------- vm/vmcore/vm_topology/src/memory.rs | 7 ++++- vmm_core/vmm_core_defs/src/lib.rs | 3 ++- 3 files changed, 22 insertions(+), 15 deletions(-) diff --git a/Guide/src/reference/architecture/openvmm/memory-layout.md b/Guide/src/reference/architecture/openvmm/memory-layout.md index 2092f4822e..2eafc82743 100644 --- a/Guide/src/reference/architecture/openvmm/memory-layout.md +++ b/Guide/src/reference/architecture/openvmm/memory-layout.md @@ -96,20 +96,21 @@ The worker resolver in [`openvmm_core::worker::memory_layout`](https://github.com/microsoft/openvmm/blob/main/openvmm/openvmm_core/src/worker/memory_layout.rs) issues requests in this order: -1. **Architectural reserved zone.** A `reserve` request for the - per-architecture range containing LAPIC, IOAPIC, GIC, PL011, battery, - TPM, and similar fixed-address platform devices. - - | Architecture | Range | +1. **Chipset low MMIO** (`fixed`) — a window pinned to end at 4 GiB, + advertised to firmware as `\_SB.VMOD._CRS`. The window always covers + at least the per-architecture reserved zone (LAPIC, IOAPIC, GIC, + PL011, battery, TPM, etc.) so guests can arbitrate fixed-address + children against this window. The caller-requested size may extend it + lower. + + | Architecture | Minimum range (architectural reserved zone) | |---|---| | x86_64 | `0xFE00_0000..0x1_0000_0000` | | aarch64 | `0xEF00_0000..0x1_0000_0000` | -2. **Chipset low MMIO** (`Mmio32`) — the VMOD/PCI0 `_CRS` low range for - VMBus relay devices and PIIX4 PCI BARs. 2 MB alignment. -3. **Chipset high MMIO** (`Mmio64`) — the corresponding high range. 2 MB +2. **Chipset high MMIO** (`Mmio64`) — the corresponding high range. 2 MB alignment. -4. **PCIe root complex ranges**, one per root complex: +3. **PCIe root complex ranges**, one per root complex: - **ECAM** (`Mmio32`). The size is derived from the bus window as `(end_bus - start_bus + 1) * 1 MB` (32 devices × 8 functions × 4 KiB per config space). @@ -120,9 +121,9 @@ issues requests in this order: fixed range as well. Per-BAR alignment would guarantee the entire window is usable for one large BAR, but burns address space on hosts with tight physical-address widths. -5. **Virtio-mmio slots** (`Mmio32`) — one contiguous region sized +4. **Virtio-mmio slots** (`Mmio32`) — one contiguous region sized `slot_count * 4 KiB`, when any slots are configured. -6. **RAM**, in vnode order. The first request becomes vnode 0, the second +5. **RAM**, in vnode order. The first request becomes vnode 0, the second vnode 1, and so on. Each vnode starts at or above the highest address used by prior vnodes; vnode N+1 never backfills a fragment that vnode N skipped. This keeps vnode ordering equal to address ordering and @@ -141,10 +142,10 @@ issues requests in this order: cost of thousands of smaller page table entries and reducing TLB pressure at runtime. Sub-GB nodes use 2 MB so small NUMA nodes do not waste a full GB of address space. -7. **VTL2 chipset MMIO** (`PostMmio`) — VTL2's own VMBus / chipset MMIO +6. **VTL2 chipset MMIO** (`PostMmio`) — VTL2's own VMBus / chipset MMIO region, when VTL2 is configured. Placed after VTL0 so enabling VTL2 does not move any VTL0 address. -8. **VTL2 private memory** (`PostMmio`) — when the IGVM file requests +7. **VTL2 private memory** (`PostMmio`) — when the IGVM file requests layout-mode VTL2 memory, the worker takes only its size and alignment from the IGVM relocation header. The IGVM file's relocation min/max bounds are not fed in as constraints here; they are validated later by diff --git a/vm/vmcore/vm_topology/src/memory.rs b/vm/vmcore/vm_topology/src/memory.rs index 2d67c9ef1b..bf0fe05ea6 100644 --- a/vm/vmcore/vm_topology/src/memory.rs +++ b/vm/vmcore/vm_topology/src/memory.rs @@ -433,7 +433,12 @@ impl MemoryLayout { /// One past the last byte of RAM, MMIO, PCI ECAM, or PCI MMIO. pub fn end_of_layout(&self) -> u64 { [ - self.mmio.last().map(|r| r.end()).unwrap_or(0), + self.mmio + .iter() + .filter(|r| !r.is_empty()) + .map(|r| r.end()) + .max() + .unwrap_or(0), self.end_of_ram(), self.pci_ecam.last().map(|r| r.end()).unwrap_or(0), self.pci_mmio.last().map(|r| r.end()).unwrap_or(0), diff --git a/vmm_core/vmm_core_defs/src/lib.rs b/vmm_core/vmm_core_defs/src/lib.rs index a12211d4bd..37211e4d76 100644 --- a/vmm_core/vmm_core_defs/src/lib.rs +++ b/vmm_core/vmm_core_defs/src/lib.rs @@ -20,7 +20,8 @@ use mesh::payload::Protobuf; #[derive(Debug, Clone, MeshPayload)] pub struct LayoutConfig { /// Chipset low MMIO range size (below 4 GiB) for VMOD/PCI0 _CRS. - /// The address is always allocated dynamically. `0` disables the range. + /// The address is always allocated dynamically. `0` uses only the + /// architectural minimum (LAPIC, IOAPIC, GIC, etc.). pub chipset_low_mmio_size: u32, /// Chipset high MMIO range size (above RAM) for VMOD/PCI0 _CRS. /// The address is always allocated dynamically. `0` disables the range. From bcc672715e08506faed7d727235830099db95045 Mon Sep 17 00:00:00 2001 From: John Starks Date: Mon, 18 May 2026 12:42:22 -0700 Subject: [PATCH 36/36] feedback --- .../architecture/openvmm/memory-layout.md | 6 ++- openvmm/openvmm_core/src/worker/dispatch.rs | 4 +- .../openvmm_core/src/worker/memory_layout.rs | 46 ++++++++----------- .../src/worker/vm_loaders/igvm.rs | 8 ++-- .../tests/x86_64/openhcl_linux_direct.rs | 5 +- 5 files changed, 31 insertions(+), 38 deletions(-) diff --git a/Guide/src/reference/architecture/openvmm/memory-layout.md b/Guide/src/reference/architecture/openvmm/memory-layout.md index 2eafc82743..947bfb66af 100644 --- a/Guide/src/reference/architecture/openvmm/memory-layout.md +++ b/Guide/src/reference/architecture/openvmm/memory-layout.md @@ -256,8 +256,10 @@ identical and places VTL2 after the VTL0-visible top: |---|---| | VTL2 | `0xC000_0000..0xC020_0000` | -`MemoryLayout::end_of_layout()` reports the VTL0-visible top. -`MemoryLayout::vtl2_range()` reports the VTL2 range separately. +`MemoryLayout::end_of_layout()` reports the top of all stored ranges, +including VTL2 chipset MMIO when present. +`MemoryLayout::vtl2_range()` reports the VTL2 private memory range +separately. ### Reserved holes do not raise the layout top diff --git a/openvmm/openvmm_core/src/worker/dispatch.rs b/openvmm/openvmm_core/src/worker/dispatch.rs index 1953cdc677..d2c1185c8b 100644 --- a/openvmm/openvmm_core/src/worker/dispatch.rs +++ b/openvmm/openvmm_core/src/worker/dispatch.rs @@ -923,9 +923,7 @@ impl InitializedVm { let resolved_layout = resolve_memory_layout(MemoryLayoutInput { mem_size: cfg.memory.mem_size, numa_mem_sizes: cfg.memory.numa_mem_sizes.as_deref(), - chipset_low_mmio_size: cfg.layout.chipset_low_mmio_size, - chipset_high_mmio_size: cfg.layout.chipset_high_mmio_size, - vtl2_chipset_mmio_size: cfg.layout.vtl2_chipset_mmio_size, + layout: cfg.layout.clone(), pcie_root_complexes: &cfg.pcie_root_complexes, virtio_mmio_count, vtl2_layout, diff --git a/openvmm/openvmm_core/src/worker/memory_layout.rs b/openvmm/openvmm_core/src/worker/memory_layout.rs index ec9e85766b..3d42ca88d2 100644 --- a/openvmm/openvmm_core/src/worker/memory_layout.rs +++ b/openvmm/openvmm_core/src/worker/memory_layout.rs @@ -76,18 +76,8 @@ pub(super) struct MemoryLayoutInput<'a> { /// Optional per-vNUMA RAM budgets. When present, these must sum to /// `mem_size`, and request order is the vnode assignment order. pub numa_mem_sizes: Option<&'a [u64]>, - /// Chipset low MMIO size (below 4 GB). This is the VMOD/PCI0 _CRS range - /// for VMBus devices and PIIX4 PCI BARs. The address is always allocated - /// dynamically. `0` disables the range. - pub chipset_low_mmio_size: u32, - /// Chipset high MMIO size (above RAM). This is the VMOD/PCI0 _CRS high - /// range for VMBus devices. The address is always allocated dynamically. - /// `0` disables the range. - pub chipset_high_mmio_size: u64, - /// VTL2-private chipset MMIO size. Placed after all VTL0-visible layout - /// so enabling VTL2 does not move VTL0 addresses. The address is always - /// allocated dynamically. `0` disables the range. - pub vtl2_chipset_mmio_size: u64, + /// Chipset MMIO sizing from the manifest builder. + pub layout: vmm_core_defs::LayoutConfig, /// PCIe root complex address-space intents. These are resolved by this /// worker step so front ends do not need to carve guest physical addresses. pub pcie_root_complexes: &'a [PcieRootComplexConfig], @@ -137,7 +127,7 @@ pub(super) fn resolve_memory_layout( ARCH_RESERVED_AARCH64 }; let four_gb = 4 * GB; - let low_mmio_size = u64::from(input.chipset_low_mmio_size) + let low_mmio_size = u64::from(input.layout.chipset_low_mmio_size) .next_multiple_of(0x1000) .max(arch_reserved.len()); let chipset_low_mmio = MemoryRange::new(four_gb - low_mmio_size..four_gb); @@ -145,11 +135,11 @@ pub(super) fn resolve_memory_layout( // Chipset high MMIO (Mmio64): VMOD/PCI0 _CRS high range. let mut chipset_high_mmio = MemoryRange::EMPTY; - if input.chipset_high_mmio_size != 0 { + if input.layout.chipset_high_mmio_size != 0 { builder.request( "chipset-high-mmio", &mut chipset_high_mmio, - input.chipset_high_mmio_size, + input.layout.chipset_high_mmio_size, TWO_MB, Placement::Mmio64, ); @@ -259,11 +249,11 @@ pub(super) fn resolve_memory_layout( // VTL2 chipset MMIO is implementation-private — placed after all // VTL0-visible RAM/MMIO so enabling VTL2 does not move VTL0 addresses. let mut vtl2_chipset_mmio = MemoryRange::EMPTY; - if input.vtl2_chipset_mmio_size != 0 { + if input.layout.vtl2_chipset_mmio_size != 0 { builder.request( "vtl2-chipset-mmio", &mut vtl2_chipset_mmio, - input.vtl2_chipset_mmio_size, + input.layout.vtl2_chipset_mmio_size, TWO_MB, Placement::PostMmio, ); @@ -478,6 +468,12 @@ mod tests { const DEFAULT_CHIPSET_HIGH_MMIO_SIZE: u64 = 512 * 1024 * 1024; const DEFAULT_VTL2_CHIPSET_MMIO_SIZE: u64 = GB; + const DEFAULT_LAYOUT: vmm_core_defs::LayoutConfig = vmm_core_defs::LayoutConfig { + chipset_low_mmio_size: DEFAULT_CHIPSET_LOW_MMIO_SIZE, + chipset_high_mmio_size: DEFAULT_CHIPSET_HIGH_MMIO_SIZE, + vtl2_chipset_mmio_size: 0, + }; + fn input( mem_size: u64, numa_mem_sizes: Option<&[u64]>, @@ -486,9 +482,7 @@ mod tests { MemoryLayoutInput { mem_size, numa_mem_sizes, - chipset_low_mmio_size: DEFAULT_CHIPSET_LOW_MMIO_SIZE, - chipset_high_mmio_size: DEFAULT_CHIPSET_HIGH_MMIO_SIZE, - vtl2_chipset_mmio_size: 0, + layout: DEFAULT_LAYOUT, pcie_root_complexes: &[], virtio_mmio_count: 0, vtl2_layout, @@ -783,7 +777,7 @@ mod tests { #[test] fn vtl2_chipset_mmio_is_post_mmio() { let mut config = input(2 * GB, None, None); - config.vtl2_chipset_mmio_size = DEFAULT_VTL2_CHIPSET_MMIO_SIZE; + config.layout.vtl2_chipset_mmio_size = DEFAULT_VTL2_CHIPSET_MMIO_SIZE; let result = resolve_memory_layout(config).unwrap(); @@ -801,7 +795,7 @@ mod tests { fn vtl2_chipset_mmio_does_not_move_vtl0_layout() { let without = resolve(input(2 * GB, None, None)); let mut config = input(2 * GB, None, None); - config.vtl2_chipset_mmio_size = DEFAULT_VTL2_CHIPSET_MMIO_SIZE; + config.layout.vtl2_chipset_mmio_size = DEFAULT_VTL2_CHIPSET_MMIO_SIZE; let with = resolve_memory_layout(config).unwrap(); assert_eq!(with.memory_layout.ram(), without.ram()); @@ -815,8 +809,8 @@ mod tests { // reported so consumers see the same layout the allocator // produced. let mut config = input(2 * GB, None, None); - config.chipset_low_mmio_size = 0; - config.chipset_high_mmio_size = 0; + config.layout.chipset_low_mmio_size = 0; + config.layout.chipset_high_mmio_size = 0; let result = resolve_memory_layout(config).unwrap(); @@ -834,13 +828,13 @@ mod tests { // Asymmetric chipset MMIO (only low or only high) is allowed. // The missing range is EMPTY. let mut config = input(2 * GB, None, None); - config.chipset_high_mmio_size = 0; + config.layout.chipset_high_mmio_size = 0; let result = resolve_memory_layout(config).unwrap(); assert!(!result.chipset_low_mmio.is_empty()); assert!(result.chipset_high_mmio.is_empty()); let mut config = input(2 * GB, None, None); - config.chipset_low_mmio_size = 0; + config.layout.chipset_low_mmio_size = 0; let result = resolve_memory_layout(config).unwrap(); // Low is always at least the arch reserved zone. assert!(!result.chipset_low_mmio.is_empty()); diff --git a/openvmm/openvmm_core/src/worker/vm_loaders/igvm.rs b/openvmm/openvmm_core/src/worker/vm_loaders/igvm.rs index 2203b88180..9305c8dd0f 100644 --- a/openvmm/openvmm_core/src/worker/vm_loaders/igvm.rs +++ b/openvmm/openvmm_core/src/worker/vm_loaders/igvm.rs @@ -45,8 +45,8 @@ use zerocopy::IntoBytes; #[derive(Debug, Error)] pub enum Error { - #[error("command line contains an embedded NUL byte")] - CommandLineContainsNul, + #[error("command line contains an embedded NUL byte at offset {0}")] + CommandLineContainsNul(usize), #[error("failed to read igvm file")] Igvm(#[source] std::io::Error), #[error("invalid igvm file")] @@ -594,8 +594,8 @@ fn load_igvm_x86( // The command line is exposed to the guest as a NUL-terminated byte // sequence (via the IGVM CommandLine parameter), so reject any embedded NUL // bytes up front. - if cmdline.as_bytes().contains(&0) { - return Err(Error::CommandLineContainsNul); + if let Some(pos) = cmdline.as_bytes().iter().position(|&b| b == 0) { + return Err(Error::CommandLineContainsNul(pos)); } let (mask, max_vtl) = match vbs_platform_header(igvm_file)? { diff --git a/vmm_tests/vmm_tests/tests/tests/x86_64/openhcl_linux_direct.rs b/vmm_tests/vmm_tests/tests/tests/x86_64/openhcl_linux_direct.rs index 87608da103..c5593ee428 100644 --- a/vmm_tests/vmm_tests/tests/tests/x86_64/openhcl_linux_direct.rs +++ b/vmm_tests/vmm_tests/tests/tests/x86_64/openhcl_linux_direct.rs @@ -415,9 +415,8 @@ async fn openhcl_linux_vtl2_mmio_self_allocate( config: PetriVmBuilder, ) -> Result<(), anyhow::Error> { // Default chipset MMIO sizes for `HclHost` from - // `vm_manifest_builder::VmChipsetBuilder::build`. Keep in sync with that - // file. - const DEFAULT_LOW_MMIO_SIZE: u64 = 96 * 1024 * 1024; + // `vm_manifest_builder::layout_config`. Keep in sync with that file. + const DEFAULT_LOW_MMIO_SIZE: u64 = 128 * 1024 * 1024; const DEFAULT_HIGH_MMIO_SIZE: u64 = 512 * 1024 * 1024; const DEFAULT_VTL2_MMIO_SIZE: u64 = 1024 * 1024 * 1024; // `mmio-size` is hardcoded in openvmm — see