diff --git a/Cargo.lock b/Cargo.lock index 163090f441..fd7a72c74b 100644 --- a/Cargo.lock +++ b/Cargo.lock @@ -5427,7 +5427,6 @@ dependencies = [ "input_core", "inspect", "inspect_proto", - "memory_range", "mesh", "mesh_process", "mesh_rpc", @@ -6019,7 +6018,6 @@ dependencies = [ "kmsg", "libtest-mimic", "linkme", - "memory_range", "mesh", "mesh_process", "mesh_worker", @@ -9515,6 +9513,7 @@ dependencies = [ "serial_pl011_resources", "thiserror 2.0.16", "vm_resource", + "vmm_core_defs", "vmotherboard", ] diff --git a/Guide/src/SUMMARY.md b/Guide/src/SUMMARY.md index b9533b7948..c0036e01b8 100644 --- a/Guide/src/SUMMARY.md +++ b/Guide/src/SUMMARY.md @@ -127,6 +127,7 @@ - [Consomme](./reference/backends/consomme.md) - [Architecture](./reference/architecture.md) - [OpenVMM Architecture](./reference/architecture/openvmm.md) + - [Memory Layout](./reference/architecture/openvmm/memory-layout.md) - [mesh](./reference/architecture/openvmm/mesh.md) - [Using mesh](./reference/architecture/openvmm/mesh/usage.md) - [How mesh works](./reference/architecture/openvmm/mesh/internals.md) diff --git a/Guide/src/reference/architecture/openvmm/memory-layout.md b/Guide/src/reference/architecture/openvmm/memory-layout.md new file mode 100644 index 0000000000..947bfb66af --- /dev/null +++ b/Guide/src/reference/architecture/openvmm/memory-layout.md @@ -0,0 +1,295 @@ +# Memory Layout + +OpenVMM has to decide where every byte of guest physical address space goes: +RAM, MMIO windows for emulated and PCIe devices, paravisor private memory, and +architectural ranges like the LAPIC or GIC. This page describes how those +decisions are made. + +```admonish warning title="Compatibility surface" +Guest physical addresses are part of the VM's compatibility contract. Guests +remember device and RAM locations across hibernation, and saved VM state +references them. Changing request order, placement class, or alignment can +move guest addresses and break resume on existing VMs. + +Treat layout policy changes like VM ABI changes: a new default may be fine +for new VMs, but existing persisted configuration must continue to resolve +to the same guest physical addresses. +``` + +## Two pieces + +Layout resolution is split into two pieces that you should think about +separately: + +1. A **pure address-space allocator** in `vm_topology::layout`. It knows + nothing about chipsets, firmware, VTLs, PCI, or the host. Callers describe + what they need in terms of ranges, sizes, alignments, and a placement + class, and the allocator returns deterministic guest physical addresses. +2. A **worker resolver** in `openvmm_core::worker::memory_layout`. This is + where OpenVMM's policy lives: which platform ranges are pinned, what + alignments NUMA nodes get, how PCIe ECAM is sized, and so on. The resolver + describes the VM to the allocator, runs it, and builds the resulting + [`MemoryLayout`](https://openvmm.dev/rustdoc/linux/vm_topology/memory/struct.MemoryLayout.html) + that the rest of the VM worker uses to look up RAM, MMIO, PCI ECAM, and + PCI MMIO ranges. + +Keeping the allocator policy-free means its behavior can be exhaustively +tested in isolation, and the worker can be reasoned about as a list of +requests that fully describes the VM. + +## The allocator + +[`LayoutBuilder`](https://openvmm.dev/rustdoc/linux/vm_topology/layout/struct.LayoutBuilder.html) +accepts four kinds of input: + +| Input | Purpose | +|---|---| +| `reserve(range)` | Block allocation at this address but do not include it in the layout top. | +| `fixed(range)` | A range whose address is already decided. Blocks allocation and counts as part of the layout. | +| `ram(size, alignment)` | Ordinary guest RAM. The only request type that may be split across multiple extents. | +| `request(size, alignment, placement)` | A single contiguous range, placed dynamically. The `placement` chooses one of three phases below. | + +`reserve` and `fixed` differ only in how they affect the **layout top** — +the address one past the highest guest-visible byte. `fixed` ranges raise +it; `reserve` ranges do not. This matters because the layout top determines +where post-MMIO requests (such as paravisor private memory) start: a +reserved hole high up in the address space should not push them even +higher. + +When `allocate()` runs, it processes requests in a fixed phase order. Each +phase pulls from whatever address space the earlier phases left free: + +1. **Reserved ranges** are removed from the free space. +2. **Fixed ranges** are removed from the free space. +3. **`Placement::Mmio32`** requests are packed *top down* below 4 GiB, so + RAM can start at GPA 0 and grow upward through the lowest free space. +4. **RAM** requests are placed *bottom up*, in caller order, splitting + around any holes left by the earlier phases. The first request starts at + GPA 0; each subsequent request starts at or above the highest address + used by previous RAM requests, so later requests never backfill + fragments earlier ones skipped. RAM is the only splittable kind. +5. **`Placement::Mmio64`** requests are packed *bottom up* starting at the + end of RAM. This makes the layout top a function of requested topology + rather than a precomputed high MMIO bucket size. +6. **`Placement::PostMmio`** requests are placed *after* everything else + (excluding reserved ranges from the "everything else"). They are for + ranges that should not affect the guest-visible top of memory. + +Within `Mmio32` and `Mmio64`, requests are sorted by alignment (largest +first), then size (largest first), then caller order. This keeps large, +strictly-aligned device windows from being fragmented by small devices. +RAM and `PostMmio` use caller order verbatim: RAM order is the NUMA vnode +assignment, and `PostMmio` carries policy that should not be reordered by +alignment. + +```admonish note +The allocator does not take host physical-address width as an input. The +layout is computed as a pure function of VM configuration; the worker +checks the resulting layout top against host capabilities after the fact. +This keeps guest physical addresses from shifting when the same VM moves +to a host with a different physical-address width. +``` + +## Worker policy + +The worker resolver in +[`openvmm_core::worker::memory_layout`](https://github.com/microsoft/openvmm/blob/main/openvmm/openvmm_core/src/worker/memory_layout.rs) +issues requests in this order: + +1. **Chipset low MMIO** (`fixed`) — a window pinned to end at 4 GiB, + advertised to firmware as `\_SB.VMOD._CRS`. The window always covers + at least the per-architecture reserved zone (LAPIC, IOAPIC, GIC, + PL011, battery, TPM, etc.) so guests can arbitrate fixed-address + children against this window. The caller-requested size may extend it + lower. + + | Architecture | Minimum range (architectural reserved zone) | + |---|---| + | x86_64 | `0xFE00_0000..0x1_0000_0000` | + | aarch64 | `0xEF00_0000..0x1_0000_0000` | + +2. **Chipset high MMIO** (`Mmio64`) — the corresponding high range. 2 MB + alignment. +3. **PCIe root complex ranges**, one per root complex: + - **ECAM** (`Mmio32`). The size is derived from the bus window as + `(end_bus - start_bus + 1) * 1 MB` (32 devices × 8 functions × + 4 KiB per config space). + - **Low MMIO** (`Mmio32`), 2 MB aligned. A caller can pin this to a + fixed range instead of supplying a size, for assigned-device, IOMMU, + and physical-topology passthrough. + - **High MMIO** (`Mmio64`), 1 GB aligned. A caller can pin this to a + fixed range as well. Per-BAR alignment would guarantee the entire + window is usable for one large BAR, but burns address space on + hosts with tight physical-address widths. +4. **Virtio-mmio slots** (`Mmio32`) — one contiguous region sized + `slot_count * 4 KiB`, when any slots are configured. +5. **RAM**, in vnode order. The first request becomes vnode 0, the second + vnode 1, and so on. Each vnode starts at or above the highest address + used by prior vnodes; vnode N+1 never backfills a fragment that vnode + N skipped. This keeps vnode ordering equal to address ordering and + turns vnode layout into a clean compatibility surface — adding a new + fixed or reserved range below RAM end can only shift the first vnode + whose own span actually covers it. Alignment depends on request size: + + | RAM request size | Alignment | + |---|---| + | < 1 GB | 2 MB | + | ≥ 1 GB | 1 GB | + + Alignment matters because RAM extents that start on a huge-page + boundary can be mapped with 2 MB or 1 GB huge pages in host and + guest page tables, avoiding the memory overhead and construction + cost of thousands of smaller page table entries and reducing TLB + pressure at runtime. Sub-GB nodes use 2 MB so small NUMA nodes + do not waste a full GB of address space. +6. **VTL2 chipset MMIO** (`PostMmio`) — VTL2's own VMBus / chipset MMIO + region, when VTL2 is configured. Placed after VTL0 so enabling VTL2 + does not move any VTL0 address. +7. **VTL2 private memory** (`PostMmio`) — when the IGVM file requests + layout-mode VTL2 memory, the worker takes only its size and alignment + from the IGVM relocation header. The IGVM file's relocation min/max + bounds are not fed in as constraints here; they are validated later by + the IGVM loader against the selected base. Treating them as constraints + here would over-constrain layout and could put holes in VTL0 just to + accommodate an IGVM file we will reject anyway. + +After `allocate()` succeeds, the worker collects the resolved ranges into +the `MemoryLayout`'s MMIO, PCI ECAM, and PCI MMIO gap vectors, then checks +the highest placed-range address (which includes VTL2 private memory and +VTL2 chipset MMIO) against the host's physical-address width. + +## RAM splitting + +RAM is the only splittable request. When contiguous free space is +available, the full requested size is placed at an aligned start address +— alignment constrains where the extent starts, not how large it is. A +1.5 GB request with 1 GB alignment in open space produces a single +`[0, 1.5 GB)` extent with no wasted space. + +Splitting only happens when a fixed or reserved range interrupts the free +space. In that case the alignment also acts as the **split granularity**: +partial chunks are rounded down to the alignment before continuing. This +keeps every RAM extent on a huge-page boundary so the host and guest can +use large pages (reducing page table overhead and TLB pressure), and +avoids sub-alignment fragments that would complicate the NUMA and +compatibility surface. + +The practical effect is that 1 GB-aligned RAM stays in 1 GB-aligned +chunks. A small fixed hole just above the 1 GB boundary will not cause a +"nearly 1 GB" RAM extent to be placed in the interrupted range; instead, +RAM resumes at the next 1 GB boundary. + +## Examples + +These examples use compact synthetic configurations. Each one is covered +by tests in `vm_topology::layout` or `openvmm_core::worker::memory_layout`. + +### A fixed MMIO range splits RAM + +4 GB of RAM with a 1 GB fixed MMIO range from 1 GB to 2 GB: + +| Input | Range | +|---|---| +| RAM request | 4 GB | +| Fixed MMIO | `0x4000_0000..0x8000_0000` | + +| Output | Range | +|---|---| +| RAM | `0x0000_0000..0x4000_0000` | +| MMIO | `0x4000_0000..0x8000_0000` | +| RAM | `0x8000_0000..0x1_4000_0000` | + +Total RAM is still 4 GB — the fixed range is occupied address space, not +RAM. + +### GB-aligned RAM stays GB-aligned + +2 GB of RAM with a tiny fixed hole just above the 1 GB boundary should +not produce a sub-GB RAM fragment: + +| Input | Range | +|---|---| +| RAM request | 2 GB, 1 GB alignment | +| Fixed MMIO | `0x4010_0000..0x4020_0000` | + +| Output | Range | +|---|---| +| RAM | `0x0000_0000..0x4000_0000` | +| Fixed MMIO | `0x4010_0000..0x4020_0000` | +| RAM | `0x8000_0000..0xC000_0000` | + +The splitter places one full 1 GB chunk, refuses to use the interrupted +sub-GB fragment, and resumes at the next 1 GB boundary. + +### Small NUMA nodes use 2 MB alignment + +Two 512 MB NUMA nodes: + +| Input | Size | +|---|---| +| vnode 0 RAM | 512 MB | +| vnode 1 RAM | 512 MB | + +| Output | Range | +|---|---| +| vnode 0 RAM | `0x0000_0000..0x2000_0000` | +| vnode 1 RAM | `0x2000_0000..0x4000_0000` | + +With 1 GB alignment each node would burn a full GB of address space. +Request order is the vnode assignment, so swapping the requests swaps the +NUMA layout. + +### VTL2 does not move VTL0 + +Starting from 2 GB of VTL0 RAM and a fixed 1 GB MMIO hole: + +| VTL0 output | Range | +|---|---| +| RAM | `0x0000_0000..0x4000_0000` | +| MMIO | `0x4000_0000..0x8000_0000` | +| RAM | `0x8000_0000..0xC000_0000` | + +Adding a 2 MB VTL2 private-memory request leaves the VTL0 layout +identical and places VTL2 after the VTL0-visible top: + +| Private output | Range | +|---|---| +| VTL2 | `0xC000_0000..0xC020_0000` | + +`MemoryLayout::end_of_layout()` reports the top of all stored ranges, +including VTL2 chipset MMIO when present. +`MemoryLayout::vtl2_range()` reports the VTL2 private memory range +separately. + +### Reserved holes do not raise the layout top + +A reserved range blocks allocation but is not a guest-visible resource, +so it does not push later post-MMIO ranges higher: + +| Input | Range | +|---|---| +| RAM request | 2 GB | +| Reserved hole | `0xFD_0000_0000..0xFD_4000_0000` | +| Post-MMIO request | 1 MB | + +| Output | Range | +|---|---| +| RAM | `0x0000_0000..0x8000_0000` | +| Post-MMIO | `0x8000_0000..0x8010_0000` | + +Trailing reserved ranges are omitted from the returned allocation list, +but a reserved range that sits between real allocations is reported so +callers can see the full occupied map. + +## When to update this page + +Update this page when any of these change: + +- the allocator's phase order or any phase's placement direction +- the semantics of `reserve`, `fixed`, `ram`, or `request` +- the architectural reserved zones or their per-architecture addresses +- the worker's RAM alignment policy +- PCIe ECAM sizing or per-BAR alignment policy +- VTL2 chipset MMIO or VTL2 private-memory placement +- the host physical-address validation step +- `MemoryLayout::end_of_layout()` or `MemoryLayout::vtl2_range()` semantics diff --git a/openvmm/openvmm_core/src/worker/dispatch.rs b/openvmm/openvmm_core/src/worker/dispatch.rs index 7ebf795bea..d2c1185c8b 100644 --- a/openvmm/openvmm_core/src/worker/dispatch.rs +++ b/openvmm/openvmm_core/src/worker/dispatch.rs @@ -5,6 +5,9 @@ use crate::emuplat; use crate::partition::BindHvliteVp; use crate::partition::HvlitePartition; use crate::vmgs_non_volatile_store::HvLiteVmgsNonVolatileStore; +use crate::worker::memory_layout::MemoryLayoutInput; +use crate::worker::memory_layout::ResolvedPcieRootComplexRanges; +use crate::worker::memory_layout::resolve_memory_layout; use crate::worker::rom::RomBuilder; use acpi::dsdt; use anyhow::Context; @@ -197,6 +200,7 @@ impl Manifest { chipset_devices: config.chipset_devices, pci_chipset_devices: config.pci_chipset_devices, chipset_capabilities: config.chipset_capabilities, + layout: config.layout, generation_id_recv: config.generation_id_recv, rtc_delta_milliseconds: config.rtc_delta_milliseconds, automatic_guest_reset: config.automatic_guest_reset, @@ -246,6 +250,7 @@ pub struct Manifest { chipset_devices: Vec, pci_chipset_devices: Vec, chipset_capabilities: VmChipsetCapabilities, + layout: vmm_core_defs::LayoutConfig, generation_id_recv: Option>, rtc_delta_milliseconds: i64, automatic_guest_reset: bool, @@ -397,6 +402,11 @@ pub(crate) struct InitializedVm { gm: GuestMemory, cfg: Manifest, mem_layout: MemoryLayout, + resolved_pcie_root_complex_ranges: Vec, + virtio_mmio_region: MemoryRange, + chipset_low_mmio: MemoryRange, + chipset_high_mmio: MemoryRange, + vtl2_chipset_mmio: MemoryRange, processor_topology: ProcessorTopology, igvm_file: Option, driver_source: VmTaskDriverSource, @@ -678,9 +688,15 @@ struct LoadedVmInner { chipset_cfg: BaseChipsetManifest, chipset_capabilities: VmChipsetCapabilities, #[cfg_attr(not(guest_arch = "x86_64"), expect(dead_code))] - virtio_mmio_count: usize, + virtio_mmio_region: MemoryRange, #[cfg_attr(not(guest_arch = "x86_64"), expect(dead_code))] virtio_mmio_irq: u32, + /// Chipset low MMIO range for VMOD/PCI0 _CRS. + chipset_low_mmio: MemoryRange, + /// Chipset high MMIO range for VMOD/PCI0 _CRS. + chipset_high_mmio: MemoryRange, + /// VTL2-private chipset MMIO range for VTL2 VMBus. + vtl2_chipset_mmio: MemoryRange, /// ((device, function), interrupt) #[cfg_attr(not(guest_arch = "x86_64"), expect(dead_code))] pci_legacy_interrupts: Vec<((u8, Option), u32)>, @@ -746,9 +762,10 @@ fn convert_vtl2_config( // Use the size, but the base is the requested load // base. - LateMapVtl0AllowedRanges::Ranges(vec![MemoryRange::new( - *base..(*base + range.len()), - )]) + let allowed = + MemoryRange::try_new(*base..base.wrapping_add(range.len())) + .with_context(|| format!("invalid vtl2 absolute base {base:#x}"))?; + LateMapVtl0AllowedRanges::Ranges(vec![allowed]) } Vtl2BaseAddressType::MemoryLayout { .. } => { LateMapVtl0AllowedRanges::MemoryLayout @@ -864,7 +881,7 @@ impl InitializedVm { let physical_address_size = proto.max_physical_address_size(); // Determine if a special vtl2 memory allocation should be used. - let vtl2_range = if let LoadMode::Igvm { + let vtl2_layout = if let LoadMode::Igvm { vtl2_base_address, .. } = &cfg.load_mode { @@ -873,21 +890,16 @@ impl InitializedVm { | Vtl2BaseAddressType::Absolute(_) | Vtl2BaseAddressType::Vtl2Allocate { .. } => None, Vtl2BaseAddressType::MemoryLayout { size } => { - let vtl2_range = super::vm_loaders::igvm::vtl2_memory_range( - physical_address_size, - cfg.memory.mem_size, - &cfg.memory.mmio_gaps, - &cfg.memory.pci_ecam_gaps, - &cfg.memory.pci_mmio_gaps, + let vtl2_layout = super::vm_loaders::igvm::vtl2_memory_layout_request( igvm_file .as_ref() .expect("igvm file should be already parsed"), *size, ) - .context("unable to determine vtl2 memory range")?; - tracing::info!(?vtl2_range, "vtl2 memory range selected"); + .context("unable to determine vtl2 memory layout request")?; + tracing::info!(?vtl2_layout, "vtl2 memory layout request selected"); - Some(vtl2_range) + Some(vtl2_layout) } } } else { @@ -895,49 +907,35 @@ impl InitializedVm { }; // Choose the memory layout of the VM. - let mem_layout = if let Some(ref sizes) = cfg.memory.numa_mem_sizes { - // When numa_mem_sizes is set, distribute guest RAM across vNUMA nodes - // for ACPI SRAT / FDT reporting. - // - // TODO: The vNUMA nodes reported are meant for test usage only, as they - // are not aligned to any physical NUMA node. There is more work to do - // to support useful vNUMA reporting. - let total: u64 = sizes - .iter() - .copied() - .try_fold(0u64, |acc, s| acc.checked_add(s)) - .context("numa memory sizes overflow")?; - anyhow::ensure!( - total == cfg.memory.mem_size, - "numa_mem_sizes total ({total:#x}) does not match mem_size ({:#x})", - cfg.memory.mem_size - ); - - MemoryLayout::new_with_numa( - sizes, - &cfg.memory.mmio_gaps, - &cfg.memory.pci_ecam_gaps, - &cfg.memory.pci_mmio_gaps, - vtl2_range, - ) - } else { - MemoryLayout::new( - cfg.memory.mem_size, - &cfg.memory.mmio_gaps, - &cfg.memory.pci_ecam_gaps, - &cfg.memory.pci_mmio_gaps, - vtl2_range, - ) - } + // + // When numa_mem_sizes is set, distribute guest RAM across vNUMA nodes + // for ACPI SRAT / FDT reporting. + // + // TODO: The vNUMA nodes reported are meant for test usage only, as they + // are not aligned to any physical NUMA node. There is more work to do + // to support useful vNUMA reporting. + let virtio_mmio_count = cfg + .virtio_devices + .iter() + .filter(|(bus, _)| matches!(bus, VirtioBus::Mmio)) + .count(); + + let resolved_layout = resolve_memory_layout(MemoryLayoutInput { + mem_size: cfg.memory.mem_size, + numa_mem_sizes: cfg.memory.numa_mem_sizes.as_deref(), + layout: cfg.layout.clone(), + pcie_root_complexes: &cfg.pcie_root_complexes, + virtio_mmio_count, + vtl2_layout, + physical_address_size, + }) .context("invalid memory configuration")?; - - if mem_layout.end_of_layout() > 1 << physical_address_size { - anyhow::bail!( - "memory layout ends at {:#x}, which exceeds the address with of {} bits", - mem_layout.end_of_layout(), - physical_address_size - ); - } + let mem_layout = resolved_layout.memory_layout; + let resolved_pcie_root_complex_ranges = resolved_layout.pcie_root_complex_ranges; + let virtio_mmio_region = resolved_layout.virtio_mmio_region; + let chipset_low_mmio = resolved_layout.chipset_low_mmio; + let chipset_high_mmio = resolved_layout.chipset_high_mmio; + let vtl2_chipset_mmio = resolved_layout.vtl2_chipset_mmio; // Place the alias map at the end of the address space. Newer versions // of OpenHCL support receiving this offset via devicetree (especially @@ -1049,6 +1047,11 @@ impl InitializedVm { gm, cfg, mem_layout, + resolved_pcie_root_complex_ranges, + virtio_mmio_region, + chipset_low_mmio, + chipset_high_mmio, + vtl2_chipset_mmio, processor_topology, igvm_file, driver_source, @@ -1075,6 +1078,11 @@ impl InitializedVm { gm, cfg, mem_layout, + resolved_pcie_root_complex_ranges, + virtio_mmio_region, + chipset_low_mmio, + chipset_high_mmio, + vtl2_chipset_mmio, processor_topology, igvm_file, driver_source, @@ -1782,7 +1790,11 @@ impl InitializedVm { let mut pcie_host_bridges = Vec::new(); let mut pcie_root_complexes = Vec::new(); - for rc in cfg.pcie_root_complexes { + for (rc, ranges) in cfg + .pcie_root_complexes + .into_iter() + .zip(resolved_pcie_root_complex_ranges) + { let device_name = format!("pcie-root:{}", rc.name); // Create a static bus range for the root complex so that @@ -1814,7 +1826,7 @@ impl InitializedVm { &mut services.register_mmio(), rc.start_bus, rc.end_bus, - rc.ecam_range, + ranges.ecam_range, root_port_definitions, msi_conn.target(), ) @@ -1839,9 +1851,9 @@ impl InitializedVm { segment: rc.segment, start_bus: rc.start_bus, end_bus: rc.end_bus, - ecam_range: rc.ecam_range, - low_mmio: rc.low_mmio, - high_mmio: rc.high_mmio, + ecam_range: ranges.ecam_range, + low_mmio: ranges.low_mmio, + high_mmio: ranges.high_mmio, }); pcie_root_complexes.push(root_complex.clone()); @@ -2355,15 +2367,11 @@ impl InitializedVm { // add virtio devices - // Construct virtio devices. - // - // TODO: allocate PCI and MMIO space better. + // Construct virtio devices. Virtio-mmio device addresses are resolved + // by the memory layout allocator; each slot is a 4 KiB Mmio32 + // allocation indexed by the order of VirtioBus::Mmio devices. let mut pci_device_number = 10; - if mem_layout.mmio().len() < 2 { - anyhow::bail!("at least two mmio regions are required"); - } - let mut virtio_mmio_start = mem_layout.mmio()[1].end(); - let mut virtio_mmio_count = 0; + let mut virtio_mmio_index = 0; // Avoid an ISA interrupt to avoid conflicts and to avoid needing to // configure the line as level-triggered in the MADT (necessary for @@ -2389,8 +2397,8 @@ impl InitializedVm { .await?; match bus { VirtioBus::Mmio => { - let mmio_start = virtio_mmio_start - 0x1000; - virtio_mmio_start -= 0x1000; + let mmio_start = virtio_mmio_region.start() + virtio_mmio_index as u64 * 0x1000; + virtio_mmio_index += 1; let id = format!("{id}-{mmio_start}"); let gm = gm.clone(); chipset_builder.arc_mutex_device(id).try_add(|services| { @@ -2404,7 +2412,6 @@ impl InitializedVm { 0x1000, ) })?; - virtio_mmio_count += 1; } VirtioBus::Pci => { let pci_inta_line = pci_inta_line.context("missing PCI INT#A line")?; @@ -2441,8 +2448,6 @@ impl InitializedVm { } } - assert!(virtio_mmio_start >= mem_layout.mmio()[1].start()); - let (chipset, devices) = chipset_builder.build()?; let (fatal_error_send, _fatal_error_recv) = mesh::channel(); let chipset = vmm_core::vmotherboard_adapter::AdaptedChipset::new( @@ -2541,8 +2546,11 @@ impl InitializedVm { chipset_capabilities: cfg.chipset_capabilities, firmware_event_send: cfg.firmware_event_send, load_mode: cfg.load_mode, - virtio_mmio_count, + virtio_mmio_region, virtio_mmio_irq, + chipset_low_mmio, + chipset_high_mmio, + vtl2_chipset_mmio, pci_legacy_interrupts, igvm_file, next_igvm_file: None, @@ -2648,7 +2656,7 @@ impl LoadedVmInner { dsdt, &self.chipset_cfg, enable_serial, - self.virtio_mmio_count, + self.virtio_mmio_region, self.virtio_mmio_irq, &self.pci_legacy_interrupts, ) @@ -2784,6 +2792,9 @@ impl LoadedVmInner { with_vmbus_redirect: self.vmbus_redirect, com_serial, entropy: Some(&entropy), + chipset_low_mmio: self.chipset_low_mmio, + chipset_high_mmio: self.chipset_high_mmio, + vtl2_chipset_mmio: self.vtl2_chipset_mmio, }; super::vm_loaders::igvm::load_igvm(params)? } @@ -3358,6 +3369,11 @@ impl LoadedVm { chipset_devices: vec![], // TODO pci_chipset_devices: vec![], // TODO chipset_capabilities: self.inner.chipset_capabilities, + layout: vmm_core_defs::LayoutConfig { + chipset_low_mmio_size: 0, + chipset_high_mmio_size: 0, + vtl2_chipset_mmio_size: 0, + }, // TODO generation_id_recv: None, // TODO rtc_delta_milliseconds: 0, // TODO automatic_guest_reset: self.inner.automatic_guest_reset, @@ -3400,7 +3416,7 @@ fn add_devices_to_dsdt_x64( dsdt: &mut dsdt::Dsdt, cfg: &BaseChipsetManifest, serial_uarts: bool, - virtio_mmio_count: usize, + virtio_mmio_region: MemoryRange, virtio_mmio_irq: u32, pci_legacy_interrupts: &[((u8, Option), u32)], // ((device, function), interrupt) ) { @@ -3426,34 +3442,25 @@ fn add_devices_to_dsdt_x64( "the DSDT describes two MMIO regions" ); let low_mmio_gap = mem_layout.mmio()[0]; - let mut high_mmio_space: std::ops::Range = mem_layout.mmio()[1].into(); - // Device(\_SB.VI00) - // { - // Name(_HID, "LNRO0005") - // Name(_UID, 0) - // Name(_CRS, ResourceTemplate() - // { - // QWORDMemory(,,,,,ReadWrite,0,0x1fffff000,0x1ffffffff,0,0x1000) - // Interrupt(ResourceConsumer, Level, ActiveHigh, Exclusive) - // {5} - // }) - // } - // TODO: manage MMIO space better than this - for i in 0..virtio_mmio_count { - high_mmio_space.end -= HV_PAGE_SIZE; - let mut device = dsdt::Device::new(format!("\\_SB.VI{i:02}").as_bytes()); - device.add_object(&dsdt::NamedString::new(b"_HID", b"LNRO0005")); - device.add_object(&dsdt::NamedInteger::new(b"_UID", i as u64)); - let mut crs = dsdt::CurrentResourceSettings::new(); - crs.add_resource(&dsdt::QwordMemory::new(high_mmio_space.end, HV_PAGE_SIZE)); - let mut intr = dsdt::Interrupt::new(virtio_mmio_irq); - intr.is_edge_triggered = false; - crs.add_resource(&intr); - device.add_object(&crs); - dsdt.add_object(&device); - } + let high_mmio_gap = mem_layout.mmio()[1]; - let high_mmio_gap = MemoryRange::new(high_mmio_space); + // Virtio-mmio devices are allocated as a contiguous region by the memory + // layout resolver. Each 4 KiB slot is a separate device. + { + for i in 0..virtio_mmio_region.page_count_4k() { + let slot_base = virtio_mmio_region.start() + i * HV_PAGE_SIZE; + let mut device = dsdt::Device::new(format!("\\_SB.VI{i:02}").as_bytes()); + device.add_object(&dsdt::NamedString::new(b"_HID", b"LNRO0005")); + device.add_object(&dsdt::NamedInteger::new(b"_UID", i)); + let mut crs = dsdt::CurrentResourceSettings::new(); + crs.add_resource(&dsdt::QwordMemory::new(slot_base, HV_PAGE_SIZE)); + let mut intr = dsdt::Interrupt::new(virtio_mmio_irq); + intr.is_edge_triggered = false; + crs.add_resource(&intr); + device.add_object(&crs); + dsdt.add_object(&device); + } + } if cfg.with_generic_pci_bus || cfg.with_i440bx_host_pci_bridge { // TODO: actually plumb through legacy PCI interrupts diff --git a/openvmm/openvmm_core/src/worker/memory_layout.rs b/openvmm/openvmm_core/src/worker/memory_layout.rs new file mode 100644 index 0000000000..3d42ca88d2 --- /dev/null +++ b/openvmm/openvmm_core/src/worker/memory_layout.rs @@ -0,0 +1,888 @@ +// Copyright (c) Microsoft Corporation. +// Licensed under the MIT License. + +//! Guest physical memory layout resolution for the VM worker. +//! +//! This module is the point where OpenVMM turns stable VM configuration and +//! already-known platform ranges into the production [`MemoryLayout`]. The +//! resulting guest physical addresses are part of the VM's compatibility surface: +//! hibernated guests and saved VMs remember device and RAM locations, so changes +//! to the request order, placement class, or alignment policy can break resume or +//! restore. Keep layout policy changes deliberate and covered by tests. +//! +//! The resolver owns all layout consumers: architectural reserved zones (LAPIC, +//! IOAPIC, GIC, etc.), chipset MMIO (VMBus, PIIX4 PCI BARs), PCIe +//! ECAM/BAR pools, virtio-mmio slots, ordinary RAM, VTL2 private memory, and +//! VTL2 chipset MMIO. Callers express sizing intent; the resolver places +//! everything and derives the effective MMIO gaps for [`MemoryLayout`]. + +use super::vm_loaders::igvm::Vtl2MemoryLayoutRequest; +use anyhow::Context; +use anyhow::bail; +use memory_range::MemoryRange; +use openvmm_defs::config::PcieMmioRangeConfig; +use openvmm_defs::config::PcieRootComplexConfig; +use std::sync::Arc; +use vm_topology::layout::LayoutBuilder; +use vm_topology::layout::Placement; +use vm_topology::memory::MemoryLayout; +use vm_topology::memory::MemoryRangeWithNode; + +const PAGE_SIZE: u64 = 4096; +const TWO_MB: u64 = 2 * 1024 * 1024; +const GB: u64 = 1024 * 1024 * 1024; + +/// PCIe ECAM: 32 devices * 8 functions * 4 KiB config space = 1 MB per bus. +const PCIE_ECAM_BYTES_PER_BUS: u64 = 32 * 8 * 4096; + +/// Minimum guest physical address at which an ECAM range may be placed. +/// +/// The ACPI MCFG table reports the bus-0 base as +/// `ecam_range.start() - start_bus * 1 MiB`. `start_bus` is a `u8`, so up to +/// 255 MiB of headroom may be required. Rounding up to a flat 256 MiB gives a +/// single easy-to-remember invariant that works for every legal `start_bus` +/// value, independent of any individual root complex's configuration. +const PCIE_ECAM_MIN_ADDRESS: u64 = 256 * 1024 * 1024; + +#[derive(Debug)] +pub(super) struct ResolvedMemoryLayout { + pub memory_layout: MemoryLayout, + pub pcie_root_complex_ranges: Vec, + /// Contiguous MMIO region for all virtio-mmio device slots. Each slot is + /// 4 KiB, indexed from the start of the region. `EMPTY` when no + /// virtio-mmio devices are configured. + pub virtio_mmio_region: MemoryRange, + /// Chipset low MMIO range (below 4 GB) for VMOD/PCI0 _CRS. Always at + /// least the architectural reserved zone (LAPIC, IOAPIC, TPM, ...). + pub chipset_low_mmio: MemoryRange, + /// Chipset high MMIO range (above RAM) for VMOD/PCI0 _CRS. `EMPTY` when + /// no chipset high MMIO is configured. + pub chipset_high_mmio: MemoryRange, + /// VTL2-private chipset MMIO range, reported to VTL2 VMBus via the device + /// tree. `EMPTY` when VTL2 is not configured or has no chipset MMIO. + pub vtl2_chipset_mmio: MemoryRange, +} + +#[derive(Debug)] +pub(super) struct ResolvedPcieRootComplexRanges { + pub ecam_range: MemoryRange, + pub low_mmio: MemoryRange, + pub high_mmio: MemoryRange, +} + +pub(super) struct MemoryLayoutInput<'a> { + /// Total VTL0 RAM size requested by the VM configuration. + pub mem_size: u64, + /// Optional per-vNUMA RAM budgets. When present, these must sum to + /// `mem_size`, and request order is the vnode assignment order. + pub numa_mem_sizes: Option<&'a [u64]>, + /// Chipset MMIO sizing from the manifest builder. + pub layout: vmm_core_defs::LayoutConfig, + /// PCIe root complex address-space intents. These are resolved by this + /// worker step so front ends do not need to carve guest physical addresses. + pub pcie_root_complexes: &'a [PcieRootComplexConfig], + /// Number of virtio-mmio device slots to allocate in 32-bit MMIO space. + /// A single contiguous region of `count * 4 KiB` is allocated. + pub virtio_mmio_count: usize, + /// Optional IGVM VTL2 private-memory request. This is allocated after all + /// VTL0-visible RAM and MMIO and is carried separately from ordinary RAM. + pub vtl2_layout: Option, + /// Host-supported physical address width used only after allocation. The + /// allocator computes the smallest layout it can; host fit is validation. + pub physical_address_size: u8, +} + +/// Architectural reserved zone for x86_64: LAPIC, IOAPIC, battery, TPM. +const ARCH_RESERVED_X86_64: MemoryRange = MemoryRange::new(0xFE00_0000..0x1_0000_0000); + +/// Architectural reserved zone for aarch64: GIC, PL011, battery. +const ARCH_RESERVED_AARCH64: MemoryRange = MemoryRange::new(0xEF00_0000..0x1_0000_0000); + +pub(super) fn resolve_memory_layout( + input: MemoryLayoutInput<'_>, +) -> anyhow::Result { + let ram_sizes = validate_ram_sizes(input.mem_size, input.numa_mem_sizes)?; + + let mut ram_ranges_by_node = vec![Vec::new(); ram_sizes.len()]; + let mut pcie_root_complex_ranges = input + .pcie_root_complexes + .iter() + .map(|_| ResolvedPcieRootComplexRanges { + ecam_range: MemoryRange::EMPTY, + low_mmio: MemoryRange::EMPTY, + high_mmio: MemoryRange::EMPTY, + }) + .collect::>(); + + let mut builder = LayoutBuilder::new(); + + // Chipset low MMIO (Mmio32): a fixed window pinned to the top of 32-bit + // address space, advertised to firmware as `\_SB.VMOD._CRS`. Always at + // least the architectural reserved zone (LAPIC, IOAPIC, TPM, ...) so + // guests can arbitrate fixed-address children like TPM2 against this + // window; the caller-requested size may extend it lower. + let arch_reserved = if cfg!(guest_arch = "x86_64") { + ARCH_RESERVED_X86_64 + } else { + ARCH_RESERVED_AARCH64 + }; + let four_gb = 4 * GB; + let low_mmio_size = u64::from(input.layout.chipset_low_mmio_size) + .next_multiple_of(0x1000) + .max(arch_reserved.len()); + let chipset_low_mmio = MemoryRange::new(four_gb - low_mmio_size..four_gb); + builder.fixed("chipset-low-mmio", chipset_low_mmio); + + // Chipset high MMIO (Mmio64): VMOD/PCI0 _CRS high range. + let mut chipset_high_mmio = MemoryRange::EMPTY; + if input.layout.chipset_high_mmio_size != 0 { + builder.request( + "chipset-high-mmio", + &mut chipset_high_mmio, + input.layout.chipset_high_mmio_size, + TWO_MB, + Placement::Mmio64, + ); + } + + // Group root complexes by PCI segment so that RCs sharing a segment get a + // single contiguous ECAM block. This ensures the MCFG bus-0 base address + // is consistent for all RCs in the same segment. + struct SegmentEcam { + segment: u16, + min_bus: u8, + max_bus: u8, + range: MemoryRange, + } + let mut segment_ecams: Vec = Vec::new(); + for rc in input.pcie_root_complexes { + if let Some(entry) = segment_ecams.iter_mut().find(|e| e.segment == rc.segment) { + entry.min_bus = entry.min_bus.min(rc.start_bus); + entry.max_bus = entry.max_bus.max(rc.end_bus); + } else { + segment_ecams.push(SegmentEcam { + segment: rc.segment, + min_bus: rc.start_bus, + max_bus: rc.end_bus, + range: MemoryRange::EMPTY, + }); + } + } + + // ECAM: always dynamically allocated below 4GB (since Linux on x86_64 + // refuses to use ECAM above 4GB unless the BIOS is of a special shape). + // + // TODO: fix the Linux loader and move this above 4GB before the layout + // is stabilized. + for se in &mut segment_ecams { + let bus_count = u64::from(se.max_bus - se.min_bus) + 1; + builder.request( + format!("pcie-seg{}-ecam", se.segment), + &mut se.range, + bus_count * PCIE_ECAM_BYTES_PER_BUS, + PCIE_ECAM_BYTES_PER_BUS, + Placement::Mmio32, + ); + } + + for (root_complex, ranges) in input + .pcie_root_complexes + .iter() + .zip(&mut pcie_root_complex_ranges) + { + // Low MMIO: 2 MB aligned. + add_mmio_range( + &mut builder, + format!("pcie-{}-low-mmio", root_complex.name), + &mut ranges.low_mmio, + &root_complex.low_mmio, + TWO_MB, + Placement::Mmio32, + )?; + // High MMIO: 1 GB aligned. Ideally we'd align it to its actual size so + // that the full amount is always usable for a single large BAR. But + // that burns physical address space, which is especially limited on + // some x86 machines. + // + // The downside of this approach is that the maximum mappable BAR size + // is a function of the rest of the topology, which can create + // reliability issues for users. + add_mmio_range( + &mut builder, + format!("pcie-{}-high-mmio", root_complex.name), + &mut ranges.high_mmio, + &root_complex.high_mmio, + GB, + Placement::Mmio64, + )?; + } + + // Virtio-mmio: allocate one contiguous region for all slots. Each slot is + // 4 KiB, so the region is `count * 4 KiB` placed as a single Mmio32 + // request. + let mut virtio_mmio_region = MemoryRange::EMPTY; + if input.virtio_mmio_count > 0 { + builder.request( + "virtio-mmio", + &mut virtio_mmio_region, + input.virtio_mmio_count as u64 * PAGE_SIZE, + PAGE_SIZE, + Placement::Mmio32, + ); + } + + // RAM request order is part of the NUMA compatibility contract: the first + // request maps to vnode 0, the second to vnode 1, and so on. For GB-sized + // nodes, use GB alignment so holes do not create sub-GB RAM chunks. For + // sub-GB nodes, use 2 MB alignment to avoid wasting a full GB of address + // space per small node. + for (vnode, (ram_size, ram_ranges)) in ram_sizes + .iter() + .copied() + .zip(&mut ram_ranges_by_node) + .enumerate() + { + let ram_alignment = if ram_size < GB { TWO_MB } else { GB }; + builder.ram(format!("ram{vnode}"), ram_ranges, ram_size, ram_alignment); + } + + // VTL2 chipset MMIO is implementation-private — placed after all + // VTL0-visible RAM/MMIO so enabling VTL2 does not move VTL0 addresses. + let mut vtl2_chipset_mmio = MemoryRange::EMPTY; + if input.layout.vtl2_chipset_mmio_size != 0 { + builder.request( + "vtl2-chipset-mmio", + &mut vtl2_chipset_mmio, + input.layout.vtl2_chipset_mmio_size, + TWO_MB, + Placement::PostMmio, + ); + } + + // VTL2 MemoryLayout mode is implementation-private memory, not a VTL0 RAM + // hole. Allocate it only after all VTL0-visible RAM/MMIO so enabling VTL2 + // does not move the VTL0 layout. + // + // IGVM relocation min/max constraints are checked later by the IGVM loader + // against the selected base; using them as a constraint here would be + // overconstraining and would lead to holes in the VTL0 layout--we just + // don't support IGVM files with relocation sections that cannot be + // satisfied by the post-MMIO space. + let mut vtl2_range = MemoryRange::EMPTY; + if let Some(vtl2_layout) = input.vtl2_layout { + builder.request( + "vtl2", + &mut vtl2_range, + vtl2_layout.size, + vtl2_layout.alignment, + Placement::PostMmio, + ); + } + + let placed_ranges = builder + .allocate() + .context("allocating memory layout ranges")?; + + // Subdivide per-segment ECAM blocks into per-RC sub-ranges. + for (root_complex, ranges) in input + .pcie_root_complexes + .iter() + .zip(&mut pcie_root_complex_ranges) + { + let se = segment_ecams + .iter() + .find(|e| e.segment == root_complex.segment) + .expect("segment must exist"); + let offset = u64::from(root_complex.start_bus - se.min_bus) * PCIE_ECAM_BYTES_PER_BUS; + let size = (u64::from(root_complex.end_bus - root_complex.start_bus) + 1) + * PCIE_ECAM_BYTES_PER_BUS; + ranges.ecam_range = + MemoryRange::new(se.range.start() + offset..se.range.start() + offset + size); + } + + // Enforce the MCFG bus-0 base invariant: every ECAM range must sit at + // `PCIE_ECAM_MIN_ADDRESS` or above. Fail fast at VM construction with a + // clear error rather than letting an unrepresentable MCFG entry surface + // later as a panic (debug) or silent wraparound (release). + for (root_complex, ranges) in input + .pcie_root_complexes + .iter() + .zip(&pcie_root_complex_ranges) + { + if ranges.ecam_range.start() < PCIE_ECAM_MIN_ADDRESS { + bail!( + "PCIe root complex {:?}: ECAM at {:#x} is below the {:#x} minimum", + root_complex.name, + ranges.ecam_range.start(), + PCIE_ECAM_MIN_ADDRESS, + ); + } + } + + let ram = ram_ranges_by_node + .into_iter() + .enumerate() + .flat_map(|(vnode, ranges)| { + ranges.into_iter().map(move |range| MemoryRangeWithNode { + range, + vnode: vnode as u32, + }) + }) + .collect::>(); + + let vtl2_range = input.vtl2_layout.map(|_| vtl2_range); + + // `MemoryLayout::mmio()` is a positional contract: `[0]` = chipset low + // MMIO, `[1]` = chipset high MMIO, and (when VTL2 is enabled) `[2]` = + // the VTL2-private chipset MMIO range. Consumers (DSDT, Linux DT, UEFI, + // PCAT) rely on this ordering. Entries may be `MemoryRange::EMPTY` when + // the corresponding range is not configured; the positional index is + // what matters, not the presence of a non-empty range. + let mut mmio_gaps: Vec = vec![chipset_low_mmio, chipset_high_mmio]; + if !vtl2_chipset_mmio.is_empty() { + mmio_gaps.push(vtl2_chipset_mmio); + } + + let mut pci_ecam_gaps: Vec = Vec::new(); + pci_ecam_gaps.extend(segment_ecams.iter().map(|se| se.range)); + pci_ecam_gaps.sort(); + + let mut pci_mmio_gaps: Vec = Vec::new(); + pci_mmio_gaps.extend( + pcie_root_complex_ranges + .iter() + .flat_map(|ranges| [ranges.low_mmio, ranges.high_mmio]), + ); + pci_mmio_gaps.sort(); + + let memory_layout = MemoryLayout::new_from_resolved_ranges( + ram, + mmio_gaps, + pci_ecam_gaps, + pci_mmio_gaps, + vtl2_range, + ) + .context("validating resolved memory layout")?; + + // Host address-width validation is intentionally after allocation. The + // layout engine is host-width independent, which keeps the layout a pure + // function of VM configuration and avoids host differences changing guest + // physical addresses. + let address_space_limit = 1u64 << input.physical_address_size; + let layout_top = placed_ranges.last().map(|r| r.range.end()).unwrap_or(0); + if layout_top > address_space_limit { + bail!( + "memory layout ends at {:#x}, which exceeds the address width of {} bits", + layout_top, + input.physical_address_size + ); + } + + Ok(ResolvedMemoryLayout { + memory_layout, + pcie_root_complex_ranges, + virtio_mmio_region, + chipset_low_mmio, + chipset_high_mmio, + vtl2_chipset_mmio, + }) +} + +fn add_mmio_range<'a>( + builder: &mut LayoutBuilder<'a>, + tag: impl Into>, + target: &'a mut MemoryRange, + config: &PcieMmioRangeConfig, + alignment: u64, + placement: Placement, +) -> anyhow::Result<()> { + let tag = tag.into(); + match config { + PcieMmioRangeConfig::Dynamic { size } => { + builder.request(tag, target, *size, alignment, placement); + } + PcieMmioRangeConfig::Fixed(range) => { + // A fixed low-MMIO range must satisfy the Mmio32 placement contract. + // Without this check, an above-4 GiB range would be accepted and + // then silently truncated to 32 bits in the ARM64 PCI device tree + // (`ranges` property uses `low_start as u32`). + if placement == Placement::Mmio32 && range.end() > 4 * GB { + bail!("{tag}: fixed low MMIO range {range} must end at or below 4 GiB",); + } + *target = *range; + builder.fixed(tag, *range); + } + } + Ok(()) +} + +fn validate_ram_sizes(mem_size: u64, numa_mem_sizes: Option<&[u64]>) -> anyhow::Result> { + // Keep validation compatible with `MemoryLayout::new()` / `new_with_numa()`: + // RAM sizes are page-granular, nonzero, and NUMA budgets must exactly cover + // the configured total. + if mem_size == 0 || !mem_size.is_multiple_of(PAGE_SIZE) { + bail!("invalid memory size {mem_size:#x}"); + } + + let Some(numa_mem_sizes) = numa_mem_sizes else { + return Ok(vec![mem_size]); + }; + + if numa_mem_sizes.is_empty() { + bail!("empty NUMA memory sizes"); + } + + for &size in numa_mem_sizes { + if size == 0 || !size.is_multiple_of(PAGE_SIZE) { + bail!("invalid NUMA node memory size {size:#x}"); + } + } + + let total = numa_mem_sizes + .iter() + .copied() + .try_fold(0u64, |acc, size| acc.checked_add(size)) + .context("numa memory sizes overflow")?; + if total != mem_size { + bail!("numa_mem_sizes total ({total:#x}) does not match mem_size ({mem_size:#x})"); + } + + Ok(numa_mem_sizes.to_vec()) +} + +#[cfg(test)] +mod tests { + use super::*; + use vm_topology::memory::AddressType; + + const MB: u64 = 1024 * 1024; + // Match the production defaults from `vm_manifest_builder`. + #[cfg(guest_arch = "x86_64")] + const DEFAULT_CHIPSET_LOW_MMIO_SIZE: u32 = 128 * 1024 * 1024; + #[cfg(guest_arch = "aarch64")] + const DEFAULT_CHIPSET_LOW_MMIO_SIZE: u32 = 512 * 1024 * 1024; + #[cfg(guest_arch = "x86_64")] + const ARCH_RESERVED: MemoryRange = ARCH_RESERVED_X86_64; + #[cfg(guest_arch = "aarch64")] + const ARCH_RESERVED: MemoryRange = ARCH_RESERVED_AARCH64; + const DEFAULT_CHIPSET_HIGH_MMIO_SIZE: u64 = 512 * 1024 * 1024; + const DEFAULT_VTL2_CHIPSET_MMIO_SIZE: u64 = GB; + + const DEFAULT_LAYOUT: vmm_core_defs::LayoutConfig = vmm_core_defs::LayoutConfig { + chipset_low_mmio_size: DEFAULT_CHIPSET_LOW_MMIO_SIZE, + chipset_high_mmio_size: DEFAULT_CHIPSET_HIGH_MMIO_SIZE, + vtl2_chipset_mmio_size: 0, + }; + + fn input( + mem_size: u64, + numa_mem_sizes: Option<&[u64]>, + vtl2_layout: Option, + ) -> MemoryLayoutInput<'_> { + MemoryLayoutInput { + mem_size, + numa_mem_sizes, + layout: DEFAULT_LAYOUT, + pcie_root_complexes: &[], + virtio_mmio_count: 0, + vtl2_layout, + physical_address_size: 46, + } + } + + fn resolve(input: MemoryLayoutInput<'_>) -> MemoryLayout { + resolve_memory_layout(input).unwrap().memory_layout + } + + fn vtl2_layout(size: u64) -> Vtl2MemoryLayoutRequest { + Vtl2MemoryLayoutRequest { + size, + alignment: PAGE_SIZE, + } + } + + fn pcie_root_complex( + low_mmio: PcieMmioRangeConfig, + high_mmio: PcieMmioRangeConfig, + ) -> PcieRootComplexConfig { + PcieRootComplexConfig { + index: 0, + name: "rc0".to_string(), + segment: 0, + start_bus: 0, + end_bus: 0, + low_mmio, + high_mmio, + ports: Vec::new(), + } + } + + #[test] + fn basic_ram_placement() { + let actual = resolve(input(2 * GB, None, None)); + + assert_eq!(actual.ram_size(), 2 * GB); + // RAM starts at GPA 0 and fills upward. + assert_eq!(actual.ram()[0].range.start(), 0); + } + + #[test] + fn ram_splits_around_arch_reserved_zone() { + // 4 GB of RAM must split around the architectural reserved zone + // and the chipset MMIO allocations below 4 GB. + let actual = resolve(input(4 * GB, None, None)); + + assert_eq!(actual.ram_size(), 4 * GB); + // RAM must not overlap the architectural reserved zone. + let reserved = ARCH_RESERVED; + for ram in actual.ram() { + assert!( + !ram.range.overlaps(&reserved), + "RAM {:?} overlaps reserved {:?}", + ram.range, + reserved + ); + } + } + + #[test] + fn numa_preserves_node_ordering() { + let sizes = [2 * GB, 2 * GB]; + + let actual = resolve(input(4 * GB, Some(&sizes), None)); + + // First vnode's RAM starts at 0. + assert_eq!(actual.ram()[0].vnode, 0); + assert_eq!(actual.ram()[0].range.start(), 0); + // All RAM accounts for 4 GB total. + assert_eq!(actual.ram_size(), 4 * GB); + } + + #[test] + fn chipset_mmio_is_resolved() { + let result = resolve_memory_layout(input(2 * GB, None, None)).unwrap(); + + let low = result.chipset_low_mmio; + let high = result.chipset_high_mmio; + assert_eq!(low.len(), DEFAULT_CHIPSET_LOW_MMIO_SIZE as u64); + assert_eq!(high.len(), DEFAULT_CHIPSET_HIGH_MMIO_SIZE); + // Chipset low MMIO is pinned to end at 4 GiB and must fully contain + // the architectural reserved zone (LAPIC, IOAPIC, TPM, ...). + assert_eq!(low.end(), 4 * GB); + assert!(low.contains(&ARCH_RESERVED)); + assert!( + high.start() >= 2 * GB, + "high chipset MMIO should be above RAM" + ); + } + + #[test] + fn pcie_dynamic_intents_are_resolved() { + let root_complexes = [pcie_root_complex( + PcieMmioRangeConfig::Dynamic { size: 64 * MB }, + PcieMmioRangeConfig::Dynamic { size: GB }, + )]; + let mut config = input(2 * GB, None, None); + config.pcie_root_complexes = &root_complexes; + + let actual = resolve_memory_layout(config).unwrap(); + let ranges = &actual.pcie_root_complex_ranges[0]; + + assert!( + ranges.ecam_range.end() <= 4 * GB, + "ECAM should be below 4 GB" + ); + assert_eq!(ranges.low_mmio.len(), 64 * MB); + assert_eq!(ranges.high_mmio.len(), GB); + assert_eq!( + actual + .memory_layout + .probe_address(ranges.ecam_range.start()), + Some(AddressType::PciEcam) + ); + assert_eq!( + actual.memory_layout.probe_address(ranges.low_mmio.start()), + Some(AddressType::PciMmio) + ); + assert_eq!( + actual.memory_layout.probe_address(ranges.high_mmio.start()), + Some(AddressType::PciMmio) + ); + } + + #[test] + fn shared_segment_gets_contiguous_ecam() { + // Two root complexes on the same segment with disjoint bus ranges + // must get ECAM sub-ranges within a single contiguous block, so + // that the MCFG bus-0 base address is the same for both. + let root_complexes = [ + PcieRootComplexConfig { + index: 0, + name: "rc0".to_string(), + segment: 0, + start_bus: 0, + end_bus: 15, + low_mmio: PcieMmioRangeConfig::Dynamic { size: 32 * MB }, + high_mmio: PcieMmioRangeConfig::Dynamic { size: GB }, + ports: Vec::new(), + }, + PcieRootComplexConfig { + index: 1, + name: "rc1".to_string(), + segment: 0, + start_bus: 16, + end_bus: 31, + low_mmio: PcieMmioRangeConfig::Dynamic { size: 32 * MB }, + high_mmio: PcieMmioRangeConfig::Dynamic { size: GB }, + ports: Vec::new(), + }, + ]; + let mut config = input(2 * GB, None, None); + config.pcie_root_complexes = &root_complexes; + + let actual = resolve_memory_layout(config).unwrap(); + let r0 = &actual.pcie_root_complex_ranges[0]; + let r1 = &actual.pcie_root_complex_ranges[1]; + + // rc0 ends exactly where rc1 starts (contiguous). + assert_eq!(r0.ecam_range.end(), r1.ecam_range.start()); + + // Both derive the same MCFG bus-0 base. + let bus0_base_r0 = r0.ecam_range.start() + - u64::from(root_complexes[0].start_bus) * PCIE_ECAM_BYTES_PER_BUS; + let bus0_base_r1 = r1.ecam_range.start() + - u64::from(root_complexes[1].start_bus) * PCIE_ECAM_BYTES_PER_BUS; + assert_eq!(bus0_base_r0, bus0_base_r1); + } + + #[test] + fn full_bus_range_ecam_does_not_overflow() { + // A single RC spanning buses 0..255 requires (255 - 0 + 1) = 256 + // buses. The bus count must be computed in u64, not u8, to avoid + // overflow. + let root_complexes = [PcieRootComplexConfig { + index: 0, + name: "rc0".to_string(), + segment: 0, + start_bus: 0, + end_bus: 255, + low_mmio: PcieMmioRangeConfig::Dynamic { size: 64 * MB }, + high_mmio: PcieMmioRangeConfig::Dynamic { size: GB }, + ports: Vec::new(), + }]; + let mut config = input(2 * GB, None, None); + config.pcie_root_complexes = &root_complexes; + + let actual = resolve_memory_layout(config).unwrap(); + let ranges = &actual.pcie_root_complex_ranges[0]; + assert_eq!(ranges.ecam_range.len(), 256 * PCIE_ECAM_BYTES_PER_BUS); + } + + #[test] + fn sub_gb_numa_nodes_use_two_mb_alignment() { + let sizes = [512 * MB, 512 * MB]; + + let actual = resolve(input(GB, Some(&sizes), None)); + + assert_eq!( + actual.ram(), + &[ + MemoryRangeWithNode { + range: MemoryRange::new(0..512 * MB), + vnode: 0, + }, + MemoryRangeWithNode { + range: MemoryRange::new(512 * MB..GB), + vnode: 1, + }, + ] + ); + } + + #[test] + fn vtl2_is_allocated_after_all_mmio() { + let actual = resolve(input(4 * GB, None, Some(vtl2_layout(2 * MB)))); + + assert!(actual.vtl2_range().is_some()); + let vtl2 = actual.vtl2_range().unwrap(); + assert_eq!(vtl2.len(), 2 * MB); + // VTL2 should be after all other allocations. + for ram in actual.ram() { + assert!(vtl2.start() >= ram.range.end()); + } + } + + #[test] + fn vtl2_does_not_change_ram_placement() { + let without_vtl2 = resolve(input(2 * GB, None, None)); + let with_vtl2 = resolve(input(2 * GB, None, Some(vtl2_layout(2 * MB)))); + + assert_eq!(with_vtl2.ram(), without_vtl2.ram()); + } + + #[test] + fn deterministic_for_same_inputs() { + let sizes = [2 * GB, 3 * GB]; + + let first = resolve(input(5 * GB, Some(&sizes), None)); + let second = resolve(input(5 * GB, Some(&sizes), None)); + + assert_eq!(first.ram(), second.ram()); + assert_eq!(first.end_of_layout(), second.end_of_layout()); + } + + #[test] + fn host_width_validation_happens_after_allocation() { + // Use enough RAM that the layout (RAM + chipset high MMIO + arch + // reserved zone) exceeds 32 bits. + let mut config = input(4 * GB, None, None); + config.physical_address_size = 32; + + let err = resolve_memory_layout(config).unwrap_err(); + + assert!(err.to_string().contains("memory layout ends at")); + } + + #[test] + fn virtio_mmio_slots_are_allocated_in_mmio32() { + let mut config = input(2 * GB, None, None); + config.virtio_mmio_count = 3; + + let result = resolve_memory_layout(config).unwrap(); + + let region = result.virtio_mmio_region; + assert_eq!(region.len(), 3 * PAGE_SIZE); + assert!(region.end() <= 4 * GB, "virtio-mmio should be below 4 GB"); + } + + #[test] + fn virtio_mmio_does_not_move_ram() { + let without = resolve(input(2 * GB, None, None)); + let mut config = input(2 * GB, None, None); + config.virtio_mmio_count = 2; + let with = resolve_memory_layout(config).unwrap(); + + assert_eq!(with.memory_layout.ram(), without.ram()); + } + + #[test] + fn zero_virtio_mmio_produces_no_region() { + let config = input(2 * GB, None, None); + + let result = resolve_memory_layout(config).unwrap(); + + assert!(result.virtio_mmio_region.is_empty()); + } + + #[test] + fn vtl2_chipset_mmio_is_post_mmio() { + let mut config = input(2 * GB, None, None); + config.layout.vtl2_chipset_mmio_size = DEFAULT_VTL2_CHIPSET_MMIO_SIZE; + + let result = resolve_memory_layout(config).unwrap(); + + let vtl2_mmio = result.vtl2_chipset_mmio; + assert_eq!(vtl2_mmio.len(), DEFAULT_VTL2_CHIPSET_MMIO_SIZE); + // VTL2 chipset MMIO should be after all VTL0-visible ranges. + let chipset_high = result.chipset_high_mmio; + assert!( + vtl2_mmio.start() >= chipset_high.end(), + "VTL2 chipset MMIO should be after VTL0 high MMIO" + ); + } + + #[test] + fn vtl2_chipset_mmio_does_not_move_vtl0_layout() { + let without = resolve(input(2 * GB, None, None)); + let mut config = input(2 * GB, None, None); + config.layout.vtl2_chipset_mmio_size = DEFAULT_VTL2_CHIPSET_MMIO_SIZE; + let with = resolve_memory_layout(config).unwrap(); + + assert_eq!(with.memory_layout.ram(), without.ram()); + } + + #[test] + fn disabled_chipset_mmio_still_reports_arch_reserved() { + // Even when the caller does not request any chipset MMIO, the + // architectural reserved zone (LAPIC, IOAPIC, TPM, ...) is still + // carved out of RAM at the top of 4 GiB. That range must be + // reported so consumers see the same layout the allocator + // produced. + let mut config = input(2 * GB, None, None); + config.layout.chipset_low_mmio_size = 0; + config.layout.chipset_high_mmio_size = 0; + + let result = resolve_memory_layout(config).unwrap(); + + let low = result.chipset_low_mmio; + assert_eq!(low.end(), 4 * GB); + assert!(low.contains(&ARCH_RESERVED)); + assert!(result.chipset_high_mmio.is_empty()); + // The reported ranges must appear in MemoryLayout::mmio() preserving + // the positional contract: [0] = low, [1] = high (EMPTY placeholder). + assert_eq!(result.memory_layout.mmio(), &[low, MemoryRange::EMPTY]); + } + + #[test] + fn asymmetric_chipset_mmio_is_accepted() { + // Asymmetric chipset MMIO (only low or only high) is allowed. + // The missing range is EMPTY. + let mut config = input(2 * GB, None, None); + config.layout.chipset_high_mmio_size = 0; + let result = resolve_memory_layout(config).unwrap(); + assert!(!result.chipset_low_mmio.is_empty()); + assert!(result.chipset_high_mmio.is_empty()); + + let mut config = input(2 * GB, None, None); + config.layout.chipset_low_mmio_size = 0; + let result = resolve_memory_layout(config).unwrap(); + // Low is always at least the arch reserved zone. + assert!(!result.chipset_low_mmio.is_empty()); + // High is still configured in this case. + assert!(!result.chipset_high_mmio.is_empty()); + } + + #[test] + fn fixed_low_mmio_above_4gb_is_rejected() { + let root_complexes = [pcie_root_complex( + // A 1 GiB fixed low MMIO range placed above 4 GiB violates the + // Mmio32 placement contract. + PcieMmioRangeConfig::Fixed(MemoryRange::new(5 * GB..6 * GB)), + PcieMmioRangeConfig::Dynamic { size: GB }, + )]; + let mut config = input(2 * GB, None, None); + config.pcie_root_complexes = &root_complexes; + let err = resolve_memory_layout(config).unwrap_err(); + assert!( + err.to_string().contains("must end at or below 4 GiB"), + "unexpected error: {err}" + ); + } + + #[test] + fn ecam_below_256mb_is_rejected() { + // Force ECAM placement below 256 MiB by reserving most of the free + // Mmio32 window for low_mmio. The fixed chipset_low_mmio at the top + // of 32-bit space leaves 3968 MiB on x86_64 and 3584 MiB on aarch64 + // for dynamic Mmio32 requests; size low_mmio to push ECAM near + // 127 MiB on both. The resolver must bail because MCFG cannot + // represent a bus-0 base below the ECAM start. + let low_mmio_size = if cfg!(guest_arch = "x86_64") { + 3840 * MB + } else { + 3456 * MB + }; + let root_complexes = [pcie_root_complex( + PcieMmioRangeConfig::Dynamic { + size: low_mmio_size, + }, + PcieMmioRangeConfig::Dynamic { size: GB }, + )]; + let mut config = input(2 * GB, None, None); + config.pcie_root_complexes = &root_complexes; + + let err = resolve_memory_layout(config).unwrap_err(); + + assert!(err.to_string().contains("ECAM"), "unexpected error: {err}"); + } +} diff --git a/openvmm/openvmm_core/src/worker/mod.rs b/openvmm/openvmm_core/src/worker/mod.rs index 23d0fe91f0..b36faa2154 100644 --- a/openvmm/openvmm_core/src/worker/mod.rs +++ b/openvmm/openvmm_core/src/worker/mod.rs @@ -2,5 +2,6 @@ // Licensed under the MIT License. pub mod dispatch; +mod memory_layout; mod rom; pub mod vm_loaders; diff --git a/openvmm/openvmm_core/src/worker/vm_loaders/igvm.rs b/openvmm/openvmm_core/src/worker/vm_loaders/igvm.rs index 05a0124e66..9305c8dd0f 100644 --- a/openvmm/openvmm_core/src/worker/vm_loaders/igvm.rs +++ b/openvmm/openvmm_core/src/worker/vm_loaders/igvm.rs @@ -30,7 +30,6 @@ use openvmm_defs::config::SerialInformation; use openvmm_defs::config::Vtl2BaseAddressType; use range_map_vec::RangeMap; use std::collections::HashMap; -use std::ffi::CString; use std::io::Read; use std::io::Seek; use thiserror::Error; @@ -46,8 +45,8 @@ use zerocopy::IntoBytes; #[derive(Debug, Error)] pub enum Error { - #[error("command line is not a valid C string")] - InvalidCommandLine(#[source] std::ffi::NulError), + #[error("command line contains an embedded NUL byte at offset {0}")] + CommandLineContainsNul(usize), #[error("failed to read igvm file")] Igvm(#[source] std::io::Error), #[error("invalid igvm file")] @@ -76,16 +75,14 @@ pub enum Error { NoVtl2MemoryRange, #[error("no vtl2 memory source in igvm file")] Vtl2MemorySource, - #[error("invalid memory config")] - MemoryConfig(#[source] vm_topology::memory::Error), - #[error("not enough physical address bits to allocate vtl2 range")] - NotEnoughPhysicalAddressBits, #[error("building device tree for partition failed")] DeviceTree(fdt::builder::Error), #[error("supplied vtl2 memory {0} is not aligned to 2MB")] Vtl2MemoryAligned(u64), #[error("supplied vtl2 memory {0} is smaller than igvm file VTL2 range {1}")] Vtl2MemoryTooSmall(u64, u64), + #[error("invalid vtl2 relocation alignment {0:#x}")] + Vtl2RelocationAlignment(u64), #[error("unsupported guest architecture")] UnsupportedGuestArch, #[error("igvm file does not support vbs")] @@ -94,8 +91,6 @@ pub enum Error { LowerVtlContext, #[error("missing required memory range {0}")] MissingRequiredMemory(MemoryRange), - #[error("IGVM file requires at least two mmio ranges")] - UnsupportedMmio, } fn from_memory_range(range: &MemoryRange) -> IGVM_VHS_MEMORY_RANGE { @@ -199,17 +194,21 @@ pub fn vtl2_memory_info(igvm_file: &IgvmFile) -> Result { } } -/// Determine a location to allocate VTL2 memory, based on VM information and a -/// provided `igvm_file`. -pub fn vtl2_memory_range( - physical_address_size: u8, - mem_size: u64, - mmio_gaps: &[MemoryRange], - pci_ecam_gaps: &[MemoryRange], - pci_mmio_gaps: &[MemoryRange], +/// Information needed to allocate a VTL2 memory range in the VM memory layout. +#[derive(Debug, Clone, Copy)] +pub struct Vtl2MemoryLayoutRequest { + /// The number of bytes to reserve for VTL2. + pub size: u64, + /// The required relocation alignment. + pub alignment: u64, +} + +/// Determine the VTL2 memory allocation constraints from a provided +/// `igvm_file`. +pub fn vtl2_memory_layout_request( igvm_file: &IgvmFile, vtl2_size: Option, -) -> Result { +) -> Result { let (mask, _max_vtl) = match vbs_platform_header(igvm_file)? { IgvmPlatformHeader::SupportedPlatform(info) => { debug_assert_eq!(info.platform_type, IgvmPlatformType::VSM_ISOLATION); @@ -228,6 +227,9 @@ pub fn vtl2_memory_range( let reloc_region = relocs.0.ok_or(Error::RelocationNotSupported)?[0].clone(); let alignment = reloc_region.relocation_alignment; + if alignment < HV_PAGE_SIZE || !alignment.is_power_of_two() { + return Err(Error::Vtl2RelocationAlignment(alignment)); + } let size = match vtl2_size { Some(vtl2_size) => { @@ -248,64 +250,40 @@ pub fn vtl2_memory_range( } }; - let align_base = |base| -> u64 { (base + alignment - 1) & !(alignment - 1) }; - - // Use one bit below the maximum possible address, as the VTL0 alias map - // will use the highest available bit of the physical address space. - let physical_address_size = physical_address_size - 1; - - // Create an initial memory layout to determine the highest used address. - let dummy_layout = MemoryLayout::new(mem_size, mmio_gaps, pci_ecam_gaps, pci_mmio_gaps, None) - .map_err(Error::MemoryConfig)?; - - // TODO: Underhill kernel panics if loaded at 32TB or higher. Restrict the - // max address to 32TB until this is fixed. - const MAX_ADDR_32TB: u64 = 32u64 << 40; // 0x2000_0000_0000 bytes - let max_physical_address = 1 << physical_address_size; - let max_physical_address = max_physical_address.min(MAX_ADDR_32TB); - - // With more than two mmio gaps, it's harder to reason about which space is - // free or not in the address space to allocate a VTL2 range. Take a - // shortcut and place VTL2 above the end of ram or mmio. - let (min_addr, max_addr) = (dummy_layout.end_of_layout(), max_physical_address); - - let aligned_min_addr = align_base(min_addr); - let aligned_max_addr = (max_addr / alignment) * alignment; - - assert!(aligned_min_addr >= reloc_region.minimum_relocation_gpa); - assert!(aligned_max_addr <= reloc_region.maximum_relocation_gpa); - - // It's possible that the min_addr is above the physical address size of the - // system. Fail now as mapping ram would fail later. - if aligned_min_addr >= aligned_max_addr { - return Err(Error::NotEnoughPhysicalAddressBits); - } - - tracing::trace!(min_addr, aligned_min_addr, max_addr, aligned_max_addr); - - // Select a random base within the alignment - let possible_bases = (aligned_max_addr - aligned_min_addr) / alignment; - let mut num: u64 = 0; - getrandom::fill(num.as_mut_bytes()).expect("crng failure"); - let selected_base = num % (possible_bases - 1); - let selected_addr = aligned_min_addr + (selected_base * alignment); - tracing::trace!(possible_bases, selected_base, selected_addr); - - Ok(MemoryRange::new(selected_addr..(selected_addr + size))) + Ok(Vtl2MemoryLayoutRequest { size, alignment }) } -/// Build a device tree representing the whole guest partition. -fn build_device_tree( - processor_topology: &ProcessorTopology, - mem_layout: &MemoryLayout, - all_ram: &[MemoryRangeWithNode], - vtl2_protectable_ram: &[MemoryRange], +/// Parameters for [`build_device_tree`]. +struct BuildDeviceTreeParams<'a> { + processor_topology: &'a ProcessorTopology, + all_ram: &'a [MemoryRangeWithNode], + vtl2_protectable_ram: &'a [MemoryRange], vtl2_base_address: Vtl2BaseAddressType, - command_line: &str, + command_line: &'a str, with_vmbus_redirect: bool, com_serial: Option, - entropy: Option<&[u8]>, -) -> Result, fdt::builder::Error> { + entropy: Option<&'a [u8]>, + chipset_low_mmio: MemoryRange, + chipset_high_mmio: MemoryRange, + vtl2_chipset_mmio: MemoryRange, +} + +/// Build a device tree representing the whole guest partition. +fn build_device_tree(params: BuildDeviceTreeParams<'_>) -> Result, fdt::builder::Error> { + let BuildDeviceTreeParams { + processor_topology, + all_ram, + vtl2_protectable_ram, + vtl2_base_address, + command_line, + with_vmbus_redirect, + com_serial, + entropy, + chipset_low_mmio, + chipset_high_mmio, + vtl2_chipset_mmio, + } = params; + let mut buf = vec![0; HV_PAGE_SIZE as usize * 256]; let mut builder = fdt::builder::Builder::new(fdt::builder::BuilderConfig { @@ -377,27 +355,23 @@ fn build_device_tree( .add_u32(p_size_cells, 2)? .add_prop_array(p_ranges, &[])?; - // Determine how much mmio this system has. 2 or less gaps are reported to - // VTL0. The 3rd and/or 4th gap will be reported to VTL2. Any more are - // ignored. - let mut mmio_chunks = mem_layout.mmio().chunks(2); - - let extract_ranges = |mmio: Option<&[MemoryRange]>| -> Vec { - let mut ranges = Vec::new(); + // Build DT ranges for VMBus devices. VTL0 gets the chipset low/high MMIO + // ranges; VTL2 gets its own private chipset MMIO range. + let ranges_vtl0: Vec = [chipset_low_mmio, chipset_high_mmio] + .into_iter() + .flat_map(|range| [range.start(), range.start(), range.len()]) + .collect(); - if let Some(mmio) = mmio { - for entry in mmio { - ranges.push(entry.start()); - ranges.push(entry.start()); - ranges.push(entry.len()); - } - } - ranges + let ranges_vtl2: Vec = if vtl2_chipset_mmio.is_empty() { + vec![] + } else { + vec![ + vtl2_chipset_mmio.start(), + vtl2_chipset_mmio.start(), + vtl2_chipset_mmio.len(), + ] }; - let ranges_vtl0 = extract_ranges(mmio_chunks.next()); - let ranges_vtl2 = extract_ranges(mmio_chunks.next()); - // VTL0 vmbus root device let vmbus_vtl0_name = if ranges_vtl0.is_empty() { "vmbus-vtl0".into() @@ -549,6 +523,12 @@ pub struct LoadIgvmParams<'a, T: ArchTopology> { pub com_serial: Option, /// Entropy pub entropy: Option<&'a [u8]>, + /// VTL0 chipset low MMIO range for the device tree VMBus node. + pub chipset_low_mmio: MemoryRange, + /// VTL0 chipset high MMIO range for the device tree VMBus node. + pub chipset_high_mmio: MemoryRange, + /// VTL2-private chipset MMIO range for the device tree VTL2 VMBus node. + pub vtl2_chipset_mmio: MemoryRange, } pub fn load_igvm( @@ -591,6 +571,9 @@ fn load_igvm_x86( with_vmbus_redirect, com_serial, entropy, + chipset_low_mmio, + chipset_high_mmio, + vtl2_chipset_mmio, } = params; let relocations_enabled = match vtl2_base_address { @@ -608,7 +591,12 @@ fn load_igvm_x86( cmdline.to_string() }; - let command_line = CString::new(cmdline).map_err(Error::InvalidCommandLine)?; + // The command line is exposed to the guest as a NUL-terminated byte + // sequence (via the IGVM CommandLine parameter), so reject any embedded NUL + // bytes up front. + if let Some(pos) = cmdline.as_bytes().iter().position(|&b| b == 0) { + return Err(Error::CommandLineContainsNul(pos)); + } let (mask, max_vtl) = match vbs_platform_header(igvm_file)? { IgvmPlatformHeader::SupportedPlatform(info) => { @@ -968,14 +956,12 @@ fn load_igvm_x86( } } IgvmDirectiveHeader::MmioRanges(ref info) => { - // Convert the OpenVMM format to the IGVM format - // Any gaps above 2 are ignored. - let mmio = mem_layout.mmio(); - if mmio.len() < 2 { - return Err(Error::UnsupportedMmio); - } + // Convert the chipset MMIO ranges to the IGVM format. let mmio_ranges = IGVM_VHS_MMIO_RANGES { - mmio_ranges: [from_memory_range(&mmio[0]), from_memory_range(&mmio[1])], + mmio_ranges: [ + from_memory_range(&chipset_low_mmio), + from_memory_range(&chipset_high_mmio), + ], }; import_parameter(&mut parameter_areas, info, mmio_ranges.as_bytes())?; } @@ -984,20 +970,25 @@ fn load_igvm_x86( import_parameter(&mut parameter_areas, info, memory_map.as_bytes())?; } IgvmDirectiveHeader::CommandLine(ref info) => { - import_parameter(&mut parameter_areas, info, command_line.as_bytes_with_nul())?; + let mut bytes = Vec::with_capacity(cmdline.len() + 1); + bytes.extend_from_slice(cmdline.as_bytes()); + bytes.push(0); + import_parameter(&mut parameter_areas, info, &bytes)?; } IgvmDirectiveHeader::DeviceTree(ref info) => { - let dt = build_device_tree( + let dt = build_device_tree(BuildDeviceTreeParams { processor_topology, - mem_layout, - &all_ram, - &vtl2_protectable_ram, + all_ram: &all_ram, + vtl2_protectable_ram: &vtl2_protectable_ram, vtl2_base_address, - &String::from_utf8_lossy(command_line.as_bytes()), + command_line: &cmdline, with_vmbus_redirect, com_serial, entropy, - ) + chipset_low_mmio, + chipset_high_mmio, + vtl2_chipset_mmio, + }) .map_err(Error::DeviceTree)?; import_parameter(&mut parameter_areas, info, &dt)?; } diff --git a/openvmm/openvmm_core/src/worker/vm_loaders/linux.rs b/openvmm/openvmm_core/src/worker/vm_loaders/linux.rs index 4b4aa0de6b..4d272561c1 100644 --- a/openvmm/openvmm_core/src/worker/vm_loaders/linux.rs +++ b/openvmm/openvmm_core/src/worker/vm_loaders/linux.rs @@ -10,7 +10,6 @@ use loader::linux::InitrdAddressType; use loader::linux::InitrdConfig; use loader::linux::RegisterConfig; use loader::linux::ZeroPageConfig; -use openvmm_defs::config::DEFAULT_MMIO_GAPS_AARCH64; use std::ffi::CString; use std::io::Seek; use thiserror::Error; @@ -494,9 +493,13 @@ fn build_dt( } } - assert!(DEFAULT_MMIO_GAPS_AARCH64.len() == 2); - let low_mmio_gap = DEFAULT_MMIO_GAPS_AARCH64[0]; - let high_mmio_gap = DEFAULT_MMIO_GAPS_AARCH64[1]; + // Build VMBus MMIO ranges from the memory layout's chipset MMIO gaps. + assert!( + cfg.mem_layout.mmio().len() >= 2, + "need at least two MMIO regions for VMBus DT node" + ); + let low_mmio_gap = cfg.mem_layout.mmio()[0]; + let high_mmio_gap = cfg.mem_layout.mmio()[1]; soc = soc .start_node("vmbus")? .add_u32(p_address_cells, 2)? diff --git a/openvmm/openvmm_defs/Cargo.toml b/openvmm/openvmm_defs/Cargo.toml index 884361ac0f..cce2dd82f2 100644 --- a/openvmm/openvmm_defs/Cargo.toml +++ b/openvmm/openvmm_defs/Cargo.toml @@ -11,7 +11,7 @@ hypervisor_resources.workspace = true openvmm_pcat_locator.workspace = true # vmcore -memory_range.workspace = true +memory_range = { workspace = true, features = ["mesh"] } vm_resource.workspace = true vmgs_resources.workspace = true diff --git a/openvmm/openvmm_defs/src/config.rs b/openvmm/openvmm_defs/src/config.rs index 07b4a7c38a..4e4af4dd4b 100644 --- a/openvmm/openvmm_defs/src/config.rs +++ b/openvmm/openvmm_defs/src/config.rs @@ -55,6 +55,9 @@ pub struct Config { pub chipset_devices: Vec, pub pci_chipset_devices: Vec, pub chipset_capabilities: VmChipsetCapabilities, + /// Memory layout sizing for the layout engine. Determines chipset MMIO + /// range sizes; addresses are allocated dynamically by the resolver. + pub layout: vmm_core_defs::LayoutConfig, pub generation_id_recv: Option>, // This is used for testing. TODO: resourcify, and also store this in VMGS. pub rtc_delta_milliseconds: i64, @@ -63,36 +66,6 @@ pub struct Config { pub efi_diagnostics_log_level: EfiDiagnosticsLogLevelType, } -// ARM64 needs a larger low gap. -const DEFAULT_LOW_MMAP_GAP_SIZE_X86: u64 = 1024 * 1024 * 128; -const DEFAULT_LOW_MMAP_GAP_SIZE_AARCH64: u64 = 1024 * 1024 * 512; - -/// Default mmio gaps for an x86 partition. -pub const DEFAULT_MMIO_GAPS_X86: [MemoryRange; 2] = [ - MemoryRange::new(0x1_0000_0000 - DEFAULT_LOW_MMAP_GAP_SIZE_X86..0x1_0000_0000), // nMB just below 4GB - MemoryRange::new(0xF_E000_0000..0x10_0000_0000), // 512MB just below 64GB, then up to 64GB -]; - -/// Default mmio gaps for x86 if VTL2 is enabled. -pub const DEFAULT_MMIO_GAPS_X86_WITH_VTL2: [MemoryRange; 3] = [ - MemoryRange::new(0x1_0000_0000 - DEFAULT_LOW_MMAP_GAP_SIZE_X86..0x1_0000_0000), // nMB just below 4GB - MemoryRange::new(0xF_E000_0000..0x20_0000_0000), // 512MB just below 64GB, then up to 128GB - MemoryRange::new(0x20_0000_0000..0x20_4000_0000), // 128GB to 129 GB -]; - -/// Default mmio gaps for an aarch64 partition. -pub const DEFAULT_MMIO_GAPS_AARCH64: [MemoryRange; 2] = [ - MemoryRange::new(0x1_0000_0000 - DEFAULT_LOW_MMAP_GAP_SIZE_AARCH64..0x1_0000_0000), // nMB just below 4GB - MemoryRange::new(0xF_E000_0000..0x10_0000_0000), // 512MB just below 64GB, then up to 64GB -]; - -/// Default mmio gaps for aarch64 if VTL2 is enabled. -pub const DEFAULT_MMIO_GAPS_AARCH64_WITH_VTL2: [MemoryRange; 3] = [ - MemoryRange::new(0x1_0000_0000 - DEFAULT_LOW_MMAP_GAP_SIZE_AARCH64..0x1_0000_0000), // nMB just below 4GB - MemoryRange::new(0xF_E000_0000..0x20_0000_0000), // 512MB just below 64GB, then up to 128GB - MemoryRange::new(0x20_0000_0000..0x20_4000_0000), // 128GB to 129 GB -]; - pub const DEFAULT_GIC_DISTRIBUTOR_BASE: u64 = 0xFFFF_0000; // The KVM in-kernel vGICv3 requires the distributor and redistributor bases be 64KiB aligned. pub const DEFAULT_GIC_REDISTRIBUTORS_BASE: u64 = if cfg!(target_os = "linux") { @@ -216,6 +189,20 @@ pub enum Vtl2BaseAddressType { Vtl2Allocate { size: Option }, } +/// Specifies a PCIe MMIO BAR window, either by size (the resolver allocates) or +/// by a fixed location. Fixed locations exist for assigned-device, IOMMU, and +/// physical-topology compatibility. +#[derive(Debug, MeshPayload)] +pub enum PcieMmioRangeConfig { + /// Dynamically allocate a range of the given size. + Dynamic { + /// Size of the range in bytes. + size: u64, + }, + /// Use the specified fixed memory range. + Fixed(MemoryRange), +} + #[derive(Debug, MeshPayload)] pub struct PcieRootComplexConfig { pub index: u32, @@ -223,9 +210,8 @@ pub struct PcieRootComplexConfig { pub segment: u16, pub start_bus: u8, pub end_bus: u8, - pub ecam_range: MemoryRange, - pub low_mmio: MemoryRange, - pub high_mmio: MemoryRange, + pub low_mmio: PcieMmioRangeConfig, + pub high_mmio: PcieMmioRangeConfig, pub ports: Vec, } @@ -360,9 +346,6 @@ pub struct MemoryConfig { pub transparent_hugepages: bool, pub hugepages: bool, pub hugepage_size: Option, - pub mmio_gaps: Vec, - pub pci_ecam_gaps: Vec, - pub pci_mmio_gaps: Vec, /// Test only: per-NUMA-node memory sizes. When set, RAM is distributed /// across vNUMA nodes according to these sizes instead of assigning all RAM /// to node 0. The sum must equal `mem_size`. diff --git a/openvmm/openvmm_entry/Cargo.toml b/openvmm/openvmm_entry/Cargo.toml index 8e3164cd7b..82f8103bef 100644 --- a/openvmm/openvmm_entry/Cargo.toml +++ b/openvmm/openvmm_entry/Cargo.toml @@ -40,7 +40,6 @@ get_resources.workspace = true hyperv_ic_resources.workspace = true ide_resources.workspace = true input_core.workspace = true -memory_range.workspace = true net_backend_resources.workspace = true netvsp_resources.workspace = true nvme_resources.workspace = true diff --git a/openvmm/openvmm_entry/src/lib.rs b/openvmm/openvmm_entry/src/lib.rs index a99441fc0a..6afbc14913 100644 --- a/openvmm/openvmm_entry/src/lib.rs +++ b/openvmm/openvmm_entry/src/lib.rs @@ -58,7 +58,6 @@ use guid::Guid; use input_core::MultiplexedInputHandle; use inspect::InspectMut; use io::Read; -use memory_range::MemoryRange; use mesh::CancelContext; use mesh::CellUpdater; use mesh::rpc::RpcSend; @@ -66,10 +65,6 @@ use meshworker::VmmMesh; use net_backend_resources::mac_address::MacAddress; use nvme_resources::NvmeControllerRequest; use openvmm_defs::config::Config; -use openvmm_defs::config::DEFAULT_MMIO_GAPS_AARCH64; -use openvmm_defs::config::DEFAULT_MMIO_GAPS_AARCH64_WITH_VTL2; -use openvmm_defs::config::DEFAULT_MMIO_GAPS_X86; -use openvmm_defs::config::DEFAULT_MMIO_GAPS_X86_WITH_VTL2; use openvmm_defs::config::DEFAULT_PCAT_BOOT_ORDER; use openvmm_defs::config::DeviceVtl; use openvmm_defs::config::EfiDiagnosticsLogLevelType; @@ -78,6 +73,7 @@ use openvmm_defs::config::LateMapVtl0MemoryPolicy; use openvmm_defs::config::LoadMode; use openvmm_defs::config::MemoryConfig; use openvmm_defs::config::PcieDeviceConfig; +use openvmm_defs::config::PcieMmioRangeConfig; use openvmm_defs::config::PcieRootComplexConfig; use openvmm_defs::config::PcieRootPortConfig; use openvmm_defs::config::PcieSwitchConfig; @@ -86,7 +82,6 @@ use openvmm_defs::config::SerialInformation; use openvmm_defs::config::VirtioBus; use openvmm_defs::config::VmbusConfig; use openvmm_defs::config::VpciDeviceConfig; -use openvmm_defs::config::Vtl2BaseAddressType; use openvmm_defs::config::Vtl2Config; use openvmm_defs::rpc::VmRpc; use openvmm_defs::worker::VM_WORKER; @@ -719,33 +714,12 @@ async fn vm_config_from_command_line( }), ); - // If VTL2 is enabled, and we are not in VTL2 self allocate mode, provide an - // mmio gap for VTL2. - let use_vtl2_gap = opt.vtl2 - && !matches!( - opt.igvm_vtl2_relocation_type, - Vtl2BaseAddressType::Vtl2Allocate { .. }, - ); + let mut pcie_root_complexes = Vec::new(); #[cfg(guest_arch = "aarch64")] let arch = MachineArch::Aarch64; #[cfg(guest_arch = "x86_64")] let arch = MachineArch::X86_64; - - let mmio_gaps: Vec = match (use_vtl2_gap, arch) { - (true, MachineArch::X86_64) => DEFAULT_MMIO_GAPS_X86_WITH_VTL2.into(), - (true, MachineArch::Aarch64) => DEFAULT_MMIO_GAPS_AARCH64_WITH_VTL2.into(), - (false, MachineArch::X86_64) => DEFAULT_MMIO_GAPS_X86.into(), - (false, MachineArch::Aarch64) => DEFAULT_MMIO_GAPS_AARCH64.into(), - }; - - let mut pci_ecam_gaps = Vec::new(); - let mut pci_mmio_gaps = Vec::new(); - - let mut low_mmio_start = mmio_gaps.first().context("expected mmio gap")?.start(); - let mut high_mmio_end = mmio_gaps.last().context("expected second mmio gap")?.end(); - - let mut pcie_root_complexes = Vec::new(); for (i, rc_cli) in opt.pcie_root_complex.iter().enumerate() { let ports = opt .pcie_root_port @@ -764,43 +738,22 @@ async fn vm_config_from_command_line( .high_mmio .checked_next_multiple_of(ONE_MB) .context("high mmio rounding error")?; - let ecam_size = (((rc_cli.end_bus - rc_cli.start_bus) as u64) + 1) * 256 * 4096; - - let low_pci_mmio_start = low_mmio_start - .checked_sub(low_mmio_size) - .context("pci low mmio underflow")?; - let ecam_start = low_pci_mmio_start - .checked_sub(ecam_size) - .context("pci ecam underflow")?; - low_mmio_start = ecam_start; - high_mmio_end = high_mmio_end - .checked_add(high_mmio_size) - .context("pci high mmio overflow")?; - - let ecam_range = MemoryRange::new(ecam_start..ecam_start + ecam_size); - let low_mmio = MemoryRange::new(low_pci_mmio_start..low_pci_mmio_start + low_mmio_size); - let high_mmio = MemoryRange::new(high_mmio_end - high_mmio_size..high_mmio_end); - - pci_ecam_gaps.push(ecam_range); - pci_mmio_gaps.push(low_mmio); - pci_mmio_gaps.push(high_mmio); - pcie_root_complexes.push(PcieRootComplexConfig { index: i as u32, name: rc_cli.name.clone(), segment: rc_cli.segment, start_bus: rc_cli.start_bus, end_bus: rc_cli.end_bus, - ecam_range, - low_mmio, - high_mmio, + low_mmio: PcieMmioRangeConfig::Dynamic { + size: low_mmio_size, + }, + high_mmio: PcieMmioRangeConfig::Dynamic { + size: high_mmio_size, + }, ports, }); } - pci_ecam_gaps.sort(); - pci_mmio_gaps.sort(); - let pcie_switches = build_switch_list(&opt.pcie_switch); #[cfg(target_os = "linux")] @@ -919,6 +872,7 @@ async fn vm_config_from_command_line( // TODO: load from VMGS file if it exists let bios_guid = Guid::new_random(); + let layout_config = chipset.layout_config(); let VmChipsetResult { chipset, mut chipset_devices, @@ -1622,14 +1576,11 @@ async fn vm_config_from_command_line( } else { opt.memory_size() }, - mmio_gaps, prefetch_memory: opt.prefetch_memory(), private_memory: opt.private_memory(), transparent_hugepages: opt.transparent_hugepages(), hugepages: opt.memory.hugepages, hugepage_size: opt.memory.hugepage_size, - pci_ecam_gaps, - pci_mmio_gaps, numa_mem_sizes: opt.numa_memory.clone(), }, processor_topology: ProcessorTopologyConfig { @@ -1681,6 +1632,7 @@ async fn vm_config_from_command_line( chipset_devices, pci_chipset_devices, chipset_capabilities: capabilities, + layout: layout_config, #[cfg(windows)] vpci_resources, vmgs, diff --git a/openvmm/openvmm_entry/src/ttrpc/mod.rs b/openvmm/openvmm_entry/src/ttrpc/mod.rs index 385d01bdf2..8fc1eb398e 100644 --- a/openvmm/openvmm_entry/src/ttrpc/mod.rs +++ b/openvmm/openvmm_entry/src/ttrpc/mod.rs @@ -33,7 +33,6 @@ use mesh_worker::WorkerId; use mesh_worker::WorkerRpc; use netvsp_resources::NetvspHandle; use openvmm_defs::config::Config; -use openvmm_defs::config::DEFAULT_MMIO_GAPS_X86; use openvmm_defs::config::DeviceVtl; use openvmm_defs::config::HypervisorConfig; use openvmm_defs::config::LoadMode; @@ -540,13 +539,15 @@ impl VmService { })?); } - let chipset = VmManifestBuilder::new( + let chipset_builder = VmManifestBuilder::new( vm_manifest_builder::BaseChipsetType::HyperVGen2LinuxDirect, vm_manifest_builder::MachineArch::X86_64, ) - .with_serial(ports) - .build() - .context("failed to build vm configuration")?; + .with_serial(ports); + let layout_config = chipset_builder.layout_config(); + let chipset = chipset_builder + .build() + .context("failed to build vm configuration")?; // Extract memory and processor counts for the VmController. let config_mem_size = req_config @@ -573,9 +574,6 @@ impl VmService { vpci_devices: vec![], memory: MemoryConfig { mem_size: config_mem_size, - mmio_gaps: DEFAULT_MMIO_GAPS_X86.into(), - pci_ecam_gaps: vec![], - pci_mmio_gaps: vec![], prefetch_memory: false, private_memory: false, transparent_hugepages: false, @@ -614,6 +612,7 @@ impl VmService { chipset_devices: chipset.chipset_devices, pci_chipset_devices: chipset.pci_chipset_devices, chipset_capabilities: chipset.capabilities, + layout: layout_config, generation_id_recv: None, rtc_delta_milliseconds: 0, automatic_guest_reset: true, diff --git a/petri/Cargo.toml b/petri/Cargo.toml index 0ca720467f..14b599a273 100644 --- a/petri/Cargo.toml +++ b/petri/Cargo.toml @@ -31,7 +31,6 @@ disk_backend_resources.workspace = true framebuffer.workspace = true get_resources.workspace = true ide_resources.workspace = true -memory_range.workspace = true net_backend_resources.workspace = true netvsp_resources.workspace = true nvme_resources.workspace = true diff --git a/petri/src/vm/mod.rs b/petri/src/vm/mod.rs index c67ee590a1..42bfff6092 100644 --- a/petri/src/vm/mod.rs +++ b/petri/src/vm/mod.rs @@ -22,7 +22,6 @@ use crate::vtl2_settings::Vtl2StorageControllerBuilder; use async_trait::async_trait; use get_resources::ged::FirmwareEvent; use guid::Guid; -use memory_range::MemoryRange; use mesh::CancelContext; use openvmm_defs::config::Vtl2BaseAddressType; use pal_async::DefaultDriver; @@ -2130,16 +2129,6 @@ pub enum ApicMode { X2apicEnabled, } -/// Mmio configuration. -#[derive(Debug)] -pub enum MmioConfig { - /// The platform provided default. - Platform, - /// Custom mmio gaps. - /// TODO: Not supported on all platforms (ie Hyper-V). - Custom(Vec), -} - /// Common memory configuration information for the VM. #[derive(Debug)] pub struct MemoryConfig { @@ -2150,8 +2139,6 @@ pub struct MemoryConfig { /// /// Dynamic memory will be disabled if this is `None`. pub dynamic_memory_range: Option<(u64, u64)>, - /// Specifies the mmio gaps to use, either platform or custom. - pub mmio_gaps: MmioConfig, /// Per-NUMA-node memory sizes. When set, RAM is distributed across /// vNUMA nodes instead of assigning all RAM to node 0. pub numa_mem_sizes: Option>, @@ -2162,7 +2149,6 @@ impl Default for MemoryConfig { Self { startup_bytes: 4 * 1024 * 1024 * 1024, // 4 GiB dynamic_memory_range: None, - mmio_gaps: MmioConfig::Platform, numa_mem_sizes: None, } } diff --git a/petri/src/vm/openvmm/construct.rs b/petri/src/vm/openvmm/construct.rs index f6df910cc8..44e6f3dade 100644 --- a/petri/src/vm/openvmm/construct.rs +++ b/petri/src/vm/openvmm/construct.rs @@ -23,7 +23,6 @@ use crate::UefiConfig; use crate::VmbusStorageType; use crate::linux_direct_serial_agent::LinuxDirectSerialAgent; -use crate::MmioConfig; use crate::SIZE_1_MB; use crate::VmbusStorageController; use crate::openvmm::memdiff_vmgs; @@ -46,10 +45,6 @@ use mesh_process::Mesh; use nvme_resources::NamespaceDefinition; use nvme_resources::NvmeControllerHandle; use openvmm_defs::config::Config; -use openvmm_defs::config::DEFAULT_MMIO_GAPS_AARCH64; -use openvmm_defs::config::DEFAULT_MMIO_GAPS_AARCH64_WITH_VTL2; -use openvmm_defs::config::DEFAULT_MMIO_GAPS_X86; -use openvmm_defs::config::DEFAULT_MMIO_GAPS_X86_WITH_VTL2; use openvmm_defs::config::DEFAULT_PCAT_BOOT_ORDER; use openvmm_defs::config::DeviceVtl; use openvmm_defs::config::HypervisorConfig; @@ -353,6 +348,7 @@ impl PetriVmConfigOpenVmm { let mut vsock_listener = Some(vsock_listener); let vsock_path_string = vsock_path.to_string_lossy(); + let layout_config = chipset.layout_config(); let chipset = chipset .build() .context("failed to build chipset configuration")?; @@ -361,7 +357,6 @@ impl PetriVmConfigOpenVmm { let MemoryConfig { startup_bytes, dynamic_memory_range, - mmio_gaps, numa_mem_sizes, } = memory; @@ -380,24 +375,6 @@ impl PetriVmConfigOpenVmm { openvmm_defs::config::MemoryConfig { mem_size, - mmio_gaps: match mmio_gaps { - MmioConfig::Platform => { - if firmware.is_openhcl() { - match arch { - MachineArch::X86_64 => DEFAULT_MMIO_GAPS_X86_WITH_VTL2.into(), - MachineArch::Aarch64 => DEFAULT_MMIO_GAPS_AARCH64_WITH_VTL2.into(), - } - } else { - match arch { - MachineArch::X86_64 => DEFAULT_MMIO_GAPS_X86.into(), - MachineArch::Aarch64 => DEFAULT_MMIO_GAPS_AARCH64.into(), - } - } - } - MmioConfig::Custom(ranges) => ranges, - }, - pci_ecam_gaps: vec![], - pci_mmio_gaps: vec![], prefetch_memory: false, private_memory: false, transparent_hugepages: false, @@ -520,6 +497,7 @@ impl PetriVmConfigOpenVmm { chipset_devices, pci_chipset_devices, chipset_capabilities: capabilities, + layout: layout_config, // Basic virtualization device support hypervisor: HypervisorConfig { diff --git a/petri/src/vm/openvmm/modify.rs b/petri/src/vm/openvmm/modify.rs index ef93b552dd..dcaadb322c 100644 --- a/petri/src/vm/openvmm/modify.rs +++ b/petri/src/vm/openvmm/modify.rs @@ -18,7 +18,6 @@ use gdma_resources::GdmaDeviceHandle; use gdma_resources::VportDefinition; use get_resources::ged::IgvmAttestTestConfig; use guid::Guid; -use memory_range::MemoryRange; use net_backend_resources::mac_address::MacAddress; use nvme_resources::NamespaceDefinition; use nvme_resources::NvmeControllerHandle; @@ -26,6 +25,7 @@ use openvmm_defs::config::Config; use openvmm_defs::config::DeviceVtl; use openvmm_defs::config::LoadMode; use openvmm_defs::config::PcieDeviceConfig; +use openvmm_defs::config::PcieMmioRangeConfig; use openvmm_defs::config::PcieRootComplexConfig; use openvmm_defs::config::PcieRootPortConfig; use openvmm_defs::config::PcieSwitchConfig; @@ -242,27 +242,9 @@ impl PetriVmConfigOpenVmm { root_complex_per_segment: u64, root_ports_per_root_complex: u64, ) -> Self { - const SINGLE_BUS_NUMBER_ECAM_SIZE: u64 = 1024 * 1024; // 1 MB - const FULL_SEGMENT_ECAM_SIZE: u64 = 256 * SINGLE_BUS_NUMBER_ECAM_SIZE; // 256 MB const LOW_MMIO_SIZE: u64 = 64 * 1024 * 1024; // 64 MB const HIGH_MMIO_SIZE: u64 = 1024 * 1024 * 1024; // 1 GB - // Allocate and configure the address space gaps - let ecam_size = segment_count * FULL_SEGMENT_ECAM_SIZE; - let low_mmio_size = segment_count * root_complex_per_segment * LOW_MMIO_SIZE; - let high_mmio_size = segment_count * root_complex_per_segment * HIGH_MMIO_SIZE; - - let low_mmio_start = self.config.memory.mmio_gaps[0].start(); - let high_mmio_end = self.config.memory.mmio_gaps[1].end(); - - let ecam_gap = MemoryRange::new(low_mmio_start - ecam_size..low_mmio_start); - let low_gap = MemoryRange::new(ecam_gap.start() - low_mmio_size..ecam_gap.start()); - let high_gap = MemoryRange::new(high_mmio_end..high_mmio_end + high_mmio_size); - - self.config.memory.pci_ecam_gaps.push(ecam_gap); - self.config.memory.pci_mmio_gaps.push(low_gap); - self.config.memory.pci_mmio_gaps.push(high_gap); - // Add the root complexes to the VM for segment in 0..segment_count { let bus_count_per_rc = 256 / root_complex_per_segment; @@ -273,17 +255,6 @@ impl PetriVmConfigOpenVmm { let start_bus = rc_index_in_segment * bus_count_per_rc; let end_bus = start_bus + bus_count_per_rc - 1; - let ecam_range_start = ecam_gap.start() - + segment * FULL_SEGMENT_ECAM_SIZE - + start_bus * SINGLE_BUS_NUMBER_ECAM_SIZE; - let ecam_range_end = - ecam_range_start + bus_count_per_rc * SINGLE_BUS_NUMBER_ECAM_SIZE; - - let low_mmio_start = low_gap.start() + index * LOW_MMIO_SIZE; - let low_mmio_end = low_gap.start() + (index + 1) * LOW_MMIO_SIZE; - let high_mmio_start = high_gap.start() + index * HIGH_MMIO_SIZE; - let high_mmio_end = high_gap.start() + (index + 1) * HIGH_MMIO_SIZE; - let ports = (0..root_ports_per_root_complex) .map(|i| PcieRootPortConfig { name: format!("s{}rc{}rp{}", segment, rc_index_in_segment, i), @@ -298,9 +269,12 @@ impl PetriVmConfigOpenVmm { segment: segment.try_into().unwrap(), start_bus: start_bus.try_into().unwrap(), end_bus: end_bus.try_into().unwrap(), - ecam_range: MemoryRange::new(ecam_range_start..ecam_range_end), - low_mmio: MemoryRange::new(low_mmio_start..low_mmio_end), - high_mmio: MemoryRange::new(high_mmio_start..high_mmio_end), + low_mmio: PcieMmioRangeConfig::Dynamic { + size: LOW_MMIO_SIZE, + }, + high_mmio: PcieMmioRangeConfig::Dynamic { + size: HIGH_MMIO_SIZE, + }, ports, }); } diff --git a/vm/vmcore/vm_topology/src/layout.rs b/vm/vmcore/vm_topology/src/layout.rs new file mode 100644 index 0000000000..31d5d8f3de --- /dev/null +++ b/vm/vmcore/vm_topology/src/layout.rs @@ -0,0 +1,1259 @@ +// Copyright (c) Microsoft Corporation. +// Licensed under the MIT License. + +//! VM address-space layout allocator. +//! +//! This module provides a pure-math layout allocator that places reserved and +//! fixed ranges, 32-bit MMIO, ordinary RAM, 64-bit MMIO, and post-MMIO ranges in +//! a flat guest physical address map. It has no knowledge of specific +//! architectures, firmware types, or chipset conventions; callers express those +//! policies as reserved/fixed ranges and dynamic requests. +//! +//! # Usage +//! +//! ``` +//! use memory_range::MemoryRange; +//! use vm_topology::layout::{LayoutBuilder, Placement}; +//! +//! let mut ram = Vec::new(); +//! let mut vmbus = MemoryRange::EMPTY; +//! +//! let mut builder = LayoutBuilder::new(); +//! builder.fixed( +//! "reserved", +//! MemoryRange::new(0xFE00_0000..0x1_0000_0000), +//! ); +//! builder.request( +//! "vmbus", +//! &mut vmbus, +//! 128 * 1024 * 1024, +//! 1024 * 1024, +//! Placement::Mmio32, +//! ); +//! builder.ram("ram", &mut ram, 2 * 1024 * 1024 * 1024, 4096); +//! +//! let sorted = builder.allocate().unwrap(); +//! assert_eq!(ram, [MemoryRange::new(0..0x8000_0000)]); +//! assert_eq!(vmbus.end(), 0xFE00_0000); +//! assert_eq!(sorted.len(), 3); +//! ``` + +use memory_range::MemoryRange; +use std::sync::Arc; +use thiserror::Error; + +const PAGE_SIZE: u64 = 4096; +const FOUR_GIB: u64 = 0x1_0000_0000; +const ADDRESS_LIMIT: u64 = MemoryRange::MAX_ADDRESS; + +/// The placement class for a dynamic single-range layout request. +#[derive(Debug, Clone, Copy, PartialEq, Eq)] +pub enum Placement { + /// The allocation must fit below the 4 GiB boundary and is placed top down. + Mmio32, + /// The allocation must sit above the 4 GiB boundary and is placed bottom + /// up above RAM. + Mmio64, + /// The allocation is placed bottom up after RAM and all MMIO allocations. + /// + /// Post-MMIO requests are allocated in caller order, not sorted by size or + /// alignment, so they can be used for private implementation ranges that + /// must not perturb the guest-visible RAM/MMIO layout. + PostMmio, +} + +/// The kind of a produced allocation. +#[derive(Debug, Clone, Copy, PartialEq, Eq)] +pub enum PlacedRangeKind { + /// A reserved range supplied by the caller. + Reserved, + /// A fixed allocation supplied by the caller. + Fixed, + /// A 32-bit MMIO allocation. + Mmio32, + /// An ordinary RAM allocation. + Ram, + /// A 64-bit MMIO allocation. + Mmio64, + /// A post-MMIO allocation. + PostMmio, +} + +/// Allocation phase reported in [`AllocateError::Exhausted`]. +#[derive(Debug, Clone, Copy, PartialEq, Eq)] +pub enum AllocationPhase { + /// 32-bit MMIO placement. + Mmio32, + /// RAM placement. + Ram, + /// 64-bit MMIO placement. + Mmio64, + /// Post-MMIO placement. + PostMmio, +} + +/// A placed range returned by [`LayoutBuilder::allocate`]. +#[derive(Debug, Clone, PartialEq, Eq)] +pub struct PlacedRange { + /// The caller-supplied tag for the request. + pub tag: Arc, + /// The kind of allocation. + pub kind: PlacedRangeKind, + /// The placed range. + pub range: MemoryRange, +} + +/// A builder for computing a deterministic VM address-space layout. +pub struct LayoutBuilder<'a> { + reserved: Vec, + fixed: Vec, + mmio32: Vec>, + ram: Vec>, + mmio64: Vec>, + post_mmio: Vec>, +} + +struct ReservedRequest { + tag: Arc, + range: MemoryRange, +} + +struct FixedRequest { + tag: Arc, + range: MemoryRange, +} + +struct DynamicRequest<'a> { + tag: Arc, + target: &'a mut MemoryRange, + size: u64, + alignment: u64, +} + +impl DynamicRequest<'_> { + /// Sort key for the dynamic placement phases: larger alignment first, then + /// larger size first. Wrapping with `Reverse` makes the descending order + /// self-evident at the call site. + fn placement_sort_key(&self) -> std::cmp::Reverse<(u64, u64)> { + std::cmp::Reverse((self.alignment, self.size)) + } +} + +struct RamRequest<'a> { + tag: Arc, + target: &'a mut Vec, + size: u64, + alignment: u64, +} + +struct AllocationState { + // Sorted, non-overlapping, non-empty ranges not yet consumed by any + // request. Keeping free space as the primary state lets each phase update + // the map incrementally instead of repeatedly subtracting all allocations + // from the whole address space. + // + // The non-empty invariant lets `remove_free_range` locate the containing + // free range with a single `partition_point` lookup. + free: Vec, + allocations: Vec, + // Highest end address of ordinary RAM. High MMIO starts here so the layout + // top is driven by requested topology rather than a caller-provided high + // MMIO bucket size or host physical-address width. + ram_end: u64, +} + +impl AllocationState { + fn new() -> Self { + Self { + free: vec![MemoryRange::new(0..ADDRESS_LIMIT)], + allocations: Vec::new(), + ram_end: 0, + } + } + + fn place_fixed(&mut self, requests: &[FixedRequest]) -> Result<(), AllocateError> { + for request in requests { + self.allocate_range(&request.tag, PlacedRangeKind::Fixed, request.range); + } + + Ok(()) + } + + fn place_reserved(&mut self, requests: &[ReservedRequest]) { + for request in requests { + self.allocate_range(&request.tag, PlacedRangeKind::Reserved, request.range); + } + } + + fn place_mmio32(&mut self, requests: &mut [DynamicRequest<'_>]) -> Result<(), AllocateError> { + // Pack 32-bit MMIO from the top of the 4 GiB window downward so RAM can + // start at GPA 0 and grow upward through the lowest remaining space. + // Alignment/size ordering keeps large, constrained windows from being + // fragmented by small devices. `sort_by_key` is stable, so otherwise + // equal requests keep caller order. + requests.sort_by_key(|r| r.placement_sort_key()); + + for request in requests { + let Some(start) = + find_highest_fit(&self.free, request.size, request.alignment, 0, FOUR_GIB) + else { + return Err(exhausted_error( + &request.tag, + request.size, + request.alignment, + AllocationPhase::Mmio32, + &self.free, + 0, + FOUR_GIB, + )); + }; + + let range = MemoryRange::new(start..start + request.size); + *request.target = range; + self.allocate_range(&request.tag, PlacedRangeKind::Mmio32, range); + } + + Ok(()) + } + + fn place_ram(&mut self, requests: &mut [RamRequest<'_>]) -> Result<(), AllocateError> { + // Ordinary RAM is the only splittable request type in this API. It is + // placed after low MMIO so the resulting RAM extents describe the + // actual guest-visible memory map, including holes below 4 GiB. + // + // Requests are placed in caller order, and each request starts at or + // above the highest address used by previous RAM requests. A later + // RAM request never backfills a fragment that an earlier one skipped: + // this keeps the flattened RAM list sorted by address (matching the + // invariant `MemoryLayout` validates) and turns vnode order into a + // clean compatibility surface, since adding new fixed or reserved + // ranges only shifts vnodes whose own span actually covers them. + for request in requests { + let floor = self.ram_end; + let ranges = find_lowest_splittable_fit( + &self.free, + request.size, + request.alignment, + floor, + ADDRESS_LIMIT, + ) + .ok_or_else(|| { + exhausted_error( + &request.tag, + request.size, + request.alignment, + AllocationPhase::Ram, + &self.free, + floor, + ADDRESS_LIMIT, + ) + })?; + + request.target.clear(); + request.target.extend_from_slice(&ranges); + for range in ranges { + self.allocate_range(&request.tag, PlacedRangeKind::Ram, range); + } + } + + Ok(()) + } + + fn place_mmio64(&mut self, requests: &mut [DynamicRequest<'_>]) -> Result<(), AllocateError> { + // High MMIO is allocated bottom up above RAM, but never below the + // 4 GiB boundary: it is "64-bit" MMIO and must not overlap the 32-bit + // window even when RAM is small. The allocator intentionally does not + // take host physical-address width as an input; callers validate the + // resulting top against host capabilities later. + requests.sort_by_key(|r| r.placement_sort_key()); + + let floor = self.ram_end.max(FOUR_GIB); + for request in requests { + let Some(start) = find_lowest_fit( + &self.free, + request.size, + request.alignment, + floor, + ADDRESS_LIMIT, + ) else { + return Err(exhausted_error( + &request.tag, + request.size, + request.alignment, + AllocationPhase::Mmio64, + &self.free, + floor, + ADDRESS_LIMIT, + )); + }; + + let range = MemoryRange::new(start..start + request.size); + *request.target = range; + self.allocate_range(&request.tag, PlacedRangeKind::Mmio64, range); + } + + Ok(()) + } + + fn place_post_mmio( + &mut self, + requests: &mut [DynamicRequest<'_>], + ) -> Result<(), AllocateError> { + // These ranges are intentionally placed after all RAM/MMIO work and in + // caller order. They are for implementation-private ranges that should + // not change the VTL0-visible layout or be reordered by alignment. + for request in requests { + let layout_top = self.layout_top(); + let Some(start) = find_lowest_fit( + &self.free, + request.size, + request.alignment, + layout_top, + ADDRESS_LIMIT, + ) else { + return Err(exhausted_error( + &request.tag, + request.size, + request.alignment, + AllocationPhase::PostMmio, + &self.free, + layout_top, + ADDRESS_LIMIT, + )); + }; + + let range = MemoryRange::new(start..start + request.size); + *request.target = range; + self.allocate_range(&request.tag, PlacedRangeKind::PostMmio, range); + } + + Ok(()) + } + + fn layout_top(&self) -> u64 { + self.allocations + .iter() + .filter(|allocation| allocation.kind != PlacedRangeKind::Reserved) + .map(|allocation| allocation.range.end()) + .max() + .unwrap_or(0) + } + + fn allocate_range(&mut self, tag: &Arc, kind: PlacedRangeKind, range: MemoryRange) { + self.remove_free_range(range); + self.allocations.push(PlacedRange { + tag: tag.clone(), + kind, + range, + }); + if kind == PlacedRangeKind::Ram { + self.ram_end = self.ram_end.max(range.end()); + } + } + + fn remove_free_range(&mut self, allocated: MemoryRange) { + let free_index = self + .free + .partition_point(|range| range.start() <= allocated.start()) + .checked_sub(1) + .expect("allocated range must be contained in the free list"); + assert!(self.free[free_index].contains(&allocated)); + let free_range = self.free.remove(free_index); + + let mut insert_index = free_index; + if free_range.start() < allocated.start() { + self.free.insert( + insert_index, + MemoryRange::new(free_range.start()..allocated.start()), + ); + insert_index += 1; + } + if allocated.end() < free_range.end() { + self.free.insert( + insert_index, + MemoryRange::new(allocated.end()..free_range.end()), + ); + } + } +} + +/// Error returned by [`LayoutBuilder::allocate`]. +#[derive(Debug, Error)] +pub enum AllocateError { + /// A request has an invalid size. + #[error("{tag}: invalid size {size:#x} (must be > 0 and a multiple of {PAGE_SIZE:#x})")] + InvalidSize { + /// The tag identifying the request. + tag: Arc, + /// The invalid size. + size: u64, + }, + /// A request has an invalid alignment. + #[error("{tag}: invalid alignment {alignment:#x} (must be >= {PAGE_SIZE:#x} and a power of 2)")] + InvalidAlignment { + /// The tag identifying the request. + tag: Arc, + /// The invalid alignment. + alignment: u64, + }, + /// Two fixed or reserved requests overlap. + #[error("fixed/reserved requests {tag_a} ({range_a}) and {tag_b} ({range_b}) overlap")] + FixedOverlap { + /// The tag of the first request. + tag_a: Arc, + /// The range of the first request. + range_a: MemoryRange, + /// The tag of the second request. + tag_b: Arc, + /// The range of the second request. + range_b: MemoryRange, + }, + /// A dynamic request could not be satisfied. + #[error( + "{tag}: cannot allocate {size:#x} bytes with alignment {alignment:#x} during {phase:?}; remaining free space in phase: {free_space:#x} bytes" + )] + Exhausted { + /// The tag identifying the request. + tag: Arc, + /// The requested size. + size: u64, + /// The requested alignment. + alignment: u64, + /// The allocation phase. + phase: AllocationPhase, + /// The remaining free space in the phase. + free_space: u64, + }, +} + +impl<'a> LayoutBuilder<'a> { + /// Creates a new layout builder. + pub fn new() -> Self { + Self { + reserved: Vec::new(), + fixed: Vec::new(), + mmio32: Vec::new(), + ram: Vec::new(), + mmio64: Vec::new(), + post_mmio: Vec::new(), + } + } + + /// Reserves a range so no allocation can use it. + /// + /// Reserved ranges are removed from the free list and may appear in the + /// returned [`PlacedRange`] list, but they do not affect post-MMIO + /// placement. Trailing reserved ranges are omitted from the returned list. + pub fn reserve(&mut self, tag: impl Into>, range: MemoryRange) { + self.reserved.push(ReservedRequest { + tag: tag.into(), + range, + }); + } + + /// Adds a fixed range request to the builder. + /// + pub fn fixed(&mut self, tag: impl Into>, range: MemoryRange) { + self.fixed.push(FixedRequest { + tag: tag.into(), + range, + }); + } + + /// Adds a dynamic single-range request to the builder. + /// + /// The target is filled in when [`Self::allocate`] succeeds. + pub fn request( + &mut self, + tag: impl Into>, + target: &'a mut MemoryRange, + size: u64, + alignment: u64, + placement: Placement, + ) { + let request = DynamicRequest { + tag: tag.into(), + target, + size, + alignment, + }; + match placement { + Placement::Mmio32 => self.mmio32.push(request), + Placement::Mmio64 => self.mmio64.push(request), + Placement::PostMmio => self.post_mmio.push(request), + } + } + + /// Adds an ordinary RAM request to the builder. + /// + /// RAM requests are placed in caller order. The first request is placed + /// bottom up from GPA 0; each subsequent request starts at or above the + /// highest address used by previous RAM requests, so later requests never + /// backfill fragments skipped by earlier ones. A single request may still + /// split around fixed and Mmio32 ranges encountered inside its own span; + /// each extent starts at `alignment`, and split extents that do not + /// satisfy the rest of the request are rounded down to `alignment` so + /// large aligned requests are not fragmented into smaller chunks. The + /// target vector is replaced with the placed RAM extents when + /// [`Self::allocate`] succeeds. + pub fn ram( + &mut self, + tag: impl Into>, + target: &'a mut Vec, + size: u64, + alignment: u64, + ) { + self.ram.push(RamRequest { + tag: tag.into(), + target, + size, + alignment, + }); + } + + /// Allocates all requests, fills in each target, and returns every placed + /// range sorted by address. + pub fn allocate(mut self) -> Result, AllocateError> { + validate_requests(&self.reserved, |r| (&r.tag, r.range.len(), PAGE_SIZE))?; + validate_requests(&self.fixed, |r| (&r.tag, r.range.len(), PAGE_SIZE))?; + validate_pinned_ranges(&self.reserved, &self.fixed)?; + validate_requests(&self.mmio32, |r| (&r.tag, r.size, r.alignment))?; + validate_requests(&self.ram, |r| (&r.tag, r.size, r.alignment))?; + validate_requests(&self.mmio64, |r| (&r.tag, r.size, r.alignment))?; + validate_requests(&self.post_mmio, |r| (&r.tag, r.size, r.alignment))?; + + let mut state = AllocationState::new(); + state.place_reserved(&self.reserved); + state.place_fixed(&self.fixed)?; + state.place_mmio32(&mut self.mmio32)?; + state.place_ram(&mut self.ram)?; + state.place_mmio64(&mut self.mmio64)?; + state.place_post_mmio(&mut self.post_mmio)?; + + state.allocations.sort_by_key(|allocation| allocation.range); + // Trailing reserved ranges sit above every guest-visible allocation and + // exist only to keep that space out of the free list during placement. + // Returning them would bloat the layout without informing any + // consumer, so drop them. Reserved ranges interleaved with real + // allocations are still reported. + while state + .allocations + .last() + .is_some_and(|allocation| allocation.kind == PlacedRangeKind::Reserved) + { + state.allocations.pop(); + } + Ok(state.allocations) + } +} + +impl Default for LayoutBuilder<'_> { + fn default() -> Self { + Self::new() + } +} + +fn validate_size_alignment(tag: &Arc, size: u64, alignment: u64) -> Result<(), AllocateError> { + if size == 0 || !size.is_multiple_of(PAGE_SIZE) { + return Err(AllocateError::InvalidSize { + tag: tag.clone(), + size, + }); + } + + if alignment < PAGE_SIZE || !alignment.is_power_of_two() { + return Err(AllocateError::InvalidAlignment { + tag: tag.clone(), + alignment, + }); + } + + Ok(()) +} + +fn validate_requests( + requests: &[T], + get: impl Fn(&T) -> (&Arc, u64, u64), +) -> Result<(), AllocateError> { + for request in requests { + let (tag, size, alignment) = get(request); + validate_size_alignment(tag, size, alignment)?; + } + + Ok(()) +} + +fn validate_pinned_ranges( + reserved_requests: &[ReservedRequest], + fixed_requests: &[FixedRequest], +) -> Result<(), AllocateError> { + let mut pinned = reserved_requests + .iter() + .map(|request| (request.range, &request.tag)) + .chain( + fixed_requests + .iter() + .map(|request| (request.range, &request.tag)), + ) + .collect::>(); + + pinned.sort_by_key(|(range, _)| range.start()); + + for &[(range_a, tag_a), (range_b, tag_b)] in pinned.array_windows() { + if range_a.overlaps(&range_b) { + return Err(AllocateError::FixedOverlap { + tag_a: tag_a.clone(), + range_a, + tag_b: tag_b.clone(), + range_b, + }); + } + } + + Ok(()) +} + +fn exhausted_error( + tag: &Arc, + size: u64, + alignment: u64, + phase: AllocationPhase, + free_ranges: &[MemoryRange], + region_start: u64, + region_end: u64, +) -> AllocateError { + AllocateError::Exhausted { + tag: tag.clone(), + size, + alignment, + phase, + free_space: free_space_in_region(free_ranges, region_start, region_end), + } +} + +fn free_space_in_region(free_ranges: &[MemoryRange], region_start: u64, region_end: u64) -> u64 { + free_ranges + .iter() + .filter_map(|range| clamp_to_region(*range, region_start, region_end)) + .map(|(start, end)| end - start) + .sum() +} + +/// Clamps a free range to the requested placement region. Returns `None` when +/// the intersection is empty. +fn clamp_to_region(range: MemoryRange, region_start: u64, region_end: u64) -> Option<(u64, u64)> { + let start = range.start().max(region_start); + let end = range.end().min(region_end); + (start < end).then_some((start, end)) +} + +fn find_highest_fit( + free_ranges: &[MemoryRange], + size: u64, + alignment: u64, + region_start: u64, + region_end: u64, +) -> Option { + for range in free_ranges.iter().rev() { + let Some((effective_start, effective_end)) = + clamp_to_region(*range, region_start, region_end) + else { + continue; + }; + if effective_end - effective_start < size { + continue; + } + let aligned_start = align_down(effective_end - size, alignment); + if aligned_start >= effective_start { + return Some(aligned_start); + } + } + + None +} + +fn find_lowest_fit( + free_ranges: &[MemoryRange], + size: u64, + alignment: u64, + region_start: u64, + region_end: u64, +) -> Option { + for range in free_ranges { + let Some((effective_start, effective_end)) = + clamp_to_region(*range, region_start, region_end) + else { + continue; + }; + let Some(aligned_start) = align_up(effective_start, alignment) else { + continue; + }; + let Some(end) = aligned_start.checked_add(size) else { + continue; + }; + if end <= effective_end { + return Some(aligned_start); + } + } + + None +} + +fn find_lowest_splittable_fit( + free_ranges: &[MemoryRange], + size: u64, + alignment: u64, + region_start: u64, + region_end: u64, +) -> Option> { + let mut remaining = size; + let mut ranges = Vec::new(); + + for range in free_ranges { + let Some((effective_start, effective_end)) = + clamp_to_region(*range, region_start, region_end) + else { + continue; + }; + let Some(aligned_start) = align_up(effective_start, alignment) else { + continue; + }; + if aligned_start >= effective_end { + continue; + } + + let available = effective_end - aligned_start; + let allocation_size = if available >= remaining { + remaining + } else { + align_down(available, alignment) + }; + if allocation_size == 0 { + continue; + } + ranges.push(MemoryRange::new( + aligned_start..aligned_start + allocation_size, + )); + remaining -= allocation_size; + + if remaining == 0 { + return Some(ranges); + } + } + + None +} + +fn align_down(value: u64, alignment: u64) -> u64 { + value & !(alignment - 1) +} + +fn align_up(value: u64, alignment: u64) -> Option { + value + .checked_add(alignment - 1) + .map(|value| align_down(value, alignment)) +} + +#[cfg(test)] +mod tests { + use super::*; + + const KIB: u64 = 1024; + const MIB: u64 = 1024 * KIB; + const GIB: u64 = 1024 * MIB; + + #[test] + fn empty_input() { + let sorted = LayoutBuilder::new().allocate().unwrap(); + assert!(sorted.is_empty()); + } + + #[test] + fn fixed_request_is_reported() { + let mut builder = LayoutBuilder::new(); + let range = MemoryRange::new(0xFC00_0000..0xFC40_0000); + builder.fixed("fixed", range); + + let sorted = builder.allocate().unwrap(); + + assert_eq!(sorted[0].range, range); + assert_eq!(sorted[0].kind, PlacedRangeKind::Fixed); + } + + #[test] + fn fixed_overlap_rejected() { + let mut builder = LayoutBuilder::new(); + builder.fixed("first", MemoryRange::new(0x1000..0x3000)); + builder.fixed("second", MemoryRange::new(0x2000..0x3000)); + + let error = builder.allocate().unwrap_err(); + + assert!(matches!(error, AllocateError::FixedOverlap { .. })); + } + + #[test] + fn invalid_request_rejected() { + let mut target = MemoryRange::EMPTY; + let mut builder = LayoutBuilder::new(); + builder.request("zero", &mut target, 0, PAGE_SIZE, Placement::Mmio32); + assert!(matches!( + builder.allocate().unwrap_err(), + AllocateError::InvalidSize { .. } + )); + + let mut target = MemoryRange::EMPTY; + let mut builder = LayoutBuilder::new(); + builder.request("alignment", &mut target, PAGE_SIZE, KIB, Placement::Mmio32); + assert!(matches!( + builder.allocate().unwrap_err(), + AllocateError::InvalidAlignment { .. } + )); + } + + #[test] + fn reserved_overlap_rejected() { + let mut builder = LayoutBuilder::new(); + builder.reserve("reserved", MemoryRange::new(GIB..GIB + MIB)); + builder.fixed( + "fixed", + MemoryRange::new(GIB + PAGE_SIZE..GIB + PAGE_SIZE + MIB), + ); + + let error = builder.allocate().unwrap_err(); + + assert!(matches!(error, AllocateError::FixedOverlap { .. })); + } + + #[test] + fn mmio32_uses_top_down_placement_below_4_gib() { + let mut first = MemoryRange::EMPTY; + let mut second = MemoryRange::EMPTY; + let mut builder = LayoutBuilder::new(); + builder.fixed("reserved", MemoryRange::new(0xFE00_0000..0x1_0000_0000)); + builder.request("first", &mut first, MIB, MIB, Placement::Mmio32); + builder.request("second", &mut second, MIB, MIB, Placement::Mmio32); + + builder.allocate().unwrap(); + + assert_eq!(first, MemoryRange::new(0xFDF0_0000..0xFE00_0000)); + assert_eq!(second, MemoryRange::new(0xFDE0_0000..0xFDF0_0000)); + } + + #[test] + fn mmio32_orders_by_alignment_then_size_then_request_order() { + let mut small = MemoryRange::EMPTY; + let mut aligned = MemoryRange::EMPTY; + let mut large = MemoryRange::EMPTY; + let mut builder = LayoutBuilder::new(); + builder.request("small", &mut small, MIB, MIB, Placement::Mmio32); + builder.request("aligned", &mut aligned, MIB, 256 * MIB, Placement::Mmio32); + builder.request("large", &mut large, 4 * MIB, MIB, Placement::Mmio32); + + builder.allocate().unwrap(); + + assert_eq!(aligned.start() % (256 * MIB), 0); + assert_eq!(large.len(), 4 * MIB); + assert_eq!(small.len(), MIB); + assert!(!aligned.overlaps(&large)); + assert!(!aligned.overlaps(&small)); + assert!(!large.overlaps(&small)); + } + + #[test] + fn ram_starts_at_zero() { + let mut ram = Vec::new(); + let mut builder = LayoutBuilder::new(); + builder.ram("ram", &mut ram, 2 * GIB, PAGE_SIZE); + + let sorted = builder.allocate().unwrap(); + + assert_eq!(ram, [MemoryRange::new(0..2 * GIB)]); + assert_eq!(sorted[0].kind, PlacedRangeKind::Ram); + assert_eq!(sorted[0].range, ram[0]); + } + + #[test] + fn ram_splits_around_fixed_ranges_and_mmio32() { + let mut mmio32 = MemoryRange::EMPTY; + let mut ram = Vec::new(); + let mut builder = LayoutBuilder::new(); + builder.fixed("fixed", MemoryRange::new(GIB..GIB + MIB)); + builder.request("mmio32", &mut mmio32, 2 * GIB, MIB, Placement::Mmio32); + builder.ram("ram", &mut ram, 3 * GIB, PAGE_SIZE); + + builder.allocate().unwrap(); + + assert_eq!( + ram, + [ + MemoryRange::new(0..GIB), + MemoryRange::new(GIB + MIB..2 * GIB), + MemoryRange::new(FOUR_GIB..FOUR_GIB + GIB + MIB), + ] + ); + } + + #[test] + fn ram_split_chunks_round_down_to_alignment() { + let mut ram = Vec::new(); + let mut builder = LayoutBuilder::new(); + builder.fixed("fixed", MemoryRange::new(GIB + MIB..GIB + 2 * MIB)); + builder.ram("ram", &mut ram, 2 * GIB, GIB); + + builder.allocate().unwrap(); + + assert_eq!( + ram, + [MemoryRange::new(0..GIB), MemoryRange::new(2 * GIB..3 * GIB),] + ); + } + + #[test] + fn ram_requests_are_placed_in_order() { + // Two RAM requests must not interleave: the second request starts at + // or above the maximum end address of the first, so the flattened + // RAM list is always sorted by address. + let mut first = Vec::new(); + let mut second = Vec::new(); + let mut builder = LayoutBuilder::new(); + builder.ram("first", &mut first, 2 * GIB, PAGE_SIZE); + builder.ram("second", &mut second, GIB, PAGE_SIZE); + + builder.allocate().unwrap(); + + assert_eq!(first, [MemoryRange::new(0..2 * GIB)]); + assert_eq!(second, [MemoryRange::new(2 * GIB..3 * GIB)]); + } + + #[test] + fn ram_request_does_not_backfill_earlier_fragments() { + // A small fixed range below the first RAM request's end leaves an + // unaligned fragment that the first request skips. An earlier + // best-fit policy would have allowed a smaller-aligned later RAM + // request to backfill that fragment, producing an out-of-order RAM + // list. In-order placement floors each request at the previous + // request's end, so the fragment stays unallocated and vnode order + // matches address order. + let mut first = Vec::new(); + let mut second = Vec::new(); + let mut builder = LayoutBuilder::new(); + // Carve a tiny hole inside what the first request would otherwise + // round down to a GiB-aligned chunk. + builder.fixed("hole", MemoryRange::new(GIB + MIB..GIB + 2 * MIB)); + builder.ram("first", &mut first, 2 * GIB, GIB); + builder.ram("second", &mut second, 256 * MIB, PAGE_SIZE); + builder.allocate().unwrap(); + + // First request lands at [0, 1 GiB) and [2 GiB, 3 GiB); the fragment + // at [1 GiB + 2 MiB, 2 GiB) is left free. + assert_eq!( + first, + [MemoryRange::new(0..GIB), MemoryRange::new(2 * GIB..3 * GIB)] + ); + // The 256 MiB second request would fit at 1 GiB + 2 MiB if backfill + // were allowed; instead it must come after the first request's max + // end (3 GiB). + assert_eq!(second.len(), 1); + assert!( + second[0].start() >= first.iter().map(|r| r.end()).max().unwrap(), + "second RAM request backfilled below first request's end: {second:?}" + ); + assert_eq!(second, [MemoryRange::new(3 * GIB..3 * GIB + 256 * MIB)]); + } + + #[test] + fn ram_in_order_keeps_flattened_list_sorted_with_mmio32() { + // Reproduces the scenario that would have produced an unsorted RAM + // list under best-fit: a fixed Mmio32-style range low in memory plus + // a small second vnode that could otherwise be placed before the + // first vnode's tail. + let mut first = Vec::new(); + let mut second = Vec::new(); + let mut builder = LayoutBuilder::new(); + // A 1 MiB fixed range (e.g. a PCIe BAR) just above 1 GiB. + builder.fixed("pcie_bar", MemoryRange::new(0x4010_0000..0x4020_0000)); + builder.ram("first", &mut first, 2 * GIB, PAGE_SIZE); + builder.ram("second", &mut second, 512 * MIB, PAGE_SIZE); + + builder.allocate().unwrap(); + + let first_end = first.iter().map(|r| r.end()).max().unwrap(); + assert!( + second.iter().all(|r| r.start() >= first_end), + "second vnode placed below first vnode's end: first={first:?} second={second:?}" + ); + + let mut all: Vec<_> = first.iter().chain(second.iter()).copied().collect(); + let sorted = { + let mut s = all.clone(); + s.sort_by_key(|r| r.start()); + s + }; + assert_eq!(all, sorted, "flattened RAM list must be sorted"); + // Sanity: no overlaps either. + all.sort_by_key(|r| r.start()); + for pair in all.windows(2) { + assert!( + pair[0].end() <= pair[1].start(), + "overlapping RAM ranges: {pair:?}" + ); + } + } + + #[test] + fn mmio64_uses_bottom_up_placement_above_four_gib() { + let mut ram = Vec::new(); + let mut first = MemoryRange::EMPTY; + let mut second = MemoryRange::EMPTY; + let mut builder = LayoutBuilder::new(); + builder.ram("ram", &mut ram, 2 * GIB, PAGE_SIZE); + builder.request("first", &mut first, MIB, MIB, Placement::Mmio64); + builder.request("second", &mut second, MIB, MIB, Placement::Mmio64); + + builder.allocate().unwrap(); + + // Mmio64 is floored at 4 GiB even when RAM ends below it. + assert_eq!(first, MemoryRange::new(FOUR_GIB..FOUR_GIB + MIB)); + assert_eq!(second, MemoryRange::new(FOUR_GIB + MIB..FOUR_GIB + 2 * MIB)); + } + + #[test] + fn mmio64_starts_above_ram_when_ram_exceeds_four_gib() { + let mut ram = Vec::new(); + let mut mmio64 = MemoryRange::EMPTY; + let mut builder = LayoutBuilder::new(); + builder.ram("ram", &mut ram, 6 * GIB, PAGE_SIZE); + builder.request("mmio64", &mut mmio64, MIB, MIB, Placement::Mmio64); + + builder.allocate().unwrap(); + + // RAM occupies [0, 4 GiB) and [4 GiB + low MMIO ..]; with no Mmio32 + // requests, the second RAM extent starts at 4 GiB and ends at 6 GiB + + // (low MMIO hole) above 4 GiB. Mmio64 is placed bottom-up above RAM. + let ram_end = ram.iter().map(|r| r.end()).max().unwrap(); + assert_eq!(mmio64, MemoryRange::new(ram_end..ram_end + MIB)); + assert!(mmio64.start() >= FOUR_GIB); + } + + #[test] + fn mmio64_skips_fixed_ranges_above_four_gib() { + let mut ram = Vec::new(); + let mut mmio64 = MemoryRange::EMPTY; + let mut builder = LayoutBuilder::new(); + builder.ram("ram", &mut ram, 2 * GIB, PAGE_SIZE); + builder.fixed("fixed", MemoryRange::new(FOUR_GIB..FOUR_GIB + MIB)); + builder.request("mmio64", &mut mmio64, MIB, MIB, Placement::Mmio64); + + builder.allocate().unwrap(); + + assert_eq!(mmio64, MemoryRange::new(FOUR_GIB + MIB..FOUR_GIB + 2 * MIB)); + } + + #[test] + fn post_mmio_uses_bottom_up_placement_after_all_mmio() { + let mut ram = Vec::new(); + let mut mmio64 = MemoryRange::EMPTY; + let mut post_mmio = MemoryRange::EMPTY; + let mut builder = LayoutBuilder::new(); + builder.ram("ram", &mut ram, 2 * GIB, PAGE_SIZE); + builder.request("mmio64", &mut mmio64, MIB, MIB, Placement::Mmio64); + builder.request("post_mmio", &mut post_mmio, MIB, MIB, Placement::PostMmio); + + builder.allocate().unwrap(); + + assert_eq!(mmio64, MemoryRange::new(FOUR_GIB..FOUR_GIB + MIB)); + assert_eq!( + post_mmio, + MemoryRange::new(FOUR_GIB + MIB..FOUR_GIB + 2 * MIB) + ); + } + + #[test] + fn post_mmio_preserves_request_order() { + let mut ram = Vec::new(); + let mut first = MemoryRange::EMPTY; + let mut aligned = MemoryRange::EMPTY; + let mut builder = LayoutBuilder::new(); + builder.ram("ram", &mut ram, 2 * GIB, PAGE_SIZE); + builder.request("first", &mut first, MIB, MIB, Placement::PostMmio); + builder.request("aligned", &mut aligned, MIB, GIB, Placement::PostMmio); + + builder.allocate().unwrap(); + + assert_eq!(first, MemoryRange::new(2 * GIB..2 * GIB + MIB)); + assert_eq!(aligned, MemoryRange::new(3 * GIB..3 * GIB + MIB)); + } + + #[test] + fn high_reserved_range_does_not_affect_post_mmio_placement() { + let mut ram = Vec::new(); + let mut post_mmio = MemoryRange::EMPTY; + let mut builder = LayoutBuilder::new(); + builder.ram("ram", &mut ram, 2 * GIB, PAGE_SIZE); + builder.reserve( + "high_reserved", + MemoryRange::new(0xFD_0000_0000..0xFD_4000_0000), + ); + builder.request("post_mmio", &mut post_mmio, MIB, MIB, Placement::PostMmio); + + let sorted = builder.allocate().unwrap(); + + assert_eq!(post_mmio, MemoryRange::new(2 * GIB..2 * GIB + MIB)); + assert!( + !sorted + .iter() + .any(|allocation| allocation.kind == PlacedRangeKind::Reserved) + ); + } + + #[test] + fn reserved_range_between_allocations_is_reported() { + let mut ram = Vec::new(); + let mut post_mmio = MemoryRange::EMPTY; + let mut builder = LayoutBuilder::new(); + builder.ram("ram", &mut ram, 2 * GIB, PAGE_SIZE); + builder.reserve("reserved", MemoryRange::new(2 * GIB..2 * GIB + MIB)); + builder.request("post_mmio", &mut post_mmio, MIB, MIB, Placement::PostMmio); + + let sorted = builder.allocate().unwrap(); + + assert_eq!( + post_mmio, + MemoryRange::new(2 * GIB + MIB..2 * GIB + 2 * MIB) + ); + assert!(sorted.iter().any(|allocation| { + allocation.kind == PlacedRangeKind::Reserved + && allocation.range == MemoryRange::new(2 * GIB..2 * GIB + MIB) + })); + } + + #[test] + fn fixed_hypertransport_hole_is_regular_fixed_placement() { + let mut ram = Vec::new(); + let mut builder = LayoutBuilder::new(); + builder.ram("ram", &mut ram, 2 * GIB, PAGE_SIZE); + let hypertransport = MemoryRange::new(0xFD_0000_0000..0xFD_4000_0000); + builder.fixed("amd_hypertransport_hole", hypertransport); + + let sorted = builder.allocate().unwrap(); + + assert_eq!(sorted.last().unwrap().range, hypertransport); + } + + #[test] + fn exhaustion_reports_phase() { + let mut mmio32 = MemoryRange::EMPTY; + let mut builder = LayoutBuilder::new(); + builder.request( + "too_big", + &mut mmio32, + 4 * GIB + PAGE_SIZE, + PAGE_SIZE, + Placement::Mmio32, + ); + assert!(matches!( + builder.allocate().unwrap_err(), + AllocateError::Exhausted { + phase: AllocationPhase::Mmio32, + .. + } + )); + + let mut ram = Vec::new(); + let mut builder = LayoutBuilder::new(); + builder.fixed("fixed", MemoryRange::new(0..ADDRESS_LIMIT)); + builder.ram("ram", &mut ram, PAGE_SIZE, PAGE_SIZE); + assert!(matches!( + builder.allocate().unwrap_err(), + AllocateError::Exhausted { + phase: AllocationPhase::Ram, + .. + } + )); + + let mut ram = Vec::new(); + let mut mmio64 = MemoryRange::EMPTY; + let mut builder = LayoutBuilder::new(); + builder.ram("ram", &mut ram, PAGE_SIZE, PAGE_SIZE); + builder.fixed("fixed", MemoryRange::new(PAGE_SIZE..ADDRESS_LIMIT)); + builder.request( + "mmio64", + &mut mmio64, + PAGE_SIZE, + PAGE_SIZE, + Placement::Mmio64, + ); + assert!(matches!( + builder.allocate().unwrap_err(), + AllocateError::Exhausted { + phase: AllocationPhase::Mmio64, + .. + } + )); + } + + #[test] + fn sorted_result_preserves_tags_and_kinds() { + let mut ram = Vec::new(); + let mut mmio32 = MemoryRange::EMPTY; + let mut mmio64 = MemoryRange::EMPTY; + let mut builder = LayoutBuilder::new(); + builder.ram("ram", &mut ram, GIB, PAGE_SIZE); + builder.request("mmio32", &mut mmio32, MIB, MIB, Placement::Mmio32); + builder.request("mmio64", &mut mmio64, MIB, MIB, Placement::Mmio64); + + let sorted = builder.allocate().unwrap(); + + // mmio32 sits just below 4 GiB; mmio64 sits at 4 GiB or above. + assert_eq!(&*sorted[0].tag, "ram"); + assert_eq!(sorted[0].kind, PlacedRangeKind::Ram); + assert_eq!(&*sorted[1].tag, "mmio32"); + assert_eq!(sorted[1].kind, PlacedRangeKind::Mmio32); + assert_eq!(&*sorted[2].tag, "mmio64"); + assert_eq!(sorted[2].kind, PlacedRangeKind::Mmio64); + } + + #[test] + fn deterministic() { + let mut previous = None; + + for _ in 0..10 { + let mut ram = Vec::new(); + let mut vmbus_low = MemoryRange::EMPTY; + let mut pcie_ecam = MemoryRange::EMPTY; + let mut pcie_high = MemoryRange::EMPTY; + let mut virtio = MemoryRange::EMPTY; + let mut builder = LayoutBuilder::new(); + builder.ram("ram", &mut ram, 2 * GIB, PAGE_SIZE); + builder.fixed("reserved", MemoryRange::new(0xFE00_0000..0x1_0000_0000)); + builder.request( + "vmbus_low", + &mut vmbus_low, + 128 * MIB, + MIB, + Placement::Mmio32, + ); + builder.request( + "pcie_ecam", + &mut pcie_ecam, + 256 * MIB, + 256 * MIB, + Placement::Mmio32, + ); + builder.request("pcie_high", &mut pcie_high, GIB, MIB, Placement::Mmio64); + builder.request( + "virtio", + &mut virtio, + PAGE_SIZE, + PAGE_SIZE, + Placement::Mmio32, + ); + + let sorted = builder.allocate().unwrap(); + if let Some(previous) = &previous { + assert_eq!(previous, &sorted); + } + previous = Some(sorted); + } + } +} diff --git a/vm/vmcore/vm_topology/src/lib.rs b/vm/vmcore/vm_topology/src/lib.rs index c9649cca18..4d9a163375 100644 --- a/vm/vmcore/vm_topology/src/lib.rs +++ b/vm/vmcore/vm_topology/src/lib.rs @@ -6,6 +6,7 @@ #![forbid(unsafe_code)] +pub mod layout; pub mod memory; pub mod pcie; pub mod processor; diff --git a/vm/vmcore/vm_topology/src/memory.rs b/vm/vmcore/vm_topology/src/memory.rs index 621884a410..bf0fe05ea6 100644 --- a/vm/vmcore/vm_topology/src/memory.rs +++ b/vm/vmcore/vm_topology/src/memory.rs @@ -269,6 +269,37 @@ impl MemoryLayout { Self::build(memory.to_vec(), gaps.to_vec(), vec![], vec![], None) } + /// Makes a new memory layout from already-resolved RAM and fixed ranges. + /// + /// Each individual range must be non-empty, but the lists themselves may + /// be empty (e.g. no PCIe root complexes means empty PCI ECAM/MMIO + /// vectors). Ranges within each list must be sorted and non-overlapping. + /// MMIO gaps may contain empty placeholder ranges to preserve positional + /// indexing (e.g. `mmio()[0]` = low, `mmio()[1]` = high); empty entries + /// are ignored during validation. The combined layout is also validated + /// for overlaps, including the optional VTL2 range. + pub fn new_from_resolved_ranges( + ram: Vec, + mmio_gaps: Vec, + pci_ecam_gaps: Vec, + pci_mmio_gaps: Vec, + vtl2_range: Option, + ) -> Result { + validate_ranges_with_metadata(&ram)?; + // MMIO gaps may include empty placeholders for positional indexing; + // validate only the non-empty entries. + let non_empty_mmio: Vec<_> = mmio_gaps + .iter() + .copied() + .filter(|r| !r.is_empty()) + .collect(); + validate_ranges(&non_empty_mmio)?; + validate_ranges(&pci_ecam_gaps)?; + validate_ranges(&pci_mmio_gaps)?; + + Self::build(ram, mmio_gaps, pci_ecam_gaps, pci_mmio_gaps, vtl2_range) + } + /// Builds the memory layout. /// /// `ram` must already be known to be sorted. @@ -279,6 +310,9 @@ impl MemoryLayout { pci_mmio: Vec, vtl2_range: Option, ) -> Result { + // Filter out empty placeholder ranges before validation and overlap + // checks — they carry no physical meaning and exist only for + // positional indexing in the stored mmio vector. let mut all_ranges = ram .iter() .map(|x| &x.range) @@ -287,6 +321,7 @@ impl MemoryLayout { .chain(&pci_ecam) .chain(&pci_mmio) .copied() + .filter(|r| !r.is_empty()) .collect::>(); all_ranges.sort(); @@ -398,7 +433,12 @@ impl MemoryLayout { /// One past the last byte of RAM, MMIO, PCI ECAM, or PCI MMIO. pub fn end_of_layout(&self) -> u64 { [ - self.mmio.last().expect("mmio set").end(), + self.mmio + .iter() + .filter(|r| !r.is_empty()) + .map(|r| r.end()) + .max() + .unwrap_or(0), self.end_of_ram(), self.pci_ecam.last().map(|r| r.end()).unwrap_or(0), self.pci_mmio.last().map(|r| r.end()).unwrap_or(0), @@ -555,6 +595,69 @@ mod tests { MemoryLayout::new(TB, &[], pci_ecam, pci_mmio, None).unwrap_err(); } + #[test] + fn resolved_ranges_constructor() { + let ram = vec![ + MemoryRangeWithNode { + range: MemoryRange::new(0..GB), + vnode: 0, + }, + MemoryRangeWithNode { + range: MemoryRange::new(2 * GB..3 * GB), + vnode: 1, + }, + ]; + let mmio = vec![MemoryRange::new(GB..2 * GB)]; + let pci_ecam = vec![MemoryRange::new(4 * GB..4 * GB + MB)]; + let pci_mmio = vec![MemoryRange::new(5 * GB..6 * GB)]; + + let layout = MemoryLayout::new_from_resolved_ranges( + ram.clone(), + mmio.clone(), + pci_ecam.clone(), + pci_mmio.clone(), + None, + ) + .unwrap(); + + assert_eq!(layout.ram(), ram); + assert_eq!(layout.mmio(), mmio); + assert_eq!(layout.probe_address(4 * GB), Some(AddressType::PciEcam)); + assert_eq!(layout.probe_address(5 * GB), Some(AddressType::PciMmio)); + } + + #[test] + fn resolved_ranges_reject_overlap_with_fixed_ranges() { + let ram = vec![MemoryRangeWithNode { + range: MemoryRange::new(0..2 * GB), + vnode: 0, + }]; + let mmio = vec![MemoryRange::new(GB..2 * GB)]; + + assert!(MemoryLayout::new_from_resolved_ranges(ram, mmio, vec![], vec![], None).is_err()); + } + + #[test] + fn resolved_ranges_validate_vtl2_against_ram_end() { + let ram = vec![ + MemoryRangeWithNode { + range: MemoryRange::new(0..GB), + vnode: 0, + }, + MemoryRangeWithNode { + range: MemoryRange::new(3 * GB..4 * GB), + vnode: 0, + }, + ]; + let mmio = vec![MemoryRange::new(GB..2 * GB)]; + let vtl2_range = MemoryRange::new(2 * GB..2 * GB + MB); + + assert!(matches!( + MemoryLayout::new_from_resolved_ranges(ram, mmio, vec![], vec![], Some(vtl2_range)), + Err(Error::Vtl2RangeBeforeEndOfRam) + )); + } + #[test] fn pci_ranges() { let mmio = &[MemoryRange::new(3 * GB..4 * GB)]; diff --git a/vmm_core/src/acpi_builder.rs b/vmm_core/src/acpi_builder.rs index 791edcb857..49503f2f4e 100644 --- a/vmm_core/src/acpi_builder.rs +++ b/vmm_core/src/acpi_builder.rs @@ -329,10 +329,18 @@ impl AcpiTablesBuilder<'_, T> { // address reported in the MCFG table must reflect wherever bus number // 0 would be accessible even if the host bridge has a different starting // bus number. + // + // The layout resolver guarantees `ecam_range.start() >= + // start_bus * 1 MiB` so this subtraction never underflows in + // practice. Use `wrapping_sub` anyway so that, if a future code + // path ever bypasses that check, behavior matches what a C MCFG + // builder would do: the guest sees a wrapped base address and is + // most likely to still compute the right per-bus ECAM addresses + // for the buses it actually accesses. let ecam_region_offset = (bridge.start_bus as u64) * 256 * 4096; mcfg_extra.extend_from_slice( acpi_spec::mcfg::McfgSegmentBusRange::new( - bridge.ecam_range.start() - ecam_region_offset, + bridge.ecam_range.start().wrapping_sub(ecam_region_offset), bridge.segment, bridge.start_bus, bridge.end_bus, diff --git a/vmm_core/vm_manifest_builder/Cargo.toml b/vmm_core/vm_manifest_builder/Cargo.toml index b89fa5d683..a040d3a9b0 100644 --- a/vmm_core/vm_manifest_builder/Cargo.toml +++ b/vmm_core/vm_manifest_builder/Cargo.toml @@ -15,6 +15,7 @@ serial_core.workspace = true serial_debugcon_resources.workspace = true serial_pl011_resources.workspace = true vm_resource.workspace = true +vmm_core_defs.workspace = true vmotherboard.workspace = true mesh.workspace = true diff --git a/vmm_core/vm_manifest_builder/src/lib.rs b/vmm_core/vm_manifest_builder/src/lib.rs index b49f96066c..f63c624c63 100644 --- a/vmm_core/vm_manifest_builder/src/lib.rs +++ b/vmm_core/vm_manifest_builder/src/lib.rs @@ -43,6 +43,7 @@ use vm_resource::PlatformResource; use vm_resource::Resource; use vm_resource::ResourceId; use vm_resource::kind::SerialBackendHandle; +pub use vmm_core_defs::LayoutConfig; use vmotherboard::ChipsetDeviceHandle; use vmotherboard::LegacyPciChipsetDeviceHandle; use vmotherboard::options::BaseChipsetManifest; @@ -392,6 +393,36 @@ impl VmManifestBuilder { Ok(result) } + + /// Returns the default memory layout sizing for this VM type and + /// architecture. + /// + /// This is separate from [`Self::build`] because not every consumer runs + /// the layout engine. In particular, OpenHCL (Underhill) receives its + /// memory layout from the host and does not use these defaults. + pub fn layout_config(&self) -> LayoutConfig { + let default_low = match self.arch { + MachineArch::X86_64 => 128 * 1024 * 1024, + MachineArch::Aarch64 => 512 * 1024 * 1024, + }; + let default_high: u64 = 512 * 1024 * 1024; + let default_vtl2: u64 = 1024 * 1024 * 1024; + match self.ty { + BaseChipsetType::HypervGen1 + | BaseChipsetType::HypervGen2Uefi + | BaseChipsetType::HyperVGen2LinuxDirect + | BaseChipsetType::UnenlightenedLinuxDirect => LayoutConfig { + chipset_low_mmio_size: default_low, + chipset_high_mmio_size: default_high, + vtl2_chipset_mmio_size: 0, + }, + BaseChipsetType::HclHost => LayoutConfig { + chipset_low_mmio_size: default_low, + chipset_high_mmio_size: default_high, + vtl2_chipset_mmio_size: default_vtl2, + }, + } + } } impl VmChipsetResult { diff --git a/vmm_core/vmm_core_defs/src/lib.rs b/vmm_core/vmm_core_defs/src/lib.rs index 221cbe554c..37211e4d76 100644 --- a/vmm_core/vmm_core_defs/src/lib.rs +++ b/vmm_core/vmm_core_defs/src/lib.rs @@ -9,7 +9,27 @@ pub mod debug_rpc; use inspect::Inspect; +use mesh::MeshPayload; use mesh::payload::Protobuf; + +/// Default memory layout sizing for a VM, used by the layout engine in +/// `openvmm_core::worker::memory_layout`. +/// +/// Consumers that receive their memory layout from the host (such as OpenHCL / +/// Underhill) do not use these values. +#[derive(Debug, Clone, MeshPayload)] +pub struct LayoutConfig { + /// Chipset low MMIO range size (below 4 GiB) for VMOD/PCI0 _CRS. + /// The address is always allocated dynamically. `0` uses only the + /// architectural minimum (LAPIC, IOAPIC, GIC, etc.). + pub chipset_low_mmio_size: u32, + /// Chipset high MMIO range size (above RAM) for VMOD/PCI0 _CRS. + /// The address is always allocated dynamically. `0` disables the range. + pub chipset_high_mmio_size: u64, + /// VTL2-private chipset MMIO range size for VTL2 VMBus. + /// The address is always allocated dynamically. `0` disables the range. + pub vtl2_chipset_mmio_size: u64, +} use std::sync::Arc; /// HaltReason sent by devices and vp_set to the vmm. diff --git a/vmm_tests/vmm_tests/tests/tests/multiarch/memstat.rs b/vmm_tests/vmm_tests/tests/tests/multiarch/memstat.rs index 9e4f4127d1..5db40cded7 100644 --- a/vmm_tests/vmm_tests/tests/tests/multiarch/memstat.rs +++ b/vmm_tests/vmm_tests/tests/tests/multiarch/memstat.rs @@ -404,7 +404,6 @@ async fn idle_test( MemoryConfig { startup_bytes: 16 * (1024 * 1024 * 1024), dynamic_memory_range: None, - mmio_gaps: petri::MmioConfig::Platform, numa_mem_sizes: None, } }) diff --git a/vmm_tests/vmm_tests/tests/tests/x86_64/openhcl_linux_direct.rs b/vmm_tests/vmm_tests/tests/tests/x86_64/openhcl_linux_direct.rs index 921906e17e..c5593ee428 100644 --- a/vmm_tests/vmm_tests/tests/tests/x86_64/openhcl_linux_direct.rs +++ b/vmm_tests/vmm_tests/tests/tests/x86_64/openhcl_linux_direct.rs @@ -363,35 +363,74 @@ async fn parse_openhcl_memory_node( Ok(MemoryRange::new(range_start..range_end)) } -/// Test VTL2 memory allocation mode, and validate that VTL0 saw the correct -/// amount of mmio, when the host provides a VTL2 mmio range. +/// Enumerate the VTL0 chipset MMIO ranges reported by the bootloader in the +/// `openhcl` device tree node, sorted by start address. +/// +/// The `openhcl/memory@*` nodes are a mix of VTL0/VTL2 RAM and VTL0/VTL2 MMIO. +/// This helper lists the directory, filters to entries whose +/// `openhcl,memory-type` is `VTL0_MMIO` (5), and delegates to +/// `parse_openhcl_memory_node` for the range read. +async fn enumerate_openhcl_vtl0_mmio_ranges( + agent: &PipetteClient, +) -> Result, anyhow::Error> { + let sh = agent.unix_shell(); + let listing = cmd!(sh, "ls /sys/firmware/devicetree/base/openhcl/") + .read() + .await?; + let mut ranges = Vec::new(); + for name in listing.lines() { + let Some(start_hex) = name.strip_prefix("memory@") else { + continue; + }; + let start = u64::from_str_radix(start_hex, 16) + .map_err(|e| anyhow::anyhow!("failed to parse {name}: {e}"))?; + // Read the type first so we can skip non-VTL0_MMIO entries (RAM, + // VTL2_MMIO) without tripping the assertion in + // `parse_openhcl_memory_node`. + let memory_type: u32 = read_sysfs_dt::( + agent, + format!("openhcl/{name}/openhcl,memory-type").as_str(), + ) + .await? + .into(); + const VTL0_MMIO: u32 = 5; + if memory_type != VTL0_MMIO { + continue; + } + ranges.push(parse_openhcl_memory_node(agent, start).await?); + } + ranges.sort_by_key(|r| r.start()); + Ok(ranges) +} + +/// Test VTL2 memory allocation mode and validate that the bootloader-built +/// device tree reflects the host-provided VTL2 MMIO range (path A in +/// `openhcl_boot`'s MMIO selection). /// -/// TODO: onboard Hyper-V support in petri for custom mmio config once Hyper-V -/// supports this. +/// Path B — where `openhcl_boot` carves VTL2 MMIO out of VTL0 because the host +/// did not provide a range — is covered by unit tests for +/// `select_vtl2_mmio_range` in `openhcl_boot::host_params::mmio`. #[openvmm_test(openhcl_linux_direct_x64)] async fn openhcl_linux_vtl2_mmio_self_allocate( config: PetriVmBuilder, ) -> Result<(), anyhow::Error> { - // Use the OpenVMM default which has a 1GB mmio gap for VTL2. This should - // cause the whole gap to be given to VTL2, as we should report 128MB for - // self allocation. - let expected_mmio_ranges: Vec = - openvmm_defs::config::DEFAULT_MMIO_GAPS_X86_WITH_VTL2.into(); + // Default chipset MMIO sizes for `HclHost` from + // `vm_manifest_builder::layout_config`. Keep in sync with that file. + const DEFAULT_LOW_MMIO_SIZE: u64 = 128 * 1024 * 1024; + const DEFAULT_HIGH_MMIO_SIZE: u64 = 512 * 1024 * 1024; + const DEFAULT_VTL2_MMIO_SIZE: u64 = 1024 * 1024 * 1024; + // `mmio-size` is hardcoded in openvmm — see + // `openvmm_core::worker::vm_loaders::igvm::build_device_tree`. + const EXPECTED_MMIO_SIZE: u64 = 128 * 1024 * 1024; + let (mut vm, agent) = config - .with_memory(MemoryConfig { - mmio_gaps: petri::MmioConfig::Custom(expected_mmio_ranges.clone()), - ..Default::default() - }) .with_vtl2_base_address_type(Vtl2BaseAddressType::Vtl2Allocate { size: None }) .run() .await?; let vtl2_agent = vm.wait_for_vtl2_agent().await?; - // Read the bootloader provided fdt via sysfs to verify that the VTL2 and - // VTL0 mmio ranges are as expected, along with the allocated mmio size - // being 128 MB. - let memory_allocation_mode: String = + let memory_allocation_mode = read_sysfs_dt_string(&vtl2_agent, "openhcl/memory-allocation-mode").await?; assert_eq!(memory_allocation_mode, "vtl2"); @@ -399,21 +438,52 @@ async fn openhcl_linux_vtl2_mmio_self_allocate( read_sysfs_dt::(&vtl2_agent, "openhcl/mmio-size") .await? .into(); - // NOTE: This value is hardcoded in openvmm today to report this to the - // guest provided device tree. - const EXPECTED_MMIO_SIZE: u64 = 128 * 1024 * 1024; assert_eq!(mmio_size, EXPECTED_MMIO_SIZE); - // Read the bootloader provided dt via sysfs to verify the VTL0 and VTL2 - // mmio ranges are as expected. + // VTL2 VMBus sees exactly one MMIO range — the VTL2-private chipset MMIO + // — placed in PostMmio above all VTL0-visible RAM/MMIO. let vtl2_mmio = parse_vmbus_mmio(&vtl2_agent, "bus/vmbus").await?; - assert_eq!(vtl2_mmio, expected_mmio_ranges[2..]); - let mut vtl0_mmio = Vec::new(); - for range_start in expected_mmio_ranges[..2].iter().map(|r| r.start()) { - let range = parse_openhcl_memory_node(&vtl2_agent, range_start).await?; - vtl0_mmio.push(range); - } - assert_eq!(vtl0_mmio, expected_mmio_ranges[..2]); + assert_eq!( + vtl2_mmio.len(), + 1, + "VTL2 should have exactly one MMIO range, got {:?}", + vtl2_mmio, + ); + assert_eq!(vtl2_mmio[0].len(), DEFAULT_VTL2_MMIO_SIZE); + assert!( + vtl2_mmio[0].start() >= 1 << 32, + "VTL2 MMIO should be above 4 GiB, got {:#x}", + vtl2_mmio[0].start(), + ); + + // VTL0 sees exactly two chipset MMIO ranges in the openhcl device tree: + // the low (Mmio32) range below 4 GiB and the high (Mmio64) range above + // RAM but below the VTL2 PostMmio range. + let vtl0_mmio = enumerate_openhcl_vtl0_mmio_ranges(&vtl2_agent).await?; + assert_eq!( + vtl0_mmio.len(), + 2, + "VTL0 should have exactly two chipset MMIO ranges, got {:?}", + vtl0_mmio, + ); + assert_eq!(vtl0_mmio[0].len(), DEFAULT_LOW_MMIO_SIZE); + assert!( + vtl0_mmio[0].end() <= 1 << 32, + "VTL0 low MMIO should be below 4 GiB, got {:?}", + vtl0_mmio[0], + ); + assert_eq!(vtl0_mmio[1].len(), DEFAULT_HIGH_MMIO_SIZE); + assert!( + vtl0_mmio[1].start() >= 1 << 32, + "VTL0 high MMIO should be above 4 GiB, got {:?}", + vtl0_mmio[1], + ); + assert!( + vtl0_mmio[1].end() <= vtl2_mmio[0].start(), + "VTL0 high MMIO {:?} should sit below the VTL2 chipset MMIO {:?}", + vtl0_mmio[1], + vtl2_mmio[0], + ); agent.power_off().await?; vm.wait_for_clean_teardown().await?;